@houtini/voice-analyser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. package/README.md +566 -0
  2. package/dist/analyzers/anti-mechanical.d.ts +58 -0
  3. package/dist/analyzers/anti-mechanical.d.ts.map +1 -0
  4. package/dist/analyzers/anti-mechanical.js +223 -0
  5. package/dist/analyzers/anti-mechanical.js.map +1 -0
  6. package/dist/analyzers/char-ngrams.d.ts +54 -0
  7. package/dist/analyzers/char-ngrams.d.ts.map +1 -0
  8. package/dist/analyzers/char-ngrams.js +208 -0
  9. package/dist/analyzers/char-ngrams.js.map +1 -0
  10. package/dist/analyzers/function-words.d.ts +41 -0
  11. package/dist/analyzers/function-words.d.ts.map +1 -0
  12. package/dist/analyzers/function-words.js +167 -0
  13. package/dist/analyzers/function-words.js.map +1 -0
  14. package/dist/analyzers/information-density.d.ts +78 -0
  15. package/dist/analyzers/information-density.d.ts.map +1 -0
  16. package/dist/analyzers/information-density.js +384 -0
  17. package/dist/analyzers/information-density.js.map +1 -0
  18. package/dist/analyzers/paragraph.d.ts +28 -0
  19. package/dist/analyzers/paragraph.d.ts.map +1 -0
  20. package/dist/analyzers/paragraph.js +78 -0
  21. package/dist/analyzers/paragraph.js.map +1 -0
  22. package/dist/analyzers/pos-ngrams.d.ts +59 -0
  23. package/dist/analyzers/pos-ngrams.d.ts.map +1 -0
  24. package/dist/analyzers/pos-ngrams.js +249 -0
  25. package/dist/analyzers/pos-ngrams.js.map +1 -0
  26. package/dist/analyzers/punctuation.d.ts +34 -0
  27. package/dist/analyzers/punctuation.d.ts.map +1 -0
  28. package/dist/analyzers/punctuation.js +174 -0
  29. package/dist/analyzers/punctuation.js.map +1 -0
  30. package/dist/analyzers/sentence.d.ts +33 -0
  31. package/dist/analyzers/sentence.d.ts.map +1 -0
  32. package/dist/analyzers/sentence.js +74 -0
  33. package/dist/analyzers/sentence.js.map +1 -0
  34. package/dist/analyzers/vocabulary.d.ts +40 -0
  35. package/dist/analyzers/vocabulary.d.ts.map +1 -0
  36. package/dist/analyzers/vocabulary.js +96 -0
  37. package/dist/analyzers/vocabulary.js.map +1 -0
  38. package/dist/analyzers/voice-markers.d.ts +88 -0
  39. package/dist/analyzers/voice-markers.d.ts.map +1 -0
  40. package/dist/analyzers/voice-markers.js +297 -0
  41. package/dist/analyzers/voice-markers.js.map +1 -0
  42. package/dist/analyzers/word-ngrams.d.ts +59 -0
  43. package/dist/analyzers/word-ngrams.d.ts.map +1 -0
  44. package/dist/analyzers/word-ngrams.js +259 -0
  45. package/dist/analyzers/word-ngrams.js.map +1 -0
  46. package/dist/index.d.ts +7 -0
  47. package/dist/index.d.ts.map +1 -0
  48. package/dist/index.js +190 -0
  49. package/dist/index.js.map +1 -0
  50. package/dist/reference/function-words.d.ts +48 -0
  51. package/dist/reference/function-words.d.ts.map +1 -0
  52. package/dist/reference/function-words.js +164 -0
  53. package/dist/reference/function-words.js.map +1 -0
  54. package/dist/tools/analyze-corpus.d.ts +15 -0
  55. package/dist/tools/analyze-corpus.d.ts.map +1 -0
  56. package/dist/tools/analyze-corpus.js +188 -0
  57. package/dist/tools/analyze-corpus.js.map +1 -0
  58. package/dist/tools/collect-corpus.d.ts +25 -0
  59. package/dist/tools/collect-corpus.d.ts.map +1 -0
  60. package/dist/tools/collect-corpus.js +109 -0
  61. package/dist/tools/collect-corpus.js.map +1 -0
  62. package/dist/tools/generate-enhanced-guide.d.ts +45 -0
  63. package/dist/tools/generate-enhanced-guide.d.ts.map +1 -0
  64. package/dist/tools/generate-enhanced-guide.js +881 -0
  65. package/dist/tools/generate-enhanced-guide.js.map +1 -0
  66. package/dist/tools/generate-guide.d.ts +16 -0
  67. package/dist/tools/generate-guide.d.ts.map +1 -0
  68. package/dist/tools/generate-guide.js +228 -0
  69. package/dist/tools/generate-guide.js.map +1 -0
  70. package/dist/utils/cleaner.d.ts +56 -0
  71. package/dist/utils/cleaner.d.ts.map +1 -0
  72. package/dist/utils/cleaner.js +193 -0
  73. package/dist/utils/cleaner.js.map +1 -0
  74. package/dist/utils/crawler.d.ts +13 -0
  75. package/dist/utils/crawler.d.ts.map +1 -0
  76. package/dist/utils/crawler.js +66 -0
  77. package/dist/utils/crawler.js.map +1 -0
  78. package/dist/utils/delta.d.ts +56 -0
  79. package/dist/utils/delta.d.ts.map +1 -0
  80. package/dist/utils/delta.js +124 -0
  81. package/dist/utils/delta.js.map +1 -0
  82. package/dist/utils/extractor.d.ts +14 -0
  83. package/dist/utils/extractor.d.ts.map +1 -0
  84. package/dist/utils/extractor.js +92 -0
  85. package/dist/utils/extractor.js.map +1 -0
  86. package/dist/utils/ngrams.d.ts +72 -0
  87. package/dist/utils/ngrams.d.ts.map +1 -0
  88. package/dist/utils/ngrams.js +154 -0
  89. package/dist/utils/ngrams.js.map +1 -0
  90. package/dist/utils/statistics.d.ts +22 -0
  91. package/dist/utils/statistics.d.ts.map +1 -0
  92. package/dist/utils/statistics.js +54 -0
  93. package/dist/utils/statistics.js.map +1 -0
  94. package/dist/utils/zscore.d.ts +44 -0
  95. package/dist/utils/zscore.d.ts.map +1 -0
  96. package/dist/utils/zscore.js +76 -0
  97. package/dist/utils/zscore.js.map +1 -0
  98. package/package.json +67 -0
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Voice Analysis MCP Server
4
+ * Automatic tone-of-voice analysis from published writing corpus
5
+ */
6
+ export {};
7
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;GAGG"}
package/dist/index.js ADDED
@@ -0,0 +1,190 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Voice Analysis MCP Server
4
+ * Automatic tone-of-voice analysis from published writing corpus
5
+ */
6
+ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
7
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
8
+ import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
9
+ import { collectCorpus } from './tools/collect-corpus.js';
10
+ import { analyzeCorpus } from './tools/analyze-corpus.js';
11
+ import { generateTovGuide } from './tools/generate-guide.js';
12
+ import { generateEnhancedGuide } from './tools/generate-enhanced-guide.js';
13
+ const server = new Server({
14
+ name: 'voice-analysis-server',
15
+ version: '1.0.0',
16
+ }, {
17
+ capabilities: {
18
+ tools: {},
19
+ },
20
+ });
21
+ // Register tools
22
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({
23
+ tools: [
24
+ {
25
+ name: 'collect_corpus',
26
+ description: 'Crawl sitemap and collect clean writing corpus from published articles',
27
+ inputSchema: {
28
+ type: 'object',
29
+ properties: {
30
+ sitemap_url: {
31
+ type: 'string',
32
+ description: 'URL to XML sitemap (e.g., https://example.com/post-sitemap.xml)'
33
+ },
34
+ output_name: {
35
+ type: 'string',
36
+ description: 'Corpus identifier/name (e.g., "richard-baxter")'
37
+ },
38
+ max_articles: {
39
+ type: 'number',
40
+ description: 'Maximum articles to process (default: 100)',
41
+ default: 100
42
+ },
43
+ article_pattern: {
44
+ type: 'string',
45
+ description: 'Optional regex to filter URLs'
46
+ }
47
+ },
48
+ required: ['sitemap_url', 'output_name'],
49
+ },
50
+ },
51
+ {
52
+ name: 'analyze_corpus',
53
+ description: 'Perform linguistic analysis on collected corpus (vocabulary, sentence structure, voice markers)',
54
+ inputSchema: {
55
+ type: 'object',
56
+ properties: {
57
+ corpus_name: {
58
+ type: 'string',
59
+ description: 'Name of corpus to analyze'
60
+ },
61
+ analysis_type: {
62
+ type: 'string',
63
+ enum: ['full', 'quick', 'vocabulary', 'syntax'],
64
+ description: 'Type of analysis to perform (default: full)',
65
+ default: 'full'
66
+ },
67
+ },
68
+ required: ['corpus_name'],
69
+ },
70
+ },
71
+ {
72
+ name: 'generate_tov_guide',
73
+ description: 'Generate tone-of-voice guide from analysis results (LLM-optimized statistical model)',
74
+ inputSchema: {
75
+ type: 'object',
76
+ properties: {
77
+ corpus_name: {
78
+ type: 'string',
79
+ description: 'Name of analyzed corpus'
80
+ },
81
+ output_format: {
82
+ type: 'string',
83
+ enum: ['llm', 'human', 'both'],
84
+ description: 'Output format (default: both)',
85
+ default: 'both'
86
+ },
87
+ template: {
88
+ type: 'string',
89
+ enum: ['minimal', 'standard', 'comprehensive'],
90
+ description: 'Guide template (default: standard)',
91
+ default: 'standard'
92
+ }
93
+ },
94
+ required: ['corpus_name'],
95
+ },
96
+ },
97
+ {
98
+ name: 'generate_enhanced_guide',
99
+ description: 'Generate ENHANCED tone-of-voice guide integrating all n-gram patterns (character, word, POS) with function words and traditional metrics. Creates comprehensive LLM instruction set with contrastive examples.',
100
+ inputSchema: {
101
+ type: 'object',
102
+ properties: {
103
+ corpus_name: {
104
+ type: 'string',
105
+ description: 'Name of analyzed corpus'
106
+ },
107
+ output_format: {
108
+ type: 'string',
109
+ enum: ['llm', 'human', 'both'],
110
+ description: 'Output format (default: both)',
111
+ default: 'both'
112
+ }
113
+ },
114
+ required: ['corpus_name'],
115
+ },
116
+ },
117
+ ],
118
+ }));
119
+ // Tool handlers
120
+ server.setRequestHandler(CallToolRequestSchema, async (request) => {
121
+ try {
122
+ switch (request.params.name) {
123
+ case 'collect_corpus': {
124
+ const params = request.params.arguments;
125
+ const result = await collectCorpus(params);
126
+ return {
127
+ content: [
128
+ {
129
+ type: 'text',
130
+ text: JSON.stringify(result, null, 2)
131
+ }
132
+ ]
133
+ };
134
+ }
135
+ case 'analyze_corpus': {
136
+ const params = request.params.arguments;
137
+ const result = await analyzeCorpus(params);
138
+ return {
139
+ content: [
140
+ {
141
+ type: 'text',
142
+ text: JSON.stringify(result, null, 2)
143
+ }
144
+ ]
145
+ };
146
+ }
147
+ case 'generate_tov_guide': {
148
+ const params = request.params.arguments;
149
+ const result = await generateTovGuide(params);
150
+ return {
151
+ content: [
152
+ {
153
+ type: 'text',
154
+ text: JSON.stringify(result, null, 2)
155
+ }
156
+ ]
157
+ };
158
+ }
159
+ case 'generate_enhanced_guide': {
160
+ const params = request.params.arguments;
161
+ const result = await generateEnhancedGuide(params);
162
+ return {
163
+ content: [
164
+ {
165
+ type: 'text',
166
+ text: JSON.stringify(result, null, 2)
167
+ }
168
+ ]
169
+ };
170
+ }
171
+ default:
172
+ throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}`);
173
+ }
174
+ }
175
+ catch (error) {
176
+ if (error instanceof McpError) {
177
+ throw error;
178
+ }
179
+ throw new McpError(ErrorCode.InternalError, `Tool execution failed: ${error instanceof Error ? error.message : String(error)}`);
180
+ }
181
+ });
182
+ // Start server
183
+ async function main() {
184
+ const transport = new StdioServerTransport();
185
+ await server.connect(transport);
186
+ }
187
+ main().catch((error) => {
188
+ process.exit(1);
189
+ });
190
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAEA;;;GAGG;AAEH,OAAO,EAAE,MAAM,EAAE,MAAM,2CAA2C,CAAC;AACnE,OAAO,EAAE,oBAAoB,EAAE,MAAM,2CAA2C,CAAC;AACjF,OAAO,EACL,qBAAqB,EACrB,sBAAsB,EACtB,SAAS,EACT,QAAQ,EACT,MAAM,oCAAoC,CAAC;AAE5C,OAAO,EAAE,aAAa,EAAuB,MAAM,2BAA2B,CAAC;AAC/E,OAAO,EAAE,aAAa,EAAuB,MAAM,2BAA2B,CAAC;AAC/E,OAAO,EAAE,gBAAgB,EAA0B,MAAM,2BAA2B,CAAC;AACrF,OAAO,EAAE,qBAAqB,EAAuB,MAAM,oCAAoC,CAAC;AAEhG,MAAM,MAAM,GAAG,IAAI,MAAM,CACvB;IACE,IAAI,EAAE,uBAAuB;IAC7B,OAAO,EAAE,OAAO;CACjB,EACD;IACE,YAAY,EAAE;QACZ,KAAK,EAAE,EAAE;KACV;CACF,CACF,CAAC;AAEF,iBAAiB;AACjB,MAAM,CAAC,iBAAiB,CAAC,sBAAsB,EAAE,KAAK,IAAI,EAAE,CAAC,CAAC;IAC5D,KAAK,EAAE;QACL;YACE,IAAI,EAAE,gBAAgB;YACtB,WAAW,EAAE,wEAAwE;YACrF,WAAW,EAAE;gBACX,IAAI,EAAE,QAAQ;gBACd,UAAU,EAAE;oBACV,WAAW,EAAE;wBACX,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,iEAAiE;qBAC/E;oBACD,WAAW,EAAE;wBACX,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,iDAAiD;qBAC/D;oBACD,YAAY,EAAE;wBACZ,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,4CAA4C;wBACzD,OAAO,EAAE,GAAG;qBACb;oBACD,eAAe,EAAE;wBACf,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,+BAA+B;qBAC7C;iBACF;gBACD,QAAQ,EAAE,CAAC,aAAa,EAAE,aAAa,CAAC;aACzC;SACF;QACD;YACE,IAAI,EAAE,gBAAgB;YACtB,WAAW,EAAE,iGAAiG;YAC9G,WAAW,EAAE;gBACX,IAAI,EAAE,QAAQ;gBACd,UAAU,EAAE;oBACV,WAAW,EAAE;wBACX,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,2BAA2B;qBACzC;oBACD,aAAa,EAAE;wBACb,IAAI,EAAE,QAAQ;wBACd,IAAI,EAAE,CAAC,MAAM,EAAE,OAAO,EAAE,YAAY,EAAE,QAAQ,CAAC;wBAC/C,WAAW,EAAE,6CAA6C;wBAC1D,OAAO,EAAE,MAAM;qBAChB;iBACF;gBACD,QAAQ,EAAE,CAAC,aAAa,CAAC;aAC1B;SACF;QACD;YACE,IAAI,EAAE,oBAAoB;YAC1B,WAAW,EAAE,sFAAsF;YACnG,WAAW,EAAE;gBACX,IAAI,EAAE,QAAQ;gBACd,UAAU,EAAE;oBACV,WAAW,EAAE;wBACX,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,yBAAyB;qBACvC;oBACD,aAAa,EAAE;wBACb,IAAI,EAAE,QAAQ;wBACd,IAAI,EAAE,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,CAAC;wBAC9B,WAAW,EAAE,+BAA+B;wBAC5C,OAAO,EAAE,MAAM;qBAChB;oBACD,QAAQ,EAAE;wBACR,IAAI,EAAE,QAAQ;wBACd,IAAI,EAAE,CAAC,SAAS,EAAE,UAAU,EAAE,eAAe,CAAC;wBAC9C,WAAW,EAAE,oCAAoC;wBACjD,OAAO,EAAE,UAAU;qBACpB;iBACF;gBACD,QAAQ,EAAE,CAAC,aAAa,CAAC;aAC1B;SACF;QACD;YACE,IAAI,EAAE,yBAAyB;YAC/B,WAAW,EAAE,gNAAgN;YAC7N,WAAW,EAAE;gBACX,IAAI,EAAE,QAAQ;gBACd,UAAU,EAAE;oBACV,WAAW,EAAE;wBACX,IAAI,EAAE,QAAQ;wBACd,WAAW,EAAE,yBAAyB;qBACvC;oBACD,aAAa,EAAE;wBACb,IAAI,EAAE,QAAQ;wBACd,IAAI,EAAE,CAAC,KAAK,EAAE,OAAO,EAAE,MAAM,CAAC;wBAC9B,WAAW,EAAE,+BAA+B;wBAC5C,OAAO,EAAE,MAAM;qBAChB;iBACF;gBACD,QAAQ,EAAE,CAAC,aAAa,CAAC;aAC1B;SACF;KACF;CACF,CAAC,CAAC,CAAC;AAEJ,gBAAgB;AAChB,MAAM,CAAC,iBAAiB,CAAC,qBAAqB,EAAE,KAAK,EAAE,OAAO,EAAE,EAAE;IAChE,IAAI,CAAC;QACH,QAAQ,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;YAC5B,KAAK,gBAAgB,CAAC,CAAC,CAAC;gBACtB,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,SAA2C,CAAC;gBAC1E,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,MAAM,CAAC,CAAC;gBAC3C,OAAO;oBACL,OAAO,EAAE;wBACP;4BACE,IAAI,EAAE,MAAM;4BACZ,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;yBACtC;qBACF;iBACF,CAAC;YACJ,CAAC;YAED,KAAK,gBAAgB,CAAC,CAAC,CAAC;gBACtB,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,SAA2C,CAAC;gBAC1E,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,MAAM,CAAC,CAAC;gBAC3C,OAAO;oBACL,OAAO,EAAE;wBACP;4BACE,IAAI,EAAE,MAAM;4BACZ,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;yBACtC;qBACF;iBACF,CAAC;YACJ,CAAC;YAED,KAAK,oBAAoB,CAAC,CAAC,CAAC;gBAC1B,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,SAA8C,CAAC;gBAC7E,MAAM,MAAM,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,CAAC;gBAC9C,OAAO;oBACL,OAAO,EAAE;wBACP;4BACE,IAAI,EAAE,MAAM;4BACZ,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;yBACtC;qBACF;iBACF,CAAC;YACJ,CAAC;YAED,KAAK,yBAAyB,CAAC,CAAC,CAAC;gBAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,SAA2C,CAAC;gBAC1E,MAAM,MAAM,GAAG,MAAM,qBAAqB,CAAC,MAAM,CAAC,CAAC;gBACnD,OAAO;oBACL,OAAO,EAAE;wBACP;4BACE,IAAI,EAAE,MAAM;4BACZ,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC;yBACtC;qBACF;iBACF,CAAC;YACJ,CAAC;YAED;gBACE,MAAM,IAAI,QAAQ,CAChB,SAAS,CAAC,cAAc,EACxB,iBAAiB,OAAO,CAAC,MAAM,CAAC,IAAI,EAAE,CACvC,CAAC;QACN,CAAC;IACH,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,IAAI,KAAK,YAAY,QAAQ,EAAE,CAAC;YAC9B,MAAM,KAAK,CAAC;QACd,CAAC;QAED,MAAM,IAAI,QAAQ,CAChB,SAAS,CAAC,aAAa,EACvB,0BAA0B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CACnF,CAAC;IACJ,CAAC;AACH,CAAC,CAAC,CAAC;AAEH,eAAe;AACf,KAAK,UAAU,IAAI;IACjB,MAAM,SAAS,GAAG,IAAI,oBAAoB,EAAE,CAAC;IAC7C,MAAM,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;AAClC,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE,EAAE;IACrB,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC,CAAC,CAAC"}
@@ -0,0 +1,48 @@
1
+ /**
2
+ * Function word reference list for stylometric analysis
3
+ *
4
+ * Based on Mosteller-Wallace (1963), Burrows' Delta research
5
+ * 70 core function words optimized for authorship fingerprinting
6
+ */
7
+ export type FunctionWordCategory = 'article' | 'determiner' | 'preposition' | 'conjunction' | 'modal' | 'auxiliary' | 'pronoun';
8
+ export interface FunctionWord {
9
+ word: string;
10
+ category: FunctionWordCategory;
11
+ tier: 1 | 2 | 3 | 4;
12
+ britishMarker?: boolean;
13
+ notes?: string;
14
+ }
15
+ /**
16
+ * Core 70 function words for stylometric analysis
17
+ */
18
+ export declare const FUNCTION_WORDS: FunctionWord[];
19
+ /**
20
+ * Get function words by tier (discriminative priority)
21
+ */
22
+ export declare function getFunctionWordsByTier(tier: 1 | 2 | 3 | 4): FunctionWord[];
23
+ /**
24
+ * Get function words by category
25
+ */
26
+ export declare function getFunctionWordsByCategory(category: FunctionWordCategory): FunctionWord[];
27
+ /**
28
+ * Get British marker function words
29
+ */
30
+ export declare function getBritishMarkers(): FunctionWord[];
31
+ /**
32
+ * Create lookup map for fast access
33
+ */
34
+ export declare function getFunctionWordMap(): Map<string, FunctionWord>;
35
+ /**
36
+ * General English reference statistics for z-score calculation
37
+ *
38
+ * These are baseline frequencies (per 1000 words) from large English corpora
39
+ * Source: Brown Corpus, BNC (British National Corpus)
40
+ *
41
+ * Note: These should ideally be calculated from a reference corpus
42
+ * For now, using typical values from linguistic research
43
+ */
44
+ export declare const GENERAL_ENGLISH_STATS: Record<string, {
45
+ mean: number;
46
+ stdDev: number;
47
+ }>;
48
+ //# sourceMappingURL=function-words.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"function-words.d.ts","sourceRoot":"","sources":["../../src/reference/function-words.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,MAAM,MAAM,oBAAoB,GAC5B,SAAS,GACT,YAAY,GACZ,aAAa,GACb,aAAa,GACb,OAAO,GACP,WAAW,GACX,SAAS,CAAC;AAEd,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,IAAI,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IACpB,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,eAAO,MAAM,cAAc,EAAE,YAAY,EAmFxC,CAAC;AAEF;;GAEG;AACH,wBAAgB,sBAAsB,CAAC,IAAI,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,YAAY,EAAE,CAE1E;AAED;;GAEG;AACH,wBAAgB,0BAA0B,CAAC,QAAQ,EAAE,oBAAoB,GAAG,YAAY,EAAE,CAEzF;AAED;;GAEG;AACH,wBAAgB,iBAAiB,IAAI,YAAY,EAAE,CAElD;AAED;;GAEG;AACH,wBAAgB,kBAAkB,IAAI,GAAG,CAAC,MAAM,EAAE,YAAY,CAAC,CAM9D;AAED;;;;;;;;GAQG;AACH,eAAO,MAAM,qBAAqB,EAAE,MAAM,CAAC,MAAM,EAAE;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAwClF,CAAC"}
@@ -0,0 +1,164 @@
1
+ /**
2
+ * Function word reference list for stylometric analysis
3
+ *
4
+ * Based on Mosteller-Wallace (1963), Burrows' Delta research
5
+ * 70 core function words optimized for authorship fingerprinting
6
+ */
7
+ /**
8
+ * Core 70 function words for stylometric analysis
9
+ */
10
+ export const FUNCTION_WORDS = [
11
+ // Articles & Determiners (12 words)
12
+ { word: 'a', category: 'article', tier: 2 },
13
+ { word: 'an', category: 'article', tier: 2 },
14
+ { word: 'the', category: 'article', tier: 2 },
15
+ { word: 'this', category: 'determiner', tier: 2 },
16
+ { word: 'that', category: 'determiner', tier: 2 },
17
+ { word: 'these', category: 'determiner', tier: 2 },
18
+ { word: 'those', category: 'determiner', tier: 2 },
19
+ { word: 'some', category: 'determiner', tier: 2 },
20
+ { word: 'any', category: 'determiner', tier: 2 },
21
+ { word: 'all', category: 'determiner', tier: 2 },
22
+ { word: 'every', category: 'determiner', tier: 2 },
23
+ { word: 'no', category: 'determiner', tier: 2 },
24
+ // Prepositions (16 words)
25
+ { word: 'at', category: 'preposition', tier: 2 },
26
+ { word: 'by', category: 'preposition', tier: 2 },
27
+ { word: 'for', category: 'preposition', tier: 2 },
28
+ { word: 'from', category: 'preposition', tier: 2 },
29
+ { word: 'in', category: 'preposition', tier: 2 },
30
+ { word: 'into', category: 'preposition', tier: 2 },
31
+ { word: 'of', category: 'preposition', tier: 2 },
32
+ { word: 'on', category: 'preposition', tier: 2 },
33
+ { word: 'to', category: 'preposition', tier: 2 },
34
+ { word: 'upon', category: 'preposition', tier: 1, notes: 'Highly discriminative' },
35
+ { word: 'with', category: 'preposition', tier: 2 },
36
+ { word: 'without', category: 'preposition', tier: 1 },
37
+ { word: 'through', category: 'preposition', tier: 2 },
38
+ { word: 'between', category: 'preposition', tier: 2 },
39
+ { word: 'within', category: 'preposition', tier: 1 },
40
+ { word: 'across', category: 'preposition', tier: 2 },
41
+ // Conjunctions (14 words)
42
+ { word: 'and', category: 'conjunction', tier: 2 },
43
+ { word: 'as', category: 'conjunction', tier: 2 },
44
+ { word: 'but', category: 'conjunction', tier: 2 },
45
+ { word: 'if', category: 'conjunction', tier: 2 },
46
+ { word: 'or', category: 'conjunction', tier: 2 },
47
+ { word: 'so', category: 'conjunction', tier: 2 },
48
+ { word: 'than', category: 'conjunction', tier: 2 },
49
+ { word: 'that', category: 'conjunction', tier: 2 },
50
+ { word: 'though', category: 'conjunction', tier: 1, notes: 'Highly discriminative' },
51
+ { word: 'when', category: 'conjunction', tier: 2 },
52
+ { word: 'while', category: 'conjunction', tier: 2 },
53
+ { word: 'whilst', category: 'conjunction', tier: 1, britishMarker: true, notes: 'British preference' },
54
+ { word: 'because', category: 'conjunction', tier: 2 },
55
+ { word: 'although', category: 'conjunction', tier: 2 },
56
+ // Modal Verbs (9 words)
57
+ { word: 'can', category: 'modal', tier: 3 },
58
+ { word: 'could', category: 'modal', tier: 3 },
59
+ { word: 'may', category: 'modal', tier: 1, notes: 'Highly discriminative' },
60
+ { word: 'might', category: 'modal', tier: 3 },
61
+ { word: 'must', category: 'modal', tier: 1, notes: 'Highly discriminative' },
62
+ { word: 'shall', category: 'modal', tier: 1, notes: 'Highly discriminative' },
63
+ { word: 'should', category: 'modal', tier: 3 },
64
+ { word: 'will', category: 'modal', tier: 3 },
65
+ { word: 'would', category: 'modal', tier: 3 },
66
+ // Auxiliary Verbs (10 words)
67
+ { word: 'be', category: 'auxiliary', tier: 3 },
68
+ { word: 'been', category: 'auxiliary', tier: 3 },
69
+ { word: 'being', category: 'auxiliary', tier: 3 },
70
+ { word: 'do', category: 'auxiliary', tier: 3 },
71
+ { word: 'does', category: 'auxiliary', tier: 3 },
72
+ { word: 'had', category: 'auxiliary', tier: 3 },
73
+ { word: 'has', category: 'auxiliary', tier: 3 },
74
+ { word: 'have', category: 'auxiliary', tier: 3 },
75
+ { word: 'is', category: 'auxiliary', tier: 3 },
76
+ { word: 'was', category: 'auxiliary', tier: 3 },
77
+ { word: 'were', category: 'auxiliary', tier: 3 },
78
+ // Pronouns (9 words - use with caution, genre-sensitive)
79
+ { word: 'I', category: 'pronoun', tier: 4, notes: 'Genre-sensitive, track separately' },
80
+ { word: 'we', category: 'pronoun', tier: 4, notes: 'Genre-sensitive' },
81
+ { word: 'you', category: 'pronoun', tier: 4, notes: 'Genre-sensitive' },
82
+ { word: 'he', category: 'pronoun', tier: 4, notes: 'Genre-sensitive' },
83
+ { word: 'she', category: 'pronoun', tier: 4, notes: 'Genre-sensitive' },
84
+ { word: 'it', category: 'pronoun', tier: 4, notes: 'Genre-sensitive' },
85
+ { word: 'they', category: 'pronoun', tier: 4, notes: 'Genre-sensitive' },
86
+ { word: 'one', category: 'pronoun', tier: 4, notes: 'Genre-sensitive' },
87
+ { word: 'who', category: 'pronoun', tier: 4, notes: 'Genre-sensitive' },
88
+ ];
89
+ /**
90
+ * Get function words by tier (discriminative priority)
91
+ */
92
+ export function getFunctionWordsByTier(tier) {
93
+ return FUNCTION_WORDS.filter(fw => fw.tier === tier);
94
+ }
95
+ /**
96
+ * Get function words by category
97
+ */
98
+ export function getFunctionWordsByCategory(category) {
99
+ return FUNCTION_WORDS.filter(fw => fw.category === category);
100
+ }
101
+ /**
102
+ * Get British marker function words
103
+ */
104
+ export function getBritishMarkers() {
105
+ return FUNCTION_WORDS.filter(fw => fw.britishMarker);
106
+ }
107
+ /**
108
+ * Create lookup map for fast access
109
+ */
110
+ export function getFunctionWordMap() {
111
+ const map = new Map();
112
+ for (const fw of FUNCTION_WORDS) {
113
+ map.set(fw.word.toLowerCase(), fw);
114
+ }
115
+ return map;
116
+ }
117
+ /**
118
+ * General English reference statistics for z-score calculation
119
+ *
120
+ * These are baseline frequencies (per 1000 words) from large English corpora
121
+ * Source: Brown Corpus, BNC (British National Corpus)
122
+ *
123
+ * Note: These should ideally be calculated from a reference corpus
124
+ * For now, using typical values from linguistic research
125
+ */
126
+ export const GENERAL_ENGLISH_STATS = {
127
+ // High-frequency words (very stable across corpora)
128
+ 'the': { mean: 60.0, stdDev: 10.0 },
129
+ 'of': { mean: 35.0, stdDev: 8.0 },
130
+ 'and': { mean: 28.0, stdDev: 7.0 },
131
+ 'a': { mean: 22.0, stdDev: 5.0 },
132
+ 'to': { mean: 25.0, stdDev: 6.0 },
133
+ 'in': { mean: 20.0, stdDev: 5.0 },
134
+ 'is': { mean: 10.0, stdDev: 3.0 },
135
+ 'that': { mean: 12.0, stdDev: 4.0 },
136
+ 'for': { mean: 12.0, stdDev: 4.0 },
137
+ 'it': { mean: 11.0, stdDev: 3.0 },
138
+ 'with': { mean: 9.0, stdDev: 3.0 },
139
+ 'as': { mean: 8.0, stdDev: 3.0 },
140
+ 'was': { mean: 7.0, stdDev: 2.5 },
141
+ 'on': { mean: 7.0, stdDev: 2.5 },
142
+ 'be': { mean: 7.0, stdDev: 2.5 },
143
+ // Modal verbs
144
+ 'can': { mean: 3.0, stdDev: 1.5 },
145
+ 'would': { mean: 4.0, stdDev: 2.0 },
146
+ 'will': { mean: 3.5, stdDev: 2.0 },
147
+ 'could': { mean: 2.0, stdDev: 1.0 },
148
+ 'should': { mean: 1.5, stdDev: 1.0 },
149
+ 'may': { mean: 1.0, stdDev: 0.8 },
150
+ 'might': { mean: 0.8, stdDev: 0.6 },
151
+ 'must': { mean: 0.9, stdDev: 0.7 },
152
+ 'shall': { mean: 0.2, stdDev: 0.3 },
153
+ // British markers
154
+ 'whilst': { mean: 0.1, stdDev: 0.2 }, // Very rare in American English
155
+ 'upon': { mean: 0.5, stdDev: 0.5 },
156
+ // Pronouns (highly variable by genre)
157
+ 'I': { mean: 5.0, stdDev: 5.0 }, // Large variance
158
+ 'we': { mean: 3.0, stdDev: 3.0 },
159
+ 'you': { mean: 4.0, stdDev: 4.0 },
160
+ 'he': { mean: 2.5, stdDev: 2.0 },
161
+ 'she': { mean: 1.5, stdDev: 1.5 },
162
+ 'they': { mean: 2.0, stdDev: 1.5 },
163
+ };
164
+ //# sourceMappingURL=function-words.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"function-words.js","sourceRoot":"","sources":["../../src/reference/function-words.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAmBH;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAmB;IAC5C,oCAAoC;IACpC,EAAE,IAAI,EAAE,GAAG,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE;IAC3C,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE;IAC5C,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE;IAC7C,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IACjD,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IACjD,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IACjD,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,IAAI,EAAE,CAAC,EAAE;IAE/C,0BAA0B;IAC1B,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACjD,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,uBAAuB,EAAE;IAClF,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACrD,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACrD,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACrD,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACpD,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAEpD,0BAA0B;IAC1B,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACjD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACjD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,uBAAuB,EAAE;IACpF,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAClD,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACnD,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE,aAAa,EAAE,IAAI,EAAE,KAAK,EAAE,oBAAoB,EAAE;IACtG,EAAE,IAAI,EAAE,SAAS,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IACrD,EAAE,IAAI,EAAE,UAAU,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,CAAC,EAAE;IAEtD,wBAAwB;IACxB,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE;IAC3C,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE;IAC7C,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,uBAAuB,EAAE;IAC3E,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE;IAC7C,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,uBAAuB,EAAE;IAC5E,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,uBAAuB,EAAE;IAC7E,EAAE,IAAI,EAAE,QAAQ,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE;IAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE;IAC5C,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,EAAE;IAE7C,6BAA6B;IAC7B,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IACjD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAC9C,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAC/C,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAChD,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAC9C,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAC/C,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,EAAE;IAEhD,yDAAyD;IACzD,EAAE,IAAI,EAAE,GAAG,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,mCAAmC,EAAE;IACvF,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE;IACtE,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE;IACvE,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE;IACtE,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE;IACvE,EAAE,IAAI,EAAE,IAAI,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE;IACtE,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE;IACxE,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE;IACvE,EAAE,IAAI,EAAE,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE,iBAAiB,EAAE;CACxE,CAAC;AAEF;;GAEG;AACH,MAAM,UAAU,sBAAsB,CAAC,IAAmB;IACxD,OAAO,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,IAAI,KAAK,IAAI,CAAC,CAAC;AACvD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,0BAA0B,CAAC,QAA8B;IACvE,OAAO,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,QAAQ,KAAK,QAAQ,CAAC,CAAC;AAC/D,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,iBAAiB;IAC/B,OAAO,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,aAAa,CAAC,CAAC;AACvD,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,MAAM,GAAG,GAAG,IAAI,GAAG,EAAwB,CAAC;IAC5C,KAAK,MAAM,EAAE,IAAI,cAAc,EAAE,CAAC;QAChC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,WAAW,EAAE,EAAE,EAAE,CAAC,CAAC;IACrC,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,CAAC,MAAM,qBAAqB,GAAqD;IACrF,oDAAoD;IACpD,KAAK,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE;IACnC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,KAAK,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IAClC,GAAG,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IAChC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,MAAM,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IACnC,KAAK,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IAClC,IAAI,EAAE,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,MAAM,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAClC,IAAI,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAChC,KAAK,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,IAAI,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAChC,IAAI,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAEhC,cAAc;IACd,KAAK,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,OAAO,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACnC,MAAM,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAClC,OAAO,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACnC,QAAQ,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACpC,KAAK,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,OAAO,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACnC,MAAM,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAClC,OAAO,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAEnC,kBAAkB;IAClB,QAAQ,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,EAAG,gCAAgC;IACvE,MAAM,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAElC,sCAAsC;IACtC,GAAG,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,EAAG,iBAAiB;IACnD,IAAI,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAChC,KAAK,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,IAAI,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IAChC,KAAK,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;IACjC,MAAM,EAAE,EAAE,IAAI,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE;CACnC,CAAC"}
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Tool: analyze_corpus
3
+ * Perform linguistic analysis on collected corpus
4
+ */
5
+ export interface AnalyzeCorpusParams {
6
+ corpus_name: string;
7
+ analysis_type?: 'full' | 'quick' | 'vocabulary' | 'syntax';
8
+ }
9
+ export interface AnalyzeCorpusResult {
10
+ success: boolean;
11
+ corpus_name: string;
12
+ analysis_path: string;
13
+ }
14
+ export declare function analyzeCorpus(params: AnalyzeCorpusParams): Promise<AnalyzeCorpusResult>;
15
+ //# sourceMappingURL=analyze-corpus.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"analyze-corpus.d.ts","sourceRoot":"","sources":["../../src/tools/analyze-corpus.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAgBH,MAAM,WAAW,mBAAmB;IAClC,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,MAAM,GAAG,OAAO,GAAG,YAAY,GAAG,QAAQ,CAAC;CAC5D;AAED,MAAM,WAAW,mBAAmB;IAClC,OAAO,EAAE,OAAO,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,wBAAsB,aAAa,CAAC,MAAM,EAAE,mBAAmB,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAuL7F"}
@@ -0,0 +1,188 @@
1
+ /**
2
+ * Tool: analyze_corpus
3
+ * Perform linguistic analysis on collected corpus
4
+ */
5
+ import fs from 'fs/promises';
6
+ import path from 'path';
7
+ import { analyzeVocabulary } from '../analyzers/vocabulary.js';
8
+ import { analyzeSentences } from '../analyzers/sentence.js';
9
+ import { analyzeVoiceMarkers } from '../analyzers/voice-markers.js';
10
+ import { analyzeParagraphs } from '../analyzers/paragraph.js';
11
+ import { analyzePunctuation, summarizePunctuation } from '../analyzers/punctuation.js';
12
+ import { analyzeFunctionWords, summarizeFunctionWordAnalysis } from '../analyzers/function-words.js';
13
+ import { analyzeCharNGrams, summarizeCharNGrams } from '../analyzers/char-ngrams.js';
14
+ import { analyzeWordNGrams, summarizeWordNGrams } from '../analyzers/word-ngrams.js';
15
+ import { analyzePOSNGrams, summarizePOSNGrams } from '../analyzers/pos-ngrams.js';
16
+ import { analyzeAntiMechanical } from '../analyzers/anti-mechanical.js';
17
+ import { analyzeInformationDensity, summarizeInformationDensity } from '../analyzers/information-density.js';
18
+ export async function analyzeCorpus(params) {
19
+ const { corpus_name, analysis_type = 'full' } = params;
20
+ const corpusDir = path.join(process.cwd(), 'corpus', corpus_name);
21
+ const articlesDir = path.join(corpusDir, 'articles');
22
+ const analysisDir = path.join(corpusDir, 'analysis');
23
+ // Create analysis directory
24
+ await fs.mkdir(analysisDir, { recursive: true });
25
+ // Read all article files
26
+ const files = await fs.readdir(articlesDir);
27
+ const markdownFiles = files.filter(f => f.endsWith('.md'));
28
+ let combinedText = '';
29
+ const articleCount = markdownFiles.length;
30
+ for (const file of markdownFiles) {
31
+ const content = await fs.readFile(path.join(articlesDir, file), 'utf-8');
32
+ // Remove frontmatter
33
+ const withoutFrontmatter = content.replace(/^---[\s\S]*?---\n\n/, '');
34
+ combinedText += withoutFrontmatter + '\n\n';
35
+ }
36
+ // Run analyses based on type
37
+ if (analysis_type === 'full' || analysis_type === 'vocabulary') {
38
+ const vocabAnalysis = analyzeVocabulary(combinedText);
39
+ await fs.writeFile(path.join(analysisDir, 'vocabulary.json'), JSON.stringify(vocabAnalysis, null, 2), 'utf-8');
40
+ }
41
+ if (analysis_type === 'full' || analysis_type === 'syntax') {
42
+ const sentenceAnalysis = analyzeSentences(combinedText);
43
+ await fs.writeFile(path.join(analysisDir, 'sentence.json'), JSON.stringify(sentenceAnalysis, null, 2), 'utf-8');
44
+ const punctuationAnalysis = analyzePunctuation(combinedText);
45
+ await fs.writeFile(path.join(analysisDir, 'punctuation.json'), JSON.stringify(punctuationAnalysis, null, 2), 'utf-8');
46
+ // Generate punctuation summary (includes AI detection for dash consistency)
47
+ const punctuationSummary = summarizePunctuation(punctuationAnalysis);
48
+ await fs.writeFile(path.join(analysisDir, 'punctuation-summary.md'), punctuationSummary, 'utf-8');
49
+ const paragraphAnalysis = analyzeParagraphs(combinedText);
50
+ await fs.writeFile(path.join(analysisDir, 'paragraph.json'), JSON.stringify(paragraphAnalysis, null, 2), 'utf-8');
51
+ }
52
+ if (analysis_type === 'full' || analysis_type === 'quick') {
53
+ const voiceAnalysis = analyzeVoiceMarkers(combinedText);
54
+ await fs.writeFile(path.join(analysisDir, 'voice.json'), JSON.stringify(voiceAnalysis, null, 2), 'utf-8');
55
+ // Function word analysis with z-scores
56
+ const functionWordAnalysis = analyzeFunctionWords(combinedText);
57
+ await fs.writeFile(path.join(analysisDir, 'function-words.json'), JSON.stringify(functionWordAnalysis, null, 2), 'utf-8');
58
+ // Generate human-readable summary
59
+ const functionWordSummary = summarizeFunctionWordAnalysis(functionWordAnalysis);
60
+ await fs.writeFile(path.join(analysisDir, 'function-words-summary.md'), functionWordSummary, 'utf-8');
61
+ // Character n-gram analysis
62
+ const charNGramAnalysis = analyzeCharNGrams(combinedText);
63
+ await fs.writeFile(path.join(analysisDir, 'char-ngrams.json'), JSON.stringify(charNGramAnalysis, null, 2), 'utf-8');
64
+ // Generate human-readable summary for char n-grams
65
+ const charNGramSummary = summarizeCharNGrams(charNGramAnalysis);
66
+ await fs.writeFile(path.join(analysisDir, 'char-ngrams-summary.md'), charNGramSummary, 'utf-8');
67
+ // Word n-gram analysis
68
+ const wordNGramAnalysis = analyzeWordNGrams(combinedText);
69
+ await fs.writeFile(path.join(analysisDir, 'word-ngrams.json'), JSON.stringify(wordNGramAnalysis, null, 2), 'utf-8');
70
+ // Generate human-readable summary for word n-grams
71
+ const wordNGramSummary = summarizeWordNGrams(wordNGramAnalysis);
72
+ await fs.writeFile(path.join(analysisDir, 'word-ngrams-summary.md'), wordNGramSummary, 'utf-8');
73
+ // POS n-gram analysis
74
+ const posNGramAnalysis = analyzePOSNGrams(combinedText);
75
+ await fs.writeFile(path.join(analysisDir, 'pos-ngrams.json'), JSON.stringify(posNGramAnalysis, null, 2), 'utf-8');
76
+ // Generate human-readable summary for POS n-grams
77
+ const posNGramSummary = summarizePOSNGrams(posNGramAnalysis);
78
+ await fs.writeFile(path.join(analysisDir, 'pos-ngrams-summary.md'), posNGramSummary, 'utf-8');
79
+ // Anti-mechanical analysis
80
+ const antiMechanicalAnalysis = analyzeAntiMechanical(combinedText);
81
+ await fs.writeFile(path.join(analysisDir, 'anti-mechanical.json'), JSON.stringify(antiMechanicalAnalysis, null, 2), 'utf-8');
82
+ // Generate anti-mechanical summary
83
+ const antiMechanicalSummary = generateAntiMechanicalSummary(antiMechanicalAnalysis);
84
+ await fs.writeFile(path.join(analysisDir, 'anti-mechanical-summary.md'), antiMechanicalSummary, 'utf-8');
85
+ // NEW: Information density analysis (based on Dejan AI research)
86
+ const informationDensityAnalysis = analyzeInformationDensity(combinedText, articleCount);
87
+ await fs.writeFile(path.join(analysisDir, 'information-density.json'), JSON.stringify(informationDensityAnalysis, null, 2), 'utf-8');
88
+ // Generate information density summary
89
+ const informationDensitySummary = summarizeInformationDensity(informationDensityAnalysis);
90
+ await fs.writeFile(path.join(analysisDir, 'information-density-summary.md'), informationDensitySummary, 'utf-8');
91
+ }
92
+ // Generate summary
93
+ const summary = await generateSummary(analysisDir);
94
+ await fs.writeFile(path.join(analysisDir, 'summary.md'), summary, 'utf-8');
95
+ return {
96
+ success: true,
97
+ corpus_name,
98
+ analysis_path: analysisDir
99
+ };
100
+ }
101
+ async function generateSummary(analysisDir) {
102
+ const files = await fs.readdir(analysisDir);
103
+ const jsonFiles = files.filter(f => f.endsWith('.json'));
104
+ let summary = '# Analysis Summary\n\n';
105
+ for (const file of jsonFiles) {
106
+ const content = await fs.readFile(path.join(analysisDir, file), 'utf-8');
107
+ const data = JSON.parse(content);
108
+ summary += `## ${file.replace('.json', '')}\n\n`;
109
+ summary += '```json\n';
110
+ summary += JSON.stringify(data, null, 2).substring(0, 500) + '...\n';
111
+ summary += '```\n\n';
112
+ }
113
+ return summary;
114
+ }
115
+ /**
116
+ * Generate human-readable anti-mechanical analysis summary
117
+ */
118
+ function generateAntiMechanicalSummary(analysis) {
119
+ const lines = [];
120
+ lines.push('# Anti-Mechanical Analysis Summary');
121
+ lines.push('');
122
+ lines.push('*Evaluates writing naturalness vs robotic/AI patterns*');
123
+ lines.push('');
124
+ // Overall score
125
+ lines.push('## Overall Naturalness Score');
126
+ lines.push('');
127
+ lines.push(`**Total Score:** ${analysis.naturalness.totalScore}/100 (${analysis.naturalness.interpretation.replace('_', ' ')})`);
128
+ lines.push('');
129
+ lines.push('| Component | Score | Max |');
130
+ lines.push('|-----------|-------|-----|');
131
+ lines.push(`| Sentence Variation | ${analysis.naturalness.sentenceVariationScore} | 25 |`);
132
+ lines.push(`| Paragraph Variation | ${analysis.naturalness.paragraphVariationScore} | 25 |`);
133
+ lines.push(`| First-Person Distribution | ${analysis.naturalness.firstPersonScore} | 25 |`);
134
+ lines.push(`| Repetition Avoidance | ${analysis.naturalness.repetitionScore} | 25 |`);
135
+ lines.push('');
136
+ // Sentence variation
137
+ lines.push('## Sentence Length Variation');
138
+ lines.push('');
139
+ lines.push(`- **Mean length:** ${analysis.sentenceLengthVariation.mean.toFixed(1)} words`);
140
+ lines.push(`- **Standard deviation:** ±${analysis.sentenceLengthVariation.stdDev.toFixed(1)}`);
141
+ lines.push(`- **Coefficient of variation:** ${analysis.sentenceLengthVariation.coefficientOfVariation.toFixed(2)}`);
142
+ lines.push(`- **Natural variation:** ${analysis.sentenceLengthVariation.hasNaturalVariation ? 'Yes (CV > 0.5)' : 'No (too uniform)'}`);
143
+ lines.push('');
144
+ lines.push('**Length Distribution:**');
145
+ lines.push(`- Short (1-8 words): ${analysis.sentenceLengthVariation.distribution.short}`);
146
+ lines.push(`- Medium (9-20 words): ${analysis.sentenceLengthVariation.distribution.medium}`);
147
+ lines.push(`- Long (21-40 words): ${analysis.sentenceLengthVariation.distribution.long}`);
148
+ lines.push(`- Very long (40+ words): ${analysis.sentenceLengthVariation.distribution.veryLong}`);
149
+ lines.push('');
150
+ // Paragraph asymmetry
151
+ lines.push('## Paragraph Asymmetry');
152
+ lines.push('');
153
+ lines.push(`- **Mean sentences per paragraph:** ${analysis.paragraphAsymmetry.meanSentences.toFixed(1)}`);
154
+ lines.push(`- **Standard deviation:** ±${analysis.paragraphAsymmetry.stdDev.toFixed(1)}`);
155
+ lines.push(`- **Single-sentence paragraphs:** ${analysis.paragraphAsymmetry.singleSentenceParagraphs}`);
156
+ lines.push(`- **Long paragraphs (5+):** ${analysis.paragraphAsymmetry.longParagraphs}`);
157
+ lines.push('');
158
+ // First-person distribution
159
+ lines.push('## First-Person Distribution');
160
+ lines.push('');
161
+ lines.push(`- **Total first-person instances:** ${analysis.firstPersonDistribution.totalCount}`);
162
+ lines.push(`- **Sentence-start instances:** ${analysis.firstPersonDistribution.sentenceStartCount}`);
163
+ lines.push(`- **Sentence-start ratio:** ${(analysis.firstPersonDistribution.sentenceStartRatio * 100).toFixed(1)}%`);
164
+ lines.push(`- **Max consecutive "I" starts:** ${analysis.firstPersonDistribution.consecutiveIStart}`);
165
+ lines.push(`- **Balanced distribution:** ${analysis.firstPersonDistribution.isBalanced ? 'Yes' : 'No (too many sentence starts)'}`);
166
+ lines.push('');
167
+ // Repetitive starts
168
+ lines.push('## Repetitive Starts');
169
+ lines.push('');
170
+ lines.push(`- **Max consecutive same-start:** ${analysis.repetitiveStarts.maxConsecutiveSameStart}`);
171
+ lines.push(`- **Has repetition problem:** ${analysis.repetitiveStarts.hasRepetitionProblem ? 'Yes' : 'No'}`);
172
+ if (analysis.repetitiveStarts.problematicPatterns.length > 0) {
173
+ lines.push(`- **Problematic patterns:** ${analysis.repetitiveStarts.problematicPatterns.join(', ')}`);
174
+ }
175
+ lines.push('');
176
+ // Interpretation guide
177
+ lines.push('## Interpretation Guide');
178
+ lines.push('');
179
+ lines.push('| Score Range | Interpretation |');
180
+ lines.push('|-------------|----------------|');
181
+ lines.push('| 85-100 | Very natural - authentic human writing |');
182
+ lines.push('| 65-84 | Natural - good variation |');
183
+ lines.push('| 45-64 | Somewhat mechanical - needs more variation |');
184
+ lines.push('| 0-44 | Mechanical - likely AI-generated or very formulaic |');
185
+ lines.push('');
186
+ return lines.join('\n');
187
+ }
188
+ //# sourceMappingURL=analyze-corpus.js.map