@nahisaho/katashiro-analyzer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/entity/entity-extractor.d.ts +44 -0
- package/dist/entity/entity-extractor.d.ts.map +1 -0
- package/dist/entity/entity-extractor.js +176 -0
- package/dist/entity/entity-extractor.js.map +1 -0
- package/dist/entity/index.d.ts +6 -0
- package/dist/entity/index.d.ts.map +1 -0
- package/dist/entity/index.js +5 -0
- package/dist/entity/index.js.map +1 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/index.js.map +1 -0
- package/dist/interfaces.d.ts +37 -0
- package/dist/interfaces.d.ts.map +1 -0
- package/dist/interfaces.js +8 -0
- package/dist/interfaces.js.map +1 -0
- package/dist/quality/index.d.ts +6 -0
- package/dist/quality/index.d.ts.map +1 -0
- package/dist/quality/index.js +5 -0
- package/dist/quality/index.js.map +1 -0
- package/dist/quality/quality-scorer.d.ts +65 -0
- package/dist/quality/quality-scorer.d.ts.map +1 -0
- package/dist/quality/quality-scorer.js +308 -0
- package/dist/quality/quality-scorer.js.map +1 -0
- package/dist/relation/index.d.ts +6 -0
- package/dist/relation/index.d.ts.map +1 -0
- package/dist/relation/index.js +5 -0
- package/dist/relation/index.js.map +1 -0
- package/dist/relation/relation-analyzer.d.ts +80 -0
- package/dist/relation/relation-analyzer.d.ts.map +1 -0
- package/dist/relation/relation-analyzer.js +192 -0
- package/dist/relation/relation-analyzer.js.map +1 -0
- package/dist/structure/index.d.ts +5 -0
- package/dist/structure/index.d.ts.map +1 -0
- package/dist/structure/index.js +5 -0
- package/dist/structure/index.js.map +1 -0
- package/dist/structure/structure-analyzer.d.ts +108 -0
- package/dist/structure/structure-analyzer.d.ts.map +1 -0
- package/dist/structure/structure-analyzer.js +248 -0
- package/dist/structure/structure-analyzer.js.map +1 -0
- package/dist/text/index.d.ts +5 -0
- package/dist/text/index.d.ts.map +1 -0
- package/dist/text/index.js +5 -0
- package/dist/text/index.js.map +1 -0
- package/dist/text/text-analyzer.d.ts +56 -0
- package/dist/text/text-analyzer.d.ts.map +1 -0
- package/dist/text/text-analyzer.js +281 -0
- package/dist/text/text-analyzer.js.map +1 -0
- package/dist/topic/index.d.ts +6 -0
- package/dist/topic/index.d.ts.map +1 -0
- package/dist/topic/index.js +5 -0
- package/dist/topic/index.js.map +1 -0
- package/dist/topic/topic-modeler.d.ts +79 -0
- package/dist/topic/topic-modeler.d.ts.map +1 -0
- package/dist/topic/topic-modeler.js +267 -0
- package/dist/topic/topic-modeler.js.map +1 -0
- package/dist/types.d.ts +74 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +7 -0
- package/dist/types.js.map +1 -0
- package/package.json +39 -0
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TextAnalyzer - テキスト分析
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-ANALYZE-001, REQ-ANALYZE-006
|
|
5
|
+
* @design DES-KATASHIRO-001 §2.2 Analyzer Container
|
|
6
|
+
* @task TSK-020
|
|
7
|
+
*/
|
|
8
|
+
import { ok, err, generateId, formatTimestamp, } from '@nahisaho/katashiro-core';
|
|
9
|
+
/**
|
|
10
|
+
* 停止語リスト(英語)
|
|
11
|
+
*/
|
|
12
|
+
const STOP_WORDS = new Set([
|
|
13
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
14
|
+
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
15
|
+
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
16
|
+
'should', 'may', 'might', 'must', 'shall', 'can', 'need', 'dare', 'ought',
|
|
17
|
+
'used', 'it', 'its', 'this', 'that', 'these', 'those', 'i', 'you', 'he',
|
|
18
|
+
'she', 'we', 'they', 'what', 'which', 'who', 'whom', 'when', 'where',
|
|
19
|
+
'why', 'how', 'all', 'each', 'every', 'both', 'few', 'more', 'most',
|
|
20
|
+
'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
|
|
21
|
+
'than', 'too', 'very', 'just', 'also', 'now', 'over', 'into', 'about',
|
|
22
|
+
]);
|
|
23
|
+
/**
|
|
24
|
+
* ポジティブワードリスト
|
|
25
|
+
*/
|
|
26
|
+
const POSITIVE_WORDS = new Set([
|
|
27
|
+
'good', 'great', 'excellent', 'amazing', 'wonderful', 'fantastic', 'love',
|
|
28
|
+
'best', 'happy', 'joy', 'beautiful', 'perfect', 'awesome', 'brilliant',
|
|
29
|
+
'outstanding', 'superb', 'magnificent', 'delightful', 'pleasant', 'nice',
|
|
30
|
+
'positive', 'success', 'successful', 'win', 'winning', 'benefit', 'improve',
|
|
31
|
+
'improvement', 'helpful', 'useful', 'valuable', 'impressive', 'incredible',
|
|
32
|
+
]);
|
|
33
|
+
/**
|
|
34
|
+
* ネガティブワードリスト
|
|
35
|
+
*/
|
|
36
|
+
const NEGATIVE_WORDS = new Set([
|
|
37
|
+
'bad', 'terrible', 'awful', 'horrible', 'worst', 'hate', 'disappointed',
|
|
38
|
+
'disappointing', 'frustrating', 'frustrated', 'angry', 'sad', 'poor',
|
|
39
|
+
'failure', 'fail', 'failed', 'problem', 'issue', 'wrong', 'error',
|
|
40
|
+
'mistake', 'difficult', 'hard', 'impossible', 'negative', 'loss', 'lose',
|
|
41
|
+
'losing', 'damage', 'harmful', 'dangerous', 'risk', 'threat', 'concern',
|
|
42
|
+
'worried', 'worry', 'fear', 'scary', 'terrible', 'dreadful', 'miserable',
|
|
43
|
+
]);
|
|
44
|
+
/**
|
|
45
|
+
* テキスト分析実装
|
|
46
|
+
*/
|
|
47
|
+
export class TextAnalyzer {
|
|
48
|
+
/**
|
|
49
|
+
* テキストを要約
|
|
50
|
+
*/
|
|
51
|
+
async summarize(content, maxLength) {
|
|
52
|
+
const text = content.body?.trim() ?? '';
|
|
53
|
+
if (!text) {
|
|
54
|
+
return err(new Error('Content text is empty'));
|
|
55
|
+
}
|
|
56
|
+
const sentences = this.splitSentences(text);
|
|
57
|
+
const keyPoints = this.extractKeyPoints(sentences);
|
|
58
|
+
const summaryText = this.generateSummary(sentences, maxLength ?? 200);
|
|
59
|
+
const summary = {
|
|
60
|
+
id: generateId('SUM'),
|
|
61
|
+
text: summaryText,
|
|
62
|
+
keyPoints,
|
|
63
|
+
wordCount: this.countWords(text),
|
|
64
|
+
createdAt: formatTimestamp(),
|
|
65
|
+
};
|
|
66
|
+
return ok(summary);
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* 感情分析(ISentimentAnalyzerインターフェース実装)
|
|
70
|
+
*/
|
|
71
|
+
async analyze(content) {
|
|
72
|
+
return this.analyzeSentiment(content);
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* 感情分析
|
|
76
|
+
*/
|
|
77
|
+
async analyzeSentiment(content) {
|
|
78
|
+
const text = content.body?.trim() ?? '';
|
|
79
|
+
if (!text) {
|
|
80
|
+
return err(new Error('Content text is empty'));
|
|
81
|
+
}
|
|
82
|
+
const words = this.tokenize(text.toLowerCase());
|
|
83
|
+
let positiveCount = 0;
|
|
84
|
+
let negativeCount = 0;
|
|
85
|
+
for (const word of words) {
|
|
86
|
+
if (POSITIVE_WORDS.has(word))
|
|
87
|
+
positiveCount++;
|
|
88
|
+
if (NEGATIVE_WORDS.has(word))
|
|
89
|
+
negativeCount++;
|
|
90
|
+
}
|
|
91
|
+
const total = positiveCount + negativeCount;
|
|
92
|
+
let score = 0;
|
|
93
|
+
let sentiment = 'neutral';
|
|
94
|
+
if (total > 0) {
|
|
95
|
+
score = (positiveCount - negativeCount) / total;
|
|
96
|
+
if (score > 0.2) {
|
|
97
|
+
sentiment = 'positive';
|
|
98
|
+
}
|
|
99
|
+
else if (score < -0.2) {
|
|
100
|
+
sentiment = 'negative';
|
|
101
|
+
}
|
|
102
|
+
else if (positiveCount > 0 && negativeCount > 0) {
|
|
103
|
+
sentiment = 'mixed';
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
// Confidence based on total sentiment words found
|
|
107
|
+
const confidence = Math.min(total / 10, 1);
|
|
108
|
+
const result = {
|
|
109
|
+
sentiment,
|
|
110
|
+
score,
|
|
111
|
+
confidence,
|
|
112
|
+
aspects: this.extractAspectSentiments(text),
|
|
113
|
+
};
|
|
114
|
+
return ok(result);
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* キーワードを抽出
|
|
118
|
+
*/
|
|
119
|
+
extractKeywords(text, maxKeywords = 10) {
|
|
120
|
+
const words = this.tokenize(text.toLowerCase());
|
|
121
|
+
const wordFreq = new Map();
|
|
122
|
+
for (const word of words) {
|
|
123
|
+
if (word.length < 3 || STOP_WORDS.has(word))
|
|
124
|
+
continue;
|
|
125
|
+
wordFreq.set(word, (wordFreq.get(word) ?? 0) + 1);
|
|
126
|
+
}
|
|
127
|
+
return Array.from(wordFreq.entries())
|
|
128
|
+
.sort((a, b) => b[1] - a[1])
|
|
129
|
+
.slice(0, maxKeywords)
|
|
130
|
+
.map(([word]) => word);
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* 単語数をカウント
|
|
134
|
+
*/
|
|
135
|
+
countWords(text) {
|
|
136
|
+
const words = text.trim().split(/\s+/).filter(w => w.length > 0);
|
|
137
|
+
return words.length;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* 文に分割
|
|
141
|
+
*/
|
|
142
|
+
splitSentences(text) {
|
|
143
|
+
// Simple sentence splitting - handles common abbreviations
|
|
144
|
+
const normalized = text
|
|
145
|
+
.replace(/([.!?])\s+/g, '$1\n')
|
|
146
|
+
.replace(/\n+/g, '\n');
|
|
147
|
+
return normalized
|
|
148
|
+
.split('\n')
|
|
149
|
+
.map(s => s.trim())
|
|
150
|
+
.filter(s => s.length > 0);
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* トークン化
|
|
154
|
+
*/
|
|
155
|
+
tokenize(text) {
|
|
156
|
+
return text
|
|
157
|
+
.replace(/[^\w\s]/g, ' ')
|
|
158
|
+
.split(/\s+/)
|
|
159
|
+
.filter(word => word.length > 0);
|
|
160
|
+
}
|
|
161
|
+
/**
|
|
162
|
+
* キーポイントを抽出
|
|
163
|
+
*/
|
|
164
|
+
extractKeyPoints(sentences) {
|
|
165
|
+
// Score sentences by importance indicators
|
|
166
|
+
const scoredSentences = sentences.map(sentence => {
|
|
167
|
+
let score = 0;
|
|
168
|
+
const lower = sentence.toLowerCase();
|
|
169
|
+
// Indicators of important sentences
|
|
170
|
+
if (lower.includes('important'))
|
|
171
|
+
score += 2;
|
|
172
|
+
if (lower.includes('key'))
|
|
173
|
+
score += 2;
|
|
174
|
+
if (lower.includes('main'))
|
|
175
|
+
score += 2;
|
|
176
|
+
if (lower.includes('significant'))
|
|
177
|
+
score += 2;
|
|
178
|
+
if (lower.includes('conclusion'))
|
|
179
|
+
score += 3;
|
|
180
|
+
if (lower.includes('result'))
|
|
181
|
+
score += 2;
|
|
182
|
+
if (lower.includes('finding'))
|
|
183
|
+
score += 2;
|
|
184
|
+
if (/^\d+\.\s/.test(sentence))
|
|
185
|
+
score += 1; // Numbered items
|
|
186
|
+
if (sentence.includes(':'))
|
|
187
|
+
score += 1;
|
|
188
|
+
// Penalize very short or very long sentences
|
|
189
|
+
const wordCount = this.countWords(sentence);
|
|
190
|
+
if (wordCount < 5)
|
|
191
|
+
score -= 2;
|
|
192
|
+
if (wordCount > 40)
|
|
193
|
+
score -= 1;
|
|
194
|
+
return { sentence, score };
|
|
195
|
+
});
|
|
196
|
+
// Get top sentences as key points
|
|
197
|
+
return scoredSentences
|
|
198
|
+
.filter(s => s.score > 0 || scoredSentences.length <= 5)
|
|
199
|
+
.sort((a, b) => b.score - a.score)
|
|
200
|
+
.slice(0, 5)
|
|
201
|
+
.map(s => s.sentence);
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* 要約を生成
|
|
205
|
+
*/
|
|
206
|
+
generateSummary(sentences, maxLength) {
|
|
207
|
+
if (sentences.length === 0)
|
|
208
|
+
return '';
|
|
209
|
+
// Use extractive summarization - pick most important sentences
|
|
210
|
+
const scoredSentences = sentences.map((sentence, index) => {
|
|
211
|
+
let score = 0;
|
|
212
|
+
// Position score - first sentences often contain key info
|
|
213
|
+
if (index === 0)
|
|
214
|
+
score += 3;
|
|
215
|
+
if (index < 3)
|
|
216
|
+
score += 1;
|
|
217
|
+
// Length score - medium length sentences are often better summaries
|
|
218
|
+
const wordCount = this.countWords(sentence);
|
|
219
|
+
if (wordCount >= 10 && wordCount <= 25)
|
|
220
|
+
score += 2;
|
|
221
|
+
// Keyword density
|
|
222
|
+
const keywords = this.extractKeywords(sentence, 3);
|
|
223
|
+
score += keywords.length * 0.5;
|
|
224
|
+
return { sentence, score, index };
|
|
225
|
+
});
|
|
226
|
+
// Sort by score and build summary
|
|
227
|
+
const sortedSentences = scoredSentences
|
|
228
|
+
.sort((a, b) => b.score - a.score);
|
|
229
|
+
let summary = '';
|
|
230
|
+
const usedIndices = new Set();
|
|
231
|
+
for (const { sentence, index } of sortedSentences) {
|
|
232
|
+
if (summary.length + sentence.length > maxLength) {
|
|
233
|
+
if (summary.length > 0)
|
|
234
|
+
break;
|
|
235
|
+
}
|
|
236
|
+
if (!usedIndices.has(index)) {
|
|
237
|
+
usedIndices.add(index);
|
|
238
|
+
summary += (summary ? ' ' : '') + sentence;
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
return summary || sentences[0] || '';
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* アスペクトごとの感情を抽出
|
|
245
|
+
*/
|
|
246
|
+
extractAspectSentiments(text) {
|
|
247
|
+
// Simple aspect extraction based on common patterns
|
|
248
|
+
const aspects = [];
|
|
249
|
+
const sentences = this.splitSentences(text);
|
|
250
|
+
const aspectPatterns = [
|
|
251
|
+
{ pattern: /quality/i, aspect: 'quality' },
|
|
252
|
+
{ pattern: /price|cost/i, aspect: 'price' },
|
|
253
|
+
{ pattern: /service/i, aspect: 'service' },
|
|
254
|
+
{ pattern: /design|look/i, aspect: 'design' },
|
|
255
|
+
{ pattern: /performance|speed/i, aspect: 'performance' },
|
|
256
|
+
];
|
|
257
|
+
for (const { pattern, aspect } of aspectPatterns) {
|
|
258
|
+
for (const sentence of sentences) {
|
|
259
|
+
if (pattern.test(sentence)) {
|
|
260
|
+
const words = this.tokenize(sentence.toLowerCase());
|
|
261
|
+
let positiveCount = 0;
|
|
262
|
+
let negativeCount = 0;
|
|
263
|
+
for (const word of words) {
|
|
264
|
+
if (POSITIVE_WORDS.has(word))
|
|
265
|
+
positiveCount++;
|
|
266
|
+
if (NEGATIVE_WORDS.has(word))
|
|
267
|
+
negativeCount++;
|
|
268
|
+
}
|
|
269
|
+
const total = positiveCount + negativeCount;
|
|
270
|
+
if (total > 0) {
|
|
271
|
+
const score = (positiveCount - negativeCount) / total;
|
|
272
|
+
const sentiment = score > 0.1 ? 'positive' : score < -0.1 ? 'negative' : 'neutral';
|
|
273
|
+
aspects.push({ aspect, sentiment, score });
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
return aspects;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
//# sourceMappingURL=text-analyzer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-analyzer.js","sourceRoot":"","sources":["../../src/text/text-analyzer.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAEL,EAAE,EACF,GAAG,EACH,UAAU,EACV,eAAe,GAEhB,MAAM,0BAA0B,CAAC;AAIlC;;GAEG;AACH,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC;IACzB,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK;IACnE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM;IACpE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO;IACzE,QAAQ,EAAE,KAAK,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO;IACzE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,GAAG,EAAE,KAAK,EAAE,IAAI;IACvE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO;IACpE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM;IACnE,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI;IACxE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO;CACtE,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;IAC7B,MAAM,EAAE,OAAO,EAAE,WAAW,EAAE,SAAS,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM;IACzE,MAAM,EAAE,OAAO,EAAE,KAAK,EAAE,WAAW,EAAE,SAAS,EAAE,SAAS,EAAE,WAAW;IACtE,aAAa,EAAE,QAAQ,EAAE,aAAa,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM;IACxE,UAAU,EAAE,SAAS,EAAE,YAAY,EAAE,KAAK,EAAE,SAAS,EAAE,SAAS,EAAE,SAAS;IAC3E,aAAa,EAAE,SAAS,EAAE,QAAQ,EAAE,UAAU,EAAE,YAAY,EAAE,YAAY;CAC3E,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC;IAC7B,KAAK,EAAE,UAAU,EAAE,OAAO,EAAE,UAAU,EAAE,OAAO,EAAE,MAAM,EAAE,cAAc;IACvE,eAAe,EAAE,aAAa,EAAE,YAAY,EAAE,OAAO,EAAE,KAAK,EAAE,MAAM;IACpE,SAAS,EAAE,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,EAAE,OAAO;IACjE,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM;IACxE,QAAQ,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,QAAQ,EAAE,SAAS;IACvE,SAAS,EAAE,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,WAAW;CACzE,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,OAAO,YAAY;IACvB;;OAEG;IACH,KAAK,CAAC,SAAS,CACb,OAAgB,EAChB,SAAkB;QAElB,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAExC,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC,CAAC;QACjD,CAAC;QAED,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;QAC5C,MAAM,SAAS,GAAG,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,CAAC;QACnD,MAAM,WAAW,GAAG,IAAI,CAAC,eAAe,CAAC,SAAS,EAAE,SAAS,IAAI,GAAG,CAAC,CAAC;QAEtE,MAAM,OAAO,GAAY;YACvB,EAAE,EAAE,UAAU,CAAC,KAAK,CAAC;YACrB,IAAI,EAAE,WAAW;YACjB,SAAS;YACT,SAAS,EAAE,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC;YAChC,SAAS,EAAE,eAAe,EAAE;SAC7B,CAAC;QAEF,OAAO,EAAE,CAAC,OAAO,CAAC,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CAAC,OAAgB;QAC5B,OAAO,IAAI,CAAC,gBAAgB,CAAC,OAAO,CAAC,CAAC;IACxC,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CAAC,OAAgB;QACrC,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QAExC,IAAI,CAAC,IAAI,EAAE,CAAC;YACV,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,uBAAuB,CAAC,CAAC,CAAC;QACjD,CAAC;QAED,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC;QAChD,IAAI,aAAa,GAAG,CAAC,CAAC;QACtB,IAAI,aAAa,GAAG,CAAC,CAAC;QAEtB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,aAAa,EAAE,CAAC;YAC9C,IAAI,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,aAAa,EAAE,CAAC;QAChD,CAAC;QAED,MAAM,KAAK,GAAG,aAAa,GAAG,aAAa,CAAC;QAC5C,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,SAAS,GAAkD,SAAS,CAAC;QAEzE,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACd,KAAK,GAAG,CAAC,aAAa,GAAG,aAAa,CAAC,GAAG,KAAK,CAAC;YAEhD,IAAI,KAAK,GAAG,GAAG,EAAE,CAAC;gBAChB,SAAS,GAAG,UAAU,CAAC;YACzB,CAAC;iBAAM,IAAI,KAAK,GAAG,CAAC,GAAG,EAAE,CAAC;gBACxB,SAAS,GAAG,UAAU,CAAC;YACzB,CAAC;iBAAM,IAAI,aAAa,GAAG,CAAC,IAAI,aAAa,GAAG,CAAC,EAAE,CAAC;gBAClD,SAAS,GAAG,OAAO,CAAC;YACtB,CAAC;QACH,CAAC;QAED,kDAAkD;QAClD,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC;QAE3C,MAAM,MAAM,GAAoB;YAC9B,SAAS;YACT,KAAK;YACL,UAAU;YACV,OAAO,EAAE,IAAI,CAAC,uBAAuB,CAAC,IAAI,CAAC;SAC5C,CAAC;QAEF,OAAO,EAAE,CAAC,MAAM,CAAC,CAAC;IACpB,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,IAAY,EAAE,cAAsB,EAAE;QACpD,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC;QAChD,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAC;QAE3C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,IAAI,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,SAAS;YACtD,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACpD,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE,CAAC;aAClC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;aAC3B,KAAK,CAAC,CAAC,EAAE,WAAW,CAAC;aACrB,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,IAAY;QACrB,MAAM,KAAK,GAAG,IAAI,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACjE,OAAO,KAAK,CAAC,MAAM,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,cAAc,CAAC,IAAY;QACzB,2DAA2D;QAC3D,MAAM,UAAU,GAAG,IAAI;aACpB,OAAO,CAAC,aAAa,EAAE,MAAM,CAAC;aAC9B,OAAO,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QAEzB,OAAO,UAAU;aACd,KAAK,CAAC,IAAI,CAAC;aACX,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACK,QAAQ,CAAC,IAAY;QAC3B,OAAO,IAAI;aACR,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;aACxB,KAAK,CAAC,KAAK,CAAC;aACZ,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACrC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,SAAmB;QAC1C,2CAA2C;QAC3C,MAAM,eAAe,GAAG,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE;YAC/C,IAAI,KAAK,GAAG,CAAC,CAAC;YACd,MAAM,KAAK,GAAG,QAAQ,CAAC,WAAW,EAAE,CAAC;YAErC,oCAAoC;YACpC,IAAI,KAAK,CAAC,QAAQ,CAAC,WAAW,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YAC5C,IAAI,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YACtC,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YACvC,IAAI,KAAK,CAAC,QAAQ,CAAC,aAAa,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YAC9C,IAAI,KAAK,CAAC,QAAQ,CAAC,YAAY,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YAC7C,IAAI,KAAK,CAAC,QAAQ,CAAC,QAAQ,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YACzC,IAAI,KAAK,CAAC,QAAQ,CAAC,SAAS,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YAC1C,IAAI,UAAU,CAAC,IAAI,CAAC,QAAQ,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC,CAAC,iBAAiB;YAC5D,IAAI,QAAQ,CAAC,QAAQ,CAAC,GAAG,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YAEvC,6CAA6C;YAC7C,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;YAC5C,IAAI,SAAS,GAAG,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YAC9B,IAAI,SAAS,GAAG,EAAE;gBAAE,KAAK,IAAI,CAAC,CAAC;YAE/B,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,CAAC;QAC7B,CAAC,CAAC,CAAC;QAEH,kCAAkC;QAClC,OAAO,eAAe;aACnB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,IAAI,eAAe,CAAC,MAAM,IAAI,CAAC,CAAC;aACvD,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC;aACjC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC;aACX,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC;IAC1B,CAAC;IAED;;OAEG;IACK,eAAe,CAAC,SAAmB,EAAE,SAAiB;QAC5D,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,CAAC;QAEtC,+DAA+D;QAC/D,MAAM,eAAe,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,KAAK,EAAE,EAAE;YACxD,IAAI,KAAK,GAAG,CAAC,CAAC;YAEd,0DAA0D;YAC1D,IAAI,KAAK,KAAK,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YAC5B,IAAI,KAAK,GAAG,CAAC;gBAAE,KAAK,IAAI,CAAC,CAAC;YAE1B,oEAAoE;YACpE,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;YAC5C,IAAI,SAAS,IAAI,EAAE,IAAI,SAAS,IAAI,EAAE;gBAAE,KAAK,IAAI,CAAC,CAAC;YAEnD,kBAAkB;YAClB,MAAM,QAAQ,GAAG,IAAI,CAAC,eAAe,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;YACnD,KAAK,IAAI,QAAQ,CAAC,MAAM,GAAG,GAAG,CAAC;YAE/B,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;QACpC,CAAC,CAAC,CAAC;QAEH,kCAAkC;QAClC,MAAM,eAAe,GAAG,eAAe;aACpC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QAErC,IAAI,OAAO,GAAG,EAAE,CAAC;QACjB,MAAM,WAAW,GAAG,IAAI,GAAG,EAAU,CAAC;QAEtC,KAAK,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,IAAI,eAAe,EAAE,CAAC;YAClD,IAAI,OAAO,CAAC,MAAM,GAAG,QAAQ,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;gBACjD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;oBAAE,MAAM;YAChC,CAAC;YAED,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;gBAC5B,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;gBACvB,OAAO,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC;YAC7C,CAAC;QACH,CAAC;QAED,OAAO,OAAO,IAAI,SAAS,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;IACvC,CAAC;IAED;;OAEG;IACK,uBAAuB,CAAC,IAAY;QAC1C,oDAAoD;QACpD,MAAM,OAAO,GAAsB,EAAE,CAAC;QACtC,MAAM,SAAS,GAAG,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC;QAE5C,MAAM,cAAc,GAAG;YACrB,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE;YAC1C,EAAE,OAAO,EAAE,aAAa,EAAE,MAAM,EAAE,OAAO,EAAE;YAC3C,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,EAAE,SAAS,EAAE;YAC1C,EAAE,OAAO,EAAE,cAAc,EAAE,MAAM,EAAE,QAAQ,EAAE;YAC7C,EAAE,OAAO,EAAE,oBAAoB,EAAE,MAAM,EAAE,aAAa,EAAE;SACzD,CAAC;QAEF,KAAK,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,cAAc,EAAE,CAAC;YACjD,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;gBACjC,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE,CAAC;oBAC3B,MAAM,KAAK,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,WAAW,EAAE,CAAC,CAAC;oBACpD,IAAI,aAAa,GAAG,CAAC,CAAC;oBACtB,IAAI,aAAa,GAAG,CAAC,CAAC;oBAEtB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;wBACzB,IAAI,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC;4BAAE,aAAa,EAAE,CAAC;wBAC9C,IAAI,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC;4BAAE,aAAa,EAAE,CAAC;oBAChD,CAAC;oBAED,MAAM,KAAK,GAAG,aAAa,GAAG,aAAa,CAAC;oBAC5C,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;wBACd,MAAM,KAAK,GAAG,CAAC,aAAa,GAAG,aAAa,CAAC,GAAG,KAAK,CAAC;wBACtD,MAAM,SAAS,GAAG,KAAK,GAAG,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC;wBAEnF,OAAO,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,CAAC,CAAC;oBAC7C,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/topic/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EACL,YAAY,GACb,MAAM,oBAAoB,CAAC;AAE5B,YAAY,EACV,KAAK,EACL,iBAAiB,EACjB,eAAe,EACf,eAAe,GAChB,MAAM,oBAAoB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/topic/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EACL,YAAY,GACb,MAAM,oBAAoB,CAAC"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TopicModeler - トピックモデリング
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-ANALYZE-010
|
|
5
|
+
* @design DES-KATASHIRO-001 §2.2 Analyzer Container
|
|
6
|
+
* @task TSK-024
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* トピック
|
|
10
|
+
*/
|
|
11
|
+
export interface Topic {
|
|
12
|
+
readonly id: string;
|
|
13
|
+
readonly name: string;
|
|
14
|
+
readonly keywords: string[];
|
|
15
|
+
readonly confidence: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* トピック分布
|
|
19
|
+
*/
|
|
20
|
+
export interface TopicDistribution {
|
|
21
|
+
readonly topics: Array<{
|
|
22
|
+
readonly topic: Topic;
|
|
23
|
+
readonly weight: number;
|
|
24
|
+
}>;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* 類似文書
|
|
28
|
+
*/
|
|
29
|
+
export interface SimilarDocument {
|
|
30
|
+
readonly index: number;
|
|
31
|
+
readonly text: string;
|
|
32
|
+
readonly similarity: number;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* 文書クラスタ
|
|
36
|
+
*/
|
|
37
|
+
export interface DocumentCluster {
|
|
38
|
+
readonly id: number;
|
|
39
|
+
readonly documents: string[];
|
|
40
|
+
readonly centroid: string[];
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* トピックモデリング実装
|
|
44
|
+
*/
|
|
45
|
+
export declare class TopicModeler {
|
|
46
|
+
/**
|
|
47
|
+
* 文書からトピックを抽出
|
|
48
|
+
*/
|
|
49
|
+
extractTopics(text: string, numTopics?: number): Topic[];
|
|
50
|
+
/**
|
|
51
|
+
* TF-IDF計算
|
|
52
|
+
*/
|
|
53
|
+
calculateTfIdf(documents: string[]): Record<string, number>;
|
|
54
|
+
/**
|
|
55
|
+
* 文書クラスタリング
|
|
56
|
+
*/
|
|
57
|
+
clusterDocuments(documents: string[], numClusters: number): DocumentCluster[];
|
|
58
|
+
/**
|
|
59
|
+
* 類似文書検索
|
|
60
|
+
*/
|
|
61
|
+
findSimilarDocuments(query: string, corpus: string[], topK?: number): SimilarDocument[];
|
|
62
|
+
/**
|
|
63
|
+
* トピック分布取得
|
|
64
|
+
*/
|
|
65
|
+
getTopicDistribution(text: string): TopicDistribution;
|
|
66
|
+
/**
|
|
67
|
+
* テキストをトークン化
|
|
68
|
+
*/
|
|
69
|
+
private tokenize;
|
|
70
|
+
/**
|
|
71
|
+
* 単語頻度計算
|
|
72
|
+
*/
|
|
73
|
+
private calculateTermFrequency;
|
|
74
|
+
/**
|
|
75
|
+
* Jaccard類似度計算
|
|
76
|
+
*/
|
|
77
|
+
private jaccardSimilarity;
|
|
78
|
+
}
|
|
79
|
+
//# sourceMappingURL=topic-modeler.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"topic-modeler.d.ts","sourceRoot":"","sources":["../../src/topic/topic-modeler.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,CAAC;IAC5B,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,MAAM,EAAE,KAAK,CAAC;QACrB,QAAQ,CAAC,KAAK,EAAE,KAAK,CAAC;QACtB,QAAQ,CAAC,MAAM,EAAE,MAAM,CAAC;KACzB,CAAC,CAAC;CACJ;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,UAAU,EAAE,MAAM,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,EAAE,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,SAAS,EAAE,MAAM,EAAE,CAAC;IAC7B,QAAQ,CAAC,QAAQ,EAAE,MAAM,EAAE,CAAC;CAC7B;AAyBD;;GAEG;AACH,qBAAa,YAAY;IACvB;;OAEG;IACH,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,GAAE,MAAU,GAAG,KAAK,EAAE;IAqC3D;;OAEG;IACH,cAAc,CAAC,SAAS,EAAE,MAAM,EAAE,GAAG,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;IAwC3D;;OAEG;IACH,gBAAgB,CAAC,SAAS,EAAE,MAAM,EAAE,EAAE,WAAW,EAAE,MAAM,GAAG,eAAe,EAAE;IAgF7E;;OAEG;IACH,oBAAoB,CAClB,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,MAAM,EAAE,EAChB,IAAI,GAAE,MAAU,GACf,eAAe,EAAE;IA2BpB;;OAEG;IACH,oBAAoB,CAAC,IAAI,EAAE,MAAM,GAAG,iBAAiB;IAkBrD;;OAEG;IACH,OAAO,CAAC,QAAQ;IAgBhB;;OAEG;IACH,OAAO,CAAC,sBAAsB;IAQ9B;;OAEG;IACH,OAAO,CAAC,iBAAiB;CAc1B"}
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TopicModeler - トピックモデリング
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-ANALYZE-010
|
|
5
|
+
* @design DES-KATASHIRO-001 §2.2 Analyzer Container
|
|
6
|
+
* @task TSK-024
|
|
7
|
+
*/
|
|
8
|
+
/**
|
|
9
|
+
* 日本語ストップワード
|
|
10
|
+
*/
|
|
11
|
+
const JAPANESE_STOPWORDS = new Set([
|
|
12
|
+
'の', 'に', 'は', 'を', 'た', 'が', 'で', 'て', 'と', 'し', 'れ', 'さ',
|
|
13
|
+
'ある', 'いる', 'も', 'な', 'する', 'から', 'こと', 'として', 'い', 'や',
|
|
14
|
+
'など', 'なっ', 'ない', 'この', 'ため', 'その', 'あっ', 'よう', 'また',
|
|
15
|
+
'もの', 'という', 'あり', 'まで', 'られ', 'なる', 'へ', 'か', 'だ', 'これ',
|
|
16
|
+
'によって', 'により', 'おり', 'より', 'による', 'ず', 'なり', 'られる',
|
|
17
|
+
'です', 'ます', 'した', 'して', 'です', 'ました',
|
|
18
|
+
]);
|
|
19
|
+
/**
|
|
20
|
+
* 英語ストップワード
|
|
21
|
+
*/
|
|
22
|
+
const ENGLISH_STOPWORDS = new Set([
|
|
23
|
+
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
24
|
+
'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
|
|
25
|
+
'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
|
|
26
|
+
'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might',
|
|
27
|
+
'this', 'that', 'these', 'those', 'it', 'its',
|
|
28
|
+
]);
|
|
29
|
+
/**
|
|
30
|
+
* トピックモデリング実装
|
|
31
|
+
*/
|
|
32
|
+
export class TopicModeler {
|
|
33
|
+
/**
|
|
34
|
+
* 文書からトピックを抽出
|
|
35
|
+
*/
|
|
36
|
+
extractTopics(text, numTopics = 3) {
|
|
37
|
+
if (!text || text.trim().length === 0) {
|
|
38
|
+
return [];
|
|
39
|
+
}
|
|
40
|
+
// Tokenize and get term frequencies
|
|
41
|
+
const tokens = this.tokenize(text);
|
|
42
|
+
const termFreq = this.calculateTermFrequency(tokens);
|
|
43
|
+
// Sort by frequency and get top terms
|
|
44
|
+
const sortedTerms = Array.from(termFreq.entries())
|
|
45
|
+
.sort((a, b) => b[1] - a[1])
|
|
46
|
+
.slice(0, numTopics * 5);
|
|
47
|
+
// Group into topics
|
|
48
|
+
const topics = [];
|
|
49
|
+
const termsPerTopic = Math.ceil(sortedTerms.length / numTopics);
|
|
50
|
+
for (let i = 0; i < numTopics && i * termsPerTopic < sortedTerms.length; i++) {
|
|
51
|
+
const topicTerms = sortedTerms.slice(i * termsPerTopic, (i + 1) * termsPerTopic);
|
|
52
|
+
if (topicTerms.length === 0)
|
|
53
|
+
continue;
|
|
54
|
+
const keywords = topicTerms.map(t => t[0]);
|
|
55
|
+
const avgFreq = topicTerms.reduce((sum, t) => sum + t[1], 0) / topicTerms.length;
|
|
56
|
+
const maxFreq = Math.max(...Array.from(termFreq.values()));
|
|
57
|
+
topics.push({
|
|
58
|
+
id: `topic-${i + 1}`,
|
|
59
|
+
name: keywords[0] ?? `Topic ${i + 1}`,
|
|
60
|
+
keywords,
|
|
61
|
+
confidence: maxFreq > 0 ? Math.min(avgFreq / maxFreq + 0.3, 1) : 0.5,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
return topics;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* TF-IDF計算
|
|
68
|
+
*/
|
|
69
|
+
calculateTfIdf(documents) {
|
|
70
|
+
if (documents.length === 0) {
|
|
71
|
+
return {};
|
|
72
|
+
}
|
|
73
|
+
// Tokenize all documents
|
|
74
|
+
const tokenizedDocs = documents.map(doc => this.tokenize(doc));
|
|
75
|
+
// Calculate document frequency
|
|
76
|
+
const docFreq = new Map();
|
|
77
|
+
for (const tokens of tokenizedDocs) {
|
|
78
|
+
const uniqueTokens = new Set(tokens);
|
|
79
|
+
for (const token of uniqueTokens) {
|
|
80
|
+
docFreq.set(token, (docFreq.get(token) ?? 0) + 1);
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
// Calculate TF-IDF for each term
|
|
84
|
+
const tfidf = {};
|
|
85
|
+
const numDocs = documents.length;
|
|
86
|
+
for (const tokens of tokenizedDocs) {
|
|
87
|
+
const termFreq = this.calculateTermFrequency(tokens);
|
|
88
|
+
const maxFreq = Math.max(...Array.from(termFreq.values()), 1);
|
|
89
|
+
for (const [term, freq] of termFreq) {
|
|
90
|
+
const tf = freq / maxFreq;
|
|
91
|
+
const df = docFreq.get(term) ?? 1;
|
|
92
|
+
const idf = Math.log((numDocs + 1) / (df + 1)) + 1;
|
|
93
|
+
const score = tf * idf;
|
|
94
|
+
if (!tfidf[term] || score > tfidf[term]) {
|
|
95
|
+
tfidf[term] = score;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
return tfidf;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* 文書クラスタリング
|
|
103
|
+
*/
|
|
104
|
+
clusterDocuments(documents, numClusters) {
|
|
105
|
+
if (documents.length === 0) {
|
|
106
|
+
return [];
|
|
107
|
+
}
|
|
108
|
+
if (documents.length <= numClusters) {
|
|
109
|
+
return documents.map((doc, i) => ({
|
|
110
|
+
id: i,
|
|
111
|
+
documents: [doc],
|
|
112
|
+
centroid: this.tokenize(doc).slice(0, 5),
|
|
113
|
+
}));
|
|
114
|
+
}
|
|
115
|
+
// Simple clustering based on term overlap
|
|
116
|
+
const tokenizedDocs = documents.map(doc => new Set(this.tokenize(doc)));
|
|
117
|
+
const clusters = [];
|
|
118
|
+
const assigned = new Set();
|
|
119
|
+
for (let c = 0; c < numClusters && assigned.size < documents.length; c++) {
|
|
120
|
+
// Find unassigned document as seed
|
|
121
|
+
let seedIdx = -1;
|
|
122
|
+
for (let i = 0; i < documents.length; i++) {
|
|
123
|
+
if (!assigned.has(i)) {
|
|
124
|
+
seedIdx = i;
|
|
125
|
+
break;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
if (seedIdx === -1)
|
|
129
|
+
break;
|
|
130
|
+
const clusterDocs = [];
|
|
131
|
+
const clusterIndices = [];
|
|
132
|
+
const seedTokens = tokenizedDocs[seedIdx];
|
|
133
|
+
if (!seedTokens)
|
|
134
|
+
continue;
|
|
135
|
+
// Find similar documents
|
|
136
|
+
for (let i = 0; i < documents.length; i++) {
|
|
137
|
+
if (assigned.has(i))
|
|
138
|
+
continue;
|
|
139
|
+
const docTokens = tokenizedDocs[i];
|
|
140
|
+
if (!docTokens)
|
|
141
|
+
continue;
|
|
142
|
+
const similarity = this.jaccardSimilarity(seedTokens, docTokens);
|
|
143
|
+
if (similarity > 0.1 || i === seedIdx) {
|
|
144
|
+
const doc = documents[i];
|
|
145
|
+
if (doc) {
|
|
146
|
+
clusterDocs.push(doc);
|
|
147
|
+
clusterIndices.push(i);
|
|
148
|
+
assigned.add(i);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
if (clusterDocs.length > 0) {
|
|
153
|
+
// Calculate centroid (most common terms)
|
|
154
|
+
const termCounts = new Map();
|
|
155
|
+
for (const idx of clusterIndices) {
|
|
156
|
+
const tokens = tokenizedDocs[idx];
|
|
157
|
+
if (tokens) {
|
|
158
|
+
for (const token of tokens) {
|
|
159
|
+
termCounts.set(token, (termCounts.get(token) ?? 0) + 1);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
const centroid = Array.from(termCounts.entries())
|
|
164
|
+
.sort((a, b) => b[1] - a[1])
|
|
165
|
+
.slice(0, 5)
|
|
166
|
+
.map(e => e[0]);
|
|
167
|
+
clusters.push({
|
|
168
|
+
id: c,
|
|
169
|
+
documents: clusterDocs,
|
|
170
|
+
centroid,
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
return clusters;
|
|
175
|
+
}
|
|
176
|
+
/**
|
|
177
|
+
* 類似文書検索
|
|
178
|
+
*/
|
|
179
|
+
findSimilarDocuments(query, corpus, topK = 5) {
|
|
180
|
+
if (corpus.length === 0) {
|
|
181
|
+
return [];
|
|
182
|
+
}
|
|
183
|
+
const queryTokens = new Set(this.tokenize(query));
|
|
184
|
+
const similarities = [];
|
|
185
|
+
for (let i = 0; i < corpus.length; i++) {
|
|
186
|
+
const doc = corpus[i];
|
|
187
|
+
if (!doc)
|
|
188
|
+
continue;
|
|
189
|
+
const docTokens = new Set(this.tokenize(doc));
|
|
190
|
+
const similarity = this.jaccardSimilarity(queryTokens, docTokens);
|
|
191
|
+
similarities.push({
|
|
192
|
+
index: i,
|
|
193
|
+
text: doc,
|
|
194
|
+
similarity,
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
return similarities
|
|
198
|
+
.sort((a, b) => b.similarity - a.similarity)
|
|
199
|
+
.slice(0, topK);
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* トピック分布取得
|
|
203
|
+
*/
|
|
204
|
+
getTopicDistribution(text) {
|
|
205
|
+
const topics = this.extractTopics(text, 3);
|
|
206
|
+
if (topics.length === 0) {
|
|
207
|
+
return { topics: [] };
|
|
208
|
+
}
|
|
209
|
+
// Normalize weights to sum to 1
|
|
210
|
+
const totalConfidence = topics.reduce((sum, t) => sum + t.confidence, 0);
|
|
211
|
+
return {
|
|
212
|
+
topics: topics.map(topic => ({
|
|
213
|
+
topic,
|
|
214
|
+
weight: totalConfidence > 0 ? topic.confidence / totalConfidence : 1 / topics.length,
|
|
215
|
+
})),
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* テキストをトークン化
|
|
220
|
+
*/
|
|
221
|
+
tokenize(text) {
|
|
222
|
+
// Simple tokenization: split by non-word characters
|
|
223
|
+
const tokens = text
|
|
224
|
+
.toLowerCase()
|
|
225
|
+
.split(/[\s\p{P}]+/u)
|
|
226
|
+
.filter(token => {
|
|
227
|
+
if (token.length < 2)
|
|
228
|
+
return false;
|
|
229
|
+
if (JAPANESE_STOPWORDS.has(token))
|
|
230
|
+
return false;
|
|
231
|
+
if (ENGLISH_STOPWORDS.has(token))
|
|
232
|
+
return false;
|
|
233
|
+
if (/^\d+$/.test(token))
|
|
234
|
+
return false;
|
|
235
|
+
return true;
|
|
236
|
+
});
|
|
237
|
+
return tokens;
|
|
238
|
+
}
|
|
239
|
+
/**
|
|
240
|
+
* 単語頻度計算
|
|
241
|
+
*/
|
|
242
|
+
calculateTermFrequency(tokens) {
|
|
243
|
+
const freq = new Map();
|
|
244
|
+
for (const token of tokens) {
|
|
245
|
+
freq.set(token, (freq.get(token) ?? 0) + 1);
|
|
246
|
+
}
|
|
247
|
+
return freq;
|
|
248
|
+
}
|
|
249
|
+
/**
|
|
250
|
+
* Jaccard類似度計算
|
|
251
|
+
*/
|
|
252
|
+
jaccardSimilarity(set1, set2) {
|
|
253
|
+
if (set1.size === 0 && set2.size === 0)
|
|
254
|
+
return 1;
|
|
255
|
+
if (set1.size === 0 || set2.size === 0)
|
|
256
|
+
return 0;
|
|
257
|
+
let intersection = 0;
|
|
258
|
+
for (const item of set1) {
|
|
259
|
+
if (set2.has(item)) {
|
|
260
|
+
intersection++;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
const union = set1.size + set2.size - intersection;
|
|
264
|
+
return union > 0 ? intersection / union : 0;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
//# sourceMappingURL=topic-modeler.js.map
|