kakaotalk-chat-analyzer 0.2.21 → 0.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,256 @@
1
+ import { FRAGMENT_TAIL_RE, KOREAN_CHAT_STOPWORDS, MORPHOLOGICAL_FRAGMENTS, REACTION_ONLY_RE, VERB_FRAGMENT_RE, } from "./korean-stopwords.js";
2
+ const URL_RE = /\bhttps?:\/\/[^\s<>"']+|www\.[^\s<>"']+/gi;
3
+ const EMAIL_RE = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi;
4
+ const PHONE_RE = /\b(?:\+?\d[\d\s().-]{7,}\d)\b/g;
5
+ const EMOJI_RE = /\p{Extended_Pictographic}/gu;
6
+ const HASHTAG_RE = /#([가-힣A-Za-z][가-힣A-Za-z0-9_]{1,22})/g;
7
+ /** 자모 축약어 (SLANG_CANON 키와 동기화) */
8
+ const JAMO_SLANG_RE = /(?:ㄹㅇ|ㅇㅈ|ㅇㅋ|ㄱㅅ|ㄳ|ㅈㅅ|ㅊㅋ|ㅊㅊ|ㄴㄴ|ㅁㅊ|ㅂㅃ|ㅃㅃ)/g;
9
+ /** 구두점·공백·괄호 등으로 1차 분할 */
10
+ const SEGMENT_SPLIT_RE = /[\s,.!?…~·|/\\;:()[\]{}<>"'`「」『』【】\u2014\u2013]+/u;
11
+ const HANGUL_RUN_RE = /[\uAC00-\uD7A3]+/g;
12
+ const HANGUL_CHAR_RE = /[\uAC00-\uD7A3]/;
13
+ const LATIN_TOKEN_RE = /[A-Za-z][A-Za-z0-9_+-]{1,}/g;
14
+ /** 긴 붙여쓰기에서 조사 경계로 추가 분할 (보수적) */
15
+ const PARTICLE_BOUNDARY_RE = /(?<=[\uAC00-\uD7A3]{2,})(?:에서|으로|부터|까지|에게|한테|께서|라며|잖아|거든|습니다|했어요|해요|어요|네요|습니다|이에요|예요)(?=[\uAC00-\uD7A3])|(?<=[\uAC00-\uD7A3]{2,})(?:은|는|이|가|을|를|의|에|와|과|도|만|며|고|서|면|니|다|게|지|네|요|죠|함|라)(?=[\uAC00-\uD7A3]|$)/gu;
16
+ const TRAILING_PARTICLE_RE = /(?:에서|으로|부터|까지|에게|한테|께서|습니다|했어요|해요|어요|네요|이에요|예요|거든요|잖아요|은|는|이|가|을|를|의|에|와|과|도|만|며|고|서|면|니|다|게|지|네|요|죠|함|라)$/u;
17
+ /** 채팅 축약 → 표기 통일 (키워드 라벨) */
18
+ const SLANG_CANON = new Map([
19
+ ["ㄹㅇ", "리얼"],
20
+ ["레알", "리얼"],
21
+ ["리얼", "리얼"],
22
+ ["ㅇㅈ", "인정"],
23
+ ["ㅇㅋ", "오케이"],
24
+ ["오키", "오케이"],
25
+ ["ㅇㅋㅇㅋ", "오케이"],
26
+ ["ㄱㅅ", "감사"],
27
+ ["ㄳ", "감사"],
28
+ ["ㅈㅅ", "죄송"],
29
+ ["ㅊㅋ", "축하"],
30
+ ["ㅊㅊ", "축하"],
31
+ ["ㄴㄴ", "노노"],
32
+ ["노노", "노노"],
33
+ ["ㅂㅂ", "바이"],
34
+ ["ㅃㅃ", "바이"],
35
+ ["ㅁㅊ", "미친"],
36
+ ["미쳤", "미친"],
37
+ ["개웃", "개웃김"],
38
+ ["레전드", "레전드"],
39
+ ["레게", "레전드"],
40
+ ["킹받", "킹받음"],
41
+ ["킹받음", "킹받음"],
42
+ ["노답", "노답"],
43
+ ["현타", "현타"],
44
+ ["인싸", "인싸"],
45
+ ["아싸", "아싸"],
46
+ ["갓", "갓"],
47
+ ["존맛", "존맛"],
48
+ ["존잼", "존잼"],
49
+ ["노잼", "노잼"],
50
+ ["핵노잼", "노잼"],
51
+ ["TMI", "tmi"],
52
+ ["tmi", "tmi"],
53
+ ["JMT", "jmt"],
54
+ ["jmt", "jmt"],
55
+ ]);
56
+ /**
57
+ * 한국어 오픈채팅·구어 특화 키워드 추출
58
+ * - 띄어쓰기·붙여쓰기 혼용, 조사 경계, 2~3어절 구, 해시태그
59
+ * - 붙여쓴 한글 덩어리에서 길이 제한 n-gram 보조
60
+ * - 메시지당 동일 토큰 1회만 (Set)
61
+ */
62
+ export function extractKoreanKeywords(message, options) {
63
+ const prepared = prepareMessage(message);
64
+ if (!prepared)
65
+ return [];
66
+ const bag = new Set();
67
+ const segments = prepared.split(SEGMENT_SPLIT_RE).filter((s) => s.length > 0);
68
+ for (const segment of segments) {
69
+ collectFromSegment(segment, bag, options);
70
+ }
71
+ return [...bag];
72
+ }
73
+ /** KR-WordRank 보조: 해시태그만 (슬랭·자모는 WordRank·불용어 층에서 처리) */
74
+ export function extractSupplementalKeywords(message, options) {
75
+ const prepared = prepareMessage(message);
76
+ if (!prepared)
77
+ return [];
78
+ const bag = new Set();
79
+ for (const segment of prepared.split(SEGMENT_SPLIT_RE).filter((s) => s.length > 0)) {
80
+ for (const m of segment.matchAll(HASHTAG_RE)) {
81
+ tryAdd(bag, m[1], options);
82
+ }
83
+ }
84
+ return [...bag];
85
+ }
86
+ function prepareMessage(message) {
87
+ let t = message.normalize("NFC");
88
+ t = t.replace(URL_RE, " ").replace(EMAIL_RE, " ").replace(PHONE_RE, " ");
89
+ t = t.replace(EMOJI_RE, " ");
90
+ t = t.replace(/[^\S\n]+/g, " ");
91
+ t = collapseStretch(t);
92
+ return t.trim();
93
+ }
94
+ /** ㅋㅋㅋ, ㅎㅎㅎ, 같은 글자 4연속+ 완화 */
95
+ function collapseStretch(text) {
96
+ return text
97
+ .replace(/([ㅋㅎㅠㅜ]){4,}/gu, "$1$1$1")
98
+ .replace(/([\uAC00-\uD7A3])\1{3,}/gu, "$1$1");
99
+ }
100
+ function collectFromSegment(segment, bag, options) {
101
+ for (const m of segment.matchAll(HASHTAG_RE)) {
102
+ tryAdd(bag, m[1], options);
103
+ }
104
+ for (const m of segment.matchAll(JAMO_SLANG_RE)) {
105
+ tryAdd(bag, m[0], options);
106
+ }
107
+ const tokens = [];
108
+ for (const m of segment.matchAll(LATIN_TOKEN_RE)) {
109
+ const canon = canonToken(m[0]);
110
+ if (canon && acceptToken(canon, options))
111
+ tokens.push(canon);
112
+ }
113
+ const hangulPieces = splitHangulChunk(segment);
114
+ for (const piece of hangulPieces) {
115
+ for (const expanded of expandHangulPiece(piece)) {
116
+ if (acceptToken(expanded, options))
117
+ tokens.push(expanded);
118
+ }
119
+ }
120
+ for (const t of tokens)
121
+ tryAdd(bag, t, options);
122
+ for (let i = 0; i < tokens.length; i += 1) {
123
+ if (i + 1 < tokens.length) {
124
+ const phrase = joinPhrase(tokens[i], tokens[i + 1]);
125
+ if (phrase)
126
+ tryAdd(bag, phrase, options);
127
+ }
128
+ if (i + 2 < tokens.length) {
129
+ const phrase = joinPhrase(tokens[i], tokens[i + 1], tokens[i + 2]);
130
+ if (phrase && phrase.length <= 24)
131
+ tryAdd(bag, phrase, options);
132
+ }
133
+ }
134
+ const hangulOnly = segment.replace(/[^\uAC00-\uD7A3]/g, "");
135
+ const hadSpace = /\s/.test(segment);
136
+ if (!hadSpace && hangulOnly.length >= 5 && hangulOnly.length <= 28) {
137
+ for (const gram of mineHangulNgrams(hangulOnly)) {
138
+ if (acceptToken(gram, options))
139
+ tryAdd(bag, gram, options);
140
+ }
141
+ }
142
+ }
143
+ function splitHangulChunk(segment) {
144
+ const parts = [];
145
+ let m;
146
+ const re = new RegExp(HANGUL_RUN_RE.source, "gu");
147
+ while ((m = re.exec(segment)) !== null) {
148
+ const run = m[0];
149
+ if (run.length <= 18) {
150
+ parts.push(run);
151
+ continue;
152
+ }
153
+ const split = run.split(PARTICLE_BOUNDARY_RE).filter((p) => p.length >= 3);
154
+ if (split.length > 1)
155
+ parts.push(...split);
156
+ else
157
+ parts.push(run);
158
+ }
159
+ return parts;
160
+ }
161
+ function expandHangulPiece(piece) {
162
+ const base = piece.trim();
163
+ if (!base)
164
+ return [];
165
+ const canon = canonToken(base);
166
+ if (!canon)
167
+ return [];
168
+ if (canon.length >= 4 && HANGUL_CHAR_RE.test(canon)) {
169
+ const stem = stripTrailingParticle(canon);
170
+ if (stem && stem.length >= 2 && stem !== canon && !MORPHOLOGICAL_FRAGMENTS.has(stem)) {
171
+ return [stem];
172
+ }
173
+ }
174
+ return [canon];
175
+ }
176
+ function stripTrailingParticle(word) {
177
+ let w = word;
178
+ for (let i = 0; i < 2; i += 1) {
179
+ const next = w.replace(TRAILING_PARTICLE_RE, "");
180
+ if (next === w || next.length < 2)
181
+ break;
182
+ w = next;
183
+ }
184
+ return w;
185
+ }
186
+ function mineHangulNgrams(compact) {
187
+ const out = new Set();
188
+ const maxSize = Math.min(6, compact.length);
189
+ for (let size = 3; size <= maxSize; size += 1) {
190
+ for (let i = 0; i <= compact.length - size; i += 1) {
191
+ const gram = compact.slice(i, i + size);
192
+ if (!MORPHOLOGICAL_FRAGMENTS.has(gram))
193
+ out.add(gram);
194
+ if (out.size >= 8)
195
+ return [...out];
196
+ }
197
+ }
198
+ return [...out];
199
+ }
200
+ function joinPhrase(...parts) {
201
+ const filtered = parts.filter((p) => p.length >= 2);
202
+ if (filtered.length !== parts.length)
203
+ return null;
204
+ const allHangul = filtered.every((p) => /^[\uAC00-\uD7A3]+$/.test(p));
205
+ if (allHangul) {
206
+ if (filtered.length === 2)
207
+ return `${filtered[0]} ${filtered[1]}`;
208
+ return `${filtered[0]} ${filtered[1]} ${filtered[2]}`;
209
+ }
210
+ return filtered.join(" ");
211
+ }
212
+ function canonToken(raw) {
213
+ const t = raw.trim();
214
+ if (!t)
215
+ return null;
216
+ const lower = /^[A-Za-z0-9_+-]+$/.test(t) ? t.toLowerCase() : t;
217
+ return SLANG_CANON.get(lower) ?? SLANG_CANON.get(t) ?? lower;
218
+ }
219
+ function acceptToken(token, options) {
220
+ if (token.length < 2 || token.length > 28)
221
+ return false;
222
+ if (/^\d+$/.test(token))
223
+ return false;
224
+ if (MORPHOLOGICAL_FRAGMENTS.has(token))
225
+ return false;
226
+ if (VERB_FRAGMENT_RE.test(token))
227
+ return false;
228
+ if (REACTION_ONLY_RE.test(token) && !SLANG_CANON.has(token))
229
+ return false;
230
+ if (KOREAN_CHAT_STOPWORDS.has(token))
231
+ return false;
232
+ if (options.exclude?.has(token))
233
+ return false;
234
+ if (options.senderNames.has(token))
235
+ return false;
236
+ const hangulCount = [...token].filter((c) => HANGUL_CHAR_RE.test(c)).length;
237
+ const hangulOnly = hangulCount === token.length;
238
+ if (hangulOnly && token.length === 2) {
239
+ if (MORPHOLOGICAL_FRAGMENTS.has(token))
240
+ return false;
241
+ if (FRAGMENT_TAIL_RE.test(token) && !SLANG_CANON.has(token))
242
+ return false;
243
+ }
244
+ if (hangulCount === 1 && token.length <= 2)
245
+ return false;
246
+ if (/^[A-Za-z]{1,2}$/.test(token))
247
+ return false;
248
+ return true;
249
+ }
250
+ function tryAdd(bag, token, options) {
251
+ const canon = canonToken(token);
252
+ if (!canon || !acceptToken(canon, options))
253
+ return;
254
+ bag.add(canon);
255
+ }
256
+ //# sourceMappingURL=korean-keywords.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"korean-keywords.js","sourceRoot":"","sources":["../../src/korean-keywords.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,gBAAgB,EAChB,qBAAqB,EACrB,uBAAuB,EACvB,gBAAgB,EAChB,gBAAgB,GACjB,MAAM,uBAAuB,CAAC;AAE/B,MAAM,MAAM,GAAG,2CAA2C,CAAC;AAC3D,MAAM,QAAQ,GAAG,6CAA6C,CAAC;AAC/D,MAAM,QAAQ,GAAG,gCAAgC,CAAC;AAClD,MAAM,QAAQ,GAAG,6BAA6B,CAAC;AAC/C,MAAM,UAAU,GAAG,sCAAsC,CAAC;AAC1D,kCAAkC;AAClC,MAAM,aAAa,GACjB,yCAAyC,CAAC;AAE5C,0BAA0B;AAC1B,MAAM,gBAAgB,GAAG,mDAAmD,CAAC;AAC7E,MAAM,aAAa,GAAG,mBAAmB,CAAC;AAC1C,MAAM,cAAc,GAAG,iBAAiB,CAAC;AACzC,MAAM,cAAc,GAAG,6BAA6B,CAAC;AAErD,kCAAkC;AAClC,MAAM,oBAAoB,GACxB,+MAA+M,CAAC;AAElN,MAAM,oBAAoB,GACxB,8GAA8G,CAAC;AAEjH,6BAA6B;AAC7B,MAAM,WAAW,GAAG,IAAI,GAAG,CAAiB;IAC1C,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,KAAK,CAAC;IACb,CAAC,IAAI,EAAE,KAAK,CAAC;IACb,CAAC,MAAM,EAAE,KAAK,CAAC;IACf,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,GAAG,EAAE,IAAI,CAAC;IACX,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,KAAK,CAAC;IACb,CAAC,KAAK,EAAE,KAAK,CAAC;IACd,CAAC,IAAI,EAAE,KAAK,CAAC;IACb,CAAC,IAAI,EAAE,KAAK,CAAC;IACb,CAAC,KAAK,EAAE,KAAK,CAAC;IACd,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,GAAG,EAAE,GAAG,CAAC;IACV,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,IAAI,EAAE,IAAI,CAAC;IACZ,CAAC,KAAK,EAAE,IAAI,CAAC;IACb,CAAC,KAAK,EAAE,KAAK,CAAC;IACd,CAAC,KAAK,EAAE,KAAK,CAAC;IACd,CAAC,KAAK,EAAE,KAAK,CAAC;IACd,CAAC,KAAK,EAAE,KAAK,CAAC;CACf,CAAC,CAAC;AAQH;;;;;GAKG;AACH,MAAM,UAAU,qBAAqB,CACnC,OAAe,EACf,OAA6B;IAE7B,MAAM,QAAQ,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IACzC,IAAI,CAAC,QAAQ;QAAE,OAAO,EAAE,CAAC;IAEzB,MAAM,GAAG,GAAG,IAAI,GAAG,EAAU,CAAC;IAC9B,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE9E,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,kBAAkB,CAAC,OAAO,EAAE,GAAG,EAAE,OAAO,CAAC,CAAC;IAC5C,CAAC;IAED,OAAO,CAAC,GAAG,GAAG,CAAC,CAAC;AAClB,CAAC;AAED,yDAAyD;AACzD,MAAM,UAAU,2BAA2B,CACzC,OAAe,EACf,OAA6B;IAE7B,MAAM,QAAQ,GAAG,cAAc,CAAC,OAAO,CAAC,CAAC;IACzC,IAAI,CAAC,QAAQ;QAAE,OAAO,EAAE,CAAC;IACzB,MAAM,GAAG,GAAG,IAAI,GAAG,EAAU,CAAC;IAC9B,KAAK,MAAM,OAAO,IAAI,QAAQ,CAAC,KAAK,CAAC,gBAAgB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,CAAC;QACnF,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;YAC7C,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAE,EAAE,OAAO,CAAC,CAAC;QAC9B,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,GAAG,CAAC,CAAC;AAClB,CAAC;AAED,SAAS,cAAc,CAAC,OAAe;IACrC,IAAI,CAAC,GAAG,OAAO,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;IACjC,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;IACzE,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC,CAAC;IAC7B,CAAC,GAAG,CAAC,CAAC,OAAO,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC;IAChC,CAAC,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;IACvB,OAAO,CAAC,CAAC,IAAI,EAAE,CAAC;AAClB,CAAC;AAED,8BAA8B;AAC9B,SAAS,eAAe,CAAC,IAAY;IACnC,OAAO,IAAI;SACR,OAAO,CAAC,gBAAgB,EAAE,QAAQ,CAAC;SACnC,OAAO,CAAC,2BAA2B,EAAE,MAAM,CAAC,CAAC;AAClD,CAAC;AAED,SAAS,kBAAkB,CACzB,OAAe,EACf,GAAgB,EAChB,OAA6B;IAE7B,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,EAAE,CAAC;QAC7C,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAE,EAAE,OAAO,CAAC,CAAC;IAC9B,CAAC;IACD,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,EAAE,CAAC;QAChD,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAE,EAAE,OAAO,CAAC,CAAC;IAC9B,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAE5B,KAAK,MAAM,CAAC,IAAI,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAC,EAAE,CAAC;QACjD,MAAM,KAAK,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC;QAChC,IAAI,KAAK,IAAI,WAAW,CAAC,KAAK,EAAE,OAAO,CAAC;YAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;IAC/D,CAAC;IAED,MAAM,YAAY,GAAG,gBAAgB,CAAC,OAAO,CAAC,CAAC;IAC/C,KAAK,MAAM,KAAK,IAAI,YAAY,EAAE,CAAC;QACjC,KAAK,MAAM,QAAQ,IAAI,iBAAiB,CAAC,KAAK,CAAC,EAAE,CAAC;YAChD,IAAI,WAAW,CAAC,QAAQ,EAAE,OAAO,CAAC;gBAAE,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC5D,CAAC;IACH,CAAC;IAED,KAAK,MAAM,CAAC,IAAI,MAAM;QAAE,MAAM,CAAC,GAAG,EAAE,CAAC,EAAE,OAAO,CAAC,CAAC;IAEhD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;QAC1C,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC;YAC1B,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,MAAM,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,CAAC;YACtD,IAAI,MAAM;gBAAE,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAC3C,CAAC;QACD,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC;YAC1B,MAAM,MAAM,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,MAAM,CAAC,CAAC,GAAG,CAAC,CAAE,EAAE,MAAM,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,CAAC;YACtE,IAAI,MAAM,IAAI,MAAM,CAAC,MAAM,IAAI,EAAE;gBAAE,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAClE,CAAC;IACH,CAAC;IAED,MAAM,UAAU,GAAG,OAAO,CAAC,OAAO,CAAC,mBAAmB,EAAE,EAAE,CAAC,CAAC;IAC5D,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACpC,IAAI,CAAC,QAAQ,IAAI,UAAU,CAAC,MAAM,IAAI,CAAC,IAAI,UAAU,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;QACnE,KAAK,MAAM,IAAI,IAAI,gBAAgB,CAAC,UAAU,CAAC,EAAE,CAAC;YAChD,IAAI,WAAW,CAAC,IAAI,EAAE,OAAO,CAAC;gBAAE,MAAM,CAAC,GAAG,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;QAC7D,CAAC;IACH,CAAC;AACH,CAAC;AAED,SAAS,gBAAgB,CAAC,OAAe;IACvC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,CAAyB,CAAC;IAC9B,MAAM,EAAE,GAAG,IAAI,MAAM,CAAC,aAAa,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAClD,OAAO,CAAC,CAAC,GAAG,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACvC,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC;QAClB,IAAI,GAAG,CAAC,MAAM,IAAI,EAAE,EAAE,CAAC;YACrB,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAChB,SAAS;QACX,CAAC;QACD,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,CAAC,oBAAoB,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;QAC3E,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC;YAAE,KAAK,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC;;YACtC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IACvB,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED,SAAS,iBAAiB,CAAC,KAAa;IACtC,MAAM,IAAI,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC;IAC1B,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IACrB,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC;IAC/B,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,CAAC;IACtB,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC,IAAI,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;QACpD,MAAM,IAAI,GAAG,qBAAqB,CAAC,KAAK,CAAC,CAAC;QAC1C,IAAI,IAAI,IAAI,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,IAAI,KAAK,KAAK,IAAI,CAAC,uBAAuB,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;YACrF,OAAO,CAAC,IAAI,CAAC,CAAC;QAChB,CAAC;IACH,CAAC;IACD,OAAO,CAAC,KAAK,CAAC,CAAC;AACjB,CAAC;AAED,SAAS,qBAAqB,CAAC,IAAY;IACzC,IAAI,CAAC,GAAG,IAAI,CAAC;IACb,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;QAC9B,MAAM,IAAI,GAAG,CAAC,CAAC,OAAO,CAAC,oBAAoB,EAAE,EAAE,CAAC,CAAC;QACjD,IAAI,IAAI,KAAK,CAAC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;YAAE,MAAM;QACzC,CAAC,GAAG,IAAI,CAAC;IACX,CAAC;IACD,OAAO,CAAC,CAAC;AACX,CAAC;AAED,SAAS,gBAAgB,CAAC,OAAe;IACvC,MAAM,GAAG,GAAG,IAAI,GAAG,EAAU,CAAC;IAC9B,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,MAAM,CAAC,CAAC;IAC5C,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,IAAI,OAAO,EAAE,IAAI,IAAI,CAAC,EAAE,CAAC;QAC9C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,EAAE,CAAC,IAAI,CAAC,EAAE,CAAC;YACnD,MAAM,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,CAAC;YACxC,IAAI,CAAC,uBAAuB,CAAC,GAAG,CAAC,IAAI,CAAC;gBAAE,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;YACtD,IAAI,GAAG,CAAC,IAAI,IAAI,CAAC;gBAAE,OAAO,CAAC,GAAG,GAAG,CAAC,CAAC;QACrC,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,GAAG,CAAC,CAAC;AAClB,CAAC;AAED,SAAS,UAAU,CAAC,GAAG,KAAe;IACpC,MAAM,QAAQ,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;IACpD,IAAI,QAAQ,CAAC,MAAM,KAAK,KAAK,CAAC,MAAM;QAAE,OAAO,IAAI,CAAC;IAClD,MAAM,SAAS,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,oBAAoB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;IACtE,IAAI,SAAS,EAAE,CAAC;QACd,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;QAClE,OAAO,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,CAAC,CAAC,EAAE,CAAC;IACxD,CAAC;IACD,OAAO,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAC5B,CAAC;AAED,SAAS,UAAU,CAAC,GAAW;IAC7B,MAAM,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,CAAC;IACrB,IAAI,CAAC,CAAC;QAAE,OAAO,IAAI,CAAC;IACpB,MAAM,KAAK,GAAG,mBAAmB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAChE,OAAO,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,KAAK,CAAC;AAC/D,CAAC;AAED,SAAS,WAAW,CAAC,KAAa,EAAE,OAA6B;IAC/D,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,IAAI,KAAK,CAAC,MAAM,GAAG,EAAE;QAAE,OAAO,KAAK,CAAC;IACxD,IAAI,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACtC,IAAI,uBAAuB,CAAC,GAAG,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACrD,IAAI,gBAAgB,CAAC,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAC/C,IAAI,gBAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAC1E,IAAI,qBAAqB,CAAC,GAAG,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACnD,IAAI,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAC9C,IAAI,OAAO,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IACjD,MAAM,WAAW,GAAG,CAAC,GAAG,KAAK,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IAC5E,MAAM,UAAU,GAAG,WAAW,KAAK,KAAK,CAAC,MAAM,CAAC;IAChD,IAAI,UAAU,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACrC,IAAI,uBAAuB,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,OAAO,KAAK,CAAC;QACrD,IAAI,gBAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,KAAK,CAAC;YAAE,OAAO,KAAK,CAAC;IAC5E,CAAC;IACD,IAAI,WAAW,KAAK,CAAC,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC;IACzD,IAAI,iBAAiB,CAAC,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,KAAK,CAAC;IAChD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,SAAS,MAAM,CAAC,GAAgB,EAAE,KAAa,EAAE,OAA6B;IAC5E,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC;IAChC,IAAI,CAAC,KAAK,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,OAAO,CAAC;QAAE,OAAO;IACnD,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;AACjB,CAAC"}
@@ -0,0 +1,7 @@
1
+ export interface NormalizeKoreanTextOptions {
2
+ keepEnglish?: boolean;
3
+ keepNumbers?: boolean;
4
+ keepPunctuation?: boolean;
5
+ repeatCollapseTo?: number;
6
+ }
7
+ export declare function normalizeKoreanText(doc: string, options?: NormalizeKoreanTextOptions): string;
@@ -0,0 +1,21 @@
1
+ /** KR-WordRank / soynlp 스타일 한국어 전처리 (lovit/krwordrank hangle.normalize 포팅) */
2
+ const DOUBLESPACE_RE = /\s+/g;
3
+ const REPEAT_CHARS_RE = /(\S)\1{3,}/gu;
4
+ export function normalizeKoreanText(doc, options = {}) {
5
+ const { keepEnglish = true, keepNumbers = true, keepPunctuation = false, repeatCollapseTo = 2 } = options;
6
+ let allowed = "가-힣";
7
+ if (keepEnglish)
8
+ allowed += "a-zA-Z";
9
+ if (keepNumbers)
10
+ allowed += "0-9";
11
+ if (keepPunctuation)
12
+ allowed += ".,?!";
13
+ const stripRe = new RegExp(`[^${allowed}]`, "gu");
14
+ let out = doc.normalize("NFC");
15
+ if (repeatCollapseTo > 0) {
16
+ out = out.replace(REPEAT_CHARS_RE, (_, ch) => ch.repeat(repeatCollapseTo));
17
+ }
18
+ out = out.replace(stripRe, " ");
19
+ return out.replace(DOUBLESPACE_RE, " ").trim();
20
+ }
21
+ //# sourceMappingURL=korean-normalize.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"korean-normalize.js","sourceRoot":"","sources":["../../src/korean-normalize.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,MAAM,cAAc,GAAG,MAAM,CAAC;AAC9B,MAAM,eAAe,GAAG,cAAc,CAAC;AASvC,MAAM,UAAU,mBAAmB,CACjC,GAAW,EACX,UAAsC,EAAE;IAExC,MAAM,EAAE,WAAW,GAAG,IAAI,EAAE,WAAW,GAAG,IAAI,EAAE,eAAe,GAAG,KAAK,EAAE,gBAAgB,GAAG,CAAC,EAAE,GAC7F,OAAO,CAAC;IAEV,IAAI,OAAO,GAAG,KAAK,CAAC;IACpB,IAAI,WAAW;QAAE,OAAO,IAAI,QAAQ,CAAC;IACrC,IAAI,WAAW;QAAE,OAAO,IAAI,KAAK,CAAC;IAClC,IAAI,eAAe;QAAE,OAAO,IAAI,MAAM,CAAC;IACvC,MAAM,OAAO,GAAG,IAAI,MAAM,CAAC,KAAK,OAAO,GAAG,EAAE,IAAI,CAAC,CAAC;IAElD,IAAI,GAAG,GAAG,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;IAC/B,IAAI,gBAAgB,GAAG,CAAC,EAAE,CAAC;QACzB,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,CAAC,gBAAgB,CAAC,CAAC,CAAC;IAC7E,CAAC;IACD,GAAG,GAAG,GAAG,CAAC,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC,CAAC;IAChC,OAAO,GAAG,CAAC,OAAO,CAAC,cAAc,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;AACjD,CAAC"}
@@ -0,0 +1,10 @@
1
+ /** 채팅·구어 불용어·접속사·대명사·짧은 응답 */
2
+ export declare const KOREAN_CHAT_STOPWORDS: Set<string>;
3
+ /** 단독 토큰으로만 걸러지는 1~2글자 채팅 반응·자모 */
4
+ export declare const REACTION_ONLY_RE: RegExp;
5
+ /** 조사 분할·n-gram에서 떨어진 어미 조각 (단독 키워드 금지) */
6
+ export declare const MORPHOLOGICAL_FRAGMENTS: Set<string>;
7
+ /** 동사 활용 꼬리만 남은 3~4글자 (하는X) */
8
+ export declare const VERB_FRAGMENT_RE: RegExp;
9
+ /** 2글자인데 어미/조사 꼬리만 남은 형태 */
10
+ export declare const FRAGMENT_TAIL_RE: RegExp;
@@ -0,0 +1,272 @@
1
+ /** 채팅·구어 불용어·접속사·대명사·짧은 응답 */
2
+ export const KOREAN_CHAT_STOPWORDS = new Set([
3
+ "그리고",
4
+ "그냥",
5
+ "근데",
6
+ "그런데",
7
+ "그래서",
8
+ "그러면",
9
+ "그러니까",
10
+ "그럼",
11
+ "그래",
12
+ "저는",
13
+ "제가",
14
+ "우리",
15
+ "너희",
16
+ "자기",
17
+ "본인",
18
+ "오늘",
19
+ "내일",
20
+ "어제",
21
+ "지금",
22
+ "방금",
23
+ "이거",
24
+ "저거",
25
+ "그거",
26
+ "여기",
27
+ "거기",
28
+ "이게",
29
+ "저게",
30
+ "그게",
31
+ "뭔데",
32
+ "뭐야",
33
+ "뭐임",
34
+ "뭐냐",
35
+ "왜요",
36
+ "왜냐",
37
+ "어떻게",
38
+ "어떡해",
39
+ "아니",
40
+ "아냐",
41
+ "맞아",
42
+ "맞지",
43
+ "맞음",
44
+ "맞고",
45
+ "응",
46
+ "웅",
47
+ "네",
48
+ "예",
49
+ "아",
50
+ "어",
51
+ "음",
52
+ "흠",
53
+ "좀",
54
+ "조금",
55
+ "진짜",
56
+ "정말",
57
+ "완전",
58
+ "되게",
59
+ "엄청",
60
+ "약간",
61
+ "그냥",
62
+ "일단",
63
+ "일단은",
64
+ "사실",
65
+ "근본",
66
+ "솔직히",
67
+ "아마",
68
+ "혹시",
69
+ "그냥",
70
+ "수정",
71
+ "확인",
72
+ "가능",
73
+ "입니다",
74
+ "합니다",
75
+ "있습니다",
76
+ "없습니다",
77
+ "했습니다",
78
+ "해요",
79
+ "이에요",
80
+ "예요",
81
+ "거예요",
82
+ "있는",
83
+ "없는",
84
+ "하는",
85
+ "하다",
86
+ "했다",
87
+ "해서",
88
+ "하면",
89
+ "하니",
90
+ "하네",
91
+ "하냐",
92
+ "하냐고",
93
+ "하자",
94
+ "하지",
95
+ "하지마",
96
+ "하지말고",
97
+ "같아",
98
+ "같고",
99
+ "같은",
100
+ "같음",
101
+ "같네",
102
+ "인데",
103
+ "인거",
104
+ "인것",
105
+ "것",
106
+ "거",
107
+ "수",
108
+ "때",
109
+ "중",
110
+ "뿐",
111
+ "정도",
112
+ "정도임",
113
+ "정도네",
114
+ "the",
115
+ "and",
116
+ "for",
117
+ "with",
118
+ "this",
119
+ "that",
120
+ "from",
121
+ "http",
122
+ "https",
123
+ "정치성향",
124
+ "강퇴됩니다",
125
+ "가려짐",
126
+ "초중반",
127
+ "비속어",
128
+ "반가워",
129
+ "닉네임",
130
+ "들어왔습니다",
131
+ "나갔습니다",
132
+ "삭제되었습니다",
133
+ "가렸습니다",
134
+ "보냈습니다",
135
+ "리얼",
136
+ "오케이",
137
+ "인정",
138
+ "노노",
139
+ "죄송",
140
+ "감사",
141
+ "축하",
142
+ "미친",
143
+ "레전드",
144
+ "노답",
145
+ "사람",
146
+ "사람들",
147
+ "내가",
148
+ "생각",
149
+ "같아요",
150
+ "조건",
151
+ "능력",
152
+ "com",
153
+ "youtu",
154
+ "www",
155
+ "node",
156
+ "si",
157
+ "보룸",
158
+ ]);
159
+ /** 단독 토큰으로만 걸러지는 1~2글자 채팅 반응·자모 */
160
+ export const REACTION_ONLY_RE = /^(?:[ㅋㅎㅠㅜㅇㅂㅅㅈㅇㅋㅍㅊ]+|[ㄱ-ㅎㅏ-ㅣ]+|[ㅋㅎ]{1,2})$/u;
161
+ /** 조사 분할·n-gram에서 떨어진 어미 조각 (단독 키워드 금지) */
162
+ export const MORPHOLOGICAL_FRAGMENTS = new Set([
163
+ "니다",
164
+ "습니다",
165
+ "습니",
166
+ "합니",
167
+ "입니다",
168
+ "해요",
169
+ "했어",
170
+ "하세",
171
+ "세요",
172
+ "하세요",
173
+ "으세요",
174
+ "네요",
175
+ "거예",
176
+ "이에",
177
+ "으로",
178
+ "로서",
179
+ "에서",
180
+ "부터",
181
+ "까지",
182
+ "에게",
183
+ "한테",
184
+ "라고",
185
+ "다고",
186
+ "이라",
187
+ "아니",
188
+ "아니라",
189
+ "는데",
190
+ "인데",
191
+ "하게",
192
+ "하지",
193
+ "하면",
194
+ "해서",
195
+ "하니",
196
+ "하네",
197
+ "하냐",
198
+ "적으",
199
+ "적이",
200
+ "적으로",
201
+ "입니",
202
+ "있습",
203
+ "없습",
204
+ "됩니",
205
+ "합니",
206
+ "같습",
207
+ "것입",
208
+ "수있",
209
+ "할수",
210
+ "하는",
211
+ "있는",
212
+ "없는",
213
+ "된다",
214
+ "된다",
215
+ "된다",
216
+ "했던",
217
+ "했음",
218
+ "했을",
219
+ "할것",
220
+ "것같",
221
+ "같은",
222
+ "같아",
223
+ "보이",
224
+ "보는",
225
+ "대한",
226
+ "관련",
227
+ "통해",
228
+ "위해",
229
+ "때문",
230
+ "정도",
231
+ "정말",
232
+ "진짜",
233
+ "너무",
234
+ "완전",
235
+ "이거",
236
+ "그거",
237
+ "저거",
238
+ "하는데",
239
+ "하는거",
240
+ "하는게",
241
+ "하세요",
242
+ "주세요",
243
+ "인가요",
244
+ "이라고",
245
+ "잖아요",
246
+ "그렇게",
247
+ "아님",
248
+ "그건",
249
+ "이제",
250
+ "무슨",
251
+ "많이",
252
+ "사람이",
253
+ "때문에",
254
+ "있는데",
255
+ "같은데",
256
+ "마세요",
257
+ "하시는",
258
+ "생각하",
259
+ "다들",
260
+ "했는데",
261
+ "겠습니",
262
+ "한다고",
263
+ "그런",
264
+ "우리나",
265
+ "공산주",
266
+ "건가요",
267
+ ]);
268
+ /** 동사 활용 꼬리만 남은 3~4글자 (하는X) */
269
+ export const VERB_FRAGMENT_RE = /^하는[a-z가-힣]{0,2}$/u;
270
+ /** 2글자인데 어미/조사 꼬리만 남은 형태 */
271
+ export const FRAGMENT_TAIL_RE = /(?:다|요|음|네|죠|임|함|라|가|는|은|을|를|의|에|와|과|도|만|서|고|니|지|게)$/u;
272
+ //# sourceMappingURL=korean-stopwords.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"korean-stopwords.js","sourceRoot":"","sources":["../../src/korean-stopwords.ts"],"names":[],"mappings":"AAAA,8BAA8B;AAC9B,MAAM,CAAC,MAAM,qBAAqB,GAAG,IAAI,GAAG,CAAC;IAC3C,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,MAAM;IACN,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,MAAM;IACN,MAAM;IACN,MAAM;IACN,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,MAAM;IACN,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,GAAG;IACH,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,MAAM;IACN,MAAM;IACN,MAAM;IACN,MAAM;IACN,MAAM;IACN,OAAO;IACP,MAAM;IACN,OAAO;IACP,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,QAAQ;IACR,OAAO;IACP,SAAS;IACT,OAAO;IACP,OAAO;IACP,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,OAAO;IACP,KAAK;IACL,MAAM;IACN,IAAI;IACJ,IAAI;CACL,CAAC,CAAC;AAEH,mCAAmC;AACnC,MAAM,CAAC,MAAM,gBAAgB,GAC3B,4CAA4C,CAAC;AAE/C,2CAA2C;AAC3C,MAAM,CAAC,MAAM,uBAAuB,GAAG,IAAI,GAAG,CAAC;IAC7C,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,KAAK;IACL,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;IACL,IAAI;IACJ,KAAK;IACL,KAAK;IACL,KAAK;CACN,CAAC,CAAC;AAEH,+BAA+B;AAC/B,MAAM,CAAC,MAAM,gBAAgB,GAAG,oBAAoB,CAAC;AAErD,4BAA4B;AAC5B,MAAM,CAAC,MAAM,gBAAgB,GAC3B,uDAAuD,CAAC"}
@@ -0,0 +1,37 @@
1
+ export type SubwordPos = "L" | "R";
2
+ export type SubwordToken = readonly [word: string, pos: SubwordPos];
3
+ export interface KrWordRankOptions {
4
+ minCount?: number;
5
+ maxLength?: number;
6
+ beta?: number;
7
+ maxIter?: number;
8
+ numRset?: number;
9
+ converge?: number;
10
+ }
11
+ export interface KrWordRankExtractOptions {
12
+ stopwords?: ReadonlySet<string>;
13
+ limit?: number;
14
+ }
15
+ /** 메시지 스트림으로 학습 → HITS → L-부분 키워드 */
16
+ export declare class KrWordRankStream {
17
+ private minCount;
18
+ private readonly maxLength;
19
+ private readonly beta;
20
+ private readonly maxIter;
21
+ private readonly numRset;
22
+ private readonly converge;
23
+ private readonly counter;
24
+ private readonly edgeCounts;
25
+ private documents;
26
+ constructor(options?: KrWordRankOptions);
27
+ addDocument(raw: string): void;
28
+ extractKeywords(options?: KrWordRankExtractOptions): Map<string, number>;
29
+ private scanToken;
30
+ private bumpCounter;
31
+ private pruneCounter;
32
+ private bumpEdge;
33
+ private buildVocabulary;
34
+ private buildGraph;
35
+ }
36
+ /** 메시지 수에 따른 min_count (소규모 방 완화) */
37
+ export declare function adaptiveMinCount(messageCount: number): number;