kakaotalk-chat-analyzer 0.16.5 → 0.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/README.md +38 -1
  2. package/data/korean-profanity.txt +33 -0
  3. package/dist/src/aggregator.d.ts +27 -3
  4. package/dist/src/aggregator.js +152 -68
  5. package/dist/src/aggregator.js.map +1 -1
  6. package/dist/src/analysis-budget.d.ts +11 -0
  7. package/dist/src/analysis-budget.js +26 -0
  8. package/dist/src/analysis-budget.js.map +1 -0
  9. package/dist/src/analysis-capability.d.ts +19 -0
  10. package/dist/src/analysis-capability.js +48 -0
  11. package/dist/src/analysis-capability.js.map +1 -0
  12. package/dist/src/analysis-phase-profile.d.ts +9 -0
  13. package/dist/src/analysis-phase-profile.js +49 -0
  14. package/dist/src/analysis-phase-profile.js.map +1 -0
  15. package/dist/src/analysis-preset.d.ts +19 -0
  16. package/dist/src/analysis-preset.js +73 -0
  17. package/dist/src/analysis-preset.js.map +1 -0
  18. package/dist/src/analysis-profile.js +4 -0
  19. package/dist/src/analysis-profile.js.map +1 -1
  20. package/dist/src/analysis-spool.d.ts +5 -0
  21. package/dist/src/analysis-spool.js.map +1 -1
  22. package/dist/src/analysis.d.ts +2 -5
  23. package/dist/src/analysis.js +200 -57
  24. package/dist/src/analysis.js.map +1 -1
  25. package/dist/src/analyze-pool.d.ts +5 -0
  26. package/dist/src/analyze-pool.js +5 -1
  27. package/dist/src/analyze-pool.js.map +1 -1
  28. package/dist/src/cli.js +59 -1
  29. package/dist/src/cli.js.map +1 -1
  30. package/dist/src/embedding-topics.d.ts +5 -2
  31. package/dist/src/embedding-topics.js +11 -11
  32. package/dist/src/embedding-topics.js.map +1 -1
  33. package/dist/src/keyword-eligibility.d.ts +2 -0
  34. package/dist/src/keyword-eligibility.js +36 -0
  35. package/dist/src/keyword-eligibility.js.map +1 -0
  36. package/dist/src/keyword-rank-dual.d.ts +20 -0
  37. package/dist/src/keyword-rank-dual.js +126 -0
  38. package/dist/src/keyword-rank-dual.js.map +1 -0
  39. package/dist/src/keyword-record-tokens.d.ts +7 -0
  40. package/dist/src/keyword-record-tokens.js +25 -0
  41. package/dist/src/keyword-record-tokens.js.map +1 -0
  42. package/dist/src/keyword-seed-topics.d.ts +5 -0
  43. package/dist/src/keyword-seed-topics.js +67 -0
  44. package/dist/src/keyword-seed-topics.js.map +1 -0
  45. package/dist/src/kiwi-keyword-pool.d.ts +4 -0
  46. package/dist/src/kiwi-keyword-pool.js +89 -0
  47. package/dist/src/kiwi-keyword-pool.js.map +1 -0
  48. package/dist/src/kiwi-runtime.d.ts +2 -0
  49. package/dist/src/kiwi-runtime.js +22 -2
  50. package/dist/src/kiwi-runtime.js.map +1 -1
  51. package/dist/src/kiwi-tokenize-worker.d.ts +1 -0
  52. package/dist/src/kiwi-tokenize-worker.js +19 -0
  53. package/dist/src/kiwi-tokenize-worker.js.map +1 -0
  54. package/dist/src/kiwi-worker-config.d.ts +3 -0
  55. package/dist/src/kiwi-worker-config.js +23 -0
  56. package/dist/src/kiwi-worker-config.js.map +1 -0
  57. package/dist/src/llm-apply.d.ts +4 -0
  58. package/dist/src/llm-apply.js +19 -0
  59. package/dist/src/llm-apply.js.map +1 -0
  60. package/dist/src/llm-cache.d.ts +9 -0
  61. package/dist/src/llm-cache.js +31 -0
  62. package/dist/src/llm-cache.js.map +1 -0
  63. package/dist/src/llm-input.d.ts +4 -0
  64. package/dist/src/llm-input.js +42 -0
  65. package/dist/src/llm-input.js.map +1 -0
  66. package/dist/src/llm-policy.d.ts +12 -0
  67. package/dist/src/llm-policy.js +48 -0
  68. package/dist/src/llm-policy.js.map +1 -0
  69. package/dist/src/llm-pull.d.ts +3 -0
  70. package/dist/src/llm-pull.js +48 -0
  71. package/dist/src/llm-pull.js.map +1 -0
  72. package/dist/src/llm-summarize.d.ts +17 -0
  73. package/dist/src/llm-summarize.js +181 -0
  74. package/dist/src/llm-summarize.js.map +1 -0
  75. package/dist/src/message-reservoir.d.ts +3 -1
  76. package/dist/src/message-reservoir.js +8 -0
  77. package/dist/src/message-reservoir.js.map +1 -1
  78. package/dist/src/ml-batch-size.d.ts +5 -0
  79. package/dist/src/ml-batch-size.js +45 -0
  80. package/dist/src/ml-batch-size.js.map +1 -0
  81. package/dist/src/ml-runtime.d.ts +8 -0
  82. package/dist/src/ml-runtime.js +54 -0
  83. package/dist/src/ml-runtime.js.map +1 -0
  84. package/dist/src/profanity.d.ts +16 -0
  85. package/dist/src/profanity.js +101 -0
  86. package/dist/src/profanity.js.map +1 -0
  87. package/dist/src/report-charts.d.ts +17 -2
  88. package/dist/src/report-charts.js +124 -13
  89. package/dist/src/report-charts.js.map +1 -1
  90. package/dist/src/report-config.d.ts +6 -0
  91. package/dist/src/report-config.js +32 -0
  92. package/dist/src/report-config.js.map +1 -0
  93. package/dist/src/report-empty.js +8 -0
  94. package/dist/src/report-empty.js.map +1 -1
  95. package/dist/src/report-innovation.js +31 -3
  96. package/dist/src/report-innovation.js.map +1 -1
  97. package/dist/src/report-provenance.d.ts +8 -0
  98. package/dist/src/report-provenance.js +31 -1
  99. package/dist/src/report-provenance.js.map +1 -1
  100. package/dist/src/report-section-visibility.js +1 -1
  101. package/dist/src/report-section-visibility.js.map +1 -1
  102. package/dist/src/report-styles.d.ts +1 -1
  103. package/dist/src/report-styles.js +20 -0
  104. package/dist/src/report-styles.js.map +1 -1
  105. package/dist/src/report.js +79 -11
  106. package/dist/src/report.js.map +1 -1
  107. package/dist/src/semantic-keywords.d.ts +12 -0
  108. package/dist/src/semantic-keywords.js +42 -22
  109. package/dist/src/semantic-keywords.js.map +1 -1
  110. package/dist/src/semantic-policy.d.ts +6 -2
  111. package/dist/src/semantic-policy.js +18 -2
  112. package/dist/src/semantic-policy.js.map +1 -1
  113. package/dist/src/sender-message-reservoir.d.ts +15 -0
  114. package/dist/src/sender-message-reservoir.js +37 -0
  115. package/dist/src/sender-message-reservoir.js.map +1 -0
  116. package/dist/src/sentiment-analyze.d.ts +20 -0
  117. package/dist/src/sentiment-analyze.js +149 -0
  118. package/dist/src/sentiment-analyze.js.map +1 -0
  119. package/dist/src/sentiment-policy.d.ts +18 -0
  120. package/dist/src/sentiment-policy.js +53 -0
  121. package/dist/src/sentiment-policy.js.map +1 -0
  122. package/dist/src/streaming-tfidf-keywords.d.ts +2 -0
  123. package/dist/src/streaming-tfidf-keywords.js +7 -3
  124. package/dist/src/streaming-tfidf-keywords.js.map +1 -1
  125. package/dist/src/system-notices.js +13 -4
  126. package/dist/src/system-notices.js.map +1 -1
  127. package/dist/src/topic-generic.d.ts +4 -0
  128. package/dist/src/topic-generic.js +43 -0
  129. package/dist/src/topic-generic.js.map +1 -0
  130. package/dist/src/topic-map.d.ts +4 -0
  131. package/dist/src/topic-map.js +51 -18
  132. package/dist/src/topic-map.js.map +1 -1
  133. package/dist/src/topic-merge.d.ts +18 -0
  134. package/dist/src/topic-merge.js +153 -0
  135. package/dist/src/topic-merge.js.map +1 -0
  136. package/dist/src/types.d.ts +67 -0
  137. package/dist/src/version.d.ts +2 -2
  138. package/dist/src/version.js +1 -1
  139. package/package.json +8 -2
package/README.md CHANGED
@@ -186,6 +186,36 @@ kca --help
186
186
 
187
187
  </details>
188
188
 
189
+ <details>
190
+ <summary><strong>분석 preset·기능 (0.18+)</strong></summary>
191
+
192
+ | preset | 용도 | 90k 메시지 목표 | 시맨틱 | 감정 | LLM |
193
+ |--------|------|-----------------|--------|------|-----|
194
+ | `speed` | RAM·시간 최소 | ~3분 | 끔 | 끔 | 끔 |
195
+ | `balanced` | 기본 권장 | ~5분 | e5-small | 자동 | 끔 |
196
+ | `quality` | 한국어·서사 최대 | ~6분 | ko-v2 | KLUE | 2B/4B |
197
+ | `custom` | 기능 직접 지정 | — | env/플래그 | env | `KCA_LLM=1` |
198
+
199
+ ```bash
200
+ kca capabilities # RAM·추천 preset
201
+ kca ./chat.csv --preset balanced
202
+ kca ./chat.csv --preset quality --local
203
+ kca llm pull 2b # GGUF (optional node-llama-cpp)
204
+ KCA_LLM_BACKEND=ollama KCA_LLM=1 kca ./chat.csv --preset custom
205
+ ```
206
+
207
+ 환경 변수: `KCA_PRESET`, `KCA_SEMANTIC_MODEL`, `KCA_SENTIMENT_MODEL`, `KCA_LLM`, `KCA_LLM_MOCK`, `KCA_ONNX_GPU`, `KCA_EMBED_BATCH`, `KCA_SENTIMENT_BATCH`, `KCA_KIWI_WORKERS`, `KCA_NO_KIWI_WORKERS`, `KCA_PROFILE_PHASES`, `KCA_BENCH_CSV`, `KCA_KEYWORD_SUMMARY_TOP`, `KCA_SHOP_SEARCH_TOP`.
208
+
209
+ **속도(품질 유지):** 대용량 CSV는 Kiwi worker pool(`KCA_KIWI_WORKERS`, RAM≥8GB 기본 2–4)·시맨틱/감정을 키워드 패스와 겹쳐 실행. `KCA_PROFILE_PHASES=1`로 단계별 ms. quality에서 GPU 가속: `onnxruntime-node` 설치 후 `KCA_ONNX_GPU=metal`(macOS)·`cuda`(Linux)·`dml`(Windows).
210
+
211
+ **키워드:** 요약은 `KCA_KEYWORD_SUMMARY_TOP`(기본 12)·**빈도 순**; ④ 차트에서 **빈도/특이어** 탭 전환. 전체 ~120개는 집계 상한.
212
+
213
+ **주제 맵:** graph(공기 군집)·keyword(상위 키워드 시드)·semantic(임베딩 클러스터) 3레인 RRF 병합 — 대용량 방에서 의미 테마 최대 12장. `KCA_TOPIC_MAX`, `KCA_TOPIC_MIN_THEMES`.
214
+
215
+ **LLM (`quality` / `KCA_LLM=1`):** 주제 제목·서사 + `topicProposals`(키워드 화이트리스트) + 인사이트 bullet·샵검색/상호작용 한 줄(원문 미전송).
216
+
217
+ </details>
218
+
189
219
  <details>
190
220
  <summary><strong>성능·키워드·벤치 (개발·파워유저)</strong></summary>
191
221
 
@@ -197,11 +227,13 @@ kca --help
197
227
  ```bash
198
228
  npx kcachat@latest "./chat.csv" --profile --no-worker
199
229
  npm run bench:stream -- 100000 # 저장소 클론 후
230
+ npm run bench:preset # speed/balanced SLA 스모크
231
+ KCA_BENCH_COMPARE=1 npm run bench:semantic
200
232
  ```
201
233
 
202
234
  </details>
203
235
 
204
- **버전 고정:** `npx kakaotalk-chat-analyzer@0.16.1` · 최신은 `kcachat@latest`가 매번 본체를 받습니다. 리포트 사이드 카드·`#kca-provenance`로 실제 생성 버전을 확인할 수 있습니다.
236
+ **버전 고정:** `npx kakaotalk-chat-analyzer@0.18.2` · 최신은 `kcachat@latest`가 매번 본체를 받습니다. 리포트 사이드 카드·`#kca-provenance`로 실제 생성 버전을 확인할 수 있습니다.
205
237
 
206
238
  **로컬 개발:**
207
239
 
@@ -216,6 +248,11 @@ cd kakaotalk-chat-analyzer && npm install && npm run build && npm test
216
248
 
217
249
  | 버전 | 요약 |
218
250
  |------|------|
251
+ | **0.18.2** | 주제 맵 3레인(graph·키워드·임베딩) 병합·테마 6~12·LLM `topicProposals` |
252
+ | **0.18.1** | 키워드 빈도/특이어 dual-view·샵검색 통계·dyad 셀 숫자·LLM 인사이트 필드 |
253
+ | **0.18.0** | preset(speed/balanced/quality)·5분 예산 skip·LLM 서사·KLUE 감정·dual-lane 툴팁·CI Playwright |
254
+ | **0.17.2** | `kca llm pull`·provenance `llmUsed`·분석 예산 라우터 |
255
+ | **0.16.6** | 글자 수 랭킹·비속어 패턴 통계·transformers 감정 분석(자동/선택) |
219
256
  | **0.16.5** | 상호작용 히트맵: 말 많은 사람 축 상단·지연 로드·로딩 스켈레톤 |
220
257
  | **0.16.4** | 대용량 방 키워드: minDf 스케일·메시지 수 우선 정렬·시맨틱은 BM25 후보만 보강 |
221
258
  | **0.16.3** | 기본 **품질 우선** 프로필(메인 스레드·시맨틱 샘플 확대·RRF 완화·임베딩 주제). 가속은 `--worker` / `--fast` |
@@ -0,0 +1,33 @@
1
+ # 한국어 채팅 비속·욕설 패턴(부분 일치, normalize 후 검사). 줄 앞 # 은 주석.
2
+ 시발
3
+ 씨발
4
+ 씨팔
5
+
6
+ 지랄
7
+ 병신
8
+ 븅신
9
+ 미친
10
+ 미친놈
11
+ 개새
12
+ 개새끼
13
+ 개쉐
14
+ 개씨
15
+ 개같
16
+ 닥쳐
17
+ 꺼져
18
+ 엿먹
19
+ 엿먹어
20
+ 죽어
21
+ 죽을
22
+ 좆같
23
+ 좆만
24
+ ㅅㅂ
25
+ ㅆㅂ
26
+
27
+ ㅂㅅ
28
+ ㅈㄹ
29
+ ㅁㅊ
30
+ ㅗㅗ
31
+ fuck
32
+ shit
33
+ bitch
@@ -1,4 +1,5 @@
1
- import type { ChatRecord, EncodingName, PrivacyMode, ReportData } from "./types.js";
1
+ import type { ChatRecord, EncodingName, PrivacyMode, ReportData, SentimentStats } from "./types.js";
2
+ import type { BuildReportOptions } from "./analyze-pool.js";
2
3
  export interface FinalizeSourceMeta {
3
4
  filePath: string;
4
5
  encoding: EncodingName;
@@ -7,6 +8,7 @@ export interface FinalizeSourceMeta {
7
8
  }
8
9
  export interface FinalizeOptions {
9
10
  usedSemanticKeywords?: boolean;
11
+ usedSentimentAnalysis?: boolean;
10
12
  koreanPrimary?: boolean;
11
13
  useEmbeddingTopics?: boolean;
12
14
  semanticSupplementRrfWeight?: number;
@@ -16,7 +18,9 @@ export declare function semanticSupplementHitCap(corpusMessages: number): number
16
18
  export interface AggregatorOptions {
17
19
  /** 시맨틱 키워드용 메시지 샘플 수집 */
18
20
  semanticSamples?: boolean;
19
- /** 시맨틱 리저보어 상한 추정(스트리밍 생략 가능) */
21
+ /** 감정 분석용 메시지 샘플 수집 */
22
+ sentimentSamples?: boolean;
23
+ /** 시맨틱·감정 리저보어 상한 추정(스트리밍 시 생략 가능) */
20
24
  estimatedMessages?: number;
21
25
  }
22
26
  export declare class ReportAggregator {
@@ -73,11 +77,18 @@ export declare class ReportAggregator {
73
77
  private roomSubManagerMessages;
74
78
  private roomManagerMessages;
75
79
  private roomShopSearchMessages;
80
+ private shopSearchUntaggedNotices;
81
+ private readonly shopSearchMissSamples;
76
82
  private roomPhotoBundleMessages;
77
83
  private pureLaughMessages;
78
84
  private openChatBoilerplateExcluded;
79
85
  private semanticThemeCandidates;
80
86
  private readonly semanticReservoir;
87
+ private readonly sentimentReservoir;
88
+ private readonly profanityCounter;
89
+ private sentimentStats;
90
+ /** stats pass에서 리저보어를 채웠으면 keyword pass 중복 push 방지 */
91
+ private samplesCollectedInStatsPass;
81
92
  private prevMs;
82
93
  private prevSender;
83
94
  private runSender;
@@ -85,9 +96,21 @@ export declare class ReportAggregator {
85
96
  private firstDate;
86
97
  private lastDate;
87
98
  constructor(filePath: string, privacy: PrivacyMode, top: number, options?: AggregatorOptions);
88
- drainSemanticSamples(): string[];
99
+ /** 스트리밍 1패스 후 실제 건수로 리저보어 상한 보정(추정치 과소 시) */
100
+ ensureSampleCaps(messageCount: number): void;
101
+ drainSemanticSamples(buildOptions?: BuildReportOptions): string[];
102
+ drainSentimentSamples(): {
103
+ text: string;
104
+ sender: string;
105
+ }[];
106
+ applySentimentStats(stats: SentimentStats): void;
107
+ senderAliasMap(): Map<string, string>;
89
108
  messageCount(): number;
90
109
  resetKeywordPipeline(): void;
110
+ markSamplesCollectedInStatsPass(): void;
111
+ applyKeywordTokens(kwTokens: string[], monthKey: string): void;
112
+ private pushAnalysisSamples;
113
+ private pushSemanticSample;
91
114
  private consumeKeywords;
92
115
  applySemanticKeywordBoost(items: {
93
116
  label: string;
@@ -102,6 +125,7 @@ export declare class ReportAggregator {
102
125
  consume(record: ChatRecord, opts?: {
103
126
  keywordsOnly?: boolean;
104
127
  skipKeywords?: boolean;
128
+ collectSamples?: boolean;
105
129
  }): void;
106
130
  private bumpSystemNotice;
107
131
  finalize(meta: FinalizeSourceMeta, finalizeOpts?: FinalizeOptions): ReportData;
@@ -1,6 +1,7 @@
1
1
  import { formatDate, formatDateTime, partsToUtcMs, weekdayIndex } from "./date.js";
2
2
  import { maskPartialDisplayName, parseChatRoomNameFromExportPath, safeInputName } from "./analysis-labels.js";
3
3
  import { GapStreamStats, SessionGapStats } from "./gap-stats.js";
4
+ import { keywordTokensForRecord } from "./keyword-record-tokens.js";
4
5
  import { tokenizeForKeywords } from "./keyword-tokenize.js";
5
6
  import { adaptiveMinCount, StreamingTfidfKeywords } from "./streaming-tfidf-keywords.js";
6
7
  import { TopicMapAccumulator } from "./topic-map.js";
@@ -8,8 +9,13 @@ import { extractHashtagKeywords } from "./korean-hashtags.js";
8
9
  import { buildKeywordStopwords } from "./keyword-stopwords.js";
9
10
  import { buildTopicStopwords } from "./topic-stopwords.js";
10
11
  import { MessageReservoir } from "./message-reservoir.js";
11
- import { semanticReservoirCap, semanticSampleCap, subsampleSemanticMessages } from "./semantic-policy.js";
12
- import { mergeKeywordRankings } from "./keyword-merge.js";
12
+ import { SenderMessageReservoir } from "./sender-message-reservoir.js";
13
+ import { ProfanityCounter } from "./profanity.js";
14
+ import { sentimentReservoirCap, sentimentSampleCap, subsampleSentimentRecords, } from "./sentiment-policy.js";
15
+ import { effectiveSemanticSampleCap, semanticReservoirCap, subsampleSemanticMessages, } from "./semantic-policy.js";
16
+ import { getAttachmentMarkers, shouldExtractKeywords } from "./keyword-eligibility.js";
17
+ import { mergeDualLaneKeywords } from "./keyword-rank-dual.js";
18
+ import { shopSearchDisplayTop } from "./report-config.js";
13
19
  import { isNoiseKeyword } from "./keyword-quality.js";
14
20
  import { formatCompactNumber, formatReplyGapMinutes } from "./report-util.js";
15
21
  import { KeywordCounter } from "./keyword-counter.js";
@@ -22,7 +28,9 @@ import { buildEventSpine } from "./event-spine.js";
22
28
  import { buildRoomNarrative } from "./room-narrative.js";
23
29
  import { buildPeriodCompare } from "./period-compare.js";
24
30
  import { buildBenchmarkBandsFromValues } from "./benchmark-bands.js";
25
- import { mergeEmbeddingThemes } from "./embedding-topics.js";
31
+ import { semanticItemsToTopics } from "./embedding-topics.js";
32
+ import { buildKeywordSeedTopics } from "./keyword-seed-topics.js";
33
+ import { mergeTopicLanes } from "./topic-merge.js";
26
34
  const ATTACHMENT_MARKERS = [
27
35
  "사진",
28
36
  "동영상",
@@ -106,11 +114,18 @@ export class ReportAggregator {
106
114
  roomSubManagerMessages = 0;
107
115
  roomManagerMessages = 0;
108
116
  roomShopSearchMessages = 0;
117
+ shopSearchUntaggedNotices = 0;
118
+ shopSearchMissSamples = [];
109
119
  roomPhotoBundleMessages = 0;
110
120
  pureLaughMessages = 0;
111
121
  openChatBoilerplateExcluded = 0;
112
122
  semanticThemeCandidates = [];
113
123
  semanticReservoir;
124
+ sentimentReservoir;
125
+ profanityCounter;
126
+ sentimentStats = null;
127
+ /** stats pass에서 리저보어를 채웠으면 keyword pass 중복 push 방지 */
128
+ samplesCollectedInStatsPass = false;
114
129
  prevMs = null;
115
130
  prevSender = null;
116
131
  runSender = null;
@@ -125,14 +140,44 @@ export class ReportAggregator {
125
140
  this.semanticReservoir = options?.semanticSamples
126
141
  ? new MessageReservoir(semanticReservoirCap(options?.estimatedMessages))
127
142
  : null;
143
+ this.sentimentReservoir = options?.sentimentSamples
144
+ ? new SenderMessageReservoir(sentimentReservoirCap(options?.estimatedMessages))
145
+ : null;
146
+ this.profanityCounter = ProfanityCounter.create();
147
+ }
148
+ /** 스트리밍 1패스 후 실제 건수로 리저보어 상한 보정(추정치 과소 시) */
149
+ ensureSampleCaps(messageCount) {
150
+ if (messageCount <= 0)
151
+ return;
152
+ const semNeed = semanticReservoirCap(messageCount);
153
+ const sentNeed = sentimentReservoirCap(messageCount);
154
+ if (this.semanticReservoir && this.semanticReservoir.capacity() < semNeed) {
155
+ this.semanticReservoir.growTo(semNeed);
156
+ }
157
+ if (this.sentimentReservoir && this.sentimentReservoir.capacity() < sentNeed) {
158
+ this.sentimentReservoir.growTo(sentNeed);
159
+ }
128
160
  }
129
- drainSemanticSamples() {
161
+ drainSemanticSamples(buildOptions) {
130
162
  const raw = this.semanticReservoir?.drain() ?? [];
131
163
  if (raw.length === 0)
132
164
  return raw;
133
- const cap = semanticSampleCap(Math.max(this.total, raw.length));
165
+ const cap = effectiveSemanticSampleCap(Math.max(this.total, raw.length), buildOptions);
134
166
  return subsampleSemanticMessages(raw, cap);
135
167
  }
168
+ drainSentimentSamples() {
169
+ const raw = this.sentimentReservoir?.drain() ?? [];
170
+ if (raw.length === 0)
171
+ return raw;
172
+ const cap = sentimentSampleCap(Math.max(this.total, raw.length));
173
+ return subsampleSentimentRecords(raw, cap);
174
+ }
175
+ applySentimentStats(stats) {
176
+ this.sentimentStats = stats;
177
+ }
178
+ senderAliasMap() {
179
+ return buildSenderLabels([...this.senderStats.keys()], this.privacy);
180
+ }
136
181
  messageCount() {
137
182
  return this.total;
138
183
  }
@@ -140,27 +185,47 @@ export class ReportAggregator {
140
185
  this.keywordStream = new StreamingTfidfKeywords();
141
186
  this.topicMap = new TopicMapAccumulator();
142
187
  }
143
- consumeKeywords(record) {
144
- const split = splitMessageForAnalysis(record.message);
145
- const msg = split.userText.length > 0 ? split.userText : record.message;
146
- const messageLength = msg.length;
147
- if (split.notices.length > 0 && split.userText.length === 0)
148
- return;
149
- const foundAttachments = getAttachmentMarkers(msg);
150
- if (isOpenChatBoilerplate(msg)) {
151
- this.openChatBoilerplateExcluded += 1;
152
- return;
188
+ markSamplesCollectedInStatsPass() {
189
+ this.samplesCollectedInStatsPass = true;
190
+ }
191
+ applyKeywordTokens(kwTokens, monthKey) {
192
+ this.keywordStream.addDocumentTokens(kwTokens);
193
+ this.topicMap.addMessage(kwTokens, monthKey);
194
+ let monthBucket = this.monthlyKeywordBuckets.get(monthKey);
195
+ if (!monthBucket) {
196
+ monthBucket = new KeywordCounter();
197
+ this.monthlyKeywordBuckets.set(monthKey, monthBucket);
153
198
  }
154
- if (messageLength < 2 || !HAS_TOKEN_CHAR_RE.test(msg) || !shouldExtractKeywords(msg, foundAttachments)) {
199
+ for (const t of kwTokens)
200
+ monthBucket.add(t);
201
+ }
202
+ pushAnalysisSamples(msg, sender, messageLength, isPureSystem) {
203
+ if (isPureSystem || isOpenChatBoilerplate(msg))
155
204
  return;
205
+ if (this.sentimentReservoir && messageLength >= 12) {
206
+ this.sentimentReservoir.push(msg, sender);
156
207
  }
157
- const kwTokens = tokenizeForKeywords(msg);
158
- this.keywordStream.addDocumentTokens(kwTokens);
159
- const monthKey = `${record.date.year}-${pad2(record.date.month)}`;
160
- this.topicMap.addMessage(kwTokens, monthKey);
208
+ }
209
+ pushSemanticSample(msg, messageLength) {
161
210
  if (this.semanticReservoir && messageLength >= 12)
162
211
  this.semanticReservoir.push(msg);
163
212
  }
213
+ consumeKeywords(record) {
214
+ const row = keywordTokensForRecord(record);
215
+ if (!row) {
216
+ const split = splitMessageForAnalysis(record.message);
217
+ const msg = split.userText.length > 0 ? split.userText : record.message;
218
+ if (isOpenChatBoilerplate(msg))
219
+ this.openChatBoilerplateExcluded += 1;
220
+ return;
221
+ }
222
+ this.applyKeywordTokens(row.tokens, row.monthKey);
223
+ if (!this.samplesCollectedInStatsPass) {
224
+ const split = splitMessageForAnalysis(record.message);
225
+ const msg = split.userText.length > 0 ? split.userText : record.message;
226
+ this.pushSemanticSample(msg, msg.length);
227
+ }
228
+ }
164
229
  applySemanticKeywordBoost(items) {
165
230
  const valid = items.filter((item) => !isNoiseKeyword(item.label));
166
231
  this.semanticThemeCandidates = valid.map((item) => ({
@@ -199,6 +264,14 @@ export class ReportAggregator {
199
264
  this.bumpSystemNotice(kind, dayKey);
200
265
  for (const tag of split.shopSearchTags)
201
266
  increment(this.shopSearchTopics, tag);
267
+ if (split.notices.includes("shopSearch") && split.shopSearchTags.length === 0) {
268
+ this.shopSearchUntaggedNotices += 1;
269
+ if (this.shopSearchMissSamples.length < 8) {
270
+ const sample = record.message.trim().slice(0, 120).replace(/\s+/g, " ");
271
+ if (sample)
272
+ this.shopSearchMissSamples.push(sample);
273
+ }
274
+ }
202
275
  const msg = split.userText.length > 0 ? split.userText : record.message;
203
276
  const messageLength = msg.length;
204
277
  const isPureSystem = split.notices.length > 0 && split.userText.length === 0;
@@ -279,6 +352,15 @@ export class ReportAggregator {
279
352
  for (const domain of foundDomains)
280
353
  increment(this.domains, domain);
281
354
  }
355
+ if (!isPureSystem && !isOpenChatBoilerplate(msg)) {
356
+ this.profanityCounter.add(msg, record.sender);
357
+ if (opts?.collectSamples) {
358
+ this.pushAnalysisSamples(msg, record.sender, messageLength, isPureSystem);
359
+ }
360
+ else if (this.sentimentReservoir && messageLength >= 12) {
361
+ this.sentimentReservoir.push(msg, record.sender);
362
+ }
363
+ }
282
364
  if (isOpenChatBoilerplate(msg)) {
283
365
  this.openChatBoilerplateExcluded += 1;
284
366
  }
@@ -287,16 +369,7 @@ export class ReportAggregator {
287
369
  shouldExtractKeywords(msg, foundAttachments)) {
288
370
  if (!opts?.skipKeywords) {
289
371
  const kwTokens = tokenizeForKeywords(msg);
290
- this.keywordStream.addDocumentTokens(kwTokens);
291
- const monthKey = `${record.date.year}-${pad2(record.date.month)}`;
292
- this.topicMap.addMessage(kwTokens, monthKey);
293
- let monthBucket = this.monthlyKeywordBuckets.get(monthKey);
294
- if (!monthBucket) {
295
- monthBucket = new KeywordCounter();
296
- this.monthlyKeywordBuckets.set(monthKey, monthBucket);
297
- }
298
- for (const t of kwTokens)
299
- monthBucket.add(t);
372
+ this.applyKeywordTokens(kwTokens, `${record.date.year}-${pad2(record.date.month)}`);
300
373
  }
301
374
  if (!opts?.keywordsOnly) {
302
375
  const kwOpts = {
@@ -308,8 +381,15 @@ export class ReportAggregator {
308
381
  }
309
382
  if (messageLength >= 12)
310
383
  this.repeatPhraseCounter.add(msg, dayKey);
311
- if (this.semanticReservoir && messageLength >= 12)
312
- this.semanticReservoir.push(msg);
384
+ if (opts?.collectSamples) {
385
+ this.pushSemanticSample(msg, messageLength);
386
+ }
387
+ else if (!this.samplesCollectedInStatsPass) {
388
+ this.pushSemanticSample(msg, messageLength);
389
+ }
390
+ }
391
+ else if (opts?.collectSamples) {
392
+ this.pushSemanticSample(msg, messageLength);
313
393
  }
314
394
  }
315
395
  }
@@ -375,10 +455,11 @@ export class ReportAggregator {
375
455
  prevStat.maxConsecutive = Math.max(prevStat.maxConsecutive, this.runLen);
376
456
  }
377
457
  const total = this.total;
458
+ const totalChars = this.totalCharacters;
378
459
  const aliases = buildSenderLabels([...this.senderStats.keys()], this.privacy);
379
- const participantStats = [...this.senderStats.entries()]
380
- .map(([raw, stat]) => {
460
+ const allParticipants = [...this.senderStats.entries()].map(([raw, stat]) => {
381
461
  const sharePercent = total > 0 ? round((stat.messages / total) * 100, 1) : 0;
462
+ const characterSharePercent = totalChars > 0 ? round((stat.characters / totalChars) * 100, 1) : 0;
382
463
  return {
383
464
  alias: aliases.get(raw) ?? "???",
384
465
  messages: stat.messages,
@@ -387,12 +468,17 @@ export class ReportAggregator {
387
468
  attachmentMessages: stat.attachmentMessages,
388
469
  linkMessages: stat.linkMessages,
389
470
  sharePercent,
471
+ characterSharePercent,
390
472
  nightMessages: stat.nightMessages,
391
473
  maxConsecutive: stat.maxConsecutive,
392
474
  };
393
- })
475
+ });
476
+ const participantStats = [...allParticipants]
394
477
  .sort((a, b) => b.messages - a.messages)
395
478
  .slice(0, this.top);
479
+ const participantsByCharacters = [...allParticipants]
480
+ .sort((a, b) => b.characters - a.characters || b.messages - a.messages)
481
+ .slice(0, this.top);
396
482
  const sortedDays = [...this.daily.keys()].sort();
397
483
  const longestStreak = longestDateStreak(sortedDays);
398
484
  let peakHour = null;
@@ -453,17 +539,24 @@ export class ReportAggregator {
453
539
  }
454
540
  const keywordStop = buildKeywordStopwords();
455
541
  const keywordLimit = Math.max(120, this.top * 3);
456
- const wordRankItems = this.keywordStream.extractKeywordItems({
542
+ const minDocFreq = adaptiveMinCount(total, finalizeOpts?.koreanPrimary !== false);
543
+ const keywordCandidates = this.keywordStream.collectKeywordCandidates({
457
544
  stopwords: keywordStop,
458
- limit: keywordLimit,
459
- minDocFreq: adaptiveMinCount(total, finalizeOpts?.koreanPrimary !== false),
545
+ minDocFreq,
460
546
  });
461
- this.applySemanticSupplementForRanked(wordRankItems);
462
- const keywords = mergeKeywordRankings(wordRankItems, this.keywordSupplement, keywordLimit, finalizeOpts?.semanticSupplementRrfWeight ?? 0.5);
463
- let topics = this.topicMap.buildTopics(total, buildTopicStopwords());
464
- if (finalizeOpts?.useEmbeddingTopics && this.semanticThemeCandidates.length > 0) {
465
- topics = mergeEmbeddingThemes(topics, this.semanticThemeCandidates, total);
466
- }
547
+ const bm25LaneForSemantic = [...keywordCandidates]
548
+ .sort((a, b) => b.score - a.score || b.messageHits - a.messageHits)
549
+ .slice(0, Math.min(200, Math.floor(80 + Math.sqrt(Math.max(total, 1)))));
550
+ this.applySemanticSupplementForRanked(bm25LaneForSemantic);
551
+ const kwMerged = mergeDualLaneKeywords(keywordCandidates, this.keywordSupplement, total, keywordLimit, finalizeOpts?.semanticSupplementRrfWeight ?? 0.5);
552
+ const keywords = kwMerged.byFrequency;
553
+ const keywordsDistinctive = kwMerged.distinctive;
554
+ const graphTopics = this.topicMap.buildTopics(total, buildTopicStopwords());
555
+ const keywordTopics = buildKeywordSeedTopics(keywords, keywordsDistinctive, total, this.topicMap);
556
+ const semanticTopics = finalizeOpts?.useEmbeddingTopics && this.semanticThemeCandidates.length > 0
557
+ ? semanticItemsToTopics(this.semanticThemeCandidates, total)
558
+ : [];
559
+ let topics = mergeTopicLanes({ graph: graphTopics, keyword: keywordTopics, semantic: semanticTopics }, total);
467
560
  const burstDetectionMethod = resolveBurstDetectionMethod();
468
561
  const keywordTop1SharePercent = top1ShareFromCounts(keywords, total);
469
562
  let attachmentMarkerSum = 0;
@@ -661,9 +754,13 @@ export class ReportAggregator {
661
754
  nightSharePercent,
662
755
  emojiMessages: this.emojiMessages,
663
756
  usedSemanticKeywords: finalizeOpts?.usedSemanticKeywords === true,
757
+ usedSentimentAnalysis: finalizeOpts?.usedSentimentAnalysis === true,
664
758
  },
665
759
  insights,
666
760
  participants: participantStats,
761
+ participantsByCharacters,
762
+ profanity: this.profanityCounter.buildProfanityStats(total, aliases),
763
+ sentiment: this.sentimentStats,
667
764
  daily: dailySorted,
668
765
  hourly: this.hourly,
669
766
  weekdays: this.weekdays.map((count, index) => ({
@@ -674,6 +771,7 @@ export class ReportAggregator {
674
771
  attachments: topCounts(this.attachments, this.top),
675
772
  domains: topCounts(this.domains, this.top),
676
773
  keywords,
774
+ keywordsDistinctive,
677
775
  topics,
678
776
  roomEvents: buildRoomEventStats(total, {
679
777
  join: this.roomJoinMessages,
@@ -687,9 +785,14 @@ export class ReportAggregator {
687
785
  manager: this.roomManagerMessages,
688
786
  shopSearch: this.roomShopSearchMessages,
689
787
  photoBundle: this.roomPhotoBundleMessages,
788
+ }, {
789
+ tagExtractions: [...this.shopSearchTopics.values()].reduce((a, n) => a + n, 0),
790
+ uniqueTags: this.shopSearchTopics.size,
791
+ untaggedNotices: this.shopSearchUntaggedNotices,
690
792
  }),
691
793
  repeatedPhrases: this.repeatPhraseCounter.top(8, 3),
692
- shopSearchTopics: topCounts(this.shopSearchTopics, 10),
794
+ shopSearchTopics: topCounts(this.shopSearchTopics, shopSearchDisplayTop()),
795
+ shopSearchMissSamples: process.env.KCA_DEBUG_SHOP === "1" ? [...this.shopSearchMissSamples] : undefined,
693
796
  pureLaughMessages: this.pureLaughMessages,
694
797
  conversationPace,
695
798
  burstDays,
@@ -771,29 +874,7 @@ function getParticipantStat(stats, sender) {
771
874
  stats.set(sender, created);
772
875
  return created;
773
876
  }
774
- function shouldExtractKeywords(message, attachmentMarkers) {
775
- const trimmed = message.trim();
776
- if (trimmed.length === 0)
777
- return false;
778
- if (attachmentMarkers.length === 1 && trimmed === attachmentMarkers[0])
779
- return false;
780
- if (attachmentMarkers.length > 0 && trimmed.length <= 16) {
781
- const onlyMarkers = attachmentMarkers.every((m) => trimmed === m || trimmed.includes(m));
782
- if (onlyMarkers && !/[가-힣A-Za-z]{3,}/.test(trimmed.replace(/[^\p{L}\p{N}]/gu, ""))) {
783
- return false;
784
- }
785
- }
786
- return true;
787
- }
788
- function getAttachmentMarkers(message) {
789
- const found = ATTACHMENT_MARKERS.filter((marker) => message.includes(marker));
790
- const t = message.trim();
791
- if (PHOTO_BUNDLE_RE.test(t) && !found.includes("사진")) {
792
- found.push("사진");
793
- }
794
- return found;
795
- }
796
- function buildRoomEventStats(total, c) {
877
+ function buildRoomEventStats(total, c, shopExtra) {
797
878
  const sum = c.join +
798
879
  c.leave +
799
880
  c.deleted +
@@ -817,6 +898,9 @@ function buildRoomEventStats(total, c) {
817
898
  subManagerCount: c.subManager,
818
899
  managerCount: c.manager,
819
900
  shopSearchCount: c.shopSearch,
901
+ shopSearchTagExtractions: shopExtra?.tagExtractions ?? 0,
902
+ shopSearchUniqueTags: shopExtra?.uniqueTags ?? 0,
903
+ shopSearchUntaggedNotices: shopExtra?.untaggedNotices ?? 0,
820
904
  photoBundleCount: c.photoBundle,
821
905
  total: sum,
822
906
  joinSharePercent: pct(c.join),