kakaotalk-chat-analyzer 0.16.5 → 0.18.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -1
- package/data/korean-profanity.txt +33 -0
- package/dist/src/aggregator.d.ts +27 -3
- package/dist/src/aggregator.js +152 -68
- package/dist/src/aggregator.js.map +1 -1
- package/dist/src/analysis-budget.d.ts +11 -0
- package/dist/src/analysis-budget.js +26 -0
- package/dist/src/analysis-budget.js.map +1 -0
- package/dist/src/analysis-capability.d.ts +19 -0
- package/dist/src/analysis-capability.js +48 -0
- package/dist/src/analysis-capability.js.map +1 -0
- package/dist/src/analysis-phase-profile.d.ts +9 -0
- package/dist/src/analysis-phase-profile.js +49 -0
- package/dist/src/analysis-phase-profile.js.map +1 -0
- package/dist/src/analysis-preset.d.ts +19 -0
- package/dist/src/analysis-preset.js +73 -0
- package/dist/src/analysis-preset.js.map +1 -0
- package/dist/src/analysis-profile.js +4 -0
- package/dist/src/analysis-profile.js.map +1 -1
- package/dist/src/analysis-spool.d.ts +5 -0
- package/dist/src/analysis-spool.js.map +1 -1
- package/dist/src/analysis.d.ts +2 -5
- package/dist/src/analysis.js +200 -57
- package/dist/src/analysis.js.map +1 -1
- package/dist/src/analyze-pool.d.ts +5 -0
- package/dist/src/analyze-pool.js +5 -1
- package/dist/src/analyze-pool.js.map +1 -1
- package/dist/src/cli.js +59 -1
- package/dist/src/cli.js.map +1 -1
- package/dist/src/embedding-topics.d.ts +5 -2
- package/dist/src/embedding-topics.js +11 -11
- package/dist/src/embedding-topics.js.map +1 -1
- package/dist/src/keyword-eligibility.d.ts +2 -0
- package/dist/src/keyword-eligibility.js +36 -0
- package/dist/src/keyword-eligibility.js.map +1 -0
- package/dist/src/keyword-rank-dual.d.ts +20 -0
- package/dist/src/keyword-rank-dual.js +126 -0
- package/dist/src/keyword-rank-dual.js.map +1 -0
- package/dist/src/keyword-record-tokens.d.ts +7 -0
- package/dist/src/keyword-record-tokens.js +25 -0
- package/dist/src/keyword-record-tokens.js.map +1 -0
- package/dist/src/keyword-seed-topics.d.ts +5 -0
- package/dist/src/keyword-seed-topics.js +67 -0
- package/dist/src/keyword-seed-topics.js.map +1 -0
- package/dist/src/kiwi-keyword-pool.d.ts +4 -0
- package/dist/src/kiwi-keyword-pool.js +89 -0
- package/dist/src/kiwi-keyword-pool.js.map +1 -0
- package/dist/src/kiwi-runtime.d.ts +2 -0
- package/dist/src/kiwi-runtime.js +22 -2
- package/dist/src/kiwi-runtime.js.map +1 -1
- package/dist/src/kiwi-tokenize-worker.d.ts +1 -0
- package/dist/src/kiwi-tokenize-worker.js +19 -0
- package/dist/src/kiwi-tokenize-worker.js.map +1 -0
- package/dist/src/kiwi-worker-config.d.ts +3 -0
- package/dist/src/kiwi-worker-config.js +23 -0
- package/dist/src/kiwi-worker-config.js.map +1 -0
- package/dist/src/llm-apply.d.ts +4 -0
- package/dist/src/llm-apply.js +19 -0
- package/dist/src/llm-apply.js.map +1 -0
- package/dist/src/llm-cache.d.ts +9 -0
- package/dist/src/llm-cache.js +31 -0
- package/dist/src/llm-cache.js.map +1 -0
- package/dist/src/llm-input.d.ts +4 -0
- package/dist/src/llm-input.js +42 -0
- package/dist/src/llm-input.js.map +1 -0
- package/dist/src/llm-policy.d.ts +12 -0
- package/dist/src/llm-policy.js +48 -0
- package/dist/src/llm-policy.js.map +1 -0
- package/dist/src/llm-pull.d.ts +3 -0
- package/dist/src/llm-pull.js +48 -0
- package/dist/src/llm-pull.js.map +1 -0
- package/dist/src/llm-summarize.d.ts +17 -0
- package/dist/src/llm-summarize.js +181 -0
- package/dist/src/llm-summarize.js.map +1 -0
- package/dist/src/message-reservoir.d.ts +3 -1
- package/dist/src/message-reservoir.js +8 -0
- package/dist/src/message-reservoir.js.map +1 -1
- package/dist/src/ml-batch-size.d.ts +5 -0
- package/dist/src/ml-batch-size.js +45 -0
- package/dist/src/ml-batch-size.js.map +1 -0
- package/dist/src/ml-runtime.d.ts +8 -0
- package/dist/src/ml-runtime.js +54 -0
- package/dist/src/ml-runtime.js.map +1 -0
- package/dist/src/profanity.d.ts +16 -0
- package/dist/src/profanity.js +101 -0
- package/dist/src/profanity.js.map +1 -0
- package/dist/src/report-charts.d.ts +17 -2
- package/dist/src/report-charts.js +124 -13
- package/dist/src/report-charts.js.map +1 -1
- package/dist/src/report-config.d.ts +6 -0
- package/dist/src/report-config.js +32 -0
- package/dist/src/report-config.js.map +1 -0
- package/dist/src/report-empty.js +8 -0
- package/dist/src/report-empty.js.map +1 -1
- package/dist/src/report-innovation.js +31 -3
- package/dist/src/report-innovation.js.map +1 -1
- package/dist/src/report-provenance.d.ts +8 -0
- package/dist/src/report-provenance.js +31 -1
- package/dist/src/report-provenance.js.map +1 -1
- package/dist/src/report-section-visibility.js +1 -1
- package/dist/src/report-section-visibility.js.map +1 -1
- package/dist/src/report-styles.d.ts +1 -1
- package/dist/src/report-styles.js +20 -0
- package/dist/src/report-styles.js.map +1 -1
- package/dist/src/report.js +79 -11
- package/dist/src/report.js.map +1 -1
- package/dist/src/semantic-keywords.d.ts +12 -0
- package/dist/src/semantic-keywords.js +42 -22
- package/dist/src/semantic-keywords.js.map +1 -1
- package/dist/src/semantic-policy.d.ts +6 -2
- package/dist/src/semantic-policy.js +18 -2
- package/dist/src/semantic-policy.js.map +1 -1
- package/dist/src/sender-message-reservoir.d.ts +15 -0
- package/dist/src/sender-message-reservoir.js +37 -0
- package/dist/src/sender-message-reservoir.js.map +1 -0
- package/dist/src/sentiment-analyze.d.ts +20 -0
- package/dist/src/sentiment-analyze.js +149 -0
- package/dist/src/sentiment-analyze.js.map +1 -0
- package/dist/src/sentiment-policy.d.ts +18 -0
- package/dist/src/sentiment-policy.js +53 -0
- package/dist/src/sentiment-policy.js.map +1 -0
- package/dist/src/streaming-tfidf-keywords.d.ts +2 -0
- package/dist/src/streaming-tfidf-keywords.js +7 -3
- package/dist/src/streaming-tfidf-keywords.js.map +1 -1
- package/dist/src/system-notices.js +13 -4
- package/dist/src/system-notices.js.map +1 -1
- package/dist/src/topic-generic.d.ts +4 -0
- package/dist/src/topic-generic.js +43 -0
- package/dist/src/topic-generic.js.map +1 -0
- package/dist/src/topic-map.d.ts +4 -0
- package/dist/src/topic-map.js +51 -18
- package/dist/src/topic-map.js.map +1 -1
- package/dist/src/topic-merge.d.ts +18 -0
- package/dist/src/topic-merge.js +153 -0
- package/dist/src/topic-merge.js.map +1 -0
- package/dist/src/types.d.ts +67 -0
- package/dist/src/version.d.ts +2 -2
- package/dist/src/version.js +1 -1
- package/package.json +8 -2
package/README.md
CHANGED
|
@@ -186,6 +186,36 @@ kca --help
|
|
|
186
186
|
|
|
187
187
|
</details>
|
|
188
188
|
|
|
189
|
+
<details>
|
|
190
|
+
<summary><strong>분석 preset·기능 (0.18+)</strong></summary>
|
|
191
|
+
|
|
192
|
+
| preset | 용도 | 90k 메시지 목표 | 시맨틱 | 감정 | LLM |
|
|
193
|
+
|--------|------|-----------------|--------|------|-----|
|
|
194
|
+
| `speed` | RAM·시간 최소 | ~3분 | 끔 | 끔 | 끔 |
|
|
195
|
+
| `balanced` | 기본 권장 | ~5분 | e5-small | 자동 | 끔 |
|
|
196
|
+
| `quality` | 한국어·서사 최대 | ~6분 | ko-v2 | KLUE | 2B/4B |
|
|
197
|
+
| `custom` | 기능 직접 지정 | — | env/플래그 | env | `KCA_LLM=1` |
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
kca capabilities # RAM·추천 preset
|
|
201
|
+
kca ./chat.csv --preset balanced
|
|
202
|
+
kca ./chat.csv --preset quality --local
|
|
203
|
+
kca llm pull 2b # GGUF (optional node-llama-cpp)
|
|
204
|
+
KCA_LLM_BACKEND=ollama KCA_LLM=1 kca ./chat.csv --preset custom
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
환경 변수: `KCA_PRESET`, `KCA_SEMANTIC_MODEL`, `KCA_SENTIMENT_MODEL`, `KCA_LLM`, `KCA_LLM_MOCK`, `KCA_ONNX_GPU`, `KCA_EMBED_BATCH`, `KCA_SENTIMENT_BATCH`, `KCA_KIWI_WORKERS`, `KCA_NO_KIWI_WORKERS`, `KCA_PROFILE_PHASES`, `KCA_BENCH_CSV`, `KCA_KEYWORD_SUMMARY_TOP`, `KCA_SHOP_SEARCH_TOP`.
|
|
208
|
+
|
|
209
|
+
**속도(품질 유지):** 대용량 CSV는 Kiwi worker pool(`KCA_KIWI_WORKERS`, RAM≥8GB 기본 2–4)·시맨틱/감정을 키워드 패스와 겹쳐 실행. `KCA_PROFILE_PHASES=1`로 단계별 ms. quality에서 GPU 가속: `onnxruntime-node` 설치 후 `KCA_ONNX_GPU=metal`(macOS)·`cuda`(Linux)·`dml`(Windows).
|
|
210
|
+
|
|
211
|
+
**키워드:** 요약은 `KCA_KEYWORD_SUMMARY_TOP`(기본 12)·**빈도 순**; ④ 차트에서 **빈도/특이어** 탭 전환. 전체 ~120개는 집계 상한.
|
|
212
|
+
|
|
213
|
+
**주제 맵:** graph(공기 군집)·keyword(상위 키워드 시드)·semantic(임베딩 클러스터) 3레인 RRF 병합 — 대용량 방에서 의미 테마 최대 12장. `KCA_TOPIC_MAX`, `KCA_TOPIC_MIN_THEMES`.
|
|
214
|
+
|
|
215
|
+
**LLM (`quality` / `KCA_LLM=1`):** 주제 제목·서사 + `topicProposals`(키워드 화이트리스트) + 인사이트 bullet·샵검색/상호작용 한 줄(원문 미전송).
|
|
216
|
+
|
|
217
|
+
</details>
|
|
218
|
+
|
|
189
219
|
<details>
|
|
190
220
|
<summary><strong>성능·키워드·벤치 (개발·파워유저)</strong></summary>
|
|
191
221
|
|
|
@@ -197,11 +227,13 @@ kca --help
|
|
|
197
227
|
```bash
|
|
198
228
|
npx kcachat@latest "./chat.csv" --profile --no-worker
|
|
199
229
|
npm run bench:stream -- 100000 # 저장소 클론 후
|
|
230
|
+
npm run bench:preset # speed/balanced SLA 스모크
|
|
231
|
+
KCA_BENCH_COMPARE=1 npm run bench:semantic
|
|
200
232
|
```
|
|
201
233
|
|
|
202
234
|
</details>
|
|
203
235
|
|
|
204
|
-
**버전 고정:** `npx kakaotalk-chat-analyzer@0.
|
|
236
|
+
**버전 고정:** `npx kakaotalk-chat-analyzer@0.18.2` · 최신은 `kcachat@latest`가 매번 본체를 받습니다. 리포트 사이드 카드·`#kca-provenance`로 실제 생성 버전을 확인할 수 있습니다.
|
|
205
237
|
|
|
206
238
|
**로컬 개발:**
|
|
207
239
|
|
|
@@ -216,6 +248,11 @@ cd kakaotalk-chat-analyzer && npm install && npm run build && npm test
|
|
|
216
248
|
|
|
217
249
|
| 버전 | 요약 |
|
|
218
250
|
|------|------|
|
|
251
|
+
| **0.18.2** | 주제 맵 3레인(graph·키워드·임베딩) 병합·테마 6~12·LLM `topicProposals` |
|
|
252
|
+
| **0.18.1** | 키워드 빈도/특이어 dual-view·샵검색 통계·dyad 셀 숫자·LLM 인사이트 필드 |
|
|
253
|
+
| **0.18.0** | preset(speed/balanced/quality)·5분 예산 skip·LLM 서사·KLUE 감정·dual-lane 툴팁·CI Playwright |
|
|
254
|
+
| **0.17.2** | `kca llm pull`·provenance `llmUsed`·분석 예산 라우터 |
|
|
255
|
+
| **0.16.6** | 글자 수 랭킹·비속어 패턴 통계·transformers 감정 분석(자동/선택) |
|
|
219
256
|
| **0.16.5** | 상호작용 히트맵: 말 많은 사람 축 상단·지연 로드·로딩 스켈레톤 |
|
|
220
257
|
| **0.16.4** | 대용량 방 키워드: minDf 스케일·메시지 수 우선 정렬·시맨틱은 BM25 후보만 보강 |
|
|
221
258
|
| **0.16.3** | 기본 **품질 우선** 프로필(메인 스레드·시맨틱 샘플 확대·RRF 완화·임베딩 주제). 가속은 `--worker` / `--fast` |
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# 한국어 채팅 비속·욕설 패턴(부분 일치, normalize 후 검사). 줄 앞 # 은 주석.
|
|
2
|
+
시발
|
|
3
|
+
씨발
|
|
4
|
+
씨팔
|
|
5
|
+
좆
|
|
6
|
+
지랄
|
|
7
|
+
병신
|
|
8
|
+
븅신
|
|
9
|
+
미친
|
|
10
|
+
미친놈
|
|
11
|
+
개새
|
|
12
|
+
개새끼
|
|
13
|
+
개쉐
|
|
14
|
+
개씨
|
|
15
|
+
개같
|
|
16
|
+
닥쳐
|
|
17
|
+
꺼져
|
|
18
|
+
엿먹
|
|
19
|
+
엿먹어
|
|
20
|
+
죽어
|
|
21
|
+
죽을
|
|
22
|
+
좆같
|
|
23
|
+
좆만
|
|
24
|
+
ㅅㅂ
|
|
25
|
+
ㅆㅂ
|
|
26
|
+
ㅄ
|
|
27
|
+
ㅂㅅ
|
|
28
|
+
ㅈㄹ
|
|
29
|
+
ㅁㅊ
|
|
30
|
+
ㅗㅗ
|
|
31
|
+
fuck
|
|
32
|
+
shit
|
|
33
|
+
bitch
|
package/dist/src/aggregator.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import type { ChatRecord, EncodingName, PrivacyMode, ReportData } from "./types.js";
|
|
1
|
+
import type { ChatRecord, EncodingName, PrivacyMode, ReportData, SentimentStats } from "./types.js";
|
|
2
|
+
import type { BuildReportOptions } from "./analyze-pool.js";
|
|
2
3
|
export interface FinalizeSourceMeta {
|
|
3
4
|
filePath: string;
|
|
4
5
|
encoding: EncodingName;
|
|
@@ -7,6 +8,7 @@ export interface FinalizeSourceMeta {
|
|
|
7
8
|
}
|
|
8
9
|
export interface FinalizeOptions {
|
|
9
10
|
usedSemanticKeywords?: boolean;
|
|
11
|
+
usedSentimentAnalysis?: boolean;
|
|
10
12
|
koreanPrimary?: boolean;
|
|
11
13
|
useEmbeddingTopics?: boolean;
|
|
12
14
|
semanticSupplementRrfWeight?: number;
|
|
@@ -16,7 +18,9 @@ export declare function semanticSupplementHitCap(corpusMessages: number): number
|
|
|
16
18
|
export interface AggregatorOptions {
|
|
17
19
|
/** 시맨틱 키워드용 메시지 샘플 수집 */
|
|
18
20
|
semanticSamples?: boolean;
|
|
19
|
-
/**
|
|
21
|
+
/** 감정 분석용 메시지 샘플 수집 */
|
|
22
|
+
sentimentSamples?: boolean;
|
|
23
|
+
/** 시맨틱·감정 리저보어 상한 추정(스트리밍 시 생략 가능) */
|
|
20
24
|
estimatedMessages?: number;
|
|
21
25
|
}
|
|
22
26
|
export declare class ReportAggregator {
|
|
@@ -73,11 +77,18 @@ export declare class ReportAggregator {
|
|
|
73
77
|
private roomSubManagerMessages;
|
|
74
78
|
private roomManagerMessages;
|
|
75
79
|
private roomShopSearchMessages;
|
|
80
|
+
private shopSearchUntaggedNotices;
|
|
81
|
+
private readonly shopSearchMissSamples;
|
|
76
82
|
private roomPhotoBundleMessages;
|
|
77
83
|
private pureLaughMessages;
|
|
78
84
|
private openChatBoilerplateExcluded;
|
|
79
85
|
private semanticThemeCandidates;
|
|
80
86
|
private readonly semanticReservoir;
|
|
87
|
+
private readonly sentimentReservoir;
|
|
88
|
+
private readonly profanityCounter;
|
|
89
|
+
private sentimentStats;
|
|
90
|
+
/** stats pass에서 리저보어를 채웠으면 keyword pass 중복 push 방지 */
|
|
91
|
+
private samplesCollectedInStatsPass;
|
|
81
92
|
private prevMs;
|
|
82
93
|
private prevSender;
|
|
83
94
|
private runSender;
|
|
@@ -85,9 +96,21 @@ export declare class ReportAggregator {
|
|
|
85
96
|
private firstDate;
|
|
86
97
|
private lastDate;
|
|
87
98
|
constructor(filePath: string, privacy: PrivacyMode, top: number, options?: AggregatorOptions);
|
|
88
|
-
|
|
99
|
+
/** 스트리밍 1패스 후 실제 건수로 리저보어 상한 보정(추정치 과소 시) */
|
|
100
|
+
ensureSampleCaps(messageCount: number): void;
|
|
101
|
+
drainSemanticSamples(buildOptions?: BuildReportOptions): string[];
|
|
102
|
+
drainSentimentSamples(): {
|
|
103
|
+
text: string;
|
|
104
|
+
sender: string;
|
|
105
|
+
}[];
|
|
106
|
+
applySentimentStats(stats: SentimentStats): void;
|
|
107
|
+
senderAliasMap(): Map<string, string>;
|
|
89
108
|
messageCount(): number;
|
|
90
109
|
resetKeywordPipeline(): void;
|
|
110
|
+
markSamplesCollectedInStatsPass(): void;
|
|
111
|
+
applyKeywordTokens(kwTokens: string[], monthKey: string): void;
|
|
112
|
+
private pushAnalysisSamples;
|
|
113
|
+
private pushSemanticSample;
|
|
91
114
|
private consumeKeywords;
|
|
92
115
|
applySemanticKeywordBoost(items: {
|
|
93
116
|
label: string;
|
|
@@ -102,6 +125,7 @@ export declare class ReportAggregator {
|
|
|
102
125
|
consume(record: ChatRecord, opts?: {
|
|
103
126
|
keywordsOnly?: boolean;
|
|
104
127
|
skipKeywords?: boolean;
|
|
128
|
+
collectSamples?: boolean;
|
|
105
129
|
}): void;
|
|
106
130
|
private bumpSystemNotice;
|
|
107
131
|
finalize(meta: FinalizeSourceMeta, finalizeOpts?: FinalizeOptions): ReportData;
|
package/dist/src/aggregator.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { formatDate, formatDateTime, partsToUtcMs, weekdayIndex } from "./date.js";
|
|
2
2
|
import { maskPartialDisplayName, parseChatRoomNameFromExportPath, safeInputName } from "./analysis-labels.js";
|
|
3
3
|
import { GapStreamStats, SessionGapStats } from "./gap-stats.js";
|
|
4
|
+
import { keywordTokensForRecord } from "./keyword-record-tokens.js";
|
|
4
5
|
import { tokenizeForKeywords } from "./keyword-tokenize.js";
|
|
5
6
|
import { adaptiveMinCount, StreamingTfidfKeywords } from "./streaming-tfidf-keywords.js";
|
|
6
7
|
import { TopicMapAccumulator } from "./topic-map.js";
|
|
@@ -8,8 +9,13 @@ import { extractHashtagKeywords } from "./korean-hashtags.js";
|
|
|
8
9
|
import { buildKeywordStopwords } from "./keyword-stopwords.js";
|
|
9
10
|
import { buildTopicStopwords } from "./topic-stopwords.js";
|
|
10
11
|
import { MessageReservoir } from "./message-reservoir.js";
|
|
11
|
-
import {
|
|
12
|
-
import {
|
|
12
|
+
import { SenderMessageReservoir } from "./sender-message-reservoir.js";
|
|
13
|
+
import { ProfanityCounter } from "./profanity.js";
|
|
14
|
+
import { sentimentReservoirCap, sentimentSampleCap, subsampleSentimentRecords, } from "./sentiment-policy.js";
|
|
15
|
+
import { effectiveSemanticSampleCap, semanticReservoirCap, subsampleSemanticMessages, } from "./semantic-policy.js";
|
|
16
|
+
import { getAttachmentMarkers, shouldExtractKeywords } from "./keyword-eligibility.js";
|
|
17
|
+
import { mergeDualLaneKeywords } from "./keyword-rank-dual.js";
|
|
18
|
+
import { shopSearchDisplayTop } from "./report-config.js";
|
|
13
19
|
import { isNoiseKeyword } from "./keyword-quality.js";
|
|
14
20
|
import { formatCompactNumber, formatReplyGapMinutes } from "./report-util.js";
|
|
15
21
|
import { KeywordCounter } from "./keyword-counter.js";
|
|
@@ -22,7 +28,9 @@ import { buildEventSpine } from "./event-spine.js";
|
|
|
22
28
|
import { buildRoomNarrative } from "./room-narrative.js";
|
|
23
29
|
import { buildPeriodCompare } from "./period-compare.js";
|
|
24
30
|
import { buildBenchmarkBandsFromValues } from "./benchmark-bands.js";
|
|
25
|
-
import {
|
|
31
|
+
import { semanticItemsToTopics } from "./embedding-topics.js";
|
|
32
|
+
import { buildKeywordSeedTopics } from "./keyword-seed-topics.js";
|
|
33
|
+
import { mergeTopicLanes } from "./topic-merge.js";
|
|
26
34
|
const ATTACHMENT_MARKERS = [
|
|
27
35
|
"사진",
|
|
28
36
|
"동영상",
|
|
@@ -106,11 +114,18 @@ export class ReportAggregator {
|
|
|
106
114
|
roomSubManagerMessages = 0;
|
|
107
115
|
roomManagerMessages = 0;
|
|
108
116
|
roomShopSearchMessages = 0;
|
|
117
|
+
shopSearchUntaggedNotices = 0;
|
|
118
|
+
shopSearchMissSamples = [];
|
|
109
119
|
roomPhotoBundleMessages = 0;
|
|
110
120
|
pureLaughMessages = 0;
|
|
111
121
|
openChatBoilerplateExcluded = 0;
|
|
112
122
|
semanticThemeCandidates = [];
|
|
113
123
|
semanticReservoir;
|
|
124
|
+
sentimentReservoir;
|
|
125
|
+
profanityCounter;
|
|
126
|
+
sentimentStats = null;
|
|
127
|
+
/** stats pass에서 리저보어를 채웠으면 keyword pass 중복 push 방지 */
|
|
128
|
+
samplesCollectedInStatsPass = false;
|
|
114
129
|
prevMs = null;
|
|
115
130
|
prevSender = null;
|
|
116
131
|
runSender = null;
|
|
@@ -125,14 +140,44 @@ export class ReportAggregator {
|
|
|
125
140
|
this.semanticReservoir = options?.semanticSamples
|
|
126
141
|
? new MessageReservoir(semanticReservoirCap(options?.estimatedMessages))
|
|
127
142
|
: null;
|
|
143
|
+
this.sentimentReservoir = options?.sentimentSamples
|
|
144
|
+
? new SenderMessageReservoir(sentimentReservoirCap(options?.estimatedMessages))
|
|
145
|
+
: null;
|
|
146
|
+
this.profanityCounter = ProfanityCounter.create();
|
|
147
|
+
}
|
|
148
|
+
/** 스트리밍 1패스 후 실제 건수로 리저보어 상한 보정(추정치 과소 시) */
|
|
149
|
+
ensureSampleCaps(messageCount) {
|
|
150
|
+
if (messageCount <= 0)
|
|
151
|
+
return;
|
|
152
|
+
const semNeed = semanticReservoirCap(messageCount);
|
|
153
|
+
const sentNeed = sentimentReservoirCap(messageCount);
|
|
154
|
+
if (this.semanticReservoir && this.semanticReservoir.capacity() < semNeed) {
|
|
155
|
+
this.semanticReservoir.growTo(semNeed);
|
|
156
|
+
}
|
|
157
|
+
if (this.sentimentReservoir && this.sentimentReservoir.capacity() < sentNeed) {
|
|
158
|
+
this.sentimentReservoir.growTo(sentNeed);
|
|
159
|
+
}
|
|
128
160
|
}
|
|
129
|
-
drainSemanticSamples() {
|
|
161
|
+
drainSemanticSamples(buildOptions) {
|
|
130
162
|
const raw = this.semanticReservoir?.drain() ?? [];
|
|
131
163
|
if (raw.length === 0)
|
|
132
164
|
return raw;
|
|
133
|
-
const cap =
|
|
165
|
+
const cap = effectiveSemanticSampleCap(Math.max(this.total, raw.length), buildOptions);
|
|
134
166
|
return subsampleSemanticMessages(raw, cap);
|
|
135
167
|
}
|
|
168
|
+
drainSentimentSamples() {
|
|
169
|
+
const raw = this.sentimentReservoir?.drain() ?? [];
|
|
170
|
+
if (raw.length === 0)
|
|
171
|
+
return raw;
|
|
172
|
+
const cap = sentimentSampleCap(Math.max(this.total, raw.length));
|
|
173
|
+
return subsampleSentimentRecords(raw, cap);
|
|
174
|
+
}
|
|
175
|
+
applySentimentStats(stats) {
|
|
176
|
+
this.sentimentStats = stats;
|
|
177
|
+
}
|
|
178
|
+
senderAliasMap() {
|
|
179
|
+
return buildSenderLabels([...this.senderStats.keys()], this.privacy);
|
|
180
|
+
}
|
|
136
181
|
messageCount() {
|
|
137
182
|
return this.total;
|
|
138
183
|
}
|
|
@@ -140,27 +185,47 @@ export class ReportAggregator {
|
|
|
140
185
|
this.keywordStream = new StreamingTfidfKeywords();
|
|
141
186
|
this.topicMap = new TopicMapAccumulator();
|
|
142
187
|
}
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
if (
|
|
151
|
-
|
|
152
|
-
|
|
188
|
+
markSamplesCollectedInStatsPass() {
|
|
189
|
+
this.samplesCollectedInStatsPass = true;
|
|
190
|
+
}
|
|
191
|
+
applyKeywordTokens(kwTokens, monthKey) {
|
|
192
|
+
this.keywordStream.addDocumentTokens(kwTokens);
|
|
193
|
+
this.topicMap.addMessage(kwTokens, monthKey);
|
|
194
|
+
let monthBucket = this.monthlyKeywordBuckets.get(monthKey);
|
|
195
|
+
if (!monthBucket) {
|
|
196
|
+
monthBucket = new KeywordCounter();
|
|
197
|
+
this.monthlyKeywordBuckets.set(monthKey, monthBucket);
|
|
153
198
|
}
|
|
154
|
-
|
|
199
|
+
for (const t of kwTokens)
|
|
200
|
+
monthBucket.add(t);
|
|
201
|
+
}
|
|
202
|
+
pushAnalysisSamples(msg, sender, messageLength, isPureSystem) {
|
|
203
|
+
if (isPureSystem || isOpenChatBoilerplate(msg))
|
|
155
204
|
return;
|
|
205
|
+
if (this.sentimentReservoir && messageLength >= 12) {
|
|
206
|
+
this.sentimentReservoir.push(msg, sender);
|
|
156
207
|
}
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
const monthKey = `${record.date.year}-${pad2(record.date.month)}`;
|
|
160
|
-
this.topicMap.addMessage(kwTokens, monthKey);
|
|
208
|
+
}
|
|
209
|
+
pushSemanticSample(msg, messageLength) {
|
|
161
210
|
if (this.semanticReservoir && messageLength >= 12)
|
|
162
211
|
this.semanticReservoir.push(msg);
|
|
163
212
|
}
|
|
213
|
+
consumeKeywords(record) {
|
|
214
|
+
const row = keywordTokensForRecord(record);
|
|
215
|
+
if (!row) {
|
|
216
|
+
const split = splitMessageForAnalysis(record.message);
|
|
217
|
+
const msg = split.userText.length > 0 ? split.userText : record.message;
|
|
218
|
+
if (isOpenChatBoilerplate(msg))
|
|
219
|
+
this.openChatBoilerplateExcluded += 1;
|
|
220
|
+
return;
|
|
221
|
+
}
|
|
222
|
+
this.applyKeywordTokens(row.tokens, row.monthKey);
|
|
223
|
+
if (!this.samplesCollectedInStatsPass) {
|
|
224
|
+
const split = splitMessageForAnalysis(record.message);
|
|
225
|
+
const msg = split.userText.length > 0 ? split.userText : record.message;
|
|
226
|
+
this.pushSemanticSample(msg, msg.length);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
164
229
|
applySemanticKeywordBoost(items) {
|
|
165
230
|
const valid = items.filter((item) => !isNoiseKeyword(item.label));
|
|
166
231
|
this.semanticThemeCandidates = valid.map((item) => ({
|
|
@@ -199,6 +264,14 @@ export class ReportAggregator {
|
|
|
199
264
|
this.bumpSystemNotice(kind, dayKey);
|
|
200
265
|
for (const tag of split.shopSearchTags)
|
|
201
266
|
increment(this.shopSearchTopics, tag);
|
|
267
|
+
if (split.notices.includes("shopSearch") && split.shopSearchTags.length === 0) {
|
|
268
|
+
this.shopSearchUntaggedNotices += 1;
|
|
269
|
+
if (this.shopSearchMissSamples.length < 8) {
|
|
270
|
+
const sample = record.message.trim().slice(0, 120).replace(/\s+/g, " ");
|
|
271
|
+
if (sample)
|
|
272
|
+
this.shopSearchMissSamples.push(sample);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
202
275
|
const msg = split.userText.length > 0 ? split.userText : record.message;
|
|
203
276
|
const messageLength = msg.length;
|
|
204
277
|
const isPureSystem = split.notices.length > 0 && split.userText.length === 0;
|
|
@@ -279,6 +352,15 @@ export class ReportAggregator {
|
|
|
279
352
|
for (const domain of foundDomains)
|
|
280
353
|
increment(this.domains, domain);
|
|
281
354
|
}
|
|
355
|
+
if (!isPureSystem && !isOpenChatBoilerplate(msg)) {
|
|
356
|
+
this.profanityCounter.add(msg, record.sender);
|
|
357
|
+
if (opts?.collectSamples) {
|
|
358
|
+
this.pushAnalysisSamples(msg, record.sender, messageLength, isPureSystem);
|
|
359
|
+
}
|
|
360
|
+
else if (this.sentimentReservoir && messageLength >= 12) {
|
|
361
|
+
this.sentimentReservoir.push(msg, record.sender);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
282
364
|
if (isOpenChatBoilerplate(msg)) {
|
|
283
365
|
this.openChatBoilerplateExcluded += 1;
|
|
284
366
|
}
|
|
@@ -287,16 +369,7 @@ export class ReportAggregator {
|
|
|
287
369
|
shouldExtractKeywords(msg, foundAttachments)) {
|
|
288
370
|
if (!opts?.skipKeywords) {
|
|
289
371
|
const kwTokens = tokenizeForKeywords(msg);
|
|
290
|
-
this.
|
|
291
|
-
const monthKey = `${record.date.year}-${pad2(record.date.month)}`;
|
|
292
|
-
this.topicMap.addMessage(kwTokens, monthKey);
|
|
293
|
-
let monthBucket = this.monthlyKeywordBuckets.get(monthKey);
|
|
294
|
-
if (!monthBucket) {
|
|
295
|
-
monthBucket = new KeywordCounter();
|
|
296
|
-
this.monthlyKeywordBuckets.set(monthKey, monthBucket);
|
|
297
|
-
}
|
|
298
|
-
for (const t of kwTokens)
|
|
299
|
-
monthBucket.add(t);
|
|
372
|
+
this.applyKeywordTokens(kwTokens, `${record.date.year}-${pad2(record.date.month)}`);
|
|
300
373
|
}
|
|
301
374
|
if (!opts?.keywordsOnly) {
|
|
302
375
|
const kwOpts = {
|
|
@@ -308,8 +381,15 @@ export class ReportAggregator {
|
|
|
308
381
|
}
|
|
309
382
|
if (messageLength >= 12)
|
|
310
383
|
this.repeatPhraseCounter.add(msg, dayKey);
|
|
311
|
-
if (
|
|
312
|
-
this.
|
|
384
|
+
if (opts?.collectSamples) {
|
|
385
|
+
this.pushSemanticSample(msg, messageLength);
|
|
386
|
+
}
|
|
387
|
+
else if (!this.samplesCollectedInStatsPass) {
|
|
388
|
+
this.pushSemanticSample(msg, messageLength);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
else if (opts?.collectSamples) {
|
|
392
|
+
this.pushSemanticSample(msg, messageLength);
|
|
313
393
|
}
|
|
314
394
|
}
|
|
315
395
|
}
|
|
@@ -375,10 +455,11 @@ export class ReportAggregator {
|
|
|
375
455
|
prevStat.maxConsecutive = Math.max(prevStat.maxConsecutive, this.runLen);
|
|
376
456
|
}
|
|
377
457
|
const total = this.total;
|
|
458
|
+
const totalChars = this.totalCharacters;
|
|
378
459
|
const aliases = buildSenderLabels([...this.senderStats.keys()], this.privacy);
|
|
379
|
-
const
|
|
380
|
-
.map(([raw, stat]) => {
|
|
460
|
+
const allParticipants = [...this.senderStats.entries()].map(([raw, stat]) => {
|
|
381
461
|
const sharePercent = total > 0 ? round((stat.messages / total) * 100, 1) : 0;
|
|
462
|
+
const characterSharePercent = totalChars > 0 ? round((stat.characters / totalChars) * 100, 1) : 0;
|
|
382
463
|
return {
|
|
383
464
|
alias: aliases.get(raw) ?? "???",
|
|
384
465
|
messages: stat.messages,
|
|
@@ -387,12 +468,17 @@ export class ReportAggregator {
|
|
|
387
468
|
attachmentMessages: stat.attachmentMessages,
|
|
388
469
|
linkMessages: stat.linkMessages,
|
|
389
470
|
sharePercent,
|
|
471
|
+
characterSharePercent,
|
|
390
472
|
nightMessages: stat.nightMessages,
|
|
391
473
|
maxConsecutive: stat.maxConsecutive,
|
|
392
474
|
};
|
|
393
|
-
})
|
|
475
|
+
});
|
|
476
|
+
const participantStats = [...allParticipants]
|
|
394
477
|
.sort((a, b) => b.messages - a.messages)
|
|
395
478
|
.slice(0, this.top);
|
|
479
|
+
const participantsByCharacters = [...allParticipants]
|
|
480
|
+
.sort((a, b) => b.characters - a.characters || b.messages - a.messages)
|
|
481
|
+
.slice(0, this.top);
|
|
396
482
|
const sortedDays = [...this.daily.keys()].sort();
|
|
397
483
|
const longestStreak = longestDateStreak(sortedDays);
|
|
398
484
|
let peakHour = null;
|
|
@@ -453,17 +539,24 @@ export class ReportAggregator {
|
|
|
453
539
|
}
|
|
454
540
|
const keywordStop = buildKeywordStopwords();
|
|
455
541
|
const keywordLimit = Math.max(120, this.top * 3);
|
|
456
|
-
const
|
|
542
|
+
const minDocFreq = adaptiveMinCount(total, finalizeOpts?.koreanPrimary !== false);
|
|
543
|
+
const keywordCandidates = this.keywordStream.collectKeywordCandidates({
|
|
457
544
|
stopwords: keywordStop,
|
|
458
|
-
|
|
459
|
-
minDocFreq: adaptiveMinCount(total, finalizeOpts?.koreanPrimary !== false),
|
|
545
|
+
minDocFreq,
|
|
460
546
|
});
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
547
|
+
const bm25LaneForSemantic = [...keywordCandidates]
|
|
548
|
+
.sort((a, b) => b.score - a.score || b.messageHits - a.messageHits)
|
|
549
|
+
.slice(0, Math.min(200, Math.floor(80 + Math.sqrt(Math.max(total, 1)))));
|
|
550
|
+
this.applySemanticSupplementForRanked(bm25LaneForSemantic);
|
|
551
|
+
const kwMerged = mergeDualLaneKeywords(keywordCandidates, this.keywordSupplement, total, keywordLimit, finalizeOpts?.semanticSupplementRrfWeight ?? 0.5);
|
|
552
|
+
const keywords = kwMerged.byFrequency;
|
|
553
|
+
const keywordsDistinctive = kwMerged.distinctive;
|
|
554
|
+
const graphTopics = this.topicMap.buildTopics(total, buildTopicStopwords());
|
|
555
|
+
const keywordTopics = buildKeywordSeedTopics(keywords, keywordsDistinctive, total, this.topicMap);
|
|
556
|
+
const semanticTopics = finalizeOpts?.useEmbeddingTopics && this.semanticThemeCandidates.length > 0
|
|
557
|
+
? semanticItemsToTopics(this.semanticThemeCandidates, total)
|
|
558
|
+
: [];
|
|
559
|
+
let topics = mergeTopicLanes({ graph: graphTopics, keyword: keywordTopics, semantic: semanticTopics }, total);
|
|
467
560
|
const burstDetectionMethod = resolveBurstDetectionMethod();
|
|
468
561
|
const keywordTop1SharePercent = top1ShareFromCounts(keywords, total);
|
|
469
562
|
let attachmentMarkerSum = 0;
|
|
@@ -661,9 +754,13 @@ export class ReportAggregator {
|
|
|
661
754
|
nightSharePercent,
|
|
662
755
|
emojiMessages: this.emojiMessages,
|
|
663
756
|
usedSemanticKeywords: finalizeOpts?.usedSemanticKeywords === true,
|
|
757
|
+
usedSentimentAnalysis: finalizeOpts?.usedSentimentAnalysis === true,
|
|
664
758
|
},
|
|
665
759
|
insights,
|
|
666
760
|
participants: participantStats,
|
|
761
|
+
participantsByCharacters,
|
|
762
|
+
profanity: this.profanityCounter.buildProfanityStats(total, aliases),
|
|
763
|
+
sentiment: this.sentimentStats,
|
|
667
764
|
daily: dailySorted,
|
|
668
765
|
hourly: this.hourly,
|
|
669
766
|
weekdays: this.weekdays.map((count, index) => ({
|
|
@@ -674,6 +771,7 @@ export class ReportAggregator {
|
|
|
674
771
|
attachments: topCounts(this.attachments, this.top),
|
|
675
772
|
domains: topCounts(this.domains, this.top),
|
|
676
773
|
keywords,
|
|
774
|
+
keywordsDistinctive,
|
|
677
775
|
topics,
|
|
678
776
|
roomEvents: buildRoomEventStats(total, {
|
|
679
777
|
join: this.roomJoinMessages,
|
|
@@ -687,9 +785,14 @@ export class ReportAggregator {
|
|
|
687
785
|
manager: this.roomManagerMessages,
|
|
688
786
|
shopSearch: this.roomShopSearchMessages,
|
|
689
787
|
photoBundle: this.roomPhotoBundleMessages,
|
|
788
|
+
}, {
|
|
789
|
+
tagExtractions: [...this.shopSearchTopics.values()].reduce((a, n) => a + n, 0),
|
|
790
|
+
uniqueTags: this.shopSearchTopics.size,
|
|
791
|
+
untaggedNotices: this.shopSearchUntaggedNotices,
|
|
690
792
|
}),
|
|
691
793
|
repeatedPhrases: this.repeatPhraseCounter.top(8, 3),
|
|
692
|
-
shopSearchTopics: topCounts(this.shopSearchTopics,
|
|
794
|
+
shopSearchTopics: topCounts(this.shopSearchTopics, shopSearchDisplayTop()),
|
|
795
|
+
shopSearchMissSamples: process.env.KCA_DEBUG_SHOP === "1" ? [...this.shopSearchMissSamples] : undefined,
|
|
693
796
|
pureLaughMessages: this.pureLaughMessages,
|
|
694
797
|
conversationPace,
|
|
695
798
|
burstDays,
|
|
@@ -771,29 +874,7 @@ function getParticipantStat(stats, sender) {
|
|
|
771
874
|
stats.set(sender, created);
|
|
772
875
|
return created;
|
|
773
876
|
}
|
|
774
|
-
function
|
|
775
|
-
const trimmed = message.trim();
|
|
776
|
-
if (trimmed.length === 0)
|
|
777
|
-
return false;
|
|
778
|
-
if (attachmentMarkers.length === 1 && trimmed === attachmentMarkers[0])
|
|
779
|
-
return false;
|
|
780
|
-
if (attachmentMarkers.length > 0 && trimmed.length <= 16) {
|
|
781
|
-
const onlyMarkers = attachmentMarkers.every((m) => trimmed === m || trimmed.includes(m));
|
|
782
|
-
if (onlyMarkers && !/[가-힣A-Za-z]{3,}/.test(trimmed.replace(/[^\p{L}\p{N}]/gu, ""))) {
|
|
783
|
-
return false;
|
|
784
|
-
}
|
|
785
|
-
}
|
|
786
|
-
return true;
|
|
787
|
-
}
|
|
788
|
-
function getAttachmentMarkers(message) {
|
|
789
|
-
const found = ATTACHMENT_MARKERS.filter((marker) => message.includes(marker));
|
|
790
|
-
const t = message.trim();
|
|
791
|
-
if (PHOTO_BUNDLE_RE.test(t) && !found.includes("사진")) {
|
|
792
|
-
found.push("사진");
|
|
793
|
-
}
|
|
794
|
-
return found;
|
|
795
|
-
}
|
|
796
|
-
function buildRoomEventStats(total, c) {
|
|
877
|
+
function buildRoomEventStats(total, c, shopExtra) {
|
|
797
878
|
const sum = c.join +
|
|
798
879
|
c.leave +
|
|
799
880
|
c.deleted +
|
|
@@ -817,6 +898,9 @@ function buildRoomEventStats(total, c) {
|
|
|
817
898
|
subManagerCount: c.subManager,
|
|
818
899
|
managerCount: c.manager,
|
|
819
900
|
shopSearchCount: c.shopSearch,
|
|
901
|
+
shopSearchTagExtractions: shopExtra?.tagExtractions ?? 0,
|
|
902
|
+
shopSearchUniqueTags: shopExtra?.uniqueTags ?? 0,
|
|
903
|
+
shopSearchUntaggedNotices: shopExtra?.untaggedNotices ?? 0,
|
|
820
904
|
photoBundleCount: c.photoBundle,
|
|
821
905
|
total: sum,
|
|
822
906
|
joinSharePercent: pct(c.join),
|