@chigichan24/crune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +155 -0
  3. package/bin/crune.js +2 -0
  4. package/dist-cli/__tests__/cli.test.js +63 -0
  5. package/dist-cli/__tests__/clustering.test.js +200 -0
  6. package/dist-cli/__tests__/community.test.js +115 -0
  7. package/dist-cli/__tests__/edges.test.js +130 -0
  8. package/dist-cli/__tests__/feature-extraction.test.js +66 -0
  9. package/dist-cli/__tests__/fixtures.js +192 -0
  10. package/dist-cli/__tests__/orchestrator.test.js +253 -0
  11. package/dist-cli/__tests__/session-parser.test.js +335 -0
  12. package/dist-cli/__tests__/session-summarizer.test.js +117 -0
  13. package/dist-cli/__tests__/skill-server.test.js +191 -0
  14. package/dist-cli/__tests__/svd.test.js +112 -0
  15. package/dist-cli/__tests__/tfidf.test.js +88 -0
  16. package/dist-cli/__tests__/tokenizer.test.js +125 -0
  17. package/dist-cli/__tests__/topic-nodes.test.js +184 -0
  18. package/dist-cli/analyze-sessions.js +476 -0
  19. package/dist-cli/cli.js +215 -0
  20. package/dist-cli/knowledge-graph/clustering.js +174 -0
  21. package/dist-cli/knowledge-graph/community.js +220 -0
  22. package/dist-cli/knowledge-graph/constants.js +58 -0
  23. package/dist-cli/knowledge-graph/edges.js +193 -0
  24. package/dist-cli/knowledge-graph/feature-extraction.js +124 -0
  25. package/dist-cli/knowledge-graph/index.js +235 -0
  26. package/dist-cli/knowledge-graph/reusability.js +51 -0
  27. package/dist-cli/knowledge-graph/similarity.js +13 -0
  28. package/dist-cli/knowledge-graph/skill-generator.js +203 -0
  29. package/dist-cli/knowledge-graph/svd.js +195 -0
  30. package/dist-cli/knowledge-graph/tfidf.js +54 -0
  31. package/dist-cli/knowledge-graph/tokenizer.js +66 -0
  32. package/dist-cli/knowledge-graph/tool-pattern.js +173 -0
  33. package/dist-cli/knowledge-graph/topic-nodes.js +199 -0
  34. package/dist-cli/knowledge-graph/types.js +4 -0
  35. package/dist-cli/knowledge-graph-builder.js +27 -0
  36. package/dist-cli/session-parser.js +360 -0
  37. package/dist-cli/session-summarizer.js +133 -0
  38. package/dist-cli/skill-server.js +62 -0
  39. package/dist-cli/skill-synthesizer.js +189 -0
  40. package/package.json +47 -0
@@ -0,0 +1,193 @@
1
+ /**
2
+ * Topic edge construction and classification.
3
+ */
4
+ import { cosineSimilarity } from "./similarity.js";
5
+ export function buildTopicEdges(topics, sessions, tfidf, svd) {
6
+ const edges = [];
7
+ const sessionIndex = new Map();
8
+ for (const s of sessions)
9
+ sessionIndex.set(s.sessionId, s);
10
+ // Precompute topic centroids in latent SVD space (if available)
11
+ // Falls back to TF-IDF centroids if SVD not provided
12
+ const centroids = new Map();
13
+ for (const topic of topics) {
14
+ if (svd) {
15
+ // Average SVD session vectors for this topic
16
+ const centroid = new Float64Array(svd.k);
17
+ let count = 0;
18
+ for (const sid of topic.sessionIds) {
19
+ const vec = svd.sessionVectors.get(sid);
20
+ if (vec) {
21
+ for (let k = 0; k < svd.k; k++)
22
+ centroid[k] += vec[k];
23
+ count++;
24
+ }
25
+ }
26
+ if (count > 0)
27
+ for (let k = 0; k < svd.k; k++)
28
+ centroid[k] /= count;
29
+ // L2 normalize
30
+ let norm = 0;
31
+ for (let k = 0; k < svd.k; k++)
32
+ norm += centroid[k] * centroid[k];
33
+ norm = Math.sqrt(norm);
34
+ if (norm > 0)
35
+ for (let k = 0; k < svd.k; k++)
36
+ centroid[k] /= norm;
37
+ centroids.set(topic.id, centroid);
38
+ }
39
+ else {
40
+ // Fallback: TF-IDF centroids
41
+ const centroid = new Float64Array(tfidf.vocabulary.length);
42
+ for (const sid of topic.sessionIds) {
43
+ const vec = tfidf.vectors.get(sid);
44
+ if (vec) {
45
+ for (let k = 0; k < centroid.length; k++)
46
+ centroid[k] += vec[k];
47
+ }
48
+ }
49
+ for (let k = 0; k < centroid.length; k++)
50
+ centroid[k] /= topic.sessionIds.length;
51
+ let norm = 0;
52
+ for (let k = 0; k < centroid.length; k++)
53
+ norm += centroid[k] * centroid[k];
54
+ norm = Math.sqrt(norm);
55
+ if (norm > 0)
56
+ for (let k = 0; k < centroid.length; k++)
57
+ centroid[k] /= norm;
58
+ centroids.set(topic.id, centroid);
59
+ }
60
+ }
61
+ for (let i = 0; i < topics.length; i++) {
62
+ for (let j = i + 1; j < topics.length; j++) {
63
+ const ti = topics[i];
64
+ const tj = topics[j];
65
+ // Signal 1: Latent semantic similarity (SVD or TF-IDF centroid cosine)
66
+ const ci = centroids.get(ti.id);
67
+ const cj = centroids.get(tj.id);
68
+ const semanticSim = cosineSimilarity(ci, cj);
69
+ // Signal 2: File overlap (Jaccard of all edited files)
70
+ const filesI = new Set();
71
+ const filesJ = new Set();
72
+ for (const sid of ti.sessionIds) {
73
+ const s = sessionIndex.get(sid);
74
+ if (s)
75
+ s.meta.filesEdited.forEach((f) => filesI.add(f));
76
+ }
77
+ for (const sid of tj.sessionIds) {
78
+ const s = sessionIndex.get(sid);
79
+ if (s)
80
+ s.meta.filesEdited.forEach((f) => filesJ.add(f));
81
+ }
82
+ const intersection = [...filesI].filter((f) => filesJ.has(f));
83
+ const union = new Set([...filesI, ...filesJ]);
84
+ const fileOverlap = union.size > 0 ? intersection.length / union.size : 0;
85
+ // Signal 3: Session overlap (temporal adjacency / same branch)
86
+ let sessionOverlap = 0;
87
+ for (const sidI of ti.sessionIds) {
88
+ const si = sessionIndex.get(sidI);
89
+ if (!si)
90
+ continue;
91
+ for (const sidJ of tj.sessionIds) {
92
+ const sj = sessionIndex.get(sidJ);
93
+ if (!sj)
94
+ continue;
95
+ if (si.projectDisplayName === sj.projectDisplayName &&
96
+ si.meta.gitBranch &&
97
+ si.meta.gitBranch === sj.meta.gitBranch) {
98
+ sessionOverlap = Math.max(sessionOverlap, 0.6);
99
+ }
100
+ // Temporal adjacency: sessions within 1 hour of each other
101
+ const timeDiff = Math.abs(new Date(si.meta.createdAt).getTime() -
102
+ new Date(sj.meta.createdAt).getTime());
103
+ if (timeDiff < 3600000) {
104
+ sessionOverlap = Math.max(sessionOverlap, 0.4);
105
+ }
106
+ }
107
+ }
108
+ // Weighted sum
109
+ const strength = semanticSim * 0.4 + fileOverlap * 0.3 + sessionOverlap * 0.3;
110
+ if (strength < 0.2)
111
+ continue;
112
+ // Determine dominant signal and generate label
113
+ const signals = { semanticSimilarity: semanticSim, fileOverlap, sessionOverlap };
114
+ const { type, label } = classifyEdge(ti, tj, signals, intersection, tfidf, centroids);
115
+ edges.push({
116
+ source: ti.id,
117
+ target: tj.id,
118
+ type,
119
+ strength: Math.round(strength * 100) / 100,
120
+ label,
121
+ signals,
122
+ });
123
+ }
124
+ }
125
+ return edges;
126
+ }
127
+ export function classifyEdge(ti, tj, signals, sharedFiles, tfidf, centroids) {
128
+ const isCrossProject = ti.projects.some((p) => tj.projects.includes(p)) === false &&
129
+ ti.project !== tj.project;
130
+ if (isCrossProject) {
131
+ // Find shared keywords between centroids
132
+ const sharedKw = findSharedKeywords(ti.id, tj.id, tfidf, centroids, 3);
133
+ return {
134
+ type: "cross-project-bridge",
135
+ label: `cross-project: ${sharedKw.join(", ") || "related concepts"}`,
136
+ };
137
+ }
138
+ // Find dominant signal
139
+ const { semanticSimilarity, fileOverlap, sessionOverlap } = signals;
140
+ const maxSignal = Math.max(semanticSimilarity * 0.4, fileOverlap * 0.3, sessionOverlap * 0.3);
141
+ if (maxSignal === fileOverlap * 0.3 && sharedFiles.length > 0) {
142
+ const commonPrefix = findCommonPathPrefix(sharedFiles);
143
+ return {
144
+ type: "shared-module",
145
+ label: `shared: ${commonPrefix || sharedFiles[0]?.split("/").slice(-2).join("/") || "files"}`,
146
+ };
147
+ }
148
+ if (maxSignal === sessionOverlap * 0.3) {
149
+ return {
150
+ type: "workflow-continuation",
151
+ label: "workflow continuation",
152
+ };
153
+ }
154
+ // Default: semantic similarity
155
+ const sharedKw = findSharedKeywords(ti.id, tj.id, tfidf, centroids, 3);
156
+ return {
157
+ type: "semantic-similarity",
158
+ label: `related: ${sharedKw.join(", ") || "similar topics"}`,
159
+ };
160
+ }
161
+ export function findSharedKeywords(topicIdA, topicIdB, tfidf, centroids, topK) {
162
+ const ca = centroids.get(topicIdA);
163
+ const cb = centroids.get(topicIdB);
164
+ if (!ca || !cb)
165
+ return [];
166
+ // Find terms with high weight in both centroids
167
+ const shared = [];
168
+ for (let i = 0; i < tfidf.vocabulary.length; i++) {
169
+ if (ca[i] > 0.01 && cb[i] > 0.01) {
170
+ shared.push({ term: tfidf.vocabulary[i], score: ca[i] * cb[i] });
171
+ }
172
+ }
173
+ shared.sort((a, b) => b.score - a.score);
174
+ return shared.slice(0, topK).map((s) => s.term);
175
+ }
176
+ export function findCommonPathPrefix(paths) {
177
+ if (paths.length === 0)
178
+ return "";
179
+ const segments = paths.map((p) => p.split("/"));
180
+ const minLen = Math.min(...segments.map((s) => s.length));
181
+ let prefixLen = 0;
182
+ for (let i = 0; i < minLen; i++) {
183
+ if (segments.every((s) => s[i] === segments[0][i])) {
184
+ prefixLen = i + 1;
185
+ }
186
+ else {
187
+ break;
188
+ }
189
+ }
190
+ if (prefixLen <= 1)
191
+ return ""; // too generic
192
+ return segments[0].slice(0, prefixLen).join("/");
193
+ }
@@ -0,0 +1,124 @@
1
+ /**
2
+ * Tool-IDF and structural feature extraction.
3
+ */
4
+ import { STRUCTURAL_DIM } from "./constants.js";
5
+ export function buildToolIdf(sessions) {
6
+ const n = sessions.length;
7
+ // Collect all tool names across sessions
8
+ const allTools = new Set();
9
+ const sessionToolCounts = new Map();
10
+ for (const s of sessions) {
11
+ const toolCounts = new Map();
12
+ // Main session tools
13
+ for (const [tool, count] of Object.entries(s.meta.toolBreakdown)) {
14
+ toolCounts.set(tool, (toolCounts.get(tool) || 0) + count);
15
+ allTools.add(tool);
16
+ }
17
+ // Subagent tools
18
+ for (const sub of Object.values(s.subagents)) {
19
+ for (const turn of sub.turns) {
20
+ for (const tc of turn.toolCalls) {
21
+ toolCounts.set(tc.toolName, (toolCounts.get(tc.toolName) || 0) + 1);
22
+ allTools.add(tc.toolName);
23
+ }
24
+ }
25
+ }
26
+ sessionToolCounts.set(s.sessionId, toolCounts);
27
+ }
28
+ // Build vocabulary and IDF
29
+ const toolVocabulary = [...allTools].sort();
30
+ const toolVocabIndex = new Map();
31
+ toolVocabulary.forEach((t, i) => toolVocabIndex.set(t, i));
32
+ // Document frequency: how many sessions use each tool
33
+ const df = new Map();
34
+ for (const [, counts] of sessionToolCounts) {
35
+ for (const tool of counts.keys()) {
36
+ df.set(tool, (df.get(tool) || 0) + 1);
37
+ }
38
+ }
39
+ // IDF weights
40
+ const toolIdfWeights = new Map();
41
+ for (const tool of toolVocabulary) {
42
+ toolIdfWeights.set(tool, Math.log(n / (df.get(tool) || 1)));
43
+ }
44
+ // Build per-session tool vectors: log(1 + count) * tool_idf, then L2 normalize
45
+ const vectors = new Map();
46
+ for (const s of sessions) {
47
+ const counts = sessionToolCounts.get(s.sessionId);
48
+ const vec = new Float64Array(toolVocabulary.length);
49
+ for (const [tool, count] of counts) {
50
+ const idx = toolVocabIndex.get(tool);
51
+ if (idx !== undefined) {
52
+ vec[idx] = Math.log(1 + count) * (toolIdfWeights.get(tool) || 1);
53
+ }
54
+ }
55
+ // L2 normalize
56
+ let norm = 0;
57
+ for (let i = 0; i < vec.length; i++)
58
+ norm += vec[i] * vec[i];
59
+ norm = Math.sqrt(norm);
60
+ if (norm > 0) {
61
+ for (let i = 0; i < vec.length; i++)
62
+ vec[i] /= norm;
63
+ }
64
+ vectors.set(s.sessionId, vec);
65
+ }
66
+ return { toolVocabulary, toolVocabIndex, toolIdfWeights, vectors };
67
+ }
68
+ export function buildStructuralVectors(sessions) {
69
+ const vectors = new Map();
70
+ for (const s of sessions) {
71
+ const vec = new Float64Array(STRUCTURAL_DIM);
72
+ const totalTurns = s.turns.length;
73
+ if (totalTurns === 0) {
74
+ vectors.set(s.sessionId, vec);
75
+ continue;
76
+ }
77
+ // Count roles and tool usage
78
+ let userCount = 0;
79
+ let assistantCount = 0;
80
+ let toolCallCount = 0;
81
+ let subagentTurns = 0;
82
+ let totalToolsInTurns = 0;
83
+ for (const turn of s.turns) {
84
+ if (turn.userPrompt)
85
+ userCount++;
86
+ if (turn.assistantTexts.length > 0)
87
+ assistantCount++;
88
+ const turnToolCount = turn.toolCalls.length;
89
+ if (turnToolCount > 0)
90
+ toolCallCount++;
91
+ totalToolsInTurns += turnToolCount;
92
+ // Check if any tool call is an Agent call
93
+ if (turn.toolCalls.some((tc) => tc.toolName === "Agent")) {
94
+ subagentTurns++;
95
+ }
96
+ }
97
+ // Also count subagent involvement from subagents object
98
+ const subagentCount = Object.keys(s.subagents).length;
99
+ const totalEntries = userCount + assistantCount + toolCallCount || 1;
100
+ vec[0] = userCount / totalEntries; // userRatio
101
+ vec[1] = assistantCount / totalEntries; // assistantRatio
102
+ vec[2] = toolCallCount / totalEntries; // toolCallRatio
103
+ vec[3] = subagentCount > 0
104
+ ? Math.min(1, (subagentTurns + subagentCount) / totalTurns)
105
+ : 0; // subagentRatio
106
+ vec[4] = Math.log(1 + totalToolsInTurns / totalTurns); // avgToolsPerTurn (log dampened)
107
+ // Edit heaviness vs Read heaviness
108
+ const tb = s.meta.toolBreakdown;
109
+ const totalTools = Object.values(tb).reduce((a, b) => a + b, 0) || 1;
110
+ vec[5] = ((tb["Edit"] || 0) + (tb["Write"] || 0)) / totalTools; // editHeaviness
111
+ vec[6] = ((tb["Read"] || 0) + (tb["Grep"] || 0) + (tb["Glob"] || 0)) / totalTools; // readHeaviness
112
+ // L2 normalize
113
+ let norm = 0;
114
+ for (let i = 0; i < vec.length; i++)
115
+ norm += vec[i] * vec[i];
116
+ norm = Math.sqrt(norm);
117
+ if (norm > 0) {
118
+ for (let i = 0; i < vec.length; i++)
119
+ vec[i] /= norm;
120
+ }
121
+ vectors.set(s.sessionId, vec);
122
+ }
123
+ return vectors;
124
+ }
@@ -0,0 +1,235 @@
1
+ /**
2
+ * Semantic knowledge graph construction from Claude Code session data.
3
+ * Pipeline: TF-IDF + Tool-IDF + Structure → SVD (Latent Semantic) → Clustering → Louvain → Brandes
4
+ */
5
+ import { STRUCTURAL_DIM } from "./constants.js";
6
+ import { tokenize } from "./tokenizer.js";
7
+ import { buildTfidf } from "./tfidf.js";
8
+ import { buildToolIdf, buildStructuralVectors } from "./feature-extraction.js";
9
+ import { buildCombinedMatrix, truncatedSvd, interpretLatentDimensions } from "./svd.js";
10
+ import { cosineDistance } from "./similarity.js";
11
+ import { agglomerativeClusteringFromDistMatrix, splitOversizedClusters, } from "./clustering.js";
12
+ import { buildTopicNodes } from "./topic-nodes.js";
13
+ import { buildTopicEdges } from "./edges.js";
14
+ import { louvainDetection, brandesBetweenness } from "./community.js";
15
+ import { computeReusabilityScores } from "./reusability.js";
16
+ import { extractEnrichedSequences } from "./tool-pattern.js";
17
+ import { generateSkillCandidates } from "./skill-generator.js";
18
+ export { tokenize, splitCamelCase, extractPathTokens, isNoiseToken } from "./tokenizer.js";
19
+ export { buildTfidf } from "./tfidf.js";
20
+ export { buildToolIdf, buildStructuralVectors } from "./feature-extraction.js";
21
+ export { buildCombinedMatrix, truncatedSvd, interpretLatentDimensions } from "./svd.js";
22
+ export { cosineSimilarity, cosineDistance } from "./similarity.js";
23
+ export { agglomerativeClusteringFromDistMatrix, findElbowThreshold, clusterWithThresholdFromDistMatrix, splitOversizedClusters, } from "./clustering.js";
24
+ export { extractDominantAction, selectRepresentativePrompts, generateSuggestedPrompt, computeToolSignature, classifyDominantRole, buildTopicNodes, } from "./topic-nodes.js";
25
+ export { buildTopicEdges, classifyEdge, findSharedKeywords, findCommonPathPrefix, } from "./edges.js";
26
+ export { louvainDetection, brandesBetweenness } from "./community.js";
27
+ export { computeReusabilityScores } from "./reusability.js";
28
+ export { abstractToolCall, extractEnrichedSequences } from "./tool-pattern.js";
29
+ export { generateSkillMarkdown, generateHookJson, generateSkillCandidates } from "./skill-generator.js";
30
+ // ─── Main Entry Point ───────────────────────────────────────────────────────
31
+ export function buildSemanticKnowledgeGraph(sessions, options = {}) {
32
+ const { enableLouvain = true, enableBrandes = true } = options;
33
+ console.log(` [Knowledge Graph] Processing ${sessions.length} sessions...`);
34
+ // Edge case: too few sessions
35
+ if (sessions.length === 0) {
36
+ return {
37
+ nodes: [],
38
+ edges: [],
39
+ communities: [],
40
+ metrics: {
41
+ totalTopics: 0,
42
+ totalEdges: 0,
43
+ graphDensity: 0,
44
+ modularity: 0,
45
+ isolatedTopicCount: 0,
46
+ bridgeTopicIds: [],
47
+ },
48
+ enrichedToolSequences: [],
49
+ skillCandidates: [],
50
+ };
51
+ }
52
+ // Step 1: Extract session text documents (for TF-IDF)
53
+ const documents = new Map();
54
+ for (const session of sessions) {
55
+ const textParts = [];
56
+ for (const turn of session.turns) {
57
+ if (turn.userPrompt)
58
+ textParts.push(turn.userPrompt);
59
+ for (const text of turn.assistantTexts) {
60
+ textParts.push(text);
61
+ }
62
+ }
63
+ // Also include file paths as tokens
64
+ for (const f of session.meta.filesEdited) {
65
+ textParts.push(f);
66
+ }
67
+ // Include branch name
68
+ if (session.meta.gitBranch) {
69
+ textParts.push(session.meta.gitBranch);
70
+ }
71
+ const fullText = textParts.join(" ");
72
+ const tokens = tokenize(fullText);
73
+ if (tokens.length > 0) {
74
+ documents.set(session.sessionId, tokens);
75
+ }
76
+ }
77
+ console.log(` [Knowledge Graph] Tokenized ${documents.size} sessions (${sessions.length - documents.size} excluded: empty)`);
78
+ // Filter sessions to only those with documents
79
+ const activeSessions = sessions.filter((s) => documents.has(s.sessionId));
80
+ const sessionIds = activeSessions.map((s) => s.sessionId);
81
+ // Step 1b: Build Tool-IDF and Structural vectors (always needed, even for single topic)
82
+ const toolIdf = buildToolIdf(activeSessions);
83
+ console.log(` [Knowledge Graph] Tool-IDF: ${toolIdf.toolVocabulary.length} tool types`);
84
+ const structVectors = buildStructuralVectors(activeSessions);
85
+ console.log(` [Knowledge Graph] Structural features: ${STRUCTURAL_DIM} dimensions`);
86
+ // Extract enriched tool sequences from all sessions
87
+ const enrichedSequences = extractEnrichedSequences(activeSessions);
88
+ console.log(` [Knowledge Graph] Enriched sequences: ${enrichedSequences.length} patterns detected`);
89
+ if (activeSessions.length < 2) {
90
+ // Single session: create one topic
91
+ const emptyTfidf = { vocabulary: [], vocabIndex: new Map(), vectors: new Map() };
92
+ const singleTopic = buildTopicNodes([sessionIds.map((_, i) => i)], activeSessions, emptyTfidf, toolIdf);
93
+ computeReusabilityScores(singleTopic);
94
+ const skillCandidates = generateSkillCandidates(singleTopic, enrichedSequences);
95
+ return {
96
+ nodes: singleTopic,
97
+ edges: [],
98
+ communities: [
99
+ {
100
+ id: 0,
101
+ topicIds: singleTopic.map((t) => t.id),
102
+ label: singleTopic[0]?.label || "All",
103
+ dominantProject: singleTopic[0]?.project || "",
104
+ },
105
+ ],
106
+ metrics: {
107
+ totalTopics: singleTopic.length,
108
+ totalEdges: 0,
109
+ graphDensity: 0,
110
+ modularity: 0,
111
+ isolatedTopicCount: singleTopic.length,
112
+ bridgeTopicIds: [],
113
+ },
114
+ enrichedToolSequences: enrichedSequences,
115
+ skillCandidates,
116
+ };
117
+ }
118
+ // Step 2: TF-IDF (text features)
119
+ const tfidf = buildTfidf(documents);
120
+ console.log(` [Knowledge Graph] TF-IDF: ${tfidf.vocabulary.length} terms in vocabulary`);
121
+ // Step 3: Build combined matrix and apply Truncated SVD
122
+ const textDim = tfidf.vocabulary.length;
123
+ const toolDim = toolIdf.toolVocabulary.length;
124
+ const { matrix, totalDim } = buildCombinedMatrix(sessionIds, tfidf.vectors, toolIdf.vectors, structVectors, textDim, toolDim, STRUCTURAL_DIM);
125
+ // Choose k: enough dimensions to capture nuanced clusters.
126
+ // Use m/4 clamped to [20, 80] — higher than sqrt(m) to preserve more signal.
127
+ const targetK = Math.min(80, Math.max(20, Math.round(activeSessions.length / 4)));
128
+ const svd = truncatedSvd(sessionIds, matrix, totalDim, targetK);
129
+ console.log(` [Knowledge Graph] SVD: ${totalDim}d → ${svd.k}d latent space (top-3 σ: ${[...svd.sigma.slice(0, 3)].map(s => s.toFixed(2)).join(', ')})`);
130
+ // Interpret latent dimensions (for logging and potential use in labeling)
131
+ const latentDims = interpretLatentDimensions(svd, tfidf.vocabulary, toolIdf.toolVocabulary, textDim, toolDim, 5);
132
+ // Log top 3 latent dimensions
133
+ for (const dim of latentDims.slice(0, 3)) {
134
+ const terms = dim.topTerms.slice(0, 3).map(t => t.term).join(', ');
135
+ const tools = dim.topTools.slice(0, 2).map(t => t.tool).join(', ');
136
+ console.log(` dim-${dim.index}: var=${(dim.varianceRatio * 100).toFixed(1)}% terms=[${terms}] tools=[${tools}]`);
137
+ }
138
+ // Step 4: Clustering on dense SVD vectors (cosine distance is now reliable)
139
+ let clusterMembers;
140
+ if (activeSessions.length < 5) {
141
+ clusterMembers = sessionIds.map((_, i) => [i]);
142
+ }
143
+ else {
144
+ // Build distance matrix from SVD session vectors
145
+ const distKey = (i, j) => i < j ? `${i}:${j}` : `${j}:${i}`;
146
+ const svdDist = new Map();
147
+ for (let i = 0; i < sessionIds.length; i++) {
148
+ const vi = svd.sessionVectors.get(sessionIds[i]);
149
+ for (let j = i + 1; j < sessionIds.length; j++) {
150
+ const vj = svd.sessionVectors.get(sessionIds[j]);
151
+ svdDist.set(distKey(i, j), cosineDistance(vi, vj));
152
+ }
153
+ }
154
+ clusterMembers = agglomerativeClusteringFromDistMatrix(sessionIds, svdDist);
155
+ // Split oversized clusters
156
+ clusterMembers = splitOversizedClusters(clusterMembers, activeSessions.length, svdDist);
157
+ }
158
+ console.log(` [Knowledge Graph] Clustering: ${clusterMembers.length} topics from ${activeSessions.length} sessions`);
159
+ // Step 5: Build topic nodes
160
+ const topics = buildTopicNodes(clusterMembers, activeSessions, tfidf, toolIdf);
161
+ // Step 5b: Compute reusability scores
162
+ computeReusabilityScores(topics);
163
+ console.log(` [Knowledge Graph] Reusability scores computed for ${topics.length} topics`);
164
+ // Step 6: Build topic edges (using SVD vectors for semantic similarity)
165
+ const edges = buildTopicEdges(topics, activeSessions, tfidf, svd);
166
+ console.log(` [Knowledge Graph] Edges: ${edges.length} topic connections`);
167
+ // Step 7: Louvain community detection (optional)
168
+ let communities;
169
+ let modularity;
170
+ if (enableLouvain) {
171
+ const louvainResult = louvainDetection(topics, edges);
172
+ communities = louvainResult.communities;
173
+ modularity = louvainResult.modularity;
174
+ console.log(` [Knowledge Graph] Communities: ${communities.length} (modularity: ${modularity.toFixed(3)})`);
175
+ }
176
+ else {
177
+ // Fallback: each cluster is its own community
178
+ communities = topics.map((t, i) => ({
179
+ id: i,
180
+ topicIds: [t.id],
181
+ label: t.label,
182
+ dominantProject: t.project,
183
+ }));
184
+ // Assign communityId to topics
185
+ for (let i = 0; i < topics.length; i++) {
186
+ topics[i].communityId = i;
187
+ }
188
+ modularity = 0;
189
+ console.log(` [Knowledge Graph] Communities: ${communities.length} (Louvain disabled, using cluster-based)`);
190
+ }
191
+ // Step 8: Graph metrics (Brandes + degree centrality, optional)
192
+ if (enableBrandes) {
193
+ brandesBetweenness(topics, edges);
194
+ }
195
+ else {
196
+ // Compute degree centrality only
197
+ const nTopics = topics.length;
198
+ const degreeMap = new Map();
199
+ for (const e of edges) {
200
+ degreeMap.set(e.source, (degreeMap.get(e.source) || 0) + 1);
201
+ degreeMap.set(e.target, (degreeMap.get(e.target) || 0) + 1);
202
+ }
203
+ for (const t of topics) {
204
+ t.degreeCentrality = nTopics > 1
205
+ ? (degreeMap.get(t.id) || 0) / (nTopics - 1)
206
+ : 0;
207
+ }
208
+ console.log(` [Knowledge Graph] Brandes disabled, degree centrality only`);
209
+ }
210
+ const isolatedCount = topics.filter((t) => t.degreeCentrality === 0).length;
211
+ const nTopics = topics.length;
212
+ const maxEdges = (nTopics * (nTopics - 1)) / 2;
213
+ const density = maxEdges > 0 ? edges.length / maxEdges : 0;
214
+ // Bridge topics: top 10% by betweenness centrality
215
+ const sortedByBetweenness = [...topics]
216
+ .filter((t) => t.betweennessCentrality > 0)
217
+ .sort((a, b) => b.betweennessCentrality - a.betweennessCentrality);
218
+ const bridgeCount = Math.max(1, Math.ceil(sortedByBetweenness.length * 0.1));
219
+ const bridgeTopicIds = sortedByBetweenness
220
+ .slice(0, bridgeCount)
221
+ .map((t) => t.id);
222
+ const metrics = {
223
+ totalTopics: nTopics,
224
+ totalEdges: edges.length,
225
+ graphDensity: Math.round(density * 10000) / 10000,
226
+ modularity: Math.round(modularity * 10000) / 10000,
227
+ isolatedTopicCount: isolatedCount,
228
+ bridgeTopicIds,
229
+ };
230
+ // Step 9: Generate skill candidates
231
+ const skillCandidates = generateSkillCandidates(topics, enrichedSequences);
232
+ console.log(` [Knowledge Graph] Skill candidates: ${skillCandidates.length} generated`);
233
+ console.log(` [Knowledge Graph] Done. ${nTopics} topics, ${edges.length} edges, ${communities.length} communities, ${isolatedCount} isolated`);
234
+ return { nodes: topics, edges, communities, metrics, enrichedToolSequences: enrichedSequences, skillCandidates };
235
+ }
@@ -0,0 +1,51 @@
1
+ /**
2
+ * Reusability score computation for topic nodes.
3
+ * Quantifies how valuable a topic pattern is for automation as skill/hook.
4
+ */
5
+ export function computeReusabilityScores(topics, now = new Date()) {
6
+ if (topics.length === 0)
7
+ return;
8
+ const maxSessionCount = Math.max(...topics.map((t) => t.sessionCount));
9
+ const maxProjects = Math.max(...topics.map((t) => t.projects.length));
10
+ const avgDurations = topics.map((t) => t.sessionCount > 0 ? t.totalDurationMinutes / t.sessionCount : 0);
11
+ const maxAvgDuration = Math.max(...avgDurations, 1);
12
+ const nowMs = now.getTime();
13
+ const daysSinceLastSeen = topics.map((t) => {
14
+ if (!t.lastSeen)
15
+ return Infinity;
16
+ const diff = nowMs - new Date(t.lastSeen).getTime();
17
+ return Math.max(0, diff / (1000 * 60 * 60 * 24));
18
+ });
19
+ const maxDays = Math.max(...daysSinceLastSeen.filter((d) => isFinite(d)), 1);
20
+ for (let i = 0; i < topics.length; i++) {
21
+ const topic = topics[i];
22
+ const frequency = maxSessionCount > 0
23
+ ? topic.sessionCount / maxSessionCount
24
+ : 0;
25
+ const avgDuration = topic.sessionCount > 0
26
+ ? topic.totalDurationMinutes / topic.sessionCount
27
+ : 0;
28
+ const timeCost = maxAvgDuration > 0
29
+ ? avgDuration / maxAvgDuration
30
+ : 0;
31
+ const crossProjectScore = maxProjects > 1
32
+ ? (topic.projects.length - 1) / (maxProjects - 1)
33
+ : 0;
34
+ const days = daysSinceLastSeen[i];
35
+ const recency = isFinite(days) && maxDays > 0
36
+ ? 1 - days / maxDays
37
+ : 0;
38
+ const overall = 0.35 * frequency +
39
+ 0.25 * timeCost +
40
+ 0.25 * crossProjectScore +
41
+ 0.15 * recency;
42
+ const score = {
43
+ overall: Math.round(overall * 1000) / 1000,
44
+ frequency: Math.round(frequency * 1000) / 1000,
45
+ timeCost: Math.round(timeCost * 1000) / 1000,
46
+ crossProjectScore: Math.round(crossProjectScore * 1000) / 1000,
47
+ recency: Math.round(recency * 1000) / 1000,
48
+ };
49
+ topic.reusabilityScore = score;
50
+ }
51
+ }
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Cosine similarity and distance for L2-normalized vectors.
3
+ */
4
+ export function cosineSimilarity(a, b) {
5
+ let dot = 0;
6
+ for (let i = 0; i < a.length; i++) {
7
+ dot += a[i] * b[i];
8
+ }
9
+ return dot; // Already L2-normalized
10
+ }
11
+ export function cosineDistance(a, b) {
12
+ return 1 - cosineSimilarity(a, b);
13
+ }