@chigichan24/crune 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +155 -0
- package/bin/crune.js +2 -0
- package/dist-cli/__tests__/cli.test.js +63 -0
- package/dist-cli/__tests__/clustering.test.js +200 -0
- package/dist-cli/__tests__/community.test.js +115 -0
- package/dist-cli/__tests__/edges.test.js +130 -0
- package/dist-cli/__tests__/feature-extraction.test.js +66 -0
- package/dist-cli/__tests__/fixtures.js +192 -0
- package/dist-cli/__tests__/orchestrator.test.js +253 -0
- package/dist-cli/__tests__/session-parser.test.js +335 -0
- package/dist-cli/__tests__/session-summarizer.test.js +117 -0
- package/dist-cli/__tests__/skill-server.test.js +191 -0
- package/dist-cli/__tests__/svd.test.js +112 -0
- package/dist-cli/__tests__/tfidf.test.js +88 -0
- package/dist-cli/__tests__/tokenizer.test.js +125 -0
- package/dist-cli/__tests__/topic-nodes.test.js +184 -0
- package/dist-cli/analyze-sessions.js +476 -0
- package/dist-cli/cli.js +215 -0
- package/dist-cli/knowledge-graph/clustering.js +174 -0
- package/dist-cli/knowledge-graph/community.js +220 -0
- package/dist-cli/knowledge-graph/constants.js +58 -0
- package/dist-cli/knowledge-graph/edges.js +193 -0
- package/dist-cli/knowledge-graph/feature-extraction.js +124 -0
- package/dist-cli/knowledge-graph/index.js +235 -0
- package/dist-cli/knowledge-graph/reusability.js +51 -0
- package/dist-cli/knowledge-graph/similarity.js +13 -0
- package/dist-cli/knowledge-graph/skill-generator.js +203 -0
- package/dist-cli/knowledge-graph/svd.js +195 -0
- package/dist-cli/knowledge-graph/tfidf.js +54 -0
- package/dist-cli/knowledge-graph/tokenizer.js +66 -0
- package/dist-cli/knowledge-graph/tool-pattern.js +173 -0
- package/dist-cli/knowledge-graph/topic-nodes.js +199 -0
- package/dist-cli/knowledge-graph/types.js +4 -0
- package/dist-cli/knowledge-graph-builder.js +27 -0
- package/dist-cli/session-parser.js +360 -0
- package/dist-cli/session-summarizer.js +133 -0
- package/dist-cli/skill-server.js +62 -0
- package/dist-cli/skill-synthesizer.js +189 -0
- package/package.json +47 -0
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Topic edge construction and classification.
|
|
3
|
+
*/
|
|
4
|
+
import { cosineSimilarity } from "./similarity.js";
|
|
5
|
+
export function buildTopicEdges(topics, sessions, tfidf, svd) {
|
|
6
|
+
const edges = [];
|
|
7
|
+
const sessionIndex = new Map();
|
|
8
|
+
for (const s of sessions)
|
|
9
|
+
sessionIndex.set(s.sessionId, s);
|
|
10
|
+
// Precompute topic centroids in latent SVD space (if available)
|
|
11
|
+
// Falls back to TF-IDF centroids if SVD not provided
|
|
12
|
+
const centroids = new Map();
|
|
13
|
+
for (const topic of topics) {
|
|
14
|
+
if (svd) {
|
|
15
|
+
// Average SVD session vectors for this topic
|
|
16
|
+
const centroid = new Float64Array(svd.k);
|
|
17
|
+
let count = 0;
|
|
18
|
+
for (const sid of topic.sessionIds) {
|
|
19
|
+
const vec = svd.sessionVectors.get(sid);
|
|
20
|
+
if (vec) {
|
|
21
|
+
for (let k = 0; k < svd.k; k++)
|
|
22
|
+
centroid[k] += vec[k];
|
|
23
|
+
count++;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
if (count > 0)
|
|
27
|
+
for (let k = 0; k < svd.k; k++)
|
|
28
|
+
centroid[k] /= count;
|
|
29
|
+
// L2 normalize
|
|
30
|
+
let norm = 0;
|
|
31
|
+
for (let k = 0; k < svd.k; k++)
|
|
32
|
+
norm += centroid[k] * centroid[k];
|
|
33
|
+
norm = Math.sqrt(norm);
|
|
34
|
+
if (norm > 0)
|
|
35
|
+
for (let k = 0; k < svd.k; k++)
|
|
36
|
+
centroid[k] /= norm;
|
|
37
|
+
centroids.set(topic.id, centroid);
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
// Fallback: TF-IDF centroids
|
|
41
|
+
const centroid = new Float64Array(tfidf.vocabulary.length);
|
|
42
|
+
for (const sid of topic.sessionIds) {
|
|
43
|
+
const vec = tfidf.vectors.get(sid);
|
|
44
|
+
if (vec) {
|
|
45
|
+
for (let k = 0; k < centroid.length; k++)
|
|
46
|
+
centroid[k] += vec[k];
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
for (let k = 0; k < centroid.length; k++)
|
|
50
|
+
centroid[k] /= topic.sessionIds.length;
|
|
51
|
+
let norm = 0;
|
|
52
|
+
for (let k = 0; k < centroid.length; k++)
|
|
53
|
+
norm += centroid[k] * centroid[k];
|
|
54
|
+
norm = Math.sqrt(norm);
|
|
55
|
+
if (norm > 0)
|
|
56
|
+
for (let k = 0; k < centroid.length; k++)
|
|
57
|
+
centroid[k] /= norm;
|
|
58
|
+
centroids.set(topic.id, centroid);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
for (let i = 0; i < topics.length; i++) {
|
|
62
|
+
for (let j = i + 1; j < topics.length; j++) {
|
|
63
|
+
const ti = topics[i];
|
|
64
|
+
const tj = topics[j];
|
|
65
|
+
// Signal 1: Latent semantic similarity (SVD or TF-IDF centroid cosine)
|
|
66
|
+
const ci = centroids.get(ti.id);
|
|
67
|
+
const cj = centroids.get(tj.id);
|
|
68
|
+
const semanticSim = cosineSimilarity(ci, cj);
|
|
69
|
+
// Signal 2: File overlap (Jaccard of all edited files)
|
|
70
|
+
const filesI = new Set();
|
|
71
|
+
const filesJ = new Set();
|
|
72
|
+
for (const sid of ti.sessionIds) {
|
|
73
|
+
const s = sessionIndex.get(sid);
|
|
74
|
+
if (s)
|
|
75
|
+
s.meta.filesEdited.forEach((f) => filesI.add(f));
|
|
76
|
+
}
|
|
77
|
+
for (const sid of tj.sessionIds) {
|
|
78
|
+
const s = sessionIndex.get(sid);
|
|
79
|
+
if (s)
|
|
80
|
+
s.meta.filesEdited.forEach((f) => filesJ.add(f));
|
|
81
|
+
}
|
|
82
|
+
const intersection = [...filesI].filter((f) => filesJ.has(f));
|
|
83
|
+
const union = new Set([...filesI, ...filesJ]);
|
|
84
|
+
const fileOverlap = union.size > 0 ? intersection.length / union.size : 0;
|
|
85
|
+
// Signal 3: Session overlap (temporal adjacency / same branch)
|
|
86
|
+
let sessionOverlap = 0;
|
|
87
|
+
for (const sidI of ti.sessionIds) {
|
|
88
|
+
const si = sessionIndex.get(sidI);
|
|
89
|
+
if (!si)
|
|
90
|
+
continue;
|
|
91
|
+
for (const sidJ of tj.sessionIds) {
|
|
92
|
+
const sj = sessionIndex.get(sidJ);
|
|
93
|
+
if (!sj)
|
|
94
|
+
continue;
|
|
95
|
+
if (si.projectDisplayName === sj.projectDisplayName &&
|
|
96
|
+
si.meta.gitBranch &&
|
|
97
|
+
si.meta.gitBranch === sj.meta.gitBranch) {
|
|
98
|
+
sessionOverlap = Math.max(sessionOverlap, 0.6);
|
|
99
|
+
}
|
|
100
|
+
// Temporal adjacency: sessions within 1 hour of each other
|
|
101
|
+
const timeDiff = Math.abs(new Date(si.meta.createdAt).getTime() -
|
|
102
|
+
new Date(sj.meta.createdAt).getTime());
|
|
103
|
+
if (timeDiff < 3600000) {
|
|
104
|
+
sessionOverlap = Math.max(sessionOverlap, 0.4);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// Weighted sum
|
|
109
|
+
const strength = semanticSim * 0.4 + fileOverlap * 0.3 + sessionOverlap * 0.3;
|
|
110
|
+
if (strength < 0.2)
|
|
111
|
+
continue;
|
|
112
|
+
// Determine dominant signal and generate label
|
|
113
|
+
const signals = { semanticSimilarity: semanticSim, fileOverlap, sessionOverlap };
|
|
114
|
+
const { type, label } = classifyEdge(ti, tj, signals, intersection, tfidf, centroids);
|
|
115
|
+
edges.push({
|
|
116
|
+
source: ti.id,
|
|
117
|
+
target: tj.id,
|
|
118
|
+
type,
|
|
119
|
+
strength: Math.round(strength * 100) / 100,
|
|
120
|
+
label,
|
|
121
|
+
signals,
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return edges;
|
|
126
|
+
}
|
|
127
|
+
export function classifyEdge(ti, tj, signals, sharedFiles, tfidf, centroids) {
|
|
128
|
+
const isCrossProject = ti.projects.some((p) => tj.projects.includes(p)) === false &&
|
|
129
|
+
ti.project !== tj.project;
|
|
130
|
+
if (isCrossProject) {
|
|
131
|
+
// Find shared keywords between centroids
|
|
132
|
+
const sharedKw = findSharedKeywords(ti.id, tj.id, tfidf, centroids, 3);
|
|
133
|
+
return {
|
|
134
|
+
type: "cross-project-bridge",
|
|
135
|
+
label: `cross-project: ${sharedKw.join(", ") || "related concepts"}`,
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
// Find dominant signal
|
|
139
|
+
const { semanticSimilarity, fileOverlap, sessionOverlap } = signals;
|
|
140
|
+
const maxSignal = Math.max(semanticSimilarity * 0.4, fileOverlap * 0.3, sessionOverlap * 0.3);
|
|
141
|
+
if (maxSignal === fileOverlap * 0.3 && sharedFiles.length > 0) {
|
|
142
|
+
const commonPrefix = findCommonPathPrefix(sharedFiles);
|
|
143
|
+
return {
|
|
144
|
+
type: "shared-module",
|
|
145
|
+
label: `shared: ${commonPrefix || sharedFiles[0]?.split("/").slice(-2).join("/") || "files"}`,
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
if (maxSignal === sessionOverlap * 0.3) {
|
|
149
|
+
return {
|
|
150
|
+
type: "workflow-continuation",
|
|
151
|
+
label: "workflow continuation",
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
// Default: semantic similarity
|
|
155
|
+
const sharedKw = findSharedKeywords(ti.id, tj.id, tfidf, centroids, 3);
|
|
156
|
+
return {
|
|
157
|
+
type: "semantic-similarity",
|
|
158
|
+
label: `related: ${sharedKw.join(", ") || "similar topics"}`,
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
export function findSharedKeywords(topicIdA, topicIdB, tfidf, centroids, topK) {
|
|
162
|
+
const ca = centroids.get(topicIdA);
|
|
163
|
+
const cb = centroids.get(topicIdB);
|
|
164
|
+
if (!ca || !cb)
|
|
165
|
+
return [];
|
|
166
|
+
// Find terms with high weight in both centroids
|
|
167
|
+
const shared = [];
|
|
168
|
+
for (let i = 0; i < tfidf.vocabulary.length; i++) {
|
|
169
|
+
if (ca[i] > 0.01 && cb[i] > 0.01) {
|
|
170
|
+
shared.push({ term: tfidf.vocabulary[i], score: ca[i] * cb[i] });
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
shared.sort((a, b) => b.score - a.score);
|
|
174
|
+
return shared.slice(0, topK).map((s) => s.term);
|
|
175
|
+
}
|
|
176
|
+
export function findCommonPathPrefix(paths) {
|
|
177
|
+
if (paths.length === 0)
|
|
178
|
+
return "";
|
|
179
|
+
const segments = paths.map((p) => p.split("/"));
|
|
180
|
+
const minLen = Math.min(...segments.map((s) => s.length));
|
|
181
|
+
let prefixLen = 0;
|
|
182
|
+
for (let i = 0; i < minLen; i++) {
|
|
183
|
+
if (segments.every((s) => s[i] === segments[0][i])) {
|
|
184
|
+
prefixLen = i + 1;
|
|
185
|
+
}
|
|
186
|
+
else {
|
|
187
|
+
break;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
if (prefixLen <= 1)
|
|
191
|
+
return ""; // too generic
|
|
192
|
+
return segments[0].slice(0, prefixLen).join("/");
|
|
193
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tool-IDF and structural feature extraction.
|
|
3
|
+
*/
|
|
4
|
+
import { STRUCTURAL_DIM } from "./constants.js";
|
|
5
|
+
export function buildToolIdf(sessions) {
|
|
6
|
+
const n = sessions.length;
|
|
7
|
+
// Collect all tool names across sessions
|
|
8
|
+
const allTools = new Set();
|
|
9
|
+
const sessionToolCounts = new Map();
|
|
10
|
+
for (const s of sessions) {
|
|
11
|
+
const toolCounts = new Map();
|
|
12
|
+
// Main session tools
|
|
13
|
+
for (const [tool, count] of Object.entries(s.meta.toolBreakdown)) {
|
|
14
|
+
toolCounts.set(tool, (toolCounts.get(tool) || 0) + count);
|
|
15
|
+
allTools.add(tool);
|
|
16
|
+
}
|
|
17
|
+
// Subagent tools
|
|
18
|
+
for (const sub of Object.values(s.subagents)) {
|
|
19
|
+
for (const turn of sub.turns) {
|
|
20
|
+
for (const tc of turn.toolCalls) {
|
|
21
|
+
toolCounts.set(tc.toolName, (toolCounts.get(tc.toolName) || 0) + 1);
|
|
22
|
+
allTools.add(tc.toolName);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
sessionToolCounts.set(s.sessionId, toolCounts);
|
|
27
|
+
}
|
|
28
|
+
// Build vocabulary and IDF
|
|
29
|
+
const toolVocabulary = [...allTools].sort();
|
|
30
|
+
const toolVocabIndex = new Map();
|
|
31
|
+
toolVocabulary.forEach((t, i) => toolVocabIndex.set(t, i));
|
|
32
|
+
// Document frequency: how many sessions use each tool
|
|
33
|
+
const df = new Map();
|
|
34
|
+
for (const [, counts] of sessionToolCounts) {
|
|
35
|
+
for (const tool of counts.keys()) {
|
|
36
|
+
df.set(tool, (df.get(tool) || 0) + 1);
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
// IDF weights
|
|
40
|
+
const toolIdfWeights = new Map();
|
|
41
|
+
for (const tool of toolVocabulary) {
|
|
42
|
+
toolIdfWeights.set(tool, Math.log(n / (df.get(tool) || 1)));
|
|
43
|
+
}
|
|
44
|
+
// Build per-session tool vectors: log(1 + count) * tool_idf, then L2 normalize
|
|
45
|
+
const vectors = new Map();
|
|
46
|
+
for (const s of sessions) {
|
|
47
|
+
const counts = sessionToolCounts.get(s.sessionId);
|
|
48
|
+
const vec = new Float64Array(toolVocabulary.length);
|
|
49
|
+
for (const [tool, count] of counts) {
|
|
50
|
+
const idx = toolVocabIndex.get(tool);
|
|
51
|
+
if (idx !== undefined) {
|
|
52
|
+
vec[idx] = Math.log(1 + count) * (toolIdfWeights.get(tool) || 1);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
// L2 normalize
|
|
56
|
+
let norm = 0;
|
|
57
|
+
for (let i = 0; i < vec.length; i++)
|
|
58
|
+
norm += vec[i] * vec[i];
|
|
59
|
+
norm = Math.sqrt(norm);
|
|
60
|
+
if (norm > 0) {
|
|
61
|
+
for (let i = 0; i < vec.length; i++)
|
|
62
|
+
vec[i] /= norm;
|
|
63
|
+
}
|
|
64
|
+
vectors.set(s.sessionId, vec);
|
|
65
|
+
}
|
|
66
|
+
return { toolVocabulary, toolVocabIndex, toolIdfWeights, vectors };
|
|
67
|
+
}
|
|
68
|
+
export function buildStructuralVectors(sessions) {
|
|
69
|
+
const vectors = new Map();
|
|
70
|
+
for (const s of sessions) {
|
|
71
|
+
const vec = new Float64Array(STRUCTURAL_DIM);
|
|
72
|
+
const totalTurns = s.turns.length;
|
|
73
|
+
if (totalTurns === 0) {
|
|
74
|
+
vectors.set(s.sessionId, vec);
|
|
75
|
+
continue;
|
|
76
|
+
}
|
|
77
|
+
// Count roles and tool usage
|
|
78
|
+
let userCount = 0;
|
|
79
|
+
let assistantCount = 0;
|
|
80
|
+
let toolCallCount = 0;
|
|
81
|
+
let subagentTurns = 0;
|
|
82
|
+
let totalToolsInTurns = 0;
|
|
83
|
+
for (const turn of s.turns) {
|
|
84
|
+
if (turn.userPrompt)
|
|
85
|
+
userCount++;
|
|
86
|
+
if (turn.assistantTexts.length > 0)
|
|
87
|
+
assistantCount++;
|
|
88
|
+
const turnToolCount = turn.toolCalls.length;
|
|
89
|
+
if (turnToolCount > 0)
|
|
90
|
+
toolCallCount++;
|
|
91
|
+
totalToolsInTurns += turnToolCount;
|
|
92
|
+
// Check if any tool call is an Agent call
|
|
93
|
+
if (turn.toolCalls.some((tc) => tc.toolName === "Agent")) {
|
|
94
|
+
subagentTurns++;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
// Also count subagent involvement from subagents object
|
|
98
|
+
const subagentCount = Object.keys(s.subagents).length;
|
|
99
|
+
const totalEntries = userCount + assistantCount + toolCallCount || 1;
|
|
100
|
+
vec[0] = userCount / totalEntries; // userRatio
|
|
101
|
+
vec[1] = assistantCount / totalEntries; // assistantRatio
|
|
102
|
+
vec[2] = toolCallCount / totalEntries; // toolCallRatio
|
|
103
|
+
vec[3] = subagentCount > 0
|
|
104
|
+
? Math.min(1, (subagentTurns + subagentCount) / totalTurns)
|
|
105
|
+
: 0; // subagentRatio
|
|
106
|
+
vec[4] = Math.log(1 + totalToolsInTurns / totalTurns); // avgToolsPerTurn (log dampened)
|
|
107
|
+
// Edit heaviness vs Read heaviness
|
|
108
|
+
const tb = s.meta.toolBreakdown;
|
|
109
|
+
const totalTools = Object.values(tb).reduce((a, b) => a + b, 0) || 1;
|
|
110
|
+
vec[5] = ((tb["Edit"] || 0) + (tb["Write"] || 0)) / totalTools; // editHeaviness
|
|
111
|
+
vec[6] = ((tb["Read"] || 0) + (tb["Grep"] || 0) + (tb["Glob"] || 0)) / totalTools; // readHeaviness
|
|
112
|
+
// L2 normalize
|
|
113
|
+
let norm = 0;
|
|
114
|
+
for (let i = 0; i < vec.length; i++)
|
|
115
|
+
norm += vec[i] * vec[i];
|
|
116
|
+
norm = Math.sqrt(norm);
|
|
117
|
+
if (norm > 0) {
|
|
118
|
+
for (let i = 0; i < vec.length; i++)
|
|
119
|
+
vec[i] /= norm;
|
|
120
|
+
}
|
|
121
|
+
vectors.set(s.sessionId, vec);
|
|
122
|
+
}
|
|
123
|
+
return vectors;
|
|
124
|
+
}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Semantic knowledge graph construction from Claude Code session data.
|
|
3
|
+
* Pipeline: TF-IDF + Tool-IDF + Structure → SVD (Latent Semantic) → Clustering → Louvain → Brandes
|
|
4
|
+
*/
|
|
5
|
+
import { STRUCTURAL_DIM } from "./constants.js";
|
|
6
|
+
import { tokenize } from "./tokenizer.js";
|
|
7
|
+
import { buildTfidf } from "./tfidf.js";
|
|
8
|
+
import { buildToolIdf, buildStructuralVectors } from "./feature-extraction.js";
|
|
9
|
+
import { buildCombinedMatrix, truncatedSvd, interpretLatentDimensions } from "./svd.js";
|
|
10
|
+
import { cosineDistance } from "./similarity.js";
|
|
11
|
+
import { agglomerativeClusteringFromDistMatrix, splitOversizedClusters, } from "./clustering.js";
|
|
12
|
+
import { buildTopicNodes } from "./topic-nodes.js";
|
|
13
|
+
import { buildTopicEdges } from "./edges.js";
|
|
14
|
+
import { louvainDetection, brandesBetweenness } from "./community.js";
|
|
15
|
+
import { computeReusabilityScores } from "./reusability.js";
|
|
16
|
+
import { extractEnrichedSequences } from "./tool-pattern.js";
|
|
17
|
+
import { generateSkillCandidates } from "./skill-generator.js";
|
|
18
|
+
export { tokenize, splitCamelCase, extractPathTokens, isNoiseToken } from "./tokenizer.js";
|
|
19
|
+
export { buildTfidf } from "./tfidf.js";
|
|
20
|
+
export { buildToolIdf, buildStructuralVectors } from "./feature-extraction.js";
|
|
21
|
+
export { buildCombinedMatrix, truncatedSvd, interpretLatentDimensions } from "./svd.js";
|
|
22
|
+
export { cosineSimilarity, cosineDistance } from "./similarity.js";
|
|
23
|
+
export { agglomerativeClusteringFromDistMatrix, findElbowThreshold, clusterWithThresholdFromDistMatrix, splitOversizedClusters, } from "./clustering.js";
|
|
24
|
+
export { extractDominantAction, selectRepresentativePrompts, generateSuggestedPrompt, computeToolSignature, classifyDominantRole, buildTopicNodes, } from "./topic-nodes.js";
|
|
25
|
+
export { buildTopicEdges, classifyEdge, findSharedKeywords, findCommonPathPrefix, } from "./edges.js";
|
|
26
|
+
export { louvainDetection, brandesBetweenness } from "./community.js";
|
|
27
|
+
export { computeReusabilityScores } from "./reusability.js";
|
|
28
|
+
export { abstractToolCall, extractEnrichedSequences } from "./tool-pattern.js";
|
|
29
|
+
export { generateSkillMarkdown, generateHookJson, generateSkillCandidates } from "./skill-generator.js";
|
|
30
|
+
// ─── Main Entry Point ───────────────────────────────────────────────────────
|
|
31
|
+
export function buildSemanticKnowledgeGraph(sessions, options = {}) {
|
|
32
|
+
const { enableLouvain = true, enableBrandes = true } = options;
|
|
33
|
+
console.log(` [Knowledge Graph] Processing ${sessions.length} sessions...`);
|
|
34
|
+
// Edge case: too few sessions
|
|
35
|
+
if (sessions.length === 0) {
|
|
36
|
+
return {
|
|
37
|
+
nodes: [],
|
|
38
|
+
edges: [],
|
|
39
|
+
communities: [],
|
|
40
|
+
metrics: {
|
|
41
|
+
totalTopics: 0,
|
|
42
|
+
totalEdges: 0,
|
|
43
|
+
graphDensity: 0,
|
|
44
|
+
modularity: 0,
|
|
45
|
+
isolatedTopicCount: 0,
|
|
46
|
+
bridgeTopicIds: [],
|
|
47
|
+
},
|
|
48
|
+
enrichedToolSequences: [],
|
|
49
|
+
skillCandidates: [],
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
// Step 1: Extract session text documents (for TF-IDF)
|
|
53
|
+
const documents = new Map();
|
|
54
|
+
for (const session of sessions) {
|
|
55
|
+
const textParts = [];
|
|
56
|
+
for (const turn of session.turns) {
|
|
57
|
+
if (turn.userPrompt)
|
|
58
|
+
textParts.push(turn.userPrompt);
|
|
59
|
+
for (const text of turn.assistantTexts) {
|
|
60
|
+
textParts.push(text);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
// Also include file paths as tokens
|
|
64
|
+
for (const f of session.meta.filesEdited) {
|
|
65
|
+
textParts.push(f);
|
|
66
|
+
}
|
|
67
|
+
// Include branch name
|
|
68
|
+
if (session.meta.gitBranch) {
|
|
69
|
+
textParts.push(session.meta.gitBranch);
|
|
70
|
+
}
|
|
71
|
+
const fullText = textParts.join(" ");
|
|
72
|
+
const tokens = tokenize(fullText);
|
|
73
|
+
if (tokens.length > 0) {
|
|
74
|
+
documents.set(session.sessionId, tokens);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
console.log(` [Knowledge Graph] Tokenized ${documents.size} sessions (${sessions.length - documents.size} excluded: empty)`);
|
|
78
|
+
// Filter sessions to only those with documents
|
|
79
|
+
const activeSessions = sessions.filter((s) => documents.has(s.sessionId));
|
|
80
|
+
const sessionIds = activeSessions.map((s) => s.sessionId);
|
|
81
|
+
// Step 1b: Build Tool-IDF and Structural vectors (always needed, even for single topic)
|
|
82
|
+
const toolIdf = buildToolIdf(activeSessions);
|
|
83
|
+
console.log(` [Knowledge Graph] Tool-IDF: ${toolIdf.toolVocabulary.length} tool types`);
|
|
84
|
+
const structVectors = buildStructuralVectors(activeSessions);
|
|
85
|
+
console.log(` [Knowledge Graph] Structural features: ${STRUCTURAL_DIM} dimensions`);
|
|
86
|
+
// Extract enriched tool sequences from all sessions
|
|
87
|
+
const enrichedSequences = extractEnrichedSequences(activeSessions);
|
|
88
|
+
console.log(` [Knowledge Graph] Enriched sequences: ${enrichedSequences.length} patterns detected`);
|
|
89
|
+
if (activeSessions.length < 2) {
|
|
90
|
+
// Single session: create one topic
|
|
91
|
+
const emptyTfidf = { vocabulary: [], vocabIndex: new Map(), vectors: new Map() };
|
|
92
|
+
const singleTopic = buildTopicNodes([sessionIds.map((_, i) => i)], activeSessions, emptyTfidf, toolIdf);
|
|
93
|
+
computeReusabilityScores(singleTopic);
|
|
94
|
+
const skillCandidates = generateSkillCandidates(singleTopic, enrichedSequences);
|
|
95
|
+
return {
|
|
96
|
+
nodes: singleTopic,
|
|
97
|
+
edges: [],
|
|
98
|
+
communities: [
|
|
99
|
+
{
|
|
100
|
+
id: 0,
|
|
101
|
+
topicIds: singleTopic.map((t) => t.id),
|
|
102
|
+
label: singleTopic[0]?.label || "All",
|
|
103
|
+
dominantProject: singleTopic[0]?.project || "",
|
|
104
|
+
},
|
|
105
|
+
],
|
|
106
|
+
metrics: {
|
|
107
|
+
totalTopics: singleTopic.length,
|
|
108
|
+
totalEdges: 0,
|
|
109
|
+
graphDensity: 0,
|
|
110
|
+
modularity: 0,
|
|
111
|
+
isolatedTopicCount: singleTopic.length,
|
|
112
|
+
bridgeTopicIds: [],
|
|
113
|
+
},
|
|
114
|
+
enrichedToolSequences: enrichedSequences,
|
|
115
|
+
skillCandidates,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
// Step 2: TF-IDF (text features)
|
|
119
|
+
const tfidf = buildTfidf(documents);
|
|
120
|
+
console.log(` [Knowledge Graph] TF-IDF: ${tfidf.vocabulary.length} terms in vocabulary`);
|
|
121
|
+
// Step 3: Build combined matrix and apply Truncated SVD
|
|
122
|
+
const textDim = tfidf.vocabulary.length;
|
|
123
|
+
const toolDim = toolIdf.toolVocabulary.length;
|
|
124
|
+
const { matrix, totalDim } = buildCombinedMatrix(sessionIds, tfidf.vectors, toolIdf.vectors, structVectors, textDim, toolDim, STRUCTURAL_DIM);
|
|
125
|
+
// Choose k: enough dimensions to capture nuanced clusters.
|
|
126
|
+
// Use m/4 clamped to [20, 80] — higher than sqrt(m) to preserve more signal.
|
|
127
|
+
const targetK = Math.min(80, Math.max(20, Math.round(activeSessions.length / 4)));
|
|
128
|
+
const svd = truncatedSvd(sessionIds, matrix, totalDim, targetK);
|
|
129
|
+
console.log(` [Knowledge Graph] SVD: ${totalDim}d → ${svd.k}d latent space (top-3 σ: ${[...svd.sigma.slice(0, 3)].map(s => s.toFixed(2)).join(', ')})`);
|
|
130
|
+
// Interpret latent dimensions (for logging and potential use in labeling)
|
|
131
|
+
const latentDims = interpretLatentDimensions(svd, tfidf.vocabulary, toolIdf.toolVocabulary, textDim, toolDim, 5);
|
|
132
|
+
// Log top 3 latent dimensions
|
|
133
|
+
for (const dim of latentDims.slice(0, 3)) {
|
|
134
|
+
const terms = dim.topTerms.slice(0, 3).map(t => t.term).join(', ');
|
|
135
|
+
const tools = dim.topTools.slice(0, 2).map(t => t.tool).join(', ');
|
|
136
|
+
console.log(` dim-${dim.index}: var=${(dim.varianceRatio * 100).toFixed(1)}% terms=[${terms}] tools=[${tools}]`);
|
|
137
|
+
}
|
|
138
|
+
// Step 4: Clustering on dense SVD vectors (cosine distance is now reliable)
|
|
139
|
+
let clusterMembers;
|
|
140
|
+
if (activeSessions.length < 5) {
|
|
141
|
+
clusterMembers = sessionIds.map((_, i) => [i]);
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
// Build distance matrix from SVD session vectors
|
|
145
|
+
const distKey = (i, j) => i < j ? `${i}:${j}` : `${j}:${i}`;
|
|
146
|
+
const svdDist = new Map();
|
|
147
|
+
for (let i = 0; i < sessionIds.length; i++) {
|
|
148
|
+
const vi = svd.sessionVectors.get(sessionIds[i]);
|
|
149
|
+
for (let j = i + 1; j < sessionIds.length; j++) {
|
|
150
|
+
const vj = svd.sessionVectors.get(sessionIds[j]);
|
|
151
|
+
svdDist.set(distKey(i, j), cosineDistance(vi, vj));
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
clusterMembers = agglomerativeClusteringFromDistMatrix(sessionIds, svdDist);
|
|
155
|
+
// Split oversized clusters
|
|
156
|
+
clusterMembers = splitOversizedClusters(clusterMembers, activeSessions.length, svdDist);
|
|
157
|
+
}
|
|
158
|
+
console.log(` [Knowledge Graph] Clustering: ${clusterMembers.length} topics from ${activeSessions.length} sessions`);
|
|
159
|
+
// Step 5: Build topic nodes
|
|
160
|
+
const topics = buildTopicNodes(clusterMembers, activeSessions, tfidf, toolIdf);
|
|
161
|
+
// Step 5b: Compute reusability scores
|
|
162
|
+
computeReusabilityScores(topics);
|
|
163
|
+
console.log(` [Knowledge Graph] Reusability scores computed for ${topics.length} topics`);
|
|
164
|
+
// Step 6: Build topic edges (using SVD vectors for semantic similarity)
|
|
165
|
+
const edges = buildTopicEdges(topics, activeSessions, tfidf, svd);
|
|
166
|
+
console.log(` [Knowledge Graph] Edges: ${edges.length} topic connections`);
|
|
167
|
+
// Step 7: Louvain community detection (optional)
|
|
168
|
+
let communities;
|
|
169
|
+
let modularity;
|
|
170
|
+
if (enableLouvain) {
|
|
171
|
+
const louvainResult = louvainDetection(topics, edges);
|
|
172
|
+
communities = louvainResult.communities;
|
|
173
|
+
modularity = louvainResult.modularity;
|
|
174
|
+
console.log(` [Knowledge Graph] Communities: ${communities.length} (modularity: ${modularity.toFixed(3)})`);
|
|
175
|
+
}
|
|
176
|
+
else {
|
|
177
|
+
// Fallback: each cluster is its own community
|
|
178
|
+
communities = topics.map((t, i) => ({
|
|
179
|
+
id: i,
|
|
180
|
+
topicIds: [t.id],
|
|
181
|
+
label: t.label,
|
|
182
|
+
dominantProject: t.project,
|
|
183
|
+
}));
|
|
184
|
+
// Assign communityId to topics
|
|
185
|
+
for (let i = 0; i < topics.length; i++) {
|
|
186
|
+
topics[i].communityId = i;
|
|
187
|
+
}
|
|
188
|
+
modularity = 0;
|
|
189
|
+
console.log(` [Knowledge Graph] Communities: ${communities.length} (Louvain disabled, using cluster-based)`);
|
|
190
|
+
}
|
|
191
|
+
// Step 8: Graph metrics (Brandes + degree centrality, optional)
|
|
192
|
+
if (enableBrandes) {
|
|
193
|
+
brandesBetweenness(topics, edges);
|
|
194
|
+
}
|
|
195
|
+
else {
|
|
196
|
+
// Compute degree centrality only
|
|
197
|
+
const nTopics = topics.length;
|
|
198
|
+
const degreeMap = new Map();
|
|
199
|
+
for (const e of edges) {
|
|
200
|
+
degreeMap.set(e.source, (degreeMap.get(e.source) || 0) + 1);
|
|
201
|
+
degreeMap.set(e.target, (degreeMap.get(e.target) || 0) + 1);
|
|
202
|
+
}
|
|
203
|
+
for (const t of topics) {
|
|
204
|
+
t.degreeCentrality = nTopics > 1
|
|
205
|
+
? (degreeMap.get(t.id) || 0) / (nTopics - 1)
|
|
206
|
+
: 0;
|
|
207
|
+
}
|
|
208
|
+
console.log(` [Knowledge Graph] Brandes disabled, degree centrality only`);
|
|
209
|
+
}
|
|
210
|
+
const isolatedCount = topics.filter((t) => t.degreeCentrality === 0).length;
|
|
211
|
+
const nTopics = topics.length;
|
|
212
|
+
const maxEdges = (nTopics * (nTopics - 1)) / 2;
|
|
213
|
+
const density = maxEdges > 0 ? edges.length / maxEdges : 0;
|
|
214
|
+
// Bridge topics: top 10% by betweenness centrality
|
|
215
|
+
const sortedByBetweenness = [...topics]
|
|
216
|
+
.filter((t) => t.betweennessCentrality > 0)
|
|
217
|
+
.sort((a, b) => b.betweennessCentrality - a.betweennessCentrality);
|
|
218
|
+
const bridgeCount = Math.max(1, Math.ceil(sortedByBetweenness.length * 0.1));
|
|
219
|
+
const bridgeTopicIds = sortedByBetweenness
|
|
220
|
+
.slice(0, bridgeCount)
|
|
221
|
+
.map((t) => t.id);
|
|
222
|
+
const metrics = {
|
|
223
|
+
totalTopics: nTopics,
|
|
224
|
+
totalEdges: edges.length,
|
|
225
|
+
graphDensity: Math.round(density * 10000) / 10000,
|
|
226
|
+
modularity: Math.round(modularity * 10000) / 10000,
|
|
227
|
+
isolatedTopicCount: isolatedCount,
|
|
228
|
+
bridgeTopicIds,
|
|
229
|
+
};
|
|
230
|
+
// Step 9: Generate skill candidates
|
|
231
|
+
const skillCandidates = generateSkillCandidates(topics, enrichedSequences);
|
|
232
|
+
console.log(` [Knowledge Graph] Skill candidates: ${skillCandidates.length} generated`);
|
|
233
|
+
console.log(` [Knowledge Graph] Done. ${nTopics} topics, ${edges.length} edges, ${communities.length} communities, ${isolatedCount} isolated`);
|
|
234
|
+
return { nodes: topics, edges, communities, metrics, enrichedToolSequences: enrichedSequences, skillCandidates };
|
|
235
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Reusability score computation for topic nodes.
|
|
3
|
+
* Quantifies how valuable a topic pattern is for automation as skill/hook.
|
|
4
|
+
*/
|
|
5
|
+
export function computeReusabilityScores(topics, now = new Date()) {
|
|
6
|
+
if (topics.length === 0)
|
|
7
|
+
return;
|
|
8
|
+
const maxSessionCount = Math.max(...topics.map((t) => t.sessionCount));
|
|
9
|
+
const maxProjects = Math.max(...topics.map((t) => t.projects.length));
|
|
10
|
+
const avgDurations = topics.map((t) => t.sessionCount > 0 ? t.totalDurationMinutes / t.sessionCount : 0);
|
|
11
|
+
const maxAvgDuration = Math.max(...avgDurations, 1);
|
|
12
|
+
const nowMs = now.getTime();
|
|
13
|
+
const daysSinceLastSeen = topics.map((t) => {
|
|
14
|
+
if (!t.lastSeen)
|
|
15
|
+
return Infinity;
|
|
16
|
+
const diff = nowMs - new Date(t.lastSeen).getTime();
|
|
17
|
+
return Math.max(0, diff / (1000 * 60 * 60 * 24));
|
|
18
|
+
});
|
|
19
|
+
const maxDays = Math.max(...daysSinceLastSeen.filter((d) => isFinite(d)), 1);
|
|
20
|
+
for (let i = 0; i < topics.length; i++) {
|
|
21
|
+
const topic = topics[i];
|
|
22
|
+
const frequency = maxSessionCount > 0
|
|
23
|
+
? topic.sessionCount / maxSessionCount
|
|
24
|
+
: 0;
|
|
25
|
+
const avgDuration = topic.sessionCount > 0
|
|
26
|
+
? topic.totalDurationMinutes / topic.sessionCount
|
|
27
|
+
: 0;
|
|
28
|
+
const timeCost = maxAvgDuration > 0
|
|
29
|
+
? avgDuration / maxAvgDuration
|
|
30
|
+
: 0;
|
|
31
|
+
const crossProjectScore = maxProjects > 1
|
|
32
|
+
? (topic.projects.length - 1) / (maxProjects - 1)
|
|
33
|
+
: 0;
|
|
34
|
+
const days = daysSinceLastSeen[i];
|
|
35
|
+
const recency = isFinite(days) && maxDays > 0
|
|
36
|
+
? 1 - days / maxDays
|
|
37
|
+
: 0;
|
|
38
|
+
const overall = 0.35 * frequency +
|
|
39
|
+
0.25 * timeCost +
|
|
40
|
+
0.25 * crossProjectScore +
|
|
41
|
+
0.15 * recency;
|
|
42
|
+
const score = {
|
|
43
|
+
overall: Math.round(overall * 1000) / 1000,
|
|
44
|
+
frequency: Math.round(frequency * 1000) / 1000,
|
|
45
|
+
timeCost: Math.round(timeCost * 1000) / 1000,
|
|
46
|
+
crossProjectScore: Math.round(crossProjectScore * 1000) / 1000,
|
|
47
|
+
recency: Math.round(recency * 1000) / 1000,
|
|
48
|
+
};
|
|
49
|
+
topic.reusabilityScore = score;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cosine similarity and distance for L2-normalized vectors.
|
|
3
|
+
*/
|
|
4
|
+
export function cosineSimilarity(a, b) {
|
|
5
|
+
let dot = 0;
|
|
6
|
+
for (let i = 0; i < a.length; i++) {
|
|
7
|
+
dot += a[i] * b[i];
|
|
8
|
+
}
|
|
9
|
+
return dot; // Already L2-normalized
|
|
10
|
+
}
|
|
11
|
+
export function cosineDistance(a, b) {
|
|
12
|
+
return 1 - cosineSimilarity(a, b);
|
|
13
|
+
}
|