@chigichan24/crune 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +155 -0
- package/bin/crune.js +2 -0
- package/dist-cli/__tests__/cli.test.js +63 -0
- package/dist-cli/__tests__/clustering.test.js +200 -0
- package/dist-cli/__tests__/community.test.js +115 -0
- package/dist-cli/__tests__/edges.test.js +130 -0
- package/dist-cli/__tests__/feature-extraction.test.js +66 -0
- package/dist-cli/__tests__/fixtures.js +192 -0
- package/dist-cli/__tests__/orchestrator.test.js +253 -0
- package/dist-cli/__tests__/session-parser.test.js +335 -0
- package/dist-cli/__tests__/session-summarizer.test.js +117 -0
- package/dist-cli/__tests__/skill-server.test.js +191 -0
- package/dist-cli/__tests__/svd.test.js +112 -0
- package/dist-cli/__tests__/tfidf.test.js +88 -0
- package/dist-cli/__tests__/tokenizer.test.js +125 -0
- package/dist-cli/__tests__/topic-nodes.test.js +184 -0
- package/dist-cli/analyze-sessions.js +476 -0
- package/dist-cli/cli.js +215 -0
- package/dist-cli/knowledge-graph/clustering.js +174 -0
- package/dist-cli/knowledge-graph/community.js +220 -0
- package/dist-cli/knowledge-graph/constants.js +58 -0
- package/dist-cli/knowledge-graph/edges.js +193 -0
- package/dist-cli/knowledge-graph/feature-extraction.js +124 -0
- package/dist-cli/knowledge-graph/index.js +235 -0
- package/dist-cli/knowledge-graph/reusability.js +51 -0
- package/dist-cli/knowledge-graph/similarity.js +13 -0
- package/dist-cli/knowledge-graph/skill-generator.js +203 -0
- package/dist-cli/knowledge-graph/svd.js +195 -0
- package/dist-cli/knowledge-graph/tfidf.js +54 -0
- package/dist-cli/knowledge-graph/tokenizer.js +66 -0
- package/dist-cli/knowledge-graph/tool-pattern.js +173 -0
- package/dist-cli/knowledge-graph/topic-nodes.js +199 -0
- package/dist-cli/knowledge-graph/types.js +4 -0
- package/dist-cli/knowledge-graph-builder.js +27 -0
- package/dist-cli/session-parser.js +360 -0
- package/dist-cli/session-summarizer.js +133 -0
- package/dist-cli/skill-server.js +62 -0
- package/dist-cli/skill-synthesizer.js +189 -0
- package/package.json +47 -0
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generates Claude Code skill Markdown and hook JSON from detected patterns.
|
|
3
|
+
*
|
|
4
|
+
* Skill format follows the conventions from anthropics/skills (Apache-2.0):
|
|
5
|
+
* https://github.com/anthropics/skills/tree/main/skills/skill-creator
|
|
6
|
+
*
|
|
7
|
+
* Key conventions applied:
|
|
8
|
+
* - YAML frontmatter with `name` and `description` (description includes "when to use")
|
|
9
|
+
* - Imperative writing style
|
|
10
|
+
* - "Why" explanations over bare MUST/NEVER rules
|
|
11
|
+
* - Concrete examples from representative prompts
|
|
12
|
+
* - Progressive disclosure: keep SKILL.md body concise
|
|
13
|
+
*/
|
|
14
|
+
// ─── Skill name generation ──────────────────────────────────────────────────
|
|
15
|
+
function toSkillName(keywords, project) {
|
|
16
|
+
// Use top keywords + project suffix, kebab-case, max 40 chars
|
|
17
|
+
const parts = keywords
|
|
18
|
+
.slice(0, 3)
|
|
19
|
+
.map((k) => k.toLowerCase().replace(/[^a-z0-9]/g, ""))
|
|
20
|
+
.filter((k) => k.length > 0);
|
|
21
|
+
const projectSuffix = project
|
|
22
|
+
.split("/")
|
|
23
|
+
.pop()
|
|
24
|
+
?.toLowerCase()
|
|
25
|
+
.replace(/[^a-z0-9]/g, "");
|
|
26
|
+
if (projectSuffix && !parts.includes(projectSuffix)) {
|
|
27
|
+
parts.push(projectSuffix);
|
|
28
|
+
}
|
|
29
|
+
return parts.join("-").slice(0, 40);
|
|
30
|
+
}
|
|
31
|
+
// ─── Description generation (with "pushiness" per skill-creator guidance) ───
|
|
32
|
+
function buildDescription(topic) {
|
|
33
|
+
// Description should include WHEN to use + WHAT it does.
|
|
34
|
+
// skill-creator calls this "pushiness" — explicit about trigger context
|
|
35
|
+
// to counteract under-triggering.
|
|
36
|
+
const action = topic.suggestedPrompt.split(" — ")[0] || "work on this domain";
|
|
37
|
+
const tools = topic.toolSignature
|
|
38
|
+
.slice(0, 3)
|
|
39
|
+
.map((t) => t.tool)
|
|
40
|
+
.join(", ");
|
|
41
|
+
const roleHint = topic.dominantRole === "subagent-delegated"
|
|
42
|
+
? "Delegates to specialized subagents."
|
|
43
|
+
: topic.dominantRole === "tool-heavy"
|
|
44
|
+
? `Tool-intensive workflow using ${tools}.`
|
|
45
|
+
: `Interactive workflow using ${tools}.`;
|
|
46
|
+
const projectScope = topic.projects.length > 1
|
|
47
|
+
? `Applies across ${topic.projects.length} projects (${topic.projects.slice(0, 2).join(", ")}${topic.projects.length > 2 ? ", ..." : ""}).`
|
|
48
|
+
: `Scoped to ${topic.project}.`;
|
|
49
|
+
return `Use when you need to ${action}. ${roleHint} ${projectScope} Detected from ${topic.sessionCount} sessions over ${topic.totalDurationMinutes} minutes of usage.`;
|
|
50
|
+
}
|
|
51
|
+
// ─── Skill body generation ──────────────────────────────────────────────────
|
|
52
|
+
function buildSkillBody(topic, relatedSequences) {
|
|
53
|
+
const sections = [];
|
|
54
|
+
// Section 1: Overview — what this skill automates and why
|
|
55
|
+
sections.push("## Overview");
|
|
56
|
+
sections.push("");
|
|
57
|
+
sections.push(`This skill captures a recurring workflow pattern detected across ${topic.sessionCount} sessions.`);
|
|
58
|
+
if (topic.projects.length > 1) {
|
|
59
|
+
sections.push(`It spans ${topic.projects.length} projects, indicating cross-project reusable knowledge.`);
|
|
60
|
+
}
|
|
61
|
+
sections.push("");
|
|
62
|
+
// Section 2: When to use — explicit trigger guidance (skill-creator: "pushiness")
|
|
63
|
+
sections.push("## When to Use");
|
|
64
|
+
sections.push("");
|
|
65
|
+
if (topic.representativePrompts.length > 0) {
|
|
66
|
+
sections.push("Activate this skill when the user's request resembles:");
|
|
67
|
+
for (const prompt of topic.representativePrompts) {
|
|
68
|
+
sections.push(`- "${prompt}"`);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
else {
|
|
72
|
+
sections.push(`Activate when working on tasks involving: ${topic.keywords.join(", ")}.`);
|
|
73
|
+
}
|
|
74
|
+
sections.push("");
|
|
75
|
+
// Section 3: Workflow steps — imperative style per skill-creator guidance
|
|
76
|
+
sections.push("## Workflow");
|
|
77
|
+
sections.push("");
|
|
78
|
+
if (topic.dominantRole === "subagent-delegated") {
|
|
79
|
+
sections.push("Delegate to specialized subagents. This pattern benefits from parallel execution:");
|
|
80
|
+
sections.push("");
|
|
81
|
+
sections.push("1. Analyze the task scope and identify subtasks");
|
|
82
|
+
sections.push("2. Spawn subagents for each independent subtask using the Agent tool");
|
|
83
|
+
sections.push("3. Collect and synthesize results");
|
|
84
|
+
sections.push("");
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
// Build steps from tool signature — imperative, concrete
|
|
88
|
+
sections.push("Follow this tool sequence:");
|
|
89
|
+
sections.push("");
|
|
90
|
+
const toolSteps = topic.toolSignature.slice(0, 5);
|
|
91
|
+
for (let i = 0; i < toolSteps.length; i++) {
|
|
92
|
+
const ts = toolSteps[i];
|
|
93
|
+
const purpose = describeToolPurpose(ts.tool);
|
|
94
|
+
sections.push(`${i + 1}. Use **${ts.tool}** to ${purpose}`);
|
|
95
|
+
}
|
|
96
|
+
sections.push("");
|
|
97
|
+
}
|
|
98
|
+
// Section 4: Enriched patterns — concrete tool sequences with targets
|
|
99
|
+
if (relatedSequences.length > 0) {
|
|
100
|
+
sections.push("## Detected Patterns");
|
|
101
|
+
sections.push("");
|
|
102
|
+
sections.push("The following tool call patterns were frequently observed in this workflow:");
|
|
103
|
+
sections.push("");
|
|
104
|
+
for (const seq of relatedSequences.slice(0, 3)) {
|
|
105
|
+
const flow = seq.sequence
|
|
106
|
+
.map((s) => {
|
|
107
|
+
const target = s.targetPattern ? ` (${s.targetPattern})` : "";
|
|
108
|
+
return `${s.toolName}${target}`;
|
|
109
|
+
})
|
|
110
|
+
.join(" → ");
|
|
111
|
+
sections.push(`- \`${flow}\` — ${seq.count} occurrences`);
|
|
112
|
+
}
|
|
113
|
+
sections.push("");
|
|
114
|
+
}
|
|
115
|
+
// Section 5: Guidelines — why-based, not bare rules
|
|
116
|
+
sections.push("## Guidelines");
|
|
117
|
+
sections.push("");
|
|
118
|
+
if (topic.dominantRole === "tool-heavy") {
|
|
119
|
+
sections.push("- Prefer tool calls over asking the user, because this pattern historically involves intensive automated operations.");
|
|
120
|
+
}
|
|
121
|
+
if (topic.projects.length > 1) {
|
|
122
|
+
sections.push("- Check project-specific conventions before applying, because this pattern spans multiple projects that may have different standards.");
|
|
123
|
+
}
|
|
124
|
+
const readTools = topic.toolSignature.filter((t) => t.tool === "Read" || t.tool === "Grep" || t.tool === "Glob");
|
|
125
|
+
if (readTools.length > 0) {
|
|
126
|
+
sections.push("- Read and understand existing code before making changes, because this pattern involves significant code exploration.");
|
|
127
|
+
}
|
|
128
|
+
sections.push("");
|
|
129
|
+
return sections.join("\n");
|
|
130
|
+
}
|
|
131
|
+
function describeToolPurpose(tool) {
|
|
132
|
+
switch (tool) {
|
|
133
|
+
case "Read":
|
|
134
|
+
return "examine existing files and understand current state";
|
|
135
|
+
case "Grep":
|
|
136
|
+
return "search for relevant patterns and references";
|
|
137
|
+
case "Glob":
|
|
138
|
+
return "locate target files by pattern";
|
|
139
|
+
case "Edit":
|
|
140
|
+
return "apply targeted modifications to existing files";
|
|
141
|
+
case "Write":
|
|
142
|
+
return "create new files as needed";
|
|
143
|
+
case "Bash":
|
|
144
|
+
return "execute commands (build, test, git operations)";
|
|
145
|
+
case "Agent":
|
|
146
|
+
return "delegate subtasks to specialized subagents";
|
|
147
|
+
default:
|
|
148
|
+
return `perform ${tool} operations`;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
// ─── Public API ─────────────────────────────────────────────────────────────
|
|
152
|
+
export function generateSkillMarkdown(topic, relatedSequences = []) {
|
|
153
|
+
const name = toSkillName(topic.keywords, topic.project);
|
|
154
|
+
const description = buildDescription(topic);
|
|
155
|
+
const body = buildSkillBody(topic, relatedSequences);
|
|
156
|
+
return `---
|
|
157
|
+
name: ${name}
|
|
158
|
+
description: >-
|
|
159
|
+
${description}
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
# ${topic.keywords.slice(0, 3).join(" / ")}
|
|
163
|
+
|
|
164
|
+
${body}`.trim();
|
|
165
|
+
}
|
|
166
|
+
export function generateHookJson(sequence) {
|
|
167
|
+
// Only generate hooks for patterns involving Bash commands with identifiable categories
|
|
168
|
+
const bashSteps = sequence.sequence.filter((s) => s.toolName === "Bash" && s.targetPattern && s.targetPattern !== "other");
|
|
169
|
+
if (bashSteps.length === 0)
|
|
170
|
+
return undefined;
|
|
171
|
+
const hookDef = {
|
|
172
|
+
description: `Auto-detected pattern from ${sequence.count} occurrences across ${sequence.projects.length} project(s)`,
|
|
173
|
+
pattern: sequence.sequence.map((s) => ({
|
|
174
|
+
tool: s.toolName,
|
|
175
|
+
category: s.category,
|
|
176
|
+
target: s.targetPattern || null,
|
|
177
|
+
})),
|
|
178
|
+
sessionCount: sequence.count,
|
|
179
|
+
projects: sequence.projects,
|
|
180
|
+
};
|
|
181
|
+
return JSON.stringify(hookDef, null, 2);
|
|
182
|
+
}
|
|
183
|
+
export function generateSkillCandidates(topics, enrichedSequences) {
|
|
184
|
+
const candidates = [];
|
|
185
|
+
// Generate skill candidates from topics sorted by reusability score
|
|
186
|
+
const sorted = [...topics].sort((a, b) => b.reusabilityScore.overall - a.reusabilityScore.overall);
|
|
187
|
+
for (const topic of sorted) {
|
|
188
|
+
// Find enriched sequences related to this topic's sessions
|
|
189
|
+
const topicSessionSet = new Set(topic.sessionIds);
|
|
190
|
+
const relatedSequences = enrichedSequences.filter((seq) => seq.sessionIds.some((sid) => topicSessionSet.has(sid)));
|
|
191
|
+
const skillMarkdown = generateSkillMarkdown(topic, relatedSequences);
|
|
192
|
+
const hookJson = relatedSequences.length > 0
|
|
193
|
+
? generateHookJson(relatedSequences[0])
|
|
194
|
+
: undefined;
|
|
195
|
+
candidates.push({
|
|
196
|
+
topicId: topic.id,
|
|
197
|
+
reusabilityScore: topic.reusabilityScore.overall,
|
|
198
|
+
skillMarkdown,
|
|
199
|
+
hookJson,
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
return candidates;
|
|
203
|
+
}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Truncated SVD (Latent Semantic Analysis) via Gram matrix power iteration.
|
|
3
|
+
*/
|
|
4
|
+
import { WEIGHT_TEXT, WEIGHT_TOOL, WEIGHT_STRUCT } from "./constants.js";
|
|
5
|
+
/**
|
|
6
|
+
* Build a combined feature matrix from text, tool, and structural vectors.
|
|
7
|
+
* Each group is L2-normalized, then scaled by its weight before concatenation.
|
|
8
|
+
* Returns a dense row-major matrix (m × n) and sessionId ordering.
|
|
9
|
+
*/
|
|
10
|
+
export function buildCombinedMatrix(sessionIds, textVectors, toolVectors, structVectors, textDim, toolDim, structDim) {
|
|
11
|
+
const totalDim = textDim + toolDim + structDim;
|
|
12
|
+
const wt = Math.sqrt(WEIGHT_TEXT);
|
|
13
|
+
const wl = Math.sqrt(WEIGHT_TOOL);
|
|
14
|
+
const ws = Math.sqrt(WEIGHT_STRUCT);
|
|
15
|
+
const matrix = [];
|
|
16
|
+
for (const sid of sessionIds) {
|
|
17
|
+
const row = new Float64Array(totalDim);
|
|
18
|
+
const tv = textVectors.get(sid);
|
|
19
|
+
const lv = toolVectors.get(sid);
|
|
20
|
+
const sv = structVectors.get(sid);
|
|
21
|
+
if (tv)
|
|
22
|
+
for (let i = 0; i < textDim; i++)
|
|
23
|
+
row[i] = tv[i] * wt;
|
|
24
|
+
if (lv)
|
|
25
|
+
for (let i = 0; i < toolDim; i++)
|
|
26
|
+
row[textDim + i] = lv[i] * wl;
|
|
27
|
+
if (sv)
|
|
28
|
+
for (let i = 0; i < structDim; i++)
|
|
29
|
+
row[textDim + toolDim + i] = sv[i] * ws;
|
|
30
|
+
matrix.push(row);
|
|
31
|
+
}
|
|
32
|
+
return { matrix, totalDim };
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Truncated SVD via power iteration on A·A^T (the Gram matrix).
|
|
36
|
+
*
|
|
37
|
+
* For m sessions × n features where m << n, computing the m×m Gram matrix
|
|
38
|
+
* and extracting its top-k eigenvectors is far cheaper than full SVD.
|
|
39
|
+
*/
|
|
40
|
+
export function truncatedSvd(sessionIds, matrix, totalDim, targetK) {
|
|
41
|
+
const m = matrix.length;
|
|
42
|
+
const n = totalDim;
|
|
43
|
+
const k = Math.min(targetK, m - 1, n);
|
|
44
|
+
// Step 1: Compute Gram matrix G = A · A^T (m × m)
|
|
45
|
+
const G = new Float64Array(m * m);
|
|
46
|
+
for (let i = 0; i < m; i++) {
|
|
47
|
+
for (let j = i; j < m; j++) {
|
|
48
|
+
let dot = 0;
|
|
49
|
+
for (let d = 0; d < n; d++) {
|
|
50
|
+
dot += matrix[i][d] * matrix[j][d];
|
|
51
|
+
}
|
|
52
|
+
G[i * m + j] = dot;
|
|
53
|
+
G[j * m + i] = dot;
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
// Step 2: Power iteration with deflation to extract top-k eigenvectors of G
|
|
57
|
+
const eigenvectors = [];
|
|
58
|
+
const eigenvalues = [];
|
|
59
|
+
// Seeded PRNG for reproducibility
|
|
60
|
+
let seed = 42;
|
|
61
|
+
const nextRand = () => {
|
|
62
|
+
seed = (seed * 1103515245 + 12345) & 0x7fffffff;
|
|
63
|
+
return seed / 0x7fffffff;
|
|
64
|
+
};
|
|
65
|
+
for (let ki = 0; ki < k; ki++) {
|
|
66
|
+
// Random initial vector
|
|
67
|
+
const v = new Float64Array(m);
|
|
68
|
+
for (let i = 0; i < m; i++)
|
|
69
|
+
v[i] = nextRand() - 0.5;
|
|
70
|
+
// Normalize
|
|
71
|
+
let norm = 0;
|
|
72
|
+
for (let i = 0; i < m; i++)
|
|
73
|
+
norm += v[i] * v[i];
|
|
74
|
+
norm = Math.sqrt(norm);
|
|
75
|
+
for (let i = 0; i < m; i++)
|
|
76
|
+
v[i] /= norm;
|
|
77
|
+
// Power iteration (50 iterations is more than enough for convergence)
|
|
78
|
+
for (let iter = 0; iter < 50; iter++) {
|
|
79
|
+
// w = G · v
|
|
80
|
+
const w = new Float64Array(m);
|
|
81
|
+
for (let i = 0; i < m; i++) {
|
|
82
|
+
let s = 0;
|
|
83
|
+
for (let j = 0; j < m; j++) {
|
|
84
|
+
s += G[i * m + j] * v[j];
|
|
85
|
+
}
|
|
86
|
+
w[i] = s;
|
|
87
|
+
}
|
|
88
|
+
// Deflate: remove projections onto previously found eigenvectors
|
|
89
|
+
for (let prev = 0; prev < ki; prev++) {
|
|
90
|
+
const ev = eigenvectors[prev];
|
|
91
|
+
let proj = 0;
|
|
92
|
+
for (let i = 0; i < m; i++)
|
|
93
|
+
proj += w[i] * ev[i];
|
|
94
|
+
for (let i = 0; i < m; i++)
|
|
95
|
+
w[i] -= proj * ev[i];
|
|
96
|
+
}
|
|
97
|
+
// Normalize
|
|
98
|
+
norm = 0;
|
|
99
|
+
for (let i = 0; i < m; i++)
|
|
100
|
+
norm += w[i] * w[i];
|
|
101
|
+
norm = Math.sqrt(norm);
|
|
102
|
+
if (norm < 1e-12)
|
|
103
|
+
break;
|
|
104
|
+
for (let i = 0; i < m; i++)
|
|
105
|
+
v[i] = w[i] / norm;
|
|
106
|
+
}
|
|
107
|
+
// Eigenvalue = v^T G v
|
|
108
|
+
let eigenvalue = 0;
|
|
109
|
+
for (let i = 0; i < m; i++) {
|
|
110
|
+
let s = 0;
|
|
111
|
+
for (let j = 0; j < m; j++)
|
|
112
|
+
s += G[i * m + j] * v[j];
|
|
113
|
+
eigenvalue += v[i] * s;
|
|
114
|
+
}
|
|
115
|
+
eigenvectors.push(new Float64Array(v));
|
|
116
|
+
eigenvalues.push(Math.max(0, eigenvalue));
|
|
117
|
+
}
|
|
118
|
+
// Step 3: Singular values = sqrt(eigenvalues of G)
|
|
119
|
+
const sigma = new Float64Array(k);
|
|
120
|
+
for (let i = 0; i < k; i++) {
|
|
121
|
+
sigma[i] = Math.sqrt(eigenvalues[i]);
|
|
122
|
+
}
|
|
123
|
+
// Step 4: Right singular vectors V = A^T · U · Σ^{-1}
|
|
124
|
+
const V = [];
|
|
125
|
+
for (let ki = 0; ki < k; ki++) {
|
|
126
|
+
const vk = new Float64Array(n);
|
|
127
|
+
if (sigma[ki] > 1e-12) {
|
|
128
|
+
const invSigma = 1 / sigma[ki];
|
|
129
|
+
for (let j = 0; j < n; j++) {
|
|
130
|
+
let s = 0;
|
|
131
|
+
for (let i = 0; i < m; i++) {
|
|
132
|
+
s += matrix[i][j] * eigenvectors[ki][i];
|
|
133
|
+
}
|
|
134
|
+
vk[j] = s * invSigma;
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
V.push(vk);
|
|
138
|
+
}
|
|
139
|
+
// Step 5: Session vectors = U · Σ (scaled embeddings)
|
|
140
|
+
const sessionVectors = new Map();
|
|
141
|
+
for (let i = 0; i < m; i++) {
|
|
142
|
+
const vec = new Float64Array(k);
|
|
143
|
+
for (let ki = 0; ki < k; ki++) {
|
|
144
|
+
vec[ki] = eigenvectors[ki][i] * sigma[ki];
|
|
145
|
+
}
|
|
146
|
+
// L2 normalize for cosine-based clustering
|
|
147
|
+
let norm = 0;
|
|
148
|
+
for (let d = 0; d < k; d++)
|
|
149
|
+
norm += vec[d] * vec[d];
|
|
150
|
+
norm = Math.sqrt(norm);
|
|
151
|
+
if (norm > 0)
|
|
152
|
+
for (let d = 0; d < k; d++)
|
|
153
|
+
vec[d] /= norm;
|
|
154
|
+
sessionVectors.set(sessionIds[i], vec);
|
|
155
|
+
}
|
|
156
|
+
return { U: eigenvectors, sigma, V, k, sessionVectors };
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Interpret latent dimensions from V matrix.
|
|
160
|
+
* Returns top-N terms per latent dimension, useful for cluster labeling.
|
|
161
|
+
*/
|
|
162
|
+
export function interpretLatentDimensions(svd, textVocabulary, toolVocabulary, textDim, toolDim, topN = 5) {
|
|
163
|
+
const totalVariance = svd.sigma.reduce((s, v) => s + v * v, 0);
|
|
164
|
+
const dimensions = [];
|
|
165
|
+
for (let ki = 0; ki < svd.k; ki++) {
|
|
166
|
+
const v = svd.V[ki];
|
|
167
|
+
const varianceRatio = totalVariance > 0
|
|
168
|
+
? (svd.sigma[ki] * svd.sigma[ki]) / totalVariance
|
|
169
|
+
: 0;
|
|
170
|
+
// Top text terms (from text portion of V)
|
|
171
|
+
const textScored = [];
|
|
172
|
+
for (let i = 0; i < textDim && i < textVocabulary.length; i++) {
|
|
173
|
+
if (Math.abs(v[i]) > 0.01) {
|
|
174
|
+
textScored.push({ term: textVocabulary[i], weight: Math.abs(v[i]) });
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
textScored.sort((a, b) => b.weight - a.weight);
|
|
178
|
+
// Top tools (from tool portion of V)
|
|
179
|
+
const toolScored = [];
|
|
180
|
+
for (let i = 0; i < toolDim && i < toolVocabulary.length; i++) {
|
|
181
|
+
const idx = textDim + i;
|
|
182
|
+
if (Math.abs(v[idx]) > 0.01) {
|
|
183
|
+
toolScored.push({ tool: toolVocabulary[i], weight: Math.abs(v[idx]) });
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
toolScored.sort((a, b) => b.weight - a.weight);
|
|
187
|
+
dimensions.push({
|
|
188
|
+
index: ki,
|
|
189
|
+
varianceRatio: Math.round(varianceRatio * 10000) / 10000,
|
|
190
|
+
topTerms: textScored.slice(0, topN),
|
|
191
|
+
topTools: toolScored.slice(0, topN),
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
return dimensions;
|
|
195
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* TF-IDF vectorization for session documents.
|
|
3
|
+
*/
|
|
4
|
+
export function buildTfidf(documents) {
|
|
5
|
+
// Build vocabulary
|
|
6
|
+
const df = new Map(); // document frequency
|
|
7
|
+
for (const [, tokens] of documents) {
|
|
8
|
+
const uniqueTerms = new Set(tokens);
|
|
9
|
+
for (const term of uniqueTerms) {
|
|
10
|
+
df.set(term, (df.get(term) || 0) + 1);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
// Filter vocabulary: appear in at least 2 docs, but not in > 80% of docs
|
|
14
|
+
const n = documents.size;
|
|
15
|
+
const maxDf = Math.max(2, Math.floor(n * 0.8));
|
|
16
|
+
const vocabulary = [];
|
|
17
|
+
const vocabIndex = new Map();
|
|
18
|
+
for (const [term, count] of df) {
|
|
19
|
+
if (count >= 2 && count <= maxDf) {
|
|
20
|
+
vocabIndex.set(term, vocabulary.length);
|
|
21
|
+
vocabulary.push(term);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
// Build TF-IDF vectors
|
|
25
|
+
const vectors = new Map();
|
|
26
|
+
for (const [docId, tokens] of documents) {
|
|
27
|
+
const tf = new Map();
|
|
28
|
+
for (const t of tokens) {
|
|
29
|
+
if (vocabIndex.has(t)) {
|
|
30
|
+
tf.set(t, (tf.get(t) || 0) + 1);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
const vec = new Float64Array(vocabulary.length);
|
|
34
|
+
for (const [term, count] of tf) {
|
|
35
|
+
const idx = vocabIndex.get(term);
|
|
36
|
+
const termFreq = Math.log(1 + count);
|
|
37
|
+
const invDocFreq = Math.log(n / (df.get(term) || 1));
|
|
38
|
+
vec[idx] = termFreq * invDocFreq;
|
|
39
|
+
}
|
|
40
|
+
// L2 normalize
|
|
41
|
+
let norm = 0;
|
|
42
|
+
for (let i = 0; i < vec.length; i++) {
|
|
43
|
+
norm += vec[i] * vec[i];
|
|
44
|
+
}
|
|
45
|
+
norm = Math.sqrt(norm);
|
|
46
|
+
if (norm > 0) {
|
|
47
|
+
for (let i = 0; i < vec.length; i++) {
|
|
48
|
+
vec[i] /= norm;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
vectors.set(docId, vec);
|
|
52
|
+
}
|
|
53
|
+
return { vocabulary, vocabIndex, vectors };
|
|
54
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text tokenization for knowledge graph feature extraction.
|
|
3
|
+
*/
|
|
4
|
+
import { STOP_WORDS, UUID_PATTERN, HEX_PATTERN, NUM_PATTERN } from "./constants.js";
|
|
5
|
+
export function splitCamelCase(word) {
|
|
6
|
+
return word
|
|
7
|
+
.replace(/([a-z])([A-Z])/g, "$1 $2")
|
|
8
|
+
.replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
|
|
9
|
+
.split(/\s+/)
|
|
10
|
+
.map((w) => w.toLowerCase());
|
|
11
|
+
}
|
|
12
|
+
export function extractPathTokens(text) {
|
|
13
|
+
const pathPattern = /(?:\/[\w.-]+){2,}/g;
|
|
14
|
+
const tokens = [];
|
|
15
|
+
let match;
|
|
16
|
+
while ((match = pathPattern.exec(text)) !== null) {
|
|
17
|
+
const segments = match[0].split("/").filter(Boolean);
|
|
18
|
+
for (const seg of segments) {
|
|
19
|
+
const name = seg.replace(/\.[^.]+$/, ""); // remove extension
|
|
20
|
+
if (name.length > 2) {
|
|
21
|
+
tokens.push(...splitCamelCase(name));
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
return tokens;
|
|
26
|
+
}
|
|
27
|
+
export function isNoiseToken(token) {
|
|
28
|
+
return (UUID_PATTERN.test(token) ||
|
|
29
|
+
HEX_PATTERN.test(token) ||
|
|
30
|
+
NUM_PATTERN.test(token) ||
|
|
31
|
+
token.length > 40 // extremely long tokens are noise
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
export function tokenize(text) {
|
|
35
|
+
const tokens = [];
|
|
36
|
+
// Extract file path tokens first
|
|
37
|
+
tokens.push(...extractPathTokens(text));
|
|
38
|
+
// Split on whitespace, punctuation, CJK boundaries
|
|
39
|
+
const words = text
|
|
40
|
+
.replace(/[`'"{}()[\]<>;:,!?@#$%^&*=+|\\~]/g, " ")
|
|
41
|
+
.replace(/\//g, " ")
|
|
42
|
+
.split(/\s+/)
|
|
43
|
+
.filter(Boolean);
|
|
44
|
+
for (const word of words) {
|
|
45
|
+
// Skip URLs and UUIDs
|
|
46
|
+
if (word.startsWith("http"))
|
|
47
|
+
continue;
|
|
48
|
+
if (UUID_PATTERN.test(word))
|
|
49
|
+
continue;
|
|
50
|
+
// Handle kebab-case and snake_case
|
|
51
|
+
const parts = word.split(/[-_]/).filter(Boolean);
|
|
52
|
+
for (const part of parts) {
|
|
53
|
+
// Split CamelCase
|
|
54
|
+
const subTokens = splitCamelCase(part);
|
|
55
|
+
for (const t of subTokens) {
|
|
56
|
+
const clean = t.toLowerCase().replace(/[^a-z0-9\u3040-\u9fff]/g, "");
|
|
57
|
+
if (clean.length > 2 &&
|
|
58
|
+
!STOP_WORDS.has(clean) &&
|
|
59
|
+
!isNoiseToken(clean)) {
|
|
60
|
+
tokens.push(clean);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return tokens;
|
|
66
|
+
}
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Enriched tool sequence extraction with parameter abstraction.
|
|
3
|
+
* Extracts variable-length tool call patterns with context (file patterns, command categories).
|
|
4
|
+
*/
|
|
5
|
+
// ─── Tool call abstraction ──────────────────────────────────────────────────
|
|
6
|
+
const TOOL_CATEGORY_MAP = {
|
|
7
|
+
Read: "read",
|
|
8
|
+
Grep: "search",
|
|
9
|
+
Glob: "search",
|
|
10
|
+
Edit: "write",
|
|
11
|
+
Write: "write",
|
|
12
|
+
Bash: "execute",
|
|
13
|
+
Agent: "delegate",
|
|
14
|
+
};
|
|
15
|
+
const BASH_CATEGORIES = [
|
|
16
|
+
[/^git\s/, "git"],
|
|
17
|
+
[/^npm\s|^npx\s|^yarn\s|^pnpm\s/, "npm"],
|
|
18
|
+
[/test|jest|vitest|mocha|pytest/, "test"],
|
|
19
|
+
[/build|tsc|webpack|vite\s+build|esbuild/, "build"],
|
|
20
|
+
[/lint|eslint|prettier/, "lint"],
|
|
21
|
+
[/docker|kubectl|helm/, "container"],
|
|
22
|
+
[/curl|wget|fetch/, "http"],
|
|
23
|
+
[/mkdir|rm\s|cp\s|mv\s|chmod|chown/, "filesystem"],
|
|
24
|
+
[/cat\s|head\s|tail\s|less\s|grep\s/, "read"],
|
|
25
|
+
];
|
|
26
|
+
function classifyBashCommand(command) {
|
|
27
|
+
const trimmed = command.trim();
|
|
28
|
+
for (const [pattern, category] of BASH_CATEGORIES) {
|
|
29
|
+
if (pattern.test(trimmed))
|
|
30
|
+
return category;
|
|
31
|
+
}
|
|
32
|
+
return "other";
|
|
33
|
+
}
|
|
34
|
+
function abstractFilePath(filePath) {
|
|
35
|
+
const parts = filePath.split("/");
|
|
36
|
+
if (parts.length <= 1)
|
|
37
|
+
return filePath;
|
|
38
|
+
const ext = filePath.match(/\.[a-zA-Z0-9]+$/)?.[0] || "";
|
|
39
|
+
// Keep first 2 meaningful directory segments + extension pattern
|
|
40
|
+
const dirs = parts.slice(0, -1).filter((p) => p.length > 0);
|
|
41
|
+
const prefix = dirs.slice(0, 2).join("/");
|
|
42
|
+
return prefix ? `${prefix}/**/*${ext}` : `**/*${ext}`;
|
|
43
|
+
}
|
|
44
|
+
export function abstractToolCall(toolCall) {
|
|
45
|
+
const category = TOOL_CATEGORY_MAP[toolCall.toolName] || "execute";
|
|
46
|
+
let targetPattern;
|
|
47
|
+
switch (toolCall.toolName) {
|
|
48
|
+
case "Edit":
|
|
49
|
+
case "Write":
|
|
50
|
+
case "Read": {
|
|
51
|
+
const fp = toolCall.input.file_path;
|
|
52
|
+
if (typeof fp === "string") {
|
|
53
|
+
targetPattern = abstractFilePath(fp);
|
|
54
|
+
}
|
|
55
|
+
break;
|
|
56
|
+
}
|
|
57
|
+
case "Bash": {
|
|
58
|
+
const cmd = toolCall.input.command;
|
|
59
|
+
if (typeof cmd === "string") {
|
|
60
|
+
targetPattern = classifyBashCommand(cmd);
|
|
61
|
+
}
|
|
62
|
+
break;
|
|
63
|
+
}
|
|
64
|
+
case "Grep": {
|
|
65
|
+
const pattern = toolCall.input.pattern;
|
|
66
|
+
if (typeof pattern === "string") {
|
|
67
|
+
targetPattern = `grep:${pattern.length > 30 ? pattern.slice(0, 30) + "..." : pattern}`;
|
|
68
|
+
}
|
|
69
|
+
break;
|
|
70
|
+
}
|
|
71
|
+
case "Glob": {
|
|
72
|
+
const pattern = toolCall.input.pattern;
|
|
73
|
+
if (typeof pattern === "string") {
|
|
74
|
+
targetPattern = `glob:${pattern}`;
|
|
75
|
+
}
|
|
76
|
+
break;
|
|
77
|
+
}
|
|
78
|
+
case "Agent": {
|
|
79
|
+
const agentType = toolCall.input.subagent_type;
|
|
80
|
+
if (typeof agentType === "string") {
|
|
81
|
+
targetPattern = agentType;
|
|
82
|
+
}
|
|
83
|
+
break;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return { toolName: toolCall.toolName, category, targetPattern };
|
|
87
|
+
}
|
|
88
|
+
// ─── Enriched sequence extraction ───────────────────────────────────────────
|
|
89
|
+
function stepKey(step) {
|
|
90
|
+
return `${step.toolName}:${step.category}:${step.targetPattern || ""}`;
|
|
91
|
+
}
|
|
92
|
+
function sequenceKey(steps) {
|
|
93
|
+
return steps.map(stepKey).join("|");
|
|
94
|
+
}
|
|
95
|
+
export function extractEnrichedSequences(sessions, minN = 3, maxN = 7, minCount = 2) {
|
|
96
|
+
// Collect all abstracted tool steps per session
|
|
97
|
+
const sessionSteps = [];
|
|
98
|
+
for (const session of sessions) {
|
|
99
|
+
const steps = [];
|
|
100
|
+
for (const turn of session.turns) {
|
|
101
|
+
for (const tc of turn.toolCalls) {
|
|
102
|
+
steps.push(abstractToolCall(tc));
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Include subagent tool calls
|
|
106
|
+
for (const sub of Object.values(session.subagents)) {
|
|
107
|
+
for (const turn of sub.turns) {
|
|
108
|
+
for (const tc of turn.toolCalls) {
|
|
109
|
+
steps.push(abstractToolCall(tc));
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
if (steps.length >= minN) {
|
|
114
|
+
sessionSteps.push({
|
|
115
|
+
sessionId: session.sessionId,
|
|
116
|
+
project: session.projectDisplayName,
|
|
117
|
+
steps,
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
// Extract n-grams for each length
|
|
122
|
+
const allSequences = new Map();
|
|
123
|
+
for (let n = minN; n <= maxN; n++) {
|
|
124
|
+
for (const { sessionId, project, steps } of sessionSteps) {
|
|
125
|
+
if (steps.length < n)
|
|
126
|
+
continue;
|
|
127
|
+
for (let i = 0; i <= steps.length - n; i++) {
|
|
128
|
+
const ngram = steps.slice(i, i + n);
|
|
129
|
+
const key = sequenceKey(ngram);
|
|
130
|
+
const existing = allSequences.get(key);
|
|
131
|
+
if (existing) {
|
|
132
|
+
existing.count++;
|
|
133
|
+
existing.sessionIds.add(sessionId);
|
|
134
|
+
existing.projects.add(project);
|
|
135
|
+
}
|
|
136
|
+
else {
|
|
137
|
+
allSequences.set(key, {
|
|
138
|
+
steps: ngram,
|
|
139
|
+
count: 1,
|
|
140
|
+
sessionIds: new Set([sessionId]),
|
|
141
|
+
projects: new Set([project]),
|
|
142
|
+
});
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
// Filter by minimum count
|
|
148
|
+
const frequent = [...allSequences.entries()]
|
|
149
|
+
.filter(([, v]) => v.count >= minCount)
|
|
150
|
+
.sort((a, b) => b[1].count - a[1].count);
|
|
151
|
+
// Maximal pattern mining: remove shorter patterns subsumed by longer ones
|
|
152
|
+
const frequentKeys = new Set(frequent.map(([k]) => k));
|
|
153
|
+
const result = [];
|
|
154
|
+
for (const [key, acc] of frequent) {
|
|
155
|
+
// Check if this pattern is a strict substring of any longer frequent pattern
|
|
156
|
+
const isSubsumed = frequent.some(([otherKey, otherAcc]) => {
|
|
157
|
+
return (otherKey !== key &&
|
|
158
|
+
otherAcc.steps.length > acc.steps.length &&
|
|
159
|
+
otherAcc.count >= acc.count * 0.8 && // longer pattern captures >=80% of occurrences
|
|
160
|
+
frequentKeys.has(otherKey) &&
|
|
161
|
+
otherKey.includes(key));
|
|
162
|
+
});
|
|
163
|
+
if (!isSubsumed) {
|
|
164
|
+
result.push({
|
|
165
|
+
sequence: acc.steps,
|
|
166
|
+
count: acc.count,
|
|
167
|
+
sessionIds: [...acc.sessionIds],
|
|
168
|
+
projects: [...acc.projects],
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return result.slice(0, 30); // Top 30 patterns
|
|
173
|
+
}
|