@chigichan24/crune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +201 -0
  2. package/README.md +155 -0
  3. package/bin/crune.js +2 -0
  4. package/dist-cli/__tests__/cli.test.js +63 -0
  5. package/dist-cli/__tests__/clustering.test.js +200 -0
  6. package/dist-cli/__tests__/community.test.js +115 -0
  7. package/dist-cli/__tests__/edges.test.js +130 -0
  8. package/dist-cli/__tests__/feature-extraction.test.js +66 -0
  9. package/dist-cli/__tests__/fixtures.js +192 -0
  10. package/dist-cli/__tests__/orchestrator.test.js +253 -0
  11. package/dist-cli/__tests__/session-parser.test.js +335 -0
  12. package/dist-cli/__tests__/session-summarizer.test.js +117 -0
  13. package/dist-cli/__tests__/skill-server.test.js +191 -0
  14. package/dist-cli/__tests__/svd.test.js +112 -0
  15. package/dist-cli/__tests__/tfidf.test.js +88 -0
  16. package/dist-cli/__tests__/tokenizer.test.js +125 -0
  17. package/dist-cli/__tests__/topic-nodes.test.js +184 -0
  18. package/dist-cli/analyze-sessions.js +476 -0
  19. package/dist-cli/cli.js +215 -0
  20. package/dist-cli/knowledge-graph/clustering.js +174 -0
  21. package/dist-cli/knowledge-graph/community.js +220 -0
  22. package/dist-cli/knowledge-graph/constants.js +58 -0
  23. package/dist-cli/knowledge-graph/edges.js +193 -0
  24. package/dist-cli/knowledge-graph/feature-extraction.js +124 -0
  25. package/dist-cli/knowledge-graph/index.js +235 -0
  26. package/dist-cli/knowledge-graph/reusability.js +51 -0
  27. package/dist-cli/knowledge-graph/similarity.js +13 -0
  28. package/dist-cli/knowledge-graph/skill-generator.js +203 -0
  29. package/dist-cli/knowledge-graph/svd.js +195 -0
  30. package/dist-cli/knowledge-graph/tfidf.js +54 -0
  31. package/dist-cli/knowledge-graph/tokenizer.js +66 -0
  32. package/dist-cli/knowledge-graph/tool-pattern.js +173 -0
  33. package/dist-cli/knowledge-graph/topic-nodes.js +199 -0
  34. package/dist-cli/knowledge-graph/types.js +4 -0
  35. package/dist-cli/knowledge-graph-builder.js +27 -0
  36. package/dist-cli/session-parser.js +360 -0
  37. package/dist-cli/session-summarizer.js +133 -0
  38. package/dist-cli/skill-server.js +62 -0
  39. package/dist-cli/skill-synthesizer.js +189 -0
  40. package/package.json +47 -0
@@ -0,0 +1,203 @@
1
+ /**
2
+ * Generates Claude Code skill Markdown and hook JSON from detected patterns.
3
+ *
4
+ * Skill format follows the conventions from anthropics/skills (Apache-2.0):
5
+ * https://github.com/anthropics/skills/tree/main/skills/skill-creator
6
+ *
7
+ * Key conventions applied:
8
+ * - YAML frontmatter with `name` and `description` (description includes "when to use")
9
+ * - Imperative writing style
10
+ * - "Why" explanations over bare MUST/NEVER rules
11
+ * - Concrete examples from representative prompts
12
+ * - Progressive disclosure: keep SKILL.md body concise
13
+ */
14
+ // ─── Skill name generation ──────────────────────────────────────────────────
15
+ function toSkillName(keywords, project) {
16
+ // Use top keywords + project suffix, kebab-case, max 40 chars
17
+ const parts = keywords
18
+ .slice(0, 3)
19
+ .map((k) => k.toLowerCase().replace(/[^a-z0-9]/g, ""))
20
+ .filter((k) => k.length > 0);
21
+ const projectSuffix = project
22
+ .split("/")
23
+ .pop()
24
+ ?.toLowerCase()
25
+ .replace(/[^a-z0-9]/g, "");
26
+ if (projectSuffix && !parts.includes(projectSuffix)) {
27
+ parts.push(projectSuffix);
28
+ }
29
+ return parts.join("-").slice(0, 40);
30
+ }
31
+ // ─── Description generation (with "pushiness" per skill-creator guidance) ───
32
+ function buildDescription(topic) {
33
+ // Description should include WHEN to use + WHAT it does.
34
+ // skill-creator calls this "pushiness" — explicit about trigger context
35
+ // to counteract under-triggering.
36
+ const action = topic.suggestedPrompt.split(" — ")[0] || "work on this domain";
37
+ const tools = topic.toolSignature
38
+ .slice(0, 3)
39
+ .map((t) => t.tool)
40
+ .join(", ");
41
+ const roleHint = topic.dominantRole === "subagent-delegated"
42
+ ? "Delegates to specialized subagents."
43
+ : topic.dominantRole === "tool-heavy"
44
+ ? `Tool-intensive workflow using ${tools}.`
45
+ : `Interactive workflow using ${tools}.`;
46
+ const projectScope = topic.projects.length > 1
47
+ ? `Applies across ${topic.projects.length} projects (${topic.projects.slice(0, 2).join(", ")}${topic.projects.length > 2 ? ", ..." : ""}).`
48
+ : `Scoped to ${topic.project}.`;
49
+ return `Use when you need to ${action}. ${roleHint} ${projectScope} Detected from ${topic.sessionCount} sessions over ${topic.totalDurationMinutes} minutes of usage.`;
50
+ }
51
+ // ─── Skill body generation ──────────────────────────────────────────────────
52
+ function buildSkillBody(topic, relatedSequences) {
53
+ const sections = [];
54
+ // Section 1: Overview — what this skill automates and why
55
+ sections.push("## Overview");
56
+ sections.push("");
57
+ sections.push(`This skill captures a recurring workflow pattern detected across ${topic.sessionCount} sessions.`);
58
+ if (topic.projects.length > 1) {
59
+ sections.push(`It spans ${topic.projects.length} projects, indicating cross-project reusable knowledge.`);
60
+ }
61
+ sections.push("");
62
+ // Section 2: When to use — explicit trigger guidance (skill-creator: "pushiness")
63
+ sections.push("## When to Use");
64
+ sections.push("");
65
+ if (topic.representativePrompts.length > 0) {
66
+ sections.push("Activate this skill when the user's request resembles:");
67
+ for (const prompt of topic.representativePrompts) {
68
+ sections.push(`- "${prompt}"`);
69
+ }
70
+ }
71
+ else {
72
+ sections.push(`Activate when working on tasks involving: ${topic.keywords.join(", ")}.`);
73
+ }
74
+ sections.push("");
75
+ // Section 3: Workflow steps — imperative style per skill-creator guidance
76
+ sections.push("## Workflow");
77
+ sections.push("");
78
+ if (topic.dominantRole === "subagent-delegated") {
79
+ sections.push("Delegate to specialized subagents. This pattern benefits from parallel execution:");
80
+ sections.push("");
81
+ sections.push("1. Analyze the task scope and identify subtasks");
82
+ sections.push("2. Spawn subagents for each independent subtask using the Agent tool");
83
+ sections.push("3. Collect and synthesize results");
84
+ sections.push("");
85
+ }
86
+ else {
87
+ // Build steps from tool signature — imperative, concrete
88
+ sections.push("Follow this tool sequence:");
89
+ sections.push("");
90
+ const toolSteps = topic.toolSignature.slice(0, 5);
91
+ for (let i = 0; i < toolSteps.length; i++) {
92
+ const ts = toolSteps[i];
93
+ const purpose = describeToolPurpose(ts.tool);
94
+ sections.push(`${i + 1}. Use **${ts.tool}** to ${purpose}`);
95
+ }
96
+ sections.push("");
97
+ }
98
+ // Section 4: Enriched patterns — concrete tool sequences with targets
99
+ if (relatedSequences.length > 0) {
100
+ sections.push("## Detected Patterns");
101
+ sections.push("");
102
+ sections.push("The following tool call patterns were frequently observed in this workflow:");
103
+ sections.push("");
104
+ for (const seq of relatedSequences.slice(0, 3)) {
105
+ const flow = seq.sequence
106
+ .map((s) => {
107
+ const target = s.targetPattern ? ` (${s.targetPattern})` : "";
108
+ return `${s.toolName}${target}`;
109
+ })
110
+ .join(" → ");
111
+ sections.push(`- \`${flow}\` — ${seq.count} occurrences`);
112
+ }
113
+ sections.push("");
114
+ }
115
+ // Section 5: Guidelines — why-based, not bare rules
116
+ sections.push("## Guidelines");
117
+ sections.push("");
118
+ if (topic.dominantRole === "tool-heavy") {
119
+ sections.push("- Prefer tool calls over asking the user, because this pattern historically involves intensive automated operations.");
120
+ }
121
+ if (topic.projects.length > 1) {
122
+ sections.push("- Check project-specific conventions before applying, because this pattern spans multiple projects that may have different standards.");
123
+ }
124
+ const readTools = topic.toolSignature.filter((t) => t.tool === "Read" || t.tool === "Grep" || t.tool === "Glob");
125
+ if (readTools.length > 0) {
126
+ sections.push("- Read and understand existing code before making changes, because this pattern involves significant code exploration.");
127
+ }
128
+ sections.push("");
129
+ return sections.join("\n");
130
+ }
131
+ function describeToolPurpose(tool) {
132
+ switch (tool) {
133
+ case "Read":
134
+ return "examine existing files and understand current state";
135
+ case "Grep":
136
+ return "search for relevant patterns and references";
137
+ case "Glob":
138
+ return "locate target files by pattern";
139
+ case "Edit":
140
+ return "apply targeted modifications to existing files";
141
+ case "Write":
142
+ return "create new files as needed";
143
+ case "Bash":
144
+ return "execute commands (build, test, git operations)";
145
+ case "Agent":
146
+ return "delegate subtasks to specialized subagents";
147
+ default:
148
+ return `perform ${tool} operations`;
149
+ }
150
+ }
151
+ // ─── Public API ─────────────────────────────────────────────────────────────
152
+ export function generateSkillMarkdown(topic, relatedSequences = []) {
153
+ const name = toSkillName(topic.keywords, topic.project);
154
+ const description = buildDescription(topic);
155
+ const body = buildSkillBody(topic, relatedSequences);
156
+ return `---
157
+ name: ${name}
158
+ description: >-
159
+ ${description}
160
+ ---
161
+
162
+ # ${topic.keywords.slice(0, 3).join(" / ")}
163
+
164
+ ${body}`.trim();
165
+ }
166
+ export function generateHookJson(sequence) {
167
+ // Only generate hooks for patterns involving Bash commands with identifiable categories
168
+ const bashSteps = sequence.sequence.filter((s) => s.toolName === "Bash" && s.targetPattern && s.targetPattern !== "other");
169
+ if (bashSteps.length === 0)
170
+ return undefined;
171
+ const hookDef = {
172
+ description: `Auto-detected pattern from ${sequence.count} occurrences across ${sequence.projects.length} project(s)`,
173
+ pattern: sequence.sequence.map((s) => ({
174
+ tool: s.toolName,
175
+ category: s.category,
176
+ target: s.targetPattern || null,
177
+ })),
178
+ sessionCount: sequence.count,
179
+ projects: sequence.projects,
180
+ };
181
+ return JSON.stringify(hookDef, null, 2);
182
+ }
183
+ export function generateSkillCandidates(topics, enrichedSequences) {
184
+ const candidates = [];
185
+ // Generate skill candidates from topics sorted by reusability score
186
+ const sorted = [...topics].sort((a, b) => b.reusabilityScore.overall - a.reusabilityScore.overall);
187
+ for (const topic of sorted) {
188
+ // Find enriched sequences related to this topic's sessions
189
+ const topicSessionSet = new Set(topic.sessionIds);
190
+ const relatedSequences = enrichedSequences.filter((seq) => seq.sessionIds.some((sid) => topicSessionSet.has(sid)));
191
+ const skillMarkdown = generateSkillMarkdown(topic, relatedSequences);
192
+ const hookJson = relatedSequences.length > 0
193
+ ? generateHookJson(relatedSequences[0])
194
+ : undefined;
195
+ candidates.push({
196
+ topicId: topic.id,
197
+ reusabilityScore: topic.reusabilityScore.overall,
198
+ skillMarkdown,
199
+ hookJson,
200
+ });
201
+ }
202
+ return candidates;
203
+ }
@@ -0,0 +1,195 @@
1
+ /**
2
+ * Truncated SVD (Latent Semantic Analysis) via Gram matrix power iteration.
3
+ */
4
+ import { WEIGHT_TEXT, WEIGHT_TOOL, WEIGHT_STRUCT } from "./constants.js";
5
+ /**
6
+ * Build a combined feature matrix from text, tool, and structural vectors.
7
+ * Each group is L2-normalized, then scaled by its weight before concatenation.
8
+ * Returns a dense row-major matrix (m × n) and sessionId ordering.
9
+ */
10
+ export function buildCombinedMatrix(sessionIds, textVectors, toolVectors, structVectors, textDim, toolDim, structDim) {
11
+ const totalDim = textDim + toolDim + structDim;
12
+ const wt = Math.sqrt(WEIGHT_TEXT);
13
+ const wl = Math.sqrt(WEIGHT_TOOL);
14
+ const ws = Math.sqrt(WEIGHT_STRUCT);
15
+ const matrix = [];
16
+ for (const sid of sessionIds) {
17
+ const row = new Float64Array(totalDim);
18
+ const tv = textVectors.get(sid);
19
+ const lv = toolVectors.get(sid);
20
+ const sv = structVectors.get(sid);
21
+ if (tv)
22
+ for (let i = 0; i < textDim; i++)
23
+ row[i] = tv[i] * wt;
24
+ if (lv)
25
+ for (let i = 0; i < toolDim; i++)
26
+ row[textDim + i] = lv[i] * wl;
27
+ if (sv)
28
+ for (let i = 0; i < structDim; i++)
29
+ row[textDim + toolDim + i] = sv[i] * ws;
30
+ matrix.push(row);
31
+ }
32
+ return { matrix, totalDim };
33
+ }
34
+ /**
35
+ * Truncated SVD via power iteration on A·A^T (the Gram matrix).
36
+ *
37
+ * For m sessions × n features where m << n, computing the m×m Gram matrix
38
+ * and extracting its top-k eigenvectors is far cheaper than full SVD.
39
+ */
40
+ export function truncatedSvd(sessionIds, matrix, totalDim, targetK) {
41
+ const m = matrix.length;
42
+ const n = totalDim;
43
+ const k = Math.min(targetK, m - 1, n);
44
+ // Step 1: Compute Gram matrix G = A · A^T (m × m)
45
+ const G = new Float64Array(m * m);
46
+ for (let i = 0; i < m; i++) {
47
+ for (let j = i; j < m; j++) {
48
+ let dot = 0;
49
+ for (let d = 0; d < n; d++) {
50
+ dot += matrix[i][d] * matrix[j][d];
51
+ }
52
+ G[i * m + j] = dot;
53
+ G[j * m + i] = dot;
54
+ }
55
+ }
56
+ // Step 2: Power iteration with deflation to extract top-k eigenvectors of G
57
+ const eigenvectors = [];
58
+ const eigenvalues = [];
59
+ // Seeded PRNG for reproducibility
60
+ let seed = 42;
61
+ const nextRand = () => {
62
+ seed = (seed * 1103515245 + 12345) & 0x7fffffff;
63
+ return seed / 0x7fffffff;
64
+ };
65
+ for (let ki = 0; ki < k; ki++) {
66
+ // Random initial vector
67
+ const v = new Float64Array(m);
68
+ for (let i = 0; i < m; i++)
69
+ v[i] = nextRand() - 0.5;
70
+ // Normalize
71
+ let norm = 0;
72
+ for (let i = 0; i < m; i++)
73
+ norm += v[i] * v[i];
74
+ norm = Math.sqrt(norm);
75
+ for (let i = 0; i < m; i++)
76
+ v[i] /= norm;
77
+ // Power iteration (50 iterations is more than enough for convergence)
78
+ for (let iter = 0; iter < 50; iter++) {
79
+ // w = G · v
80
+ const w = new Float64Array(m);
81
+ for (let i = 0; i < m; i++) {
82
+ let s = 0;
83
+ for (let j = 0; j < m; j++) {
84
+ s += G[i * m + j] * v[j];
85
+ }
86
+ w[i] = s;
87
+ }
88
+ // Deflate: remove projections onto previously found eigenvectors
89
+ for (let prev = 0; prev < ki; prev++) {
90
+ const ev = eigenvectors[prev];
91
+ let proj = 0;
92
+ for (let i = 0; i < m; i++)
93
+ proj += w[i] * ev[i];
94
+ for (let i = 0; i < m; i++)
95
+ w[i] -= proj * ev[i];
96
+ }
97
+ // Normalize
98
+ norm = 0;
99
+ for (let i = 0; i < m; i++)
100
+ norm += w[i] * w[i];
101
+ norm = Math.sqrt(norm);
102
+ if (norm < 1e-12)
103
+ break;
104
+ for (let i = 0; i < m; i++)
105
+ v[i] = w[i] / norm;
106
+ }
107
+ // Eigenvalue = v^T G v
108
+ let eigenvalue = 0;
109
+ for (let i = 0; i < m; i++) {
110
+ let s = 0;
111
+ for (let j = 0; j < m; j++)
112
+ s += G[i * m + j] * v[j];
113
+ eigenvalue += v[i] * s;
114
+ }
115
+ eigenvectors.push(new Float64Array(v));
116
+ eigenvalues.push(Math.max(0, eigenvalue));
117
+ }
118
+ // Step 3: Singular values = sqrt(eigenvalues of G)
119
+ const sigma = new Float64Array(k);
120
+ for (let i = 0; i < k; i++) {
121
+ sigma[i] = Math.sqrt(eigenvalues[i]);
122
+ }
123
+ // Step 4: Right singular vectors V = A^T · U · Σ^{-1}
124
+ const V = [];
125
+ for (let ki = 0; ki < k; ki++) {
126
+ const vk = new Float64Array(n);
127
+ if (sigma[ki] > 1e-12) {
128
+ const invSigma = 1 / sigma[ki];
129
+ for (let j = 0; j < n; j++) {
130
+ let s = 0;
131
+ for (let i = 0; i < m; i++) {
132
+ s += matrix[i][j] * eigenvectors[ki][i];
133
+ }
134
+ vk[j] = s * invSigma;
135
+ }
136
+ }
137
+ V.push(vk);
138
+ }
139
+ // Step 5: Session vectors = U · Σ (scaled embeddings)
140
+ const sessionVectors = new Map();
141
+ for (let i = 0; i < m; i++) {
142
+ const vec = new Float64Array(k);
143
+ for (let ki = 0; ki < k; ki++) {
144
+ vec[ki] = eigenvectors[ki][i] * sigma[ki];
145
+ }
146
+ // L2 normalize for cosine-based clustering
147
+ let norm = 0;
148
+ for (let d = 0; d < k; d++)
149
+ norm += vec[d] * vec[d];
150
+ norm = Math.sqrt(norm);
151
+ if (norm > 0)
152
+ for (let d = 0; d < k; d++)
153
+ vec[d] /= norm;
154
+ sessionVectors.set(sessionIds[i], vec);
155
+ }
156
+ return { U: eigenvectors, sigma, V, k, sessionVectors };
157
+ }
158
+ /**
159
+ * Interpret latent dimensions from V matrix.
160
+ * Returns top-N terms per latent dimension, useful for cluster labeling.
161
+ */
162
+ export function interpretLatentDimensions(svd, textVocabulary, toolVocabulary, textDim, toolDim, topN = 5) {
163
+ const totalVariance = svd.sigma.reduce((s, v) => s + v * v, 0);
164
+ const dimensions = [];
165
+ for (let ki = 0; ki < svd.k; ki++) {
166
+ const v = svd.V[ki];
167
+ const varianceRatio = totalVariance > 0
168
+ ? (svd.sigma[ki] * svd.sigma[ki]) / totalVariance
169
+ : 0;
170
+ // Top text terms (from text portion of V)
171
+ const textScored = [];
172
+ for (let i = 0; i < textDim && i < textVocabulary.length; i++) {
173
+ if (Math.abs(v[i]) > 0.01) {
174
+ textScored.push({ term: textVocabulary[i], weight: Math.abs(v[i]) });
175
+ }
176
+ }
177
+ textScored.sort((a, b) => b.weight - a.weight);
178
+ // Top tools (from tool portion of V)
179
+ const toolScored = [];
180
+ for (let i = 0; i < toolDim && i < toolVocabulary.length; i++) {
181
+ const idx = textDim + i;
182
+ if (Math.abs(v[idx]) > 0.01) {
183
+ toolScored.push({ tool: toolVocabulary[i], weight: Math.abs(v[idx]) });
184
+ }
185
+ }
186
+ toolScored.sort((a, b) => b.weight - a.weight);
187
+ dimensions.push({
188
+ index: ki,
189
+ varianceRatio: Math.round(varianceRatio * 10000) / 10000,
190
+ topTerms: textScored.slice(0, topN),
191
+ topTools: toolScored.slice(0, topN),
192
+ });
193
+ }
194
+ return dimensions;
195
+ }
@@ -0,0 +1,54 @@
1
+ /**
2
+ * TF-IDF vectorization for session documents.
3
+ */
4
+ export function buildTfidf(documents) {
5
+ // Build vocabulary
6
+ const df = new Map(); // document frequency
7
+ for (const [, tokens] of documents) {
8
+ const uniqueTerms = new Set(tokens);
9
+ for (const term of uniqueTerms) {
10
+ df.set(term, (df.get(term) || 0) + 1);
11
+ }
12
+ }
13
+ // Filter vocabulary: appear in at least 2 docs, but not in > 80% of docs
14
+ const n = documents.size;
15
+ const maxDf = Math.max(2, Math.floor(n * 0.8));
16
+ const vocabulary = [];
17
+ const vocabIndex = new Map();
18
+ for (const [term, count] of df) {
19
+ if (count >= 2 && count <= maxDf) {
20
+ vocabIndex.set(term, vocabulary.length);
21
+ vocabulary.push(term);
22
+ }
23
+ }
24
+ // Build TF-IDF vectors
25
+ const vectors = new Map();
26
+ for (const [docId, tokens] of documents) {
27
+ const tf = new Map();
28
+ for (const t of tokens) {
29
+ if (vocabIndex.has(t)) {
30
+ tf.set(t, (tf.get(t) || 0) + 1);
31
+ }
32
+ }
33
+ const vec = new Float64Array(vocabulary.length);
34
+ for (const [term, count] of tf) {
35
+ const idx = vocabIndex.get(term);
36
+ const termFreq = Math.log(1 + count);
37
+ const invDocFreq = Math.log(n / (df.get(term) || 1));
38
+ vec[idx] = termFreq * invDocFreq;
39
+ }
40
+ // L2 normalize
41
+ let norm = 0;
42
+ for (let i = 0; i < vec.length; i++) {
43
+ norm += vec[i] * vec[i];
44
+ }
45
+ norm = Math.sqrt(norm);
46
+ if (norm > 0) {
47
+ for (let i = 0; i < vec.length; i++) {
48
+ vec[i] /= norm;
49
+ }
50
+ }
51
+ vectors.set(docId, vec);
52
+ }
53
+ return { vocabulary, vocabIndex, vectors };
54
+ }
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Text tokenization for knowledge graph feature extraction.
3
+ */
4
+ import { STOP_WORDS, UUID_PATTERN, HEX_PATTERN, NUM_PATTERN } from "./constants.js";
5
+ export function splitCamelCase(word) {
6
+ return word
7
+ .replace(/([a-z])([A-Z])/g, "$1 $2")
8
+ .replace(/([A-Z]+)([A-Z][a-z])/g, "$1 $2")
9
+ .split(/\s+/)
10
+ .map((w) => w.toLowerCase());
11
+ }
12
+ export function extractPathTokens(text) {
13
+ const pathPattern = /(?:\/[\w.-]+){2,}/g;
14
+ const tokens = [];
15
+ let match;
16
+ while ((match = pathPattern.exec(text)) !== null) {
17
+ const segments = match[0].split("/").filter(Boolean);
18
+ for (const seg of segments) {
19
+ const name = seg.replace(/\.[^.]+$/, ""); // remove extension
20
+ if (name.length > 2) {
21
+ tokens.push(...splitCamelCase(name));
22
+ }
23
+ }
24
+ }
25
+ return tokens;
26
+ }
27
+ export function isNoiseToken(token) {
28
+ return (UUID_PATTERN.test(token) ||
29
+ HEX_PATTERN.test(token) ||
30
+ NUM_PATTERN.test(token) ||
31
+ token.length > 40 // extremely long tokens are noise
32
+ );
33
+ }
34
+ export function tokenize(text) {
35
+ const tokens = [];
36
+ // Extract file path tokens first
37
+ tokens.push(...extractPathTokens(text));
38
+ // Split on whitespace, punctuation, CJK boundaries
39
+ const words = text
40
+ .replace(/[`'"{}()[\]<>;:,!?@#$%^&*=+|\\~]/g, " ")
41
+ .replace(/\//g, " ")
42
+ .split(/\s+/)
43
+ .filter(Boolean);
44
+ for (const word of words) {
45
+ // Skip URLs and UUIDs
46
+ if (word.startsWith("http"))
47
+ continue;
48
+ if (UUID_PATTERN.test(word))
49
+ continue;
50
+ // Handle kebab-case and snake_case
51
+ const parts = word.split(/[-_]/).filter(Boolean);
52
+ for (const part of parts) {
53
+ // Split CamelCase
54
+ const subTokens = splitCamelCase(part);
55
+ for (const t of subTokens) {
56
+ const clean = t.toLowerCase().replace(/[^a-z0-9\u3040-\u9fff]/g, "");
57
+ if (clean.length > 2 &&
58
+ !STOP_WORDS.has(clean) &&
59
+ !isNoiseToken(clean)) {
60
+ tokens.push(clean);
61
+ }
62
+ }
63
+ }
64
+ }
65
+ return tokens;
66
+ }
@@ -0,0 +1,173 @@
1
+ /**
2
+ * Enriched tool sequence extraction with parameter abstraction.
3
+ * Extracts variable-length tool call patterns with context (file patterns, command categories).
4
+ */
5
+ // ─── Tool call abstraction ──────────────────────────────────────────────────
6
+ const TOOL_CATEGORY_MAP = {
7
+ Read: "read",
8
+ Grep: "search",
9
+ Glob: "search",
10
+ Edit: "write",
11
+ Write: "write",
12
+ Bash: "execute",
13
+ Agent: "delegate",
14
+ };
15
+ const BASH_CATEGORIES = [
16
+ [/^git\s/, "git"],
17
+ [/^npm\s|^npx\s|^yarn\s|^pnpm\s/, "npm"],
18
+ [/test|jest|vitest|mocha|pytest/, "test"],
19
+ [/build|tsc|webpack|vite\s+build|esbuild/, "build"],
20
+ [/lint|eslint|prettier/, "lint"],
21
+ [/docker|kubectl|helm/, "container"],
22
+ [/curl|wget|fetch/, "http"],
23
+ [/mkdir|rm\s|cp\s|mv\s|chmod|chown/, "filesystem"],
24
+ [/cat\s|head\s|tail\s|less\s|grep\s/, "read"],
25
+ ];
26
+ function classifyBashCommand(command) {
27
+ const trimmed = command.trim();
28
+ for (const [pattern, category] of BASH_CATEGORIES) {
29
+ if (pattern.test(trimmed))
30
+ return category;
31
+ }
32
+ return "other";
33
+ }
34
+ function abstractFilePath(filePath) {
35
+ const parts = filePath.split("/");
36
+ if (parts.length <= 1)
37
+ return filePath;
38
+ const ext = filePath.match(/\.[a-zA-Z0-9]+$/)?.[0] || "";
39
+ // Keep first 2 meaningful directory segments + extension pattern
40
+ const dirs = parts.slice(0, -1).filter((p) => p.length > 0);
41
+ const prefix = dirs.slice(0, 2).join("/");
42
+ return prefix ? `${prefix}/**/*${ext}` : `**/*${ext}`;
43
+ }
44
+ export function abstractToolCall(toolCall) {
45
+ const category = TOOL_CATEGORY_MAP[toolCall.toolName] || "execute";
46
+ let targetPattern;
47
+ switch (toolCall.toolName) {
48
+ case "Edit":
49
+ case "Write":
50
+ case "Read": {
51
+ const fp = toolCall.input.file_path;
52
+ if (typeof fp === "string") {
53
+ targetPattern = abstractFilePath(fp);
54
+ }
55
+ break;
56
+ }
57
+ case "Bash": {
58
+ const cmd = toolCall.input.command;
59
+ if (typeof cmd === "string") {
60
+ targetPattern = classifyBashCommand(cmd);
61
+ }
62
+ break;
63
+ }
64
+ case "Grep": {
65
+ const pattern = toolCall.input.pattern;
66
+ if (typeof pattern === "string") {
67
+ targetPattern = `grep:${pattern.length > 30 ? pattern.slice(0, 30) + "..." : pattern}`;
68
+ }
69
+ break;
70
+ }
71
+ case "Glob": {
72
+ const pattern = toolCall.input.pattern;
73
+ if (typeof pattern === "string") {
74
+ targetPattern = `glob:${pattern}`;
75
+ }
76
+ break;
77
+ }
78
+ case "Agent": {
79
+ const agentType = toolCall.input.subagent_type;
80
+ if (typeof agentType === "string") {
81
+ targetPattern = agentType;
82
+ }
83
+ break;
84
+ }
85
+ }
86
+ return { toolName: toolCall.toolName, category, targetPattern };
87
+ }
88
+ // ─── Enriched sequence extraction ───────────────────────────────────────────
89
+ function stepKey(step) {
90
+ return `${step.toolName}:${step.category}:${step.targetPattern || ""}`;
91
+ }
92
+ function sequenceKey(steps) {
93
+ return steps.map(stepKey).join("|");
94
+ }
95
+ export function extractEnrichedSequences(sessions, minN = 3, maxN = 7, minCount = 2) {
96
+ // Collect all abstracted tool steps per session
97
+ const sessionSteps = [];
98
+ for (const session of sessions) {
99
+ const steps = [];
100
+ for (const turn of session.turns) {
101
+ for (const tc of turn.toolCalls) {
102
+ steps.push(abstractToolCall(tc));
103
+ }
104
+ }
105
+ // Include subagent tool calls
106
+ for (const sub of Object.values(session.subagents)) {
107
+ for (const turn of sub.turns) {
108
+ for (const tc of turn.toolCalls) {
109
+ steps.push(abstractToolCall(tc));
110
+ }
111
+ }
112
+ }
113
+ if (steps.length >= minN) {
114
+ sessionSteps.push({
115
+ sessionId: session.sessionId,
116
+ project: session.projectDisplayName,
117
+ steps,
118
+ });
119
+ }
120
+ }
121
+ // Extract n-grams for each length
122
+ const allSequences = new Map();
123
+ for (let n = minN; n <= maxN; n++) {
124
+ for (const { sessionId, project, steps } of sessionSteps) {
125
+ if (steps.length < n)
126
+ continue;
127
+ for (let i = 0; i <= steps.length - n; i++) {
128
+ const ngram = steps.slice(i, i + n);
129
+ const key = sequenceKey(ngram);
130
+ const existing = allSequences.get(key);
131
+ if (existing) {
132
+ existing.count++;
133
+ existing.sessionIds.add(sessionId);
134
+ existing.projects.add(project);
135
+ }
136
+ else {
137
+ allSequences.set(key, {
138
+ steps: ngram,
139
+ count: 1,
140
+ sessionIds: new Set([sessionId]),
141
+ projects: new Set([project]),
142
+ });
143
+ }
144
+ }
145
+ }
146
+ }
147
+ // Filter by minimum count
148
+ const frequent = [...allSequences.entries()]
149
+ .filter(([, v]) => v.count >= minCount)
150
+ .sort((a, b) => b[1].count - a[1].count);
151
+ // Maximal pattern mining: remove shorter patterns subsumed by longer ones
152
+ const frequentKeys = new Set(frequent.map(([k]) => k));
153
+ const result = [];
154
+ for (const [key, acc] of frequent) {
155
+ // Check if this pattern is a strict substring of any longer frequent pattern
156
+ const isSubsumed = frequent.some(([otherKey, otherAcc]) => {
157
+ return (otherKey !== key &&
158
+ otherAcc.steps.length > acc.steps.length &&
159
+ otherAcc.count >= acc.count * 0.8 && // longer pattern captures >=80% of occurrences
160
+ frequentKeys.has(otherKey) &&
161
+ otherKey.includes(key));
162
+ });
163
+ if (!isSubsumed) {
164
+ result.push({
165
+ sequence: acc.steps,
166
+ count: acc.count,
167
+ sessionIds: [...acc.sessionIds],
168
+ projects: [...acc.projects],
169
+ });
170
+ }
171
+ }
172
+ return result.slice(0, 30); // Top 30 patterns
173
+ }