npm - @chigichan24/crune - Versions diffs - 0.1.0 - Mend

@chigichan24/crune 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/LICENSE +201 -0
package/README.md +155 -0
package/bin/crune.js +2 -0
package/dist-cli/__tests__/cli.test.js +63 -0
package/dist-cli/__tests__/clustering.test.js +200 -0
package/dist-cli/__tests__/community.test.js +115 -0
package/dist-cli/__tests__/edges.test.js +130 -0
package/dist-cli/__tests__/feature-extraction.test.js +66 -0
package/dist-cli/__tests__/fixtures.js +192 -0
package/dist-cli/__tests__/orchestrator.test.js +253 -0
package/dist-cli/__tests__/session-parser.test.js +335 -0
package/dist-cli/__tests__/session-summarizer.test.js +117 -0
package/dist-cli/__tests__/skill-server.test.js +191 -0
package/dist-cli/__tests__/svd.test.js +112 -0
package/dist-cli/__tests__/tfidf.test.js +88 -0
package/dist-cli/__tests__/tokenizer.test.js +125 -0
package/dist-cli/__tests__/topic-nodes.test.js +184 -0
package/dist-cli/analyze-sessions.js +476 -0
package/dist-cli/cli.js +215 -0
package/dist-cli/knowledge-graph/clustering.js +174 -0
package/dist-cli/knowledge-graph/community.js +220 -0
package/dist-cli/knowledge-graph/constants.js +58 -0
package/dist-cli/knowledge-graph/edges.js +193 -0
package/dist-cli/knowledge-graph/feature-extraction.js +124 -0
package/dist-cli/knowledge-graph/index.js +235 -0
package/dist-cli/knowledge-graph/reusability.js +51 -0
package/dist-cli/knowledge-graph/similarity.js +13 -0
package/dist-cli/knowledge-graph/skill-generator.js +203 -0
package/dist-cli/knowledge-graph/svd.js +195 -0
package/dist-cli/knowledge-graph/tfidf.js +54 -0
package/dist-cli/knowledge-graph/tokenizer.js +66 -0
package/dist-cli/knowledge-graph/tool-pattern.js +173 -0
package/dist-cli/knowledge-graph/topic-nodes.js +199 -0
package/dist-cli/knowledge-graph/types.js +4 -0
package/dist-cli/knowledge-graph-builder.js +27 -0
package/dist-cli/session-parser.js +360 -0
package/dist-cli/session-summarizer.js +133 -0
package/dist-cli/skill-server.js +62 -0
package/dist-cli/skill-synthesizer.js +189 -0
package/package.json +47 -0

package/dist-cli/__tests__/session-summarizer.test.js ADDED Viewed

@@ -0,0 +1,117 @@
+import { describe, it, expect } from "vitest";
+import { generateSessionSummary, classifyWorkType, findCommonPathPrefix, } from "../session-summarizer.js";
+describe("generateSessionSummary", () => {
+    it("single plan mode prompt: returns that prompt as summary", () => {
+        const result = generateSessionSummary([{ userPrompt: "Investigate the login bug", permissionMode: "plan" }], {
+            toolBreakdown: { Read: 5 },
+            filesEdited: [],
+            permissionMode: null,
+            turnCount: 1,
+        });
+        expect(result.summary).toBe("Investigate the login bug");
+    });
+    it("multiple plan mode prompts: selects the most central one", () => {
+        const result = generateSessionSummary([
+            { userPrompt: "refactor authentication module", permissionMode: "plan" },
+            { userPrompt: "refactor authentication tests", permissionMode: "plan" },
+            { userPrompt: "deploy to staging server", permissionMode: "plan" },
+        ], {
+            toolBreakdown: { Edit: 10 },
+            filesEdited: [],
+            permissionMode: null,
+            turnCount: 3,
+        });
+        // The first two share "refactor" and "authentication", so one of them should be selected
+        // The first prompt gets higher position weight, so it should win
+        expect(result.summary).toBe("refactor authentication module");
+    });
+    it("no plan mode prompts: falls back to all user prompts", () => {
+        const result = generateSessionSummary([
+            { userPrompt: "fix the build error", permissionMode: "code" },
+            { userPrompt: "run the tests", permissionMode: "code" },
+        ], {
+            toolBreakdown: { Bash: 5 },
+            filesEdited: [],
+            permissionMode: null,
+            turnCount: 2,
+        });
+        expect(result.summary.length).toBeGreaterThan(0);
+    });
+    it("empty prompts filtered: skips whitespace-only prompts", () => {
+        const result = generateSessionSummary([
+            { userPrompt: "   ", permissionMode: "plan" },
+            { userPrompt: "", permissionMode: "plan" },
+            { userPrompt: "implement feature X", permissionMode: "plan" },
+        ], {
+            toolBreakdown: { Edit: 5 },
+            filesEdited: [],
+            permissionMode: null,
+            turnCount: 3,
+        });
+        expect(result.summary).toBe("implement feature X");
+    });
+    it("summary truncated to 300 chars: long prompt gets cut", () => {
+        const longPrompt = "a".repeat(500);
+        const result = generateSessionSummary([{ userPrompt: longPrompt, permissionMode: "plan" }], {
+            toolBreakdown: {},
+            filesEdited: [],
+            permissionMode: null,
+            turnCount: 1,
+        });
+        expect(result.summary.length).toBe(300);
+    });
+});
+describe("classifyWorkType", () => {
+    it("investigation: high read ratio", () => {
+        expect(classifyWorkType({ Read: 10, Grep: 5, Glob: 3, Edit: 1 }, null, 10)).toBe("investigation");
+    });
+    it("implementation: high write ratio", () => {
+        expect(classifyWorkType({ Edit: 10, Write: 3, Read: 5 }, null, 10)).toBe("implementation");
+    });
+    it("debugging: high bash ratio with some writes", () => {
+        expect(classifyWorkType({ Bash: 10, Edit: 3, Read: 2 }, null, 10)).toBe("debugging");
+    });
+    it("planning: plan mode with few turns and no writes", () => {
+        expect(classifyWorkType({ Read: 2 }, "plan", 3)).toBe("planning");
+    });
+    it("planning: empty tool breakdown with few turns", () => {
+        expect(classifyWorkType({}, null, 3)).toBe("planning");
+    });
+});
+describe("findCommonPathPrefix", () => {
+    it("common prefix: returns shared directory", () => {
+        expect(findCommonPathPrefix(["src/a/b.ts", "src/a/c.ts"])).toBe("src/a");
+    });
+    it("root only: returns empty string", () => {
+        expect(findCommonPathPrefix(["src/a.ts", "lib/b.ts"])).toBe("");
+    });
+    it("single file: returns its directory", () => {
+        expect(findCommonPathPrefix(["src/a/b.ts"])).toBe("src/a");
+    });
+    it("empty array: returns empty string", () => {
+        expect(findCommonPathPrefix([])).toBe("");
+    });
+});
+describe("keywords extraction", () => {
+    it("extracts top keywords from prompts", () => {
+        const result = generateSessionSummary([
+            { userPrompt: "refactor authentication module component", permissionMode: "plan" },
+            { userPrompt: "refactor authentication service layer", permissionMode: "plan" },
+            { userPrompt: "refactor authentication controller handler", permissionMode: "plan" },
+        ], {
+            toolBreakdown: { Edit: 10 },
+            filesEdited: [],
+            permissionMode: null,
+            turnCount: 3,
+        });
+        expect(result.keywords.length).toBeGreaterThan(0);
+        expect(result.keywords.length).toBeLessThanOrEqual(5);
+        // "refactor" and "authentication" appear in all prompts, should be top keywords
+        expect(result.keywords).toContain("refactor");
+        expect(result.keywords).toContain("authentication");
+        // Stop words should not appear
+        for (const kw of result.keywords) {
+            expect(kw.trim().length).toBeGreaterThan(0);
+        }
+    });
+});

package/dist-cli/__tests__/skill-server.test.js ADDED Viewed

@@ -0,0 +1,191 @@
+import { describe, it, expect } from "vitest";
+import { buildSynthesisPrompt } from "../skill-synthesizer.js";
+// Minimal mock TopicNode
+function makeTopicNode(overrides = {}) {
+    return {
+        id: "topic-1",
+        label: "Test Topic",
+        keywords: ["testing", "mock"],
+        dominantRole: "code",
+        projects: ["project-a"],
+        project: "project-a",
+        sessionCount: 5,
+        totalDurationMinutes: 120,
+        totalToolCalls: 50,
+        toolSignature: [{ tool: "Bash", weight: 0.6 }, { tool: "Read", weight: 0.4 }],
+        representativePrompts: ["Run tests", "Check output"],
+        suggestedPrompt: "Run tests and check output",
+        reusabilityScore: { overall: 0.8, frequency: 0.7, timeCost: 0.9, crossProjectScore: 0.5, recency: 0.8 },
+        betweennessCentrality: 0.01,
+        degreeCentrality: 0.1,
+        ...overrides,
+    };
+}
+// Minimal mock SkillCandidate
+function makeSkillCandidate(overrides = {}) {
+    return {
+        topicId: "topic-1",
+        reusabilityScore: 0.8,
+        skillMarkdown: "# Test Skill\nDo the thing.",
+        ...overrides,
+    };
+}
+describe("buildSynthesisPrompt", () => {
+    it("should NOT contain Graph Position or Connected Topics without graphContext", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode(),
+        });
+        expect(prompt).not.toContain("## Graph Position");
+        expect(prompt).not.toContain("## Connected Topics");
+    });
+    it("should contain bridge interpretation for high betweenness centrality", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode({ betweennessCentrality: 0.25 }),
+            graphContext: {
+                connectedTopics: [],
+                isBridgeTopic: true,
+            },
+        });
+        expect(prompt).toContain("## Graph Position");
+        expect(prompt).toContain("critical bridge topic connecting multiple knowledge domains");
+        expect(prompt).toContain("bridge topic in the knowledge graph");
+    });
+    it("should contain bridge interpretation for moderate betweenness", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode({ betweennessCentrality: 0.1 }),
+            graphContext: {
+                connectedTopics: [],
+                isBridgeTopic: false,
+            },
+        });
+        expect(prompt).toContain("bridges several knowledge domains");
+    });
+    it("should contain hub interpretation for high degree centrality", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode({ betweennessCentrality: 0.01, degreeCentrality: 0.6 }),
+            graphContext: {
+                connectedTopics: [],
+                isBridgeTopic: false,
+            },
+        });
+        expect(prompt).toContain("hub topic connected to many other topics");
+    });
+    it("should contain isolated interpretation for zero degree centrality", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode({ betweennessCentrality: 0.01, degreeCentrality: 0 }),
+            graphContext: {
+                connectedTopics: [],
+                isBridgeTopic: false,
+            },
+        });
+        expect(prompt).toContain("isolated topic with no connections");
+    });
+    it("should contain peripheral interpretation for low centrality values", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode({ betweennessCentrality: 0.01, degreeCentrality: 0.1 }),
+            graphContext: {
+                connectedTopics: [],
+                isBridgeTopic: false,
+            },
+        });
+        expect(prompt).toContain("peripheral topic");
+    });
+    it("should contain Prerequisite and Follow-up for workflow-continuation edges", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode(),
+            graphContext: {
+                connectedTopics: [
+                    {
+                        id: "topic-2",
+                        label: "Setup Environment",
+                        keywords: ["setup", "env"],
+                        edgeType: "workflow-continuation",
+                        strength: 0.85,
+                        direction: "incoming",
+                    },
+                    {
+                        id: "topic-3",
+                        label: "Deploy App",
+                        keywords: ["deploy", "production"],
+                        edgeType: "workflow-continuation",
+                        strength: 0.75,
+                        direction: "outgoing",
+                    },
+                ],
+                isBridgeTopic: false,
+            },
+        });
+        expect(prompt).toContain("## Connected Topics");
+        expect(prompt).toContain("Prerequisite: Setup Environment [setup, env] (strength: 0.85)");
+        expect(prompt).toContain("Follow-up: Deploy App [deploy, production] (strength: 0.75)");
+        expect(prompt).toContain("requires");
+        expect(prompt).toContain("next");
+        expect(prompt).toContain("frontmatter");
+    });
+    it("should contain all edge type groups with mixed edge types", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode(),
+            graphContext: {
+                connectedTopics: [
+                    {
+                        id: "t-wf",
+                        label: "Workflow Prev",
+                        keywords: ["wf"],
+                        edgeType: "workflow-continuation",
+                        strength: 0.9,
+                        direction: "incoming",
+                    },
+                    {
+                        id: "t-sm",
+                        label: "Shared Module Topic",
+                        keywords: ["shared"],
+                        edgeType: "shared-module",
+                        strength: 0.7,
+                        direction: "outgoing",
+                    },
+                    {
+                        id: "t-cp",
+                        label: "Cross Project Topic",
+                        keywords: ["cross"],
+                        edgeType: "cross-project-bridge",
+                        strength: 0.6,
+                        direction: "outgoing",
+                    },
+                    {
+                        id: "t-ss",
+                        label: "Similar Topic",
+                        keywords: ["similar"],
+                        edgeType: "semantic-similarity",
+                        strength: 0.5,
+                        direction: "outgoing",
+                    },
+                ],
+                isBridgeTopic: false,
+            },
+        });
+        expect(prompt).toContain("Prerequisite: Workflow Prev [wf] (strength: 0.9)");
+        expect(prompt).toContain("Related (shared files): Shared Module Topic [shared]");
+        expect(prompt).toContain("Cross-project link: Cross Project Topic [cross]");
+        expect(prompt).toContain("Similar topic (differentiate from): Similar Topic [similar]");
+    });
+    it("should contain community label and member count", () => {
+        const prompt = buildSynthesisPrompt({
+            skillCandidate: makeSkillCandidate(),
+            topicNode: makeTopicNode(),
+            graphContext: {
+                connectedTopics: [],
+                community: { label: "Frontend Development", memberCount: 12 },
+                isBridgeTopic: false,
+            },
+        });
+        expect(prompt).toContain("Belongs to community: Frontend Development (12 topics)");
+    });
+});

package/dist-cli/__tests__/svd.test.js ADDED Viewed

@@ -0,0 +1,112 @@
+import { describe, it, expect } from "vitest";
+import { buildCombinedMatrix, truncatedSvd, } from "../knowledge-graph-builder.js";
+describe("buildCombinedMatrix", () => {
+    it("output has correct dimensions (totalDim = textDim + toolDim + structDim, rows = sessionIds.length)", () => {
+        const sessionIds = ["s1", "s2", "s3"];
+        const textDim = 4;
+        const toolDim = 3;
+        const structDim = 2;
+        const textVectors = new Map([
+            ["s1", new Float64Array([1, 0, 0, 1])],
+            ["s2", new Float64Array([0, 1, 1, 0])],
+            ["s3", new Float64Array([1, 1, 0, 0])],
+        ]);
+        const toolVectors = new Map([
+            ["s1", new Float64Array([1, 0, 0])],
+            ["s2", new Float64Array([0, 1, 0])],
+            ["s3", new Float64Array([0, 0, 1])],
+        ]);
+        const structVectors = new Map([
+            ["s1", new Float64Array([1, 0])],
+            ["s2", new Float64Array([0, 1])],
+            ["s3", new Float64Array([1, 1])],
+        ]);
+        const { matrix, totalDim } = buildCombinedMatrix(sessionIds, textVectors, toolVectors, structVectors, textDim, toolDim, structDim);
+        expect(totalDim).toBe(textDim + toolDim + structDim);
+        expect(matrix.length).toBe(sessionIds.length);
+        for (const row of matrix) {
+            expect(row.length).toBe(totalDim);
+        }
+    });
+    it("weights are applied: text portion scaled by sqrt(0.5), tool/struct by sqrt(0.25)", () => {
+        const sessionIds = ["s1"];
+        const textDim = 2;
+        const toolDim = 2;
+        const structDim = 2;
+        const textVectors = new Map([
+            ["s1", new Float64Array([1, 2])],
+        ]);
+        const toolVectors = new Map([
+            ["s1", new Float64Array([3, 4])],
+        ]);
+        const structVectors = new Map([
+            ["s1", new Float64Array([5, 6])],
+        ]);
+        const { matrix } = buildCombinedMatrix(sessionIds, textVectors, toolVectors, structVectors, textDim, toolDim, structDim);
+        const row = matrix[0];
+        const sqrtText = Math.sqrt(0.5);
+        const sqrtTool = Math.sqrt(0.25);
+        const sqrtStruct = Math.sqrt(0.25);
+        // Text portion (indices 0-1)
+        expect(row[0]).toBeCloseTo(1 * sqrtText);
+        expect(row[1]).toBeCloseTo(2 * sqrtText);
+        // Tool portion (indices 2-3)
+        expect(row[2]).toBeCloseTo(3 * sqrtTool);
+        expect(row[3]).toBeCloseTo(4 * sqrtTool);
+        // Struct portion (indices 4-5)
+        expect(row[4]).toBeCloseTo(5 * sqrtStruct);
+        expect(row[5]).toBeCloseTo(6 * sqrtStruct);
+    });
+});
+describe("truncatedSvd", () => {
+    // Helper: create a test matrix with 5 sessions and 10 features
+    function makeTestData() {
+        const sessionIds = ["s1", "s2", "s3", "s4", "s5"];
+        const totalDim = 10;
+        const matrix = [];
+        // Create distinct patterns so SVD has structure to find
+        const patterns = [
+            [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
+            [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
+            [1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
+            [0, 0, 1, 1, 0, 0, 1, 1, 0, 0],
+            [1, 0.5, 0.5, 0, 1, 0.5, 0.5, 0, 1, 0.5],
+        ];
+        for (const p of patterns) {
+            matrix.push(new Float64Array(p));
+        }
+        return { sessionIds, matrix, totalDim };
+    }
+    it("5 sessions of 10-dim features, k=3: returns sessionVectors with 3 dimensions", () => {
+        const { sessionIds, matrix, totalDim } = makeTestData();
+        const result = truncatedSvd(sessionIds, matrix, totalDim, 3);
+        expect(result.k).toBe(3);
+        expect(result.sigma.length).toBe(3);
+        expect(result.sessionVectors.size).toBe(5);
+        for (const [, vec] of result.sessionVectors) {
+            expect(vec.length).toBe(3);
+        }
+    });
+    it("singular values are non-negative and in descending order", () => {
+        const { sessionIds, matrix, totalDim } = makeTestData();
+        const result = truncatedSvd(sessionIds, matrix, totalDim, 3);
+        for (let i = 0; i < result.sigma.length; i++) {
+            expect(result.sigma[i]).toBeGreaterThanOrEqual(0);
+        }
+        for (let i = 1; i < result.sigma.length; i++) {
+            expect(result.sigma[i - 1]).toBeGreaterThanOrEqual(result.sigma[i]);
+        }
+    });
+    it("sessionVectors are L2-normalized (magnitude ~= 1.0)", () => {
+        const { sessionIds, matrix, totalDim } = makeTestData();
+        const result = truncatedSvd(sessionIds, matrix, totalDim, 3);
+        for (const [, vec] of result.sessionVectors) {
+            let norm = 0;
+            for (let i = 0; i < vec.length; i++) {
+                norm += vec[i] * vec[i];
+            }
+            norm = Math.sqrt(norm);
+            expect(norm).toBeCloseTo(1.0, 4);
+        }
+    });
+});

package/dist-cli/__tests__/tfidf.test.js ADDED Viewed

@@ -0,0 +1,88 @@
+import { describe, it, expect } from "vitest";
+import { buildTfidf } from "../knowledge-graph-builder.js";
+describe("buildTfidf", () => {
+    it("excludes terms that appear in only 1 document", () => {
+        const documents = new Map();
+        documents.set("doc1", ["alpha", "beta", "gamma"]);
+        documents.set("doc2", ["alpha", "beta", "delta"]);
+        documents.set("doc3", ["alpha", "gamma", "epsilon"]);
+        const result = buildTfidf(documents);
+        // n=3, maxDf = max(2, floor(3*0.8)) = max(2,2) = 2
+        // "alpha" df=3 > maxDf(2) => excluded
+        // "beta" df=2, "gamma" df=2 => kept (>=2 and <=2)
+        // "delta" df=1, "epsilon" df=1 => excluded (< 2)
+        expect(result.vocabulary).toContain("beta");
+        expect(result.vocabulary).toContain("gamma");
+        expect(result.vocabulary).not.toContain("delta");
+        expect(result.vocabulary).not.toContain("epsilon");
+    });
+    it("excludes terms appearing in >80% of docs when applicable", () => {
+        // With 10 docs, maxDf = max(2, floor(10*0.8)) = 8
+        const documents = new Map();
+        for (let i = 0; i < 10; i++) {
+            const tokens = ["ubiquitous"]; // appears in all 10
+            if (i < 5)
+                tokens.push("common"); // appears in 5 (<=8, >=2) => kept
+            if (i < 2)
+                tokens.push("rare"); // appears in 2 (>=2, <=8) => kept
+            documents.set(`doc${i}`, tokens);
+        }
+        const result = buildTfidf(documents);
+        // "ubiquitous" in 10 docs > maxDf(8) => excluded
+        expect(result.vocabulary).not.toContain("ubiquitous");
+        // "common" in 5 docs => kept
+        expect(result.vocabulary).toContain("common");
+        // "rare" in 2 docs => kept
+        expect(result.vocabulary).toContain("rare");
+    });
+    it("produces L2-normalized vectors (dot product with self ~= 1.0)", () => {
+        const documents = new Map();
+        documents.set("doc1", ["foo", "bar", "baz"]);
+        documents.set("doc2", ["foo", "bar", "qux"]);
+        documents.set("doc3", ["foo", "baz", "qux"]);
+        const result = buildTfidf(documents);
+        for (const [, vec] of result.vectors) {
+            let dotProduct = 0;
+            for (let i = 0; i < vec.length; i++) {
+                dotProduct += vec[i] * vec[i];
+            }
+            // If the vector is non-zero it should be normalized to 1
+            if (dotProduct > 0) {
+                expect(dotProduct).toBeCloseTo(1.0, 10);
+            }
+        }
+    });
+    it("gives rare term higher IDF weight than common term", () => {
+        const documents = new Map();
+        // "common" in 4 of 5 docs, "rare" in 2 of 5 docs
+        documents.set("doc1", ["common", "rare"]);
+        documents.set("doc2", ["common", "rare"]);
+        documents.set("doc3", ["common", "filler"]);
+        documents.set("doc4", ["common", "filler"]);
+        documents.set("doc5", ["filler", "filler"]);
+        const result = buildTfidf(documents);
+        // Both "common" (df=4) and "rare" (df=2) should be in vocabulary
+        // maxDf = max(2, floor(5*0.8)) = max(2,4) = 4, so common (df=4) is kept
+        expect(result.vocabulary).toContain("common");
+        expect(result.vocabulary).toContain("rare");
+        // Check IDF: log(5/2) > log(5/4) => rare's weight > common's weight
+        // Look at doc1 which has both terms once each => TF is same => difference is purely IDF
+        const vec = result.vectors.get("doc1");
+        const rareIdx = result.vocabIndex.get("rare");
+        const commonIdx = result.vocabIndex.get("common");
+        // Before normalization, rare would have higher raw value.
+        // After L2-normalization, the ratio is preserved, so the rare component should be larger.
+        expect(vec[rareIdx]).toBeGreaterThan(vec[commonIdx]);
+    });
+    it("produces zero vector for empty token list", () => {
+        const documents = new Map();
+        documents.set("doc1", ["foo", "bar"]);
+        documents.set("doc2", ["foo", "bar"]);
+        documents.set("doc3", []);
+        const result = buildTfidf(documents);
+        const vec = result.vectors.get("doc3");
+        for (let i = 0; i < vec.length; i++) {
+            expect(vec[i]).toBe(0);
+        }
+    });
+});

package/dist-cli/__tests__/tokenizer.test.js ADDED Viewed

@@ -0,0 +1,125 @@
+import { describe, it, expect } from "vitest";
+import { splitCamelCase, extractPathTokens, isNoiseToken, tokenize, } from "../knowledge-graph-builder.js";
+describe("splitCamelCase", () => {
+    it("splits camelCase into lowercase parts", () => {
+        expect(splitCamelCase("camelCase")).toEqual(["camel", "case"]);
+    });
+    it("splits PascalCase with uppercase acronym prefix", () => {
+        expect(splitCamelCase("XMLParser")).toEqual(["xml", "parser"]);
+    });
+    it("returns single-element array for lowercase word", () => {
+        expect(splitCamelCase("lowercase")).toEqual(["lowercase"]);
+    });
+    it("splits multiple humps", () => {
+        expect(splitCamelCase("myVariableName")).toEqual([
+            "my",
+            "variable",
+            "name",
+        ]);
+    });
+    it("handles all-uppercase word", () => {
+        const result = splitCamelCase("HTML");
+        expect(result).toEqual(["html"]);
+    });
+});
+describe("extractPathTokens", () => {
+    it("extracts tokens from a file path", () => {
+        const tokens = extractPathTokens("/src/components/App.tsx");
+        expect(tokens).toContain("src");
+        expect(tokens).toContain("components");
+        expect(tokens).toContain("app");
+        // Extension should be stripped
+        expect(tokens).not.toContain("tsx");
+    });
+    it("returns empty array when text has no paths", () => {
+        expect(extractPathTokens("just some regular text")).toEqual([]);
+    });
+    it("extracts tokens from multiple paths in text", () => {
+        const tokens = extractPathTokens("Edited /src/utils/helpers.ts and /lib/core/Engine.ts");
+        expect(tokens).toContain("src");
+        expect(tokens).toContain("utils");
+        expect(tokens).toContain("helpers");
+        expect(tokens).toContain("lib");
+        expect(tokens).toContain("core");
+        expect(tokens).toContain("engine");
+    });
+    it("skips short path segments (<=2 chars)", () => {
+        const tokens = extractPathTokens("/a/b/component.ts");
+        expect(tokens).not.toContain("a");
+        expect(tokens).not.toContain("b");
+        expect(tokens).toContain("component");
+    });
+});
+describe("isNoiseToken", () => {
+    it("returns true for UUID strings", () => {
+        expect(isNoiseToken("550e8400-e29b-41d4-a716-446655440000")).toBe(true);
+    });
+    it("returns true for hex strings of 6+ chars", () => {
+        expect(isNoiseToken("abcdef12")).toBe(true);
+    });
+    it("returns true for pure numbers", () => {
+        expect(isNoiseToken("12345")).toBe(true);
+    });
+    it("returns true for extremely long tokens (>40 chars)", () => {
+        expect(isNoiseToken("a".repeat(41))).toBe(true);
+    });
+    it("returns false for normal words", () => {
+        expect(isNoiseToken("valid")).toBe(false);
+    });
+    it("returns false for short hex-like strings (<6 chars)", () => {
+        expect(isNoiseToken("abc")).toBe(false);
+    });
+});
+describe("tokenize", () => {
+    it("tokenizes text with camelCase words", () => {
+        const tokens = tokenize("refactor camelCaseFunction");
+        expect(tokens).toContain("refactor");
+        expect(tokens).toContain("camel");
+        expect(tokens).toContain("case");
+        expect(tokens).toContain("function");
+    });
+    it("tokenizes text containing file paths", () => {
+        const tokens = tokenize("edited /src/components/App.tsx");
+        expect(tokens).toContain("src");
+        expect(tokens).toContain("components");
+        expect(tokens).toContain("app");
+    });
+    it("excludes stop words", () => {
+        const tokens = tokenize("the quick brown fox in the forest");
+        expect(tokens).not.toContain("the");
+        expect(tokens).not.toContain("in");
+        expect(tokens).toContain("quick");
+        expect(tokens).toContain("brown");
+        expect(tokens).toContain("fox");
+        expect(tokens).toContain("forest");
+    });
+    it("skips URLs", () => {
+        const tokens = tokenize("visit http://example.com/path for details");
+        // tokenize skips words starting with "http" but URL parts split by / may remain
+        expect(tokens).not.toContain("http");
+        expect(tokens).not.toContain("http://example.com/path");
+    });
+    it("handles kebab-case and snake_case", () => {
+        const tokens = tokenize("my-component some_variable");
+        expect(tokens).toContain("component");
+        // "some" is a stop word, so it's excluded
+        expect(tokens).not.toContain("some");
+        expect(tokens).toContain("variable");
+    });
+    it("handles Japanese text without crashing", () => {
+        expect(() => tokenize("セッションの分析を実行する")).not.toThrow();
+    });
+    it("filters out noise tokens", () => {
+        const tokens = tokenize("commit 550e8400-e29b-41d4-a716-446655440000 was good");
+        expect(tokens).not.toContain("550e8400-e29b-41d4-a716-446655440000");
+        expect(tokens).toContain("commit");
+        expect(tokens).toContain("good");
+    });
+    it("filters tokens with 2 or fewer characters", () => {
+        const tokens = tokenize("I am ok to go");
+        expect(tokens).not.toContain("am");
+        expect(tokens).not.toContain("ok");
+        expect(tokens).not.toContain("to");
+        expect(tokens).not.toContain("go");
+    });
+});