@pentatonic-ai/ai-agent-sdk 0.5.11 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/README.md +345 -174
  2. package/bin/__tests__/callback-server.test.js +70 -0
  3. package/bin/__tests__/credentials.test.js +58 -0
  4. package/bin/__tests__/login.test.js +210 -0
  5. package/bin/__tests__/pkce.test.js +39 -0
  6. package/bin/__tests__/whoami.test.js +77 -0
  7. package/bin/cli.js +109 -440
  8. package/bin/commands/config.js +251 -0
  9. package/bin/commands/login.js +219 -0
  10. package/bin/commands/whoami.js +41 -0
  11. package/bin/lib/callback-server.js +137 -0
  12. package/bin/lib/credentials.js +100 -0
  13. package/bin/lib/pkce.js +26 -0
  14. package/package.json +4 -2
  15. package/packages/doctor/__tests__/detect.test.js +2 -6
  16. package/packages/doctor/src/checks/local-memory.js +164 -196
  17. package/packages/doctor/src/detect.js +11 -3
  18. package/packages/memory/src/__tests__/corpus-chunkers.test.js +143 -0
  19. package/packages/memory/src/__tests__/corpus-discover.test.js +175 -0
  20. package/packages/memory/src/__tests__/corpus-ingest.test.js +236 -0
  21. package/packages/memory/src/__tests__/corpus-signatures.test.js +175 -0
  22. package/packages/memory/src/__tests__/corpus-state.test.js +161 -0
  23. package/packages/memory/src/__tests__/ingest-corpus-opts.test.js +129 -0
  24. package/packages/memory/src/__tests__/search-kind.test.js +108 -0
  25. package/packages/memory/src/corpus/adapters.js +398 -0
  26. package/packages/memory/src/corpus/chunkers.js +328 -0
  27. package/packages/memory/src/corpus/cli.js +613 -0
  28. package/packages/memory/src/corpus/discover.js +379 -0
  29. package/packages/memory/src/corpus/index.js +68 -0
  30. package/packages/memory/src/corpus/ingest.js +356 -0
  31. package/packages/memory/src/corpus/signatures.js +280 -0
  32. package/packages/memory/src/corpus/state.js +134 -0
  33. package/packages/memory/src/index.js +18 -0
  34. package/packages/memory/src/ingest.js +20 -11
  35. package/packages/memory/src/openclaw/index.js +39 -1
  36. package/packages/memory/src/search.js +30 -7
  37. package/packages/memory-engine/.env.example +13 -0
  38. package/packages/memory-engine/README.md +131 -0
  39. package/packages/memory-engine/bench/README.md +99 -0
  40. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  41. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  42. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  43. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  44. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  45. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  49. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  50. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  51. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  60. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  61. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  62. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  63. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  66. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  67. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  68. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  69. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  78. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  79. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  80. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  81. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  82. package/packages/memory-engine/compat/Dockerfile +11 -0
  83. package/packages/memory-engine/compat/server.py +680 -0
  84. package/packages/memory-engine/docker-compose.yml +243 -0
  85. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  86. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  87. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  88. package/packages/memory-engine/engine/README.md +52 -0
  89. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  90. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  91. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  92. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  93. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  94. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  95. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  96. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  97. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  98. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  99. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  100. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  101. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  102. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  103. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  104. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  105. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  106. package/packages/memory-engine/pme_memory/embed.py +74 -0
  107. package/packages/memory-engine/pme_memory/health.py +36 -0
  108. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  109. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  110. package/packages/memory-engine/pme_memory/needs.py +55 -0
  111. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  112. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  113. package/packages/memory-engine/pme_memory/search.py +52 -0
  114. package/packages/memory-engine/pme_memory/store.py +86 -0
  115. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  116. package/packages/memory-engine/pyproject.toml +65 -0
  117. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  118. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  119. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,143 @@
1
+ /**
2
+ * Tests for file chunking. Code, prose, structured, and the fallback
3
+ * sliding window. Verifies metadata fidelity (line ranges, heading
4
+ * paths, chunk indices) — these power the "from README.md > Setup"
5
+ * surfacing on retrieval.
6
+ */
7
+
8
+ import { chunkFile } from "../corpus/chunkers.js";
9
+
10
+ describe("chunkFile — small files", () => {
11
+ it("returns one chunk for a small file", () => {
12
+ const out = chunkFile({
13
+ relPath: "tiny.md",
14
+ ext: ".md",
15
+ content: "# Hi\n\nShort note.",
16
+ });
17
+ expect(out).toHaveLength(1);
18
+ expect(out[0].metadata.chunk_index).toBe(0);
19
+ expect(out[0].metadata.total_chunks).toBe(1);
20
+ expect(out[0].metadata.kind).toBe("prose");
21
+ });
22
+
23
+ it("returns empty array for empty content", () => {
24
+ expect(chunkFile({ relPath: "x.txt", ext: ".txt", content: "" })).toEqual([]);
25
+ expect(chunkFile({ relPath: "x.txt", ext: ".txt", content: " \n " })).toEqual([]);
26
+ });
27
+ });
28
+
29
+ describe("chunkFile — markdown", () => {
30
+ function bigMd() {
31
+ let md = "# Top\n\nIntro.\n";
32
+ for (let i = 0; i < 10; i++) {
33
+ md += `\n## Section ${i}\n\n`;
34
+ md += "Paragraph content. ".repeat(80);
35
+ md += "\n";
36
+ }
37
+ return md;
38
+ }
39
+
40
+ it("splits on headings and tracks heading_path", () => {
41
+ const out = chunkFile({
42
+ relPath: "doc.md",
43
+ ext: ".md",
44
+ content: bigMd(),
45
+ });
46
+ expect(out.length).toBeGreaterThan(1);
47
+ // Each chunk has a heading path
48
+ for (const c of out) {
49
+ expect(c.metadata.kind).toBe("prose");
50
+ expect(typeof c.metadata.heading_path).toBe("string");
51
+ }
52
+ // At least one chunk hangs off "Top > Section N"
53
+ expect(
54
+ out.some((c) => /^Top > Section \d+$/.test(c.metadata.heading_path))
55
+ ).toBe(true);
56
+ });
57
+
58
+ it("records total_chunks consistently", () => {
59
+ const out = chunkFile({ relPath: "doc.md", ext: ".md", content: bigMd() });
60
+ const total = out[0].metadata.total_chunks;
61
+ expect(total).toBe(out.length);
62
+ for (const c of out) expect(c.metadata.total_chunks).toBe(total);
63
+ });
64
+ });
65
+
66
+ describe("chunkFile — code", () => {
67
+ function bigCode(lines = 200) {
68
+ const out = [];
69
+ for (let i = 0; i < lines; i++) {
70
+ out.push(`function fn${i}() { return ${i}; }`);
71
+ if (i % 5 === 4) out.push("");
72
+ }
73
+ return out.join("\n");
74
+ }
75
+
76
+ it("emits multiple chunks for large code files", () => {
77
+ const out = chunkFile({
78
+ relPath: "big.ts",
79
+ ext: ".ts",
80
+ content: bigCode(800),
81
+ });
82
+ expect(out.length).toBeGreaterThan(1);
83
+ for (const c of out) {
84
+ expect(c.metadata.kind).toBe("code");
85
+ expect(c.metadata.ext).toBe(".ts");
86
+ expect(typeof c.metadata.line_start).toBe("number");
87
+ expect(typeof c.metadata.line_end).toBe("number");
88
+ expect(c.metadata.line_end).toBeGreaterThanOrEqual(c.metadata.line_start);
89
+ }
90
+ });
91
+
92
+ it("snaps chunk boundaries to blank lines when nearby", () => {
93
+ const out = chunkFile({
94
+ relPath: "snap.ts",
95
+ ext: ".ts",
96
+ content: bigCode(400),
97
+ });
98
+ // Find a non-final chunk; its last line should be blank or near-blank
99
+ const nonFinal = out.slice(0, -1);
100
+ expect(nonFinal.length).toBeGreaterThan(0);
101
+ });
102
+
103
+ it("preserves chunk_index sequence", () => {
104
+ const out = chunkFile({
105
+ relPath: "seq.ts",
106
+ ext: ".ts",
107
+ content: bigCode(800),
108
+ });
109
+ out.forEach((c, i) => expect(c.metadata.chunk_index).toBe(i));
110
+ });
111
+ });
112
+
113
+ describe("chunkFile — fallback", () => {
114
+ it("uses sliding window for unknown extensions", () => {
115
+ const out = chunkFile({
116
+ relPath: "data.csv",
117
+ ext: ".csv",
118
+ content: "col1,col2,col3\n" + "1,2,3\n".repeat(2000),
119
+ });
120
+ expect(out.length).toBeGreaterThan(0);
121
+ expect(out[0].metadata.kind).toMatch(/text|config|prose|code/);
122
+ });
123
+ });
124
+
125
+ describe("chunkFile — security: chunk content is just text", () => {
126
+ // Defense in depth: the chunker is not where secrets are filtered
127
+ // (discover hard-excludes those), but the chunker must never
128
+ // accidentally mangle content (e.g. stripping characters that look
129
+ // like markup), which would silently lose meaning.
130
+ it("preserves the original content byte-for-byte across chunks", () => {
131
+ const content =
132
+ "# Doc\n\n" +
133
+ "x".repeat(5000) +
134
+ "\n\n## Section\n\nMore content " +
135
+ "y".repeat(3000);
136
+ const out = chunkFile({ relPath: "x.md", ext: ".md", content });
137
+ // Concatenating chunks (with overlap stripped) should reconstruct
138
+ // close to the original — at minimum, every original character
139
+ // appears somewhere across chunks.
140
+ const joined = out.map((c) => c.content).join("");
141
+ expect(joined.length).toBeGreaterThanOrEqual(content.trim().length * 0.95);
142
+ });
143
+ });
@@ -0,0 +1,175 @@
1
+ /**
2
+ * Tests for corpus discovery — the file walker, ignore-rule handling,
3
+ * and the security exclusions.
4
+ *
5
+ * Critical guarantee: NO secret-pattern file may EVER be yielded by
6
+ * discover(), even if .gitignore explicitly un-ignores it. This is the
7
+ * "Uber engineer" expectation — the credential walker is paranoid.
8
+ */
9
+
10
+ import { promises as fsp } from "node:fs";
11
+ import { join } from "node:path";
12
+ import { tmpdir } from "node:os";
13
+ import { mkdtemp, mkdir, writeFile } from "node:fs/promises";
14
+
15
+ import { discover, isPathEligible } from "../corpus/discover.js";
16
+
17
+ async function makeFixture() {
18
+ const root = await mkdtemp(join(tmpdir(), "tes-discover-"));
19
+ await mkdir(join(root, "src"), { recursive: true });
20
+ await mkdir(join(root, "node_modules", "lodash"), { recursive: true });
21
+ await mkdir(join(root, "secrets"), { recursive: true });
22
+ await mkdir(join(root, ".aws"), { recursive: true });
23
+ await mkdir(join(root, ".ssh"), { recursive: true });
24
+
25
+ await writeFile(join(root, "src", "index.ts"), "export const x = 1;\n");
26
+ await writeFile(join(root, "src", "util.ts"), "// util\nexport const y = 2;\n");
27
+ await writeFile(join(root, "README.md"), "# Project\n\nHello.\n");
28
+ await writeFile(join(root, "package.json"), '{"name":"x"}');
29
+ await writeFile(
30
+ join(root, "node_modules", "lodash", "index.js"),
31
+ "module.exports = {};"
32
+ );
33
+
34
+ // Files that MUST NEVER be returned
35
+ await writeFile(join(root, ".env"), "API_KEY=should_never_leak\n");
36
+ await writeFile(join(root, ".env.local"), "DB_URL=secret\n");
37
+ await writeFile(join(root, "id_rsa"), "PRIVATE KEY DATA\n");
38
+ await writeFile(join(root, "server.pem"), "CERT DATA\n");
39
+ await writeFile(join(root, "secrets", "api.json"), '{"key":"abc"}');
40
+ await writeFile(join(root, ".aws", "credentials"), "[default]\naws_access_key_id=AKIA...");
41
+ await writeFile(join(root, ".ssh", "config"), "Host github\n User git");
42
+ await writeFile(join(root, "service-account.json"), '{"private_key":"..."}');
43
+
44
+ // .gitignore that tries to exclude src/util.ts and re-include .env
45
+ await writeFile(
46
+ join(root, ".gitignore"),
47
+ "src/util.ts\n!.env\n!secrets/\n"
48
+ );
49
+
50
+ return root;
51
+ }
52
+
53
+ async function collectAll(repoRoot, opts = {}) {
54
+ const files = [];
55
+ for await (const f of discover(repoRoot, opts)) files.push(f);
56
+ return files;
57
+ }
58
+
59
+ describe("discover", () => {
60
+ let repoRoot;
61
+
62
+ beforeAll(async () => {
63
+ repoRoot = await makeFixture();
64
+ });
65
+
66
+ afterAll(async () => {
67
+ await fsp.rm(repoRoot, { recursive: true, force: true });
68
+ });
69
+
70
+ it("yields code, prose, and config files", async () => {
71
+ const files = await collectAll(repoRoot);
72
+ const paths = files.map((f) => f.relPath).sort();
73
+ expect(paths).toContain("src/index.ts");
74
+ expect(paths).toContain("README.md");
75
+ expect(paths).toContain("package.json");
76
+ });
77
+
78
+ it("NEVER yields .env files even if .gitignore un-ignores them", async () => {
79
+ const files = await collectAll(repoRoot);
80
+ const paths = files.map((f) => f.relPath);
81
+ expect(paths).not.toContain(".env");
82
+ expect(paths).not.toContain(".env.local");
83
+ });
84
+
85
+ it("NEVER yields private key or cert files", async () => {
86
+ const files = await collectAll(repoRoot);
87
+ const paths = files.map((f) => f.relPath);
88
+ expect(paths).not.toContain("id_rsa");
89
+ expect(paths).not.toContain("server.pem");
90
+ });
91
+
92
+ it("NEVER yields files inside .aws/ or .ssh/", async () => {
93
+ const files = await collectAll(repoRoot);
94
+ const paths = files.map((f) => f.relPath);
95
+ for (const p of paths) {
96
+ expect(p).not.toMatch(/^\.aws\//);
97
+ expect(p).not.toMatch(/^\.ssh\//);
98
+ }
99
+ });
100
+
101
+ it("NEVER yields files in secrets/ even with .gitignore !secrets/", async () => {
102
+ const files = await collectAll(repoRoot);
103
+ const paths = files.map((f) => f.relPath);
104
+ expect(paths.some((p) => p.startsWith("secrets/"))).toBe(false);
105
+ });
106
+
107
+ it("excludes service-account JSON heuristically", async () => {
108
+ const files = await collectAll(repoRoot);
109
+ const paths = files.map((f) => f.relPath);
110
+ expect(paths).not.toContain("service-account.json");
111
+ });
112
+
113
+ it("skips node_modules at directory level", async () => {
114
+ const files = await collectAll(repoRoot);
115
+ const paths = files.map((f) => f.relPath);
116
+ expect(paths.some((p) => p.startsWith("node_modules/"))).toBe(false);
117
+ });
118
+
119
+ it("respects .gitignore for non-secret paths", async () => {
120
+ const files = await collectAll(repoRoot);
121
+ const paths = files.map((f) => f.relPath);
122
+ expect(paths).not.toContain("src/util.ts");
123
+ });
124
+
125
+ it("can skip .gitignore honoring when asked", async () => {
126
+ const files = await collectAll(repoRoot, { honorGitignore: false });
127
+ const paths = files.map((f) => f.relPath);
128
+ expect(paths).toContain("src/util.ts");
129
+ // But hard-excludes still apply
130
+ expect(paths).not.toContain(".env");
131
+ });
132
+
133
+ it("attaches a content hash and size to each file", async () => {
134
+ const files = await collectAll(repoRoot);
135
+ for (const f of files) {
136
+ expect(f.hash).toMatch(/^[0-9a-f]{64}$/);
137
+ expect(typeof f.size).toBe("number");
138
+ expect(f.size).toBeGreaterThan(0);
139
+ }
140
+ });
141
+
142
+ it("emits warnings for hard-excluded files (visibility)", async () => {
143
+ const warnings = [];
144
+ await collectAll(repoRoot, { onWarning: (m) => warnings.push(m) });
145
+ // At least one secret-exclusion warning fired
146
+ expect(warnings.some((w) => w.includes(".env"))).toBe(true);
147
+ });
148
+ });
149
+
150
+ describe("isPathEligible", () => {
151
+ it("rejects hard-excluded paths", () => {
152
+ expect(isPathEligible(".env").eligible).toBe(false);
153
+ expect(isPathEligible("config/.env.production").eligible).toBe(false);
154
+ expect(isPathEligible("server.key").eligible).toBe(false);
155
+ expect(isPathEligible(".aws/credentials").eligible).toBe(false);
156
+ });
157
+
158
+ it("rejects paths inside skip dirs", () => {
159
+ expect(isPathEligible("node_modules/foo/index.js").eligible).toBe(false);
160
+ expect(isPathEligible("dist/bundle.js").eligible).toBe(false);
161
+ expect(isPathEligible("__pycache__/foo.pyc").eligible).toBe(false);
162
+ });
163
+
164
+ it("rejects skipped extensions", () => {
165
+ expect(isPathEligible("yarn.lock").eligible).toBe(false);
166
+ expect(isPathEligible("vendor.min.js").eligible).toBe(false);
167
+ expect(isPathEligible("logo.png").eligible).toBe(false);
168
+ });
169
+
170
+ it("accepts normal source files", () => {
171
+ expect(isPathEligible("src/index.ts").eligible).toBe(true);
172
+ expect(isPathEligible("README.md").eligible).toBe(true);
173
+ expect(isPathEligible("config/app.yaml").eligible).toBe(true);
174
+ });
175
+ });
@@ -0,0 +1,236 @@
1
+ /**
2
+ * Tests for the corpus ingest pipeline against a fake adapter.
3
+ *
4
+ * Covers:
5
+ * - End-to-end ingest of a fixture repo
6
+ * - Delta sync: unchanged files are skipped
7
+ * - Re-ingest: changed file's old chunks are deleted before new chunks
8
+ * are written
9
+ * - File deletion: vanished files have their chunks removed
10
+ * - State persistence: state.json reflects the ingest
11
+ * - maxChunks cap: aborts cleanly when exceeded
12
+ * - ingestPaths: handles a list of changed files (git hook path)
13
+ */
14
+
15
+ import { promises as fsp } from "node:fs";
16
+ import { join, resolve } from "node:path";
17
+ import { tmpdir } from "node:os";
18
+ import { mkdtemp, mkdir, writeFile, rm } from "node:fs/promises";
19
+
20
+ import { ingestCorpus, ingestPaths } from "../corpus/ingest.js";
21
+ import { loadState } from "../corpus/state.js";
22
+
23
+ function makeFakeAdapter() {
24
+ // Stores chunks keyed by corpus_file_key → array of chunk records.
25
+ const store = new Map();
26
+ let ingestCount = 0;
27
+ let deleteCount = 0;
28
+ return {
29
+ store,
30
+ counts: () => ({ ingestCount, deleteCount }),
31
+ async ingestChunk(content, metadata) {
32
+ ingestCount++;
33
+ const key = metadata.corpus_file_key;
34
+ const existing = store.get(key) || [];
35
+ existing.push({ content, metadata });
36
+ store.set(key, existing);
37
+ return { id: `mem_${ingestCount}` };
38
+ },
39
+ async deleteByCorpusFile(repoAbs, relPath) {
40
+ const key = `${repoAbs}::${relPath}`;
41
+ const had = store.get(key);
42
+ if (!had) return 0;
43
+ deleteCount += had.length;
44
+ store.delete(key);
45
+ return had.length;
46
+ },
47
+ };
48
+ }
49
+
50
+ async function makeRepo() {
51
+ const root = await mkdtemp(join(tmpdir(), "tes-corpus-"));
52
+ await mkdir(join(root, "src"), { recursive: true });
53
+ await writeFile(join(root, "README.md"), "# Project\n\nIntro paragraph.");
54
+ await writeFile(join(root, "src", "index.ts"), "export const a = 1;\n");
55
+ await writeFile(join(root, "src", "util.ts"), "export const b = 2;\n");
56
+ // Should be excluded
57
+ await writeFile(join(root, ".env"), "SECRET=do_not_ingest\n");
58
+ return root;
59
+ }
60
+
61
+ async function isolatedStatePath() {
62
+ // State must live OUTSIDE the repo we're scanning, otherwise it
63
+ // gets re-ingested as a "new file" on subsequent runs.
64
+ const dir = await mkdtemp(join(tmpdir(), "tes-corpus-state-"));
65
+ return join(dir, "corpus.json");
66
+ }
67
+
68
+ describe("ingestCorpus", () => {
69
+ let repo;
70
+ let statePath;
71
+
72
+ beforeEach(async () => {
73
+ repo = await makeRepo();
74
+ statePath = await isolatedStatePath();
75
+ });
76
+
77
+ afterEach(async () => {
78
+ await rm(repo, { recursive: true, force: true });
79
+ });
80
+
81
+ it("ingests all eligible files on first run", async () => {
82
+ const adapter = makeFakeAdapter();
83
+ const totals = await ingestCorpus(adapter, repo, { statePath });
84
+ expect(totals.filesIngested).toBe(3); // README.md + 2 .ts
85
+ expect(totals.chunksCreated).toBeGreaterThanOrEqual(3);
86
+ expect(totals.bytesProcessed).toBeGreaterThan(0);
87
+ // .env never made it
88
+ for (const [key] of adapter.store) {
89
+ expect(key).not.toContain(".env");
90
+ }
91
+ });
92
+
93
+ it("writes state with stats and file hashes", async () => {
94
+ const adapter = makeFakeAdapter();
95
+ await ingestCorpus(adapter, repo, { statePath });
96
+ const state = await loadState(statePath);
97
+ const src = state.sources[resolve(repo)];
98
+ expect(src).toBeDefined();
99
+ expect(src.stats.fileCount).toBe(3);
100
+ expect(src.stats.chunkCount).toBeGreaterThanOrEqual(3);
101
+ expect(src.files["README.md"].hash).toMatch(/^[0-9a-f]{64}$/);
102
+ expect(src.lastSyncedAt).toBeTruthy();
103
+ });
104
+
105
+ it("skips unchanged files on re-ingest (delta sync)", async () => {
106
+ const adapter = makeFakeAdapter();
107
+ await ingestCorpus(adapter, repo, { statePath });
108
+ const firstIngestCount = adapter.counts().ingestCount;
109
+
110
+ const totals = await ingestCorpus(adapter, repo, { statePath });
111
+ expect(totals.filesIngested).toBe(0);
112
+ expect(totals.filesSkipped).toBe(3);
113
+ expect(adapter.counts().ingestCount).toBe(firstIngestCount); // no new ingests
114
+ });
115
+
116
+ it("re-ingests a changed file and removes its old chunks first", async () => {
117
+ const adapter = makeFakeAdapter();
118
+ await ingestCorpus(adapter, repo, { statePath });
119
+ const beforeDeletes = adapter.counts().deleteCount;
120
+
121
+ await writeFile(
122
+ join(repo, "src", "index.ts"),
123
+ "export const a = 999; // changed\n"
124
+ );
125
+
126
+ const totals = await ingestCorpus(adapter, repo, { statePath });
127
+ expect(totals.filesIngested).toBe(1);
128
+ expect(totals.filesSkipped).toBe(2);
129
+ // The changed file's chunks were deleted before new ones inserted
130
+ expect(adapter.counts().deleteCount).toBeGreaterThan(beforeDeletes);
131
+ });
132
+
133
+ it("removes chunks for files that vanish from disk", async () => {
134
+ const adapter = makeFakeAdapter();
135
+ await ingestCorpus(adapter, repo, { statePath });
136
+
137
+ await rm(join(repo, "src", "util.ts"));
138
+
139
+ const beforeDeletes = adapter.counts().deleteCount;
140
+ await ingestCorpus(adapter, repo, { statePath });
141
+ expect(adapter.counts().deleteCount).toBeGreaterThan(beforeDeletes);
142
+
143
+ const state = await loadState(statePath);
144
+ const src = state.sources[resolve(repo)];
145
+ expect(src.files["src/util.ts"]).toBeUndefined();
146
+ expect(src.stats.fileCount).toBe(2);
147
+ });
148
+
149
+ it("aborts cleanly when maxChunks would be exceeded", async () => {
150
+ const adapter = makeFakeAdapter();
151
+ // Force concurrency=1 so the cap check is deterministic. With
152
+ // higher concurrency the cap is "soft" — we may overshoot by up
153
+ // to (concurrency-1) chunks before the abort propagates. That's
154
+ // documented behavior; we test the deterministic path here.
155
+ await expect(
156
+ ingestCorpus(adapter, repo, { statePath, maxChunks: 1, concurrency: 1 })
157
+ ).rejects.toThrow(/maxChunks/);
158
+ expect(adapter.counts().ingestCount).toBe(1);
159
+ });
160
+
161
+ it("attaches source_file metadata to every chunk", async () => {
162
+ const adapter = makeFakeAdapter();
163
+ await ingestCorpus(adapter, repo, { statePath });
164
+ for (const [key, chunks] of adapter.store) {
165
+ for (const c of chunks) {
166
+ expect(c.metadata.source_file).toBeTruthy();
167
+ expect(c.metadata.source_repo).toBeTruthy();
168
+ expect(c.metadata.source_file_hash).toMatch(/^[0-9a-f]{64}$/);
169
+ expect(c.metadata.corpus_file_key).toBe(key);
170
+ }
171
+ }
172
+ });
173
+
174
+ it("propagates onWarning for hard-excluded secret files", async () => {
175
+ const adapter = makeFakeAdapter();
176
+ const warnings = [];
177
+ await ingestCorpus(adapter, repo, {
178
+ statePath,
179
+ onWarning: (m) => warnings.push(m),
180
+ });
181
+ expect(warnings.some((w) => w.includes(".env"))).toBe(true);
182
+ });
183
+ });
184
+
185
+ describe("ingestPaths (git hook fast path)", () => {
186
+ let repo;
187
+ let statePath;
188
+
189
+ beforeEach(async () => {
190
+ repo = await makeRepo();
191
+ statePath = await isolatedStatePath();
192
+ });
193
+
194
+ afterEach(async () => {
195
+ await rm(repo, { recursive: true, force: true });
196
+ });
197
+
198
+ it("ingests just the listed paths", async () => {
199
+ const adapter = makeFakeAdapter();
200
+ const totals = await ingestPaths(
201
+ adapter,
202
+ repo,
203
+ ["src/index.ts", "README.md"],
204
+ { statePath }
205
+ );
206
+ expect(totals.filesIngested).toBe(2);
207
+ // src/util.ts was NOT ingested
208
+ for (const [key] of adapter.store) {
209
+ expect(key).not.toContain("util.ts");
210
+ }
211
+ });
212
+
213
+ it("skips ineligible paths (secrets, lockfiles)", async () => {
214
+ const adapter = makeFakeAdapter();
215
+ await writeFile(join(repo, "yarn.lock"), "lockfile content\n");
216
+ const totals = await ingestPaths(
217
+ adapter,
218
+ repo,
219
+ [".env", "yarn.lock", "src/index.ts"],
220
+ { statePath }
221
+ );
222
+ expect(totals.filesIngested).toBe(1);
223
+ expect(totals.filesSkipped).toBe(2);
224
+ });
225
+
226
+ it("removes chunks for paths that no longer exist", async () => {
227
+ const adapter = makeFakeAdapter();
228
+ // First seed state by ingesting normally
229
+ await ingestCorpus(adapter, repo, { statePath });
230
+ // Then delete a file and run ingestPaths on it
231
+ await rm(join(repo, "src", "util.ts"));
232
+ const beforeDeletes = adapter.counts().deleteCount;
233
+ await ingestPaths(adapter, repo, ["src/util.ts"], { statePath });
234
+ expect(adapter.counts().deleteCount).toBeGreaterThan(beforeDeletes);
235
+ });
236
+ });