@pentatonic-ai/ai-agent-sdk 0.5.11 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/README.md +345 -174
  2. package/bin/__tests__/callback-server.test.js +70 -0
  3. package/bin/__tests__/credentials.test.js +58 -0
  4. package/bin/__tests__/login.test.js +210 -0
  5. package/bin/__tests__/pkce.test.js +39 -0
  6. package/bin/__tests__/whoami.test.js +77 -0
  7. package/bin/cli.js +109 -440
  8. package/bin/commands/config.js +251 -0
  9. package/bin/commands/login.js +219 -0
  10. package/bin/commands/whoami.js +41 -0
  11. package/bin/lib/callback-server.js +137 -0
  12. package/bin/lib/credentials.js +100 -0
  13. package/bin/lib/pkce.js +26 -0
  14. package/package.json +4 -2
  15. package/packages/doctor/__tests__/detect.test.js +2 -6
  16. package/packages/doctor/src/checks/local-memory.js +164 -196
  17. package/packages/doctor/src/detect.js +11 -3
  18. package/packages/memory/src/__tests__/corpus-chunkers.test.js +143 -0
  19. package/packages/memory/src/__tests__/corpus-discover.test.js +175 -0
  20. package/packages/memory/src/__tests__/corpus-ingest.test.js +236 -0
  21. package/packages/memory/src/__tests__/corpus-signatures.test.js +175 -0
  22. package/packages/memory/src/__tests__/corpus-state.test.js +161 -0
  23. package/packages/memory/src/__tests__/ingest-corpus-opts.test.js +129 -0
  24. package/packages/memory/src/__tests__/search-kind.test.js +108 -0
  25. package/packages/memory/src/corpus/adapters.js +398 -0
  26. package/packages/memory/src/corpus/chunkers.js +328 -0
  27. package/packages/memory/src/corpus/cli.js +613 -0
  28. package/packages/memory/src/corpus/discover.js +379 -0
  29. package/packages/memory/src/corpus/index.js +68 -0
  30. package/packages/memory/src/corpus/ingest.js +356 -0
  31. package/packages/memory/src/corpus/signatures.js +280 -0
  32. package/packages/memory/src/corpus/state.js +134 -0
  33. package/packages/memory/src/index.js +18 -0
  34. package/packages/memory/src/ingest.js +20 -11
  35. package/packages/memory/src/openclaw/index.js +39 -1
  36. package/packages/memory/src/search.js +30 -7
  37. package/packages/memory-engine/.env.example +13 -0
  38. package/packages/memory-engine/README.md +131 -0
  39. package/packages/memory-engine/bench/README.md +99 -0
  40. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  41. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  42. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  43. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  44. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  45. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  49. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  50. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  51. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  60. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  61. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  62. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  63. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  66. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  67. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  68. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  69. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  78. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  79. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  80. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  81. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  82. package/packages/memory-engine/compat/Dockerfile +11 -0
  83. package/packages/memory-engine/compat/server.py +680 -0
  84. package/packages/memory-engine/docker-compose.yml +243 -0
  85. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  86. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  87. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  88. package/packages/memory-engine/engine/README.md +52 -0
  89. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  90. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  91. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  92. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  93. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  94. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  95. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  96. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  97. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  98. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  99. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  100. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  101. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  102. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  103. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  104. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  105. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  106. package/packages/memory-engine/pme_memory/embed.py +74 -0
  107. package/packages/memory-engine/pme_memory/health.py +36 -0
  108. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  109. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  110. package/packages/memory-engine/pme_memory/needs.py +55 -0
  111. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  112. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  113. package/packages/memory-engine/pme_memory/search.py +52 -0
  114. package/packages/memory-engine/pme_memory/store.py +86 -0
  115. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  116. package/packages/memory-engine/pyproject.toml +65 -0
  117. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  118. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  119. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,356 @@
1
+ /**
2
+ * Corpus ingest pipeline.
3
+ *
4
+ * Takes a discovered file → chunks it → ingests each chunk via an
5
+ * adapter. Two adapters ship: `localAdapter` (writes via the existing
6
+ * memory.ingest path against a Pool) and `hostedAdapter` (emits
7
+ * STORE_MEMORY events via the existing TES createModuleEvent mutation —
8
+ * no new server-side schema needed).
9
+ *
10
+ * This module knows nothing about HTTP or pg directly — adapters
11
+ * encapsulate that. Keeps it testable and lets us add e.g. a Cloudflare
12
+ * Worker adapter later without changing the pipeline.
13
+ */
14
+
15
+ import { chunkFile } from "./chunkers.js";
16
+ import { extractReferences } from "./signatures.js";
17
+ import {
18
+ loadState,
19
+ saveState,
20
+ upsertSource,
21
+ recordFile,
22
+ forgetFile,
23
+ recomputeStats,
24
+ } from "./state.js";
25
+ import { discover } from "./discover.js";
26
+ import { resolve, basename } from "node:path";
27
+
28
+ /**
29
+ * Adapter contract:
30
+ * adapter.ingestChunk(content, metadata) → Promise<{ id, skipped? }>
31
+ * adapter.deleteByCorpusFile(repoAbs, relPath) → Promise<number> // chunks removed
32
+ *
33
+ * Both methods MUST be safe to call repeatedly with the same args
34
+ * (idempotent on adapter side or via metadata content_hash).
35
+ */
36
+
37
+ /**
38
+ * Ingest a single repo end-to-end. Walks the tree, chunks each file,
39
+ * sends chunks through the adapter, and updates corpus state.
40
+ *
41
+ * Caller controls concurrency by setting `opts.concurrency` (default 4
42
+ * — small enough not to swamp local Ollama, large enough to amortize
43
+ * hosted GraphQL round trips).
44
+ *
45
+ * @param {object} adapter - { ingestChunk, deleteByCorpusFile }
46
+ * @param {string} repoPath - Path to the repo (will be resolved to abs)
47
+ * @param {object} [opts]
48
+ * @param {number} [opts.concurrency=4]
49
+ * @param {number} [opts.maxChunks=100000] - Hard cap per ingest run; abort if exceeded
50
+ * @param {string} [opts.statePath] - Override state file path (tests)
51
+ * @param {Function} [opts.onProgress] - ({phase, processed, total, file}) => void
52
+ * @param {Function} [opts.onWarning] - (msg) => void
53
+ * @param {string} [opts.sourceUrl] - Optional git remote URL to record
54
+ * @param {object} [opts.discoverOpts] - Forwarded to discover()
55
+ * @param {"references"|"content"} [opts.mode="references"] - Storage mode.
56
+ * "references" (default): store path + signature pointers; the agent
57
+ * reads source files at query time. Stale source = `Read` fails →
58
+ * loud, self-correcting failure mode.
59
+ * "content": store full chunk content (legacy behaviour). Stale chunks
60
+ * silently mislead retrieval until re-ingested. Kept for callers who
61
+ * explicitly want a self-contained index.
62
+ * @returns {Promise<{filesProcessed, filesIngested, filesSkipped, chunksCreated, bytesProcessed}>}
63
+ */
64
+ export async function ingestCorpus(adapter, repoPath, opts = {}) {
65
+ const repoAbs = resolve(repoPath);
66
+ const concurrency = opts.concurrency || 4;
67
+ const maxChunks = opts.maxChunks ?? 100000;
68
+ const onProgress = opts.onProgress || (() => {});
69
+ const onWarning = opts.onWarning || (() => {});
70
+
71
+ // Default mode is "references" — store pointers, not chunks. See JSDoc
72
+ // above. Set mode: "content" to opt back into the chunk-content
73
+ // behaviour for callers who explicitly want a self-contained index
74
+ // (e.g. air-gapped retrieval where the source isn't readable at
75
+ // query time).
76
+ const mode = opts.mode === "content" ? "content" : "references";
77
+ const extract = mode === "content" ? chunkFile : extractReferences;
78
+
79
+ const state = await loadState(opts.statePath);
80
+ const source = upsertSource(state, repoAbs, {
81
+ sourceType: "directory",
82
+ sourceUrl: opts.sourceUrl ?? null,
83
+ });
84
+
85
+ const totals = {
86
+ filesProcessed: 0,
87
+ filesIngested: 0,
88
+ filesSkipped: 0,
89
+ chunksCreated: 0,
90
+ bytesProcessed: 0,
91
+ };
92
+
93
+ // Phase 1: discovery (counts files for progress)
94
+ const queue = [];
95
+ for await (const file of discover(repoAbs, {
96
+ ...(opts.discoverOpts || {}),
97
+ onWarning,
98
+ })) {
99
+ queue.push(file);
100
+ }
101
+ onProgress({ phase: "discovered", total: queue.length });
102
+
103
+ // Phase 2: ingest with bounded concurrency
104
+ const stillSeen = new Set();
105
+ let inFlight = 0;
106
+ let cursor = 0;
107
+ let aborted = null;
108
+
109
+ async function worker() {
110
+ while (cursor < queue.length && !aborted) {
111
+ const file = queue[cursor++];
112
+ stillSeen.add(file.relPath);
113
+ totals.filesProcessed++;
114
+
115
+ const prev = source.files[file.relPath];
116
+ if (prev && prev.hash === file.hash) {
117
+ totals.filesSkipped++;
118
+ onProgress({
119
+ phase: "file",
120
+ processed: totals.filesProcessed,
121
+ total: queue.length,
122
+ file: file.relPath,
123
+ status: "unchanged",
124
+ });
125
+ continue;
126
+ }
127
+
128
+ // Changed file — drop existing chunks before re-ingesting
129
+ if (prev) {
130
+ try {
131
+ await adapter.deleteByCorpusFile(repoAbs, file.relPath);
132
+ } catch (err) {
133
+ onWarning(
134
+ `corpus: failed to delete stale chunks for ${file.relPath}: ${err.message}`
135
+ );
136
+ }
137
+ }
138
+
139
+ const chunks = extract(file);
140
+ if (totals.chunksCreated + chunks.length > maxChunks) {
141
+ aborted = new Error(
142
+ `corpus: maxChunks (${maxChunks}) exceeded — stopped at ${file.relPath}`
143
+ );
144
+ break;
145
+ }
146
+
147
+ let chunksCreatedHere = 0;
148
+ for (const chunk of chunks) {
149
+ // Per-chunk cap check — concurrency-safe because totals.chunksCreated
150
+ // is incremented inside the loop atomically (single-threaded JS).
151
+ if (totals.chunksCreated >= maxChunks) {
152
+ aborted = new Error(
153
+ `corpus: maxChunks (${maxChunks}) reached — stopped at ${file.relPath}`
154
+ );
155
+ break;
156
+ }
157
+ const metadata = {
158
+ ...chunk.metadata,
159
+ source_repo: repoAbs,
160
+ source_repo_name: basename(repoAbs),
161
+ source_file: file.relPath,
162
+ source_file_hash: file.hash,
163
+ corpus_file_key: `${repoAbs}::${file.relPath}`,
164
+ };
165
+ try {
166
+ const result = await adapter.ingestChunk(chunk.content, metadata);
167
+ if (!result?.skipped) {
168
+ chunksCreatedHere++;
169
+ totals.chunksCreated++;
170
+ }
171
+ } catch (err) {
172
+ onWarning(
173
+ `corpus: ingest failed for ${file.relPath} chunk ${chunk.metadata.chunk_index}: ${err.message}`
174
+ );
175
+ }
176
+ }
177
+
178
+ if (chunksCreatedHere > 0) {
179
+ recordFile(source, file.relPath, file.hash, chunksCreatedHere);
180
+ totals.filesIngested++;
181
+ totals.bytesProcessed += file.size;
182
+ }
183
+ if (aborted) break;
184
+
185
+ onProgress({
186
+ phase: "file",
187
+ processed: totals.filesProcessed,
188
+ total: queue.length,
189
+ file: file.relPath,
190
+ status: prev ? "updated" : "ingested",
191
+ chunks: chunksCreatedHere,
192
+ });
193
+ }
194
+ }
195
+
196
+ const workers = Array.from({ length: concurrency }, () => worker());
197
+ await Promise.all(workers);
198
+
199
+ // Phase 3: detect deletions — files in state but no longer on disk
200
+ const removed = [];
201
+ for (const relPath of Object.keys(source.files)) {
202
+ if (!stillSeen.has(relPath)) {
203
+ try {
204
+ await adapter.deleteByCorpusFile(repoAbs, relPath);
205
+ forgetFile(source, relPath);
206
+ removed.push(relPath);
207
+ } catch (err) {
208
+ onWarning(
209
+ `corpus: failed to delete chunks for vanished ${relPath}: ${err.message}`
210
+ );
211
+ }
212
+ }
213
+ }
214
+ if (removed.length) {
215
+ onProgress({ phase: "removed", count: removed.length });
216
+ }
217
+
218
+ source.lastSyncedAt = new Date().toISOString();
219
+ recomputeStats(source);
220
+ await saveState(state, opts.statePath);
221
+
222
+ if (aborted) throw aborted;
223
+
224
+ return totals;
225
+ }
226
+
227
+ /**
228
+ * Delta-sync a known repo. Same as ingestCorpus but useful as a
229
+ * semantic distinction in the CLI ("resync"). Skips files whose
230
+ * content hash matches state; deletes chunks for removed files.
231
+ */
232
+ export async function syncCorpus(adapter, repoPath, opts = {}) {
233
+ return ingestCorpus(adapter, repoPath, opts);
234
+ }
235
+
236
+ /**
237
+ * Ingest a specific list of files (e.g. those changed in a git commit).
238
+ * Cheaper than walking the whole tree.
239
+ *
240
+ * @param {object} adapter
241
+ * @param {string} repoPath
242
+ * @param {string[]} relPaths - Paths relative to repoPath
243
+ * @param {object} [opts]
244
+ */
245
+ export async function ingestPaths(adapter, repoPath, relPaths, opts = {}) {
246
+ const repoAbs = resolve(repoPath);
247
+ const onWarning = opts.onWarning || (() => {});
248
+ const state = await loadState(opts.statePath);
249
+ const source = upsertSource(state, repoAbs, {});
250
+
251
+ const totals = {
252
+ filesProcessed: 0,
253
+ filesIngested: 0,
254
+ filesSkipped: 0,
255
+ chunksCreated: 0,
256
+ bytesProcessed: 0,
257
+ };
258
+
259
+ const { join } = await import("node:path");
260
+ const { promises: fsp, existsSync } = await import("node:fs");
261
+ const { createHash } = await import("node:crypto");
262
+ const { isPathEligible } = await import("./discover.js");
263
+
264
+ for (const relPath of relPaths) {
265
+ totals.filesProcessed++;
266
+ const eligible = isPathEligible(relPath);
267
+ const fullPath = join(repoAbs, relPath);
268
+
269
+ // File deleted on disk — drop its chunks
270
+ if (!existsSync(fullPath)) {
271
+ try {
272
+ await adapter.deleteByCorpusFile(repoAbs, relPath);
273
+ forgetFile(source, relPath);
274
+ } catch (err) {
275
+ onWarning(`corpus: cleanup failed for ${relPath}: ${err.message}`);
276
+ }
277
+ continue;
278
+ }
279
+
280
+ if (!eligible.eligible) {
281
+ totals.filesSkipped++;
282
+ continue;
283
+ }
284
+
285
+ let content;
286
+ try {
287
+ content = await fsp.readFile(fullPath, "utf-8");
288
+ } catch (err) {
289
+ onWarning(`corpus: cannot read ${relPath}: ${err.message}`);
290
+ continue;
291
+ }
292
+ if (content.includes("\0")) {
293
+ totals.filesSkipped++;
294
+ continue;
295
+ }
296
+
297
+ const hash = createHash("sha256").update(content).digest("hex");
298
+ const prev = source.files[relPath];
299
+ if (prev && prev.hash === hash) {
300
+ totals.filesSkipped++;
301
+ continue;
302
+ }
303
+
304
+ if (prev) {
305
+ try {
306
+ await adapter.deleteByCorpusFile(repoAbs, relPath);
307
+ } catch (err) {
308
+ onWarning(
309
+ `corpus: failed to delete stale chunks for ${relPath}: ${err.message}`
310
+ );
311
+ }
312
+ }
313
+
314
+ const ext = relPath.includes(".")
315
+ ? "." + relPath.split(".").pop().toLowerCase()
316
+ : "";
317
+
318
+ const chunks = chunkFile({
319
+ relPath,
320
+ content,
321
+ ext,
322
+ basename: relPath.split("/").pop(),
323
+ });
324
+
325
+ let chunksCreatedHere = 0;
326
+ for (const chunk of chunks) {
327
+ const metadata = {
328
+ ...chunk.metadata,
329
+ source_repo: repoAbs,
330
+ source_repo_name: basename(repoAbs),
331
+ source_file: relPath,
332
+ source_file_hash: hash,
333
+ corpus_file_key: `${repoAbs}::${relPath}`,
334
+ };
335
+ try {
336
+ const result = await adapter.ingestChunk(chunk.content, metadata);
337
+ if (!result?.skipped) chunksCreatedHere++;
338
+ } catch (err) {
339
+ onWarning(`corpus: ingest failed for ${relPath}: ${err.message}`);
340
+ }
341
+ }
342
+
343
+ if (chunksCreatedHere > 0) {
344
+ recordFile(source, relPath, hash, chunksCreatedHere);
345
+ totals.filesIngested++;
346
+ totals.chunksCreated += chunksCreatedHere;
347
+ totals.bytesProcessed += content.length;
348
+ }
349
+ }
350
+
351
+ source.lastSyncedAt = new Date().toISOString();
352
+ recomputeStats(source);
353
+ await saveState(state, opts.statePath);
354
+
355
+ return totals;
356
+ }
@@ -0,0 +1,280 @@
1
+ /**
2
+ * Signature / reference extraction for corpus ingest.
3
+ *
4
+ * The default mode for corpus ingest. Stores POINTERS to source content
5
+ * (path + line range + a short summary) rather than full chunk content.
6
+ *
7
+ * Why pointers instead of content?
8
+ *
9
+ * 1. Code rots. The repo on disk is the source of truth; an embedded
10
+ * chunk goes stale silently the moment a file is edited. Pointers
11
+ * "rot loudly" — a `Read` of a moved/deleted/changed file is a
12
+ * signal the LLM observes and adjusts to.
13
+ *
14
+ * 2. Privacy. Pentatonic-hosted retrieval needs only the signature
15
+ * and path, not the full source. Customer code stays on the
16
+ * customer's machine; only the index leaves.
17
+ *
18
+ * 3. Index size. ~50–200 chars per reference vs ~500–2000 per chunk.
19
+ * A 12k-chunk repo becomes a ~3k-reference index, often smaller.
20
+ *
21
+ * The extractor is regex-based, not AST. That means it covers ~80% of
22
+ * common cases and falls back to a single file-level reference for
23
+ * anything it doesn't recognise. The walker (discover.js) already
24
+ * filters out generated/binary noise, so the inputs here are
25
+ * generally well-formed text.
26
+ *
27
+ * Per-language strategies:
28
+ * - Markdown: per-section reference (`## Heading` + first paragraph)
29
+ * - JS/TS: per top-level `function`/`class`/`export` definition
30
+ * - Python: per `def`/`class` at top level (indent-aware)
31
+ * - JSON/YAML: top-level keys as a single reference
32
+ * - Other: single file-level reference
33
+ *
34
+ * Each reference shape:
35
+ * {
36
+ * content, // text that gets embedded (signature + brief body)
37
+ * metadata: {
38
+ * kind: 'code_reference',
39
+ * path, // relative path within the repo
40
+ * symbol?, // e.g. function/class name, when extractable
41
+ * start_line, // 1-indexed
42
+ * end_line, // 1-indexed
43
+ * language, // 'markdown' | 'javascript' | 'typescript' | 'python' | 'json' | 'yaml' | 'text'
44
+ * lines, // "<start>-<end>" — convenience for display
45
+ * },
46
+ * }
47
+ */
48
+
49
+ const MD_EXTS = new Set([".md", ".mdx", ".markdown"]);
50
+ const JS_EXTS = new Set([".js", ".jsx", ".mjs", ".cjs"]);
51
+ const TS_EXTS = new Set([".ts", ".tsx"]);
52
+ const PY_EXTS = new Set([".py"]);
53
+ const JSON_EXTS = new Set([".json"]);
54
+ const YAML_EXTS = new Set([".yaml", ".yml"]);
55
+
56
+ function languageOf(ext) {
57
+ if (MD_EXTS.has(ext)) return "markdown";
58
+ if (JS_EXTS.has(ext)) return "javascript";
59
+ if (TS_EXTS.has(ext)) return "typescript";
60
+ if (PY_EXTS.has(ext)) return "python";
61
+ if (JSON_EXTS.has(ext)) return "json";
62
+ if (YAML_EXTS.has(ext)) return "yaml";
63
+ return "text";
64
+ }
65
+
66
+ function ref(content, metadata) {
67
+ return {
68
+ content,
69
+ metadata: {
70
+ kind: "code_reference",
71
+ ...metadata,
72
+ lines: `${metadata.start_line}-${metadata.end_line}`,
73
+ },
74
+ };
75
+ }
76
+
77
+ /**
78
+ * Markdown: one reference per H1/H2 section. Body of the reference is
79
+ * the heading + the first paragraph of prose under it (trimmed).
80
+ */
81
+ function extractMarkdownReferences(file) {
82
+ const lines = file.content.split(/\r?\n/);
83
+ const refs = [];
84
+ let currentHeading = null;
85
+ let currentStart = 1;
86
+ let currentBody = [];
87
+
88
+ function flush(endLine) {
89
+ if (!currentHeading) return;
90
+ const summary = `${currentHeading}\n\n${currentBody.slice(0, 4).join(" ").trim().slice(0, 240)}`;
91
+ refs.push(
92
+ ref(summary, {
93
+ path: file.relPath,
94
+ symbol: currentHeading.replace(/^#+\s*/, "").trim(),
95
+ start_line: currentStart,
96
+ end_line: endLine,
97
+ language: "markdown",
98
+ })
99
+ );
100
+ }
101
+
102
+ for (let i = 0; i < lines.length; i++) {
103
+ const line = lines[i];
104
+ const headingMatch = line.match(/^(#{1,2})\s+(.+?)\s*$/);
105
+ if (headingMatch) {
106
+ flush(i); // close previous section at the line above
107
+ currentHeading = line.trim();
108
+ currentStart = i + 1;
109
+ currentBody = [];
110
+ } else if (currentHeading) {
111
+ const t = line.trim();
112
+ if (t && !t.startsWith("```")) currentBody.push(t);
113
+ }
114
+ }
115
+ flush(lines.length);
116
+
117
+ // No headings? Fall back to a single file-level reference.
118
+ if (refs.length === 0) {
119
+ refs.push(fileLevelReference(file, "markdown"));
120
+ }
121
+ return refs;
122
+ }
123
+
124
+ /**
125
+ * JS / TS: per-top-level-symbol references. Regex over `function`,
126
+ * `class`, `const X =`, and `export ...` declarations at indent 0.
127
+ * Not perfect (no AST), but catches the vast majority of public-API
128
+ * surface. Anything we miss falls into a file-level reference.
129
+ */
130
+ function extractJsLikeReferences(file, language) {
131
+ const lines = file.content.split(/\r?\n/);
132
+ const refs = [];
133
+ // Top-level only — anchored at start of line, optionally with `export`
134
+ // (and optionally `default` / `async`).
135
+ const decl =
136
+ /^(?:export\s+(?:default\s+)?)?(?:async\s+)?(?:function\s*\*?\s+([A-Za-z_$][\w$]*)|class\s+([A-Za-z_$][\w$]*)|(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=)/;
137
+
138
+ for (let i = 0; i < lines.length; i++) {
139
+ const m = lines[i].match(decl);
140
+ if (!m) continue;
141
+ const symbol = m[1] || m[2] || m[3];
142
+ if (!symbol) continue;
143
+
144
+ // Body extent — read until next blank line or matching brace count.
145
+ // Cheap heuristic: capture up to 8 lines or end-of-block.
146
+ const start = i + 1;
147
+ const endIdx = Math.min(i + 8, lines.length);
148
+ const snippet = lines.slice(i, endIdx).join("\n");
149
+ const summary = `${file.relPath}:${start} — ${symbol}\n${snippet}`.slice(
150
+ 0,
151
+ 400
152
+ );
153
+
154
+ refs.push(
155
+ ref(summary, {
156
+ path: file.relPath,
157
+ symbol,
158
+ start_line: start,
159
+ end_line: endIdx,
160
+ language,
161
+ })
162
+ );
163
+ }
164
+
165
+ if (refs.length === 0) {
166
+ refs.push(fileLevelReference(file, language));
167
+ }
168
+ return refs;
169
+ }
170
+
171
+ /**
172
+ * Python: per top-level `def`/`class`. Indent 0 only — methods of a
173
+ * class are not separately yielded; the class entry covers them.
174
+ */
175
+ function extractPythonReferences(file) {
176
+ const lines = file.content.split(/\r?\n/);
177
+ const refs = [];
178
+ const decl = /^(def|class)\s+([A-Za-z_][\w]*)/;
179
+
180
+ for (let i = 0; i < lines.length; i++) {
181
+ const m = lines[i].match(decl);
182
+ if (!m) continue;
183
+ const symbol = m[2];
184
+ const start = i + 1;
185
+ const endIdx = Math.min(i + 8, lines.length);
186
+ const snippet = lines.slice(i, endIdx).join("\n");
187
+ const summary = `${file.relPath}:${start} — ${symbol}\n${snippet}`.slice(
188
+ 0,
189
+ 400
190
+ );
191
+ refs.push(
192
+ ref(summary, {
193
+ path: file.relPath,
194
+ symbol,
195
+ start_line: start,
196
+ end_line: endIdx,
197
+ language: "python",
198
+ })
199
+ );
200
+ }
201
+ if (refs.length === 0) refs.push(fileLevelReference(file, "python"));
202
+ return refs;
203
+ }
204
+
205
+ /**
206
+ * JSON / YAML: collapse to a single reference whose body is the
207
+ * top-level keys. Useful as "this config exists and contains X, Y, Z";
208
+ * the agent reads the file for the actual values.
209
+ */
210
+ function extractConfigReferences(file, language) {
211
+ const lines = file.content.split(/\r?\n/);
212
+ let keys = [];
213
+
214
+ if (language === "json") {
215
+ // Match top-level `"key":` at indent 2 — common json formatting.
216
+ const seen = new Set();
217
+ for (const line of lines) {
218
+ const m = line.match(/^\s{0,4}"([^"]+)"\s*:/);
219
+ if (m && !seen.has(m[1])) {
220
+ seen.add(m[1]);
221
+ keys.push(m[1]);
222
+ }
223
+ }
224
+ } else {
225
+ // YAML: top-level keys appear at indent 0.
226
+ for (const line of lines) {
227
+ const m = line.match(/^([A-Za-z_][\w-]*)\s*:/);
228
+ if (m) keys.push(m[1]);
229
+ }
230
+ keys = [...new Set(keys)];
231
+ }
232
+
233
+ const summary = `${file.relPath} — top-level keys: ${keys.slice(0, 12).join(", ") || "(none extracted)"}`;
234
+ return [
235
+ ref(summary, {
236
+ path: file.relPath,
237
+ start_line: 1,
238
+ end_line: lines.length,
239
+ language,
240
+ }),
241
+ ];
242
+ }
243
+
244
+ function fileLevelReference(file, language) {
245
+ const lines = file.content.split(/\r?\n/);
246
+ const head = file.content.slice(0, 240).replace(/\s+/g, " ").trim();
247
+ const summary = `${file.relPath} — ${head}`;
248
+ return ref(summary, {
249
+ path: file.relPath,
250
+ start_line: 1,
251
+ end_line: lines.length,
252
+ language,
253
+ });
254
+ }
255
+
256
+ /**
257
+ * Public entry point. Returns an array of references, each with a
258
+ * `content` field suitable for embedding and a `metadata` field with
259
+ * the pointer back to the source.
260
+ *
261
+ * @param {{relPath: string, content: string, ext: string}} file
262
+ * @returns {Array<{content: string, metadata: object}>}
263
+ */
264
+ export function extractReferences(file) {
265
+ const language = languageOf(file.ext || "");
266
+ switch (language) {
267
+ case "markdown":
268
+ return extractMarkdownReferences(file);
269
+ case "javascript":
270
+ case "typescript":
271
+ return extractJsLikeReferences(file, language);
272
+ case "python":
273
+ return extractPythonReferences(file);
274
+ case "json":
275
+ case "yaml":
276
+ return extractConfigReferences(file, language);
277
+ default:
278
+ return [fileLevelReference(file, language)];
279
+ }
280
+ }