@pentatonic-ai/ai-agent-sdk 0.5.11 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/README.md +345 -174
  2. package/bin/__tests__/callback-server.test.js +70 -0
  3. package/bin/__tests__/credentials.test.js +58 -0
  4. package/bin/__tests__/login.test.js +210 -0
  5. package/bin/__tests__/pkce.test.js +39 -0
  6. package/bin/__tests__/whoami.test.js +77 -0
  7. package/bin/cli.js +109 -440
  8. package/bin/commands/config.js +251 -0
  9. package/bin/commands/login.js +219 -0
  10. package/bin/commands/whoami.js +41 -0
  11. package/bin/lib/callback-server.js +137 -0
  12. package/bin/lib/credentials.js +100 -0
  13. package/bin/lib/pkce.js +26 -0
  14. package/package.json +4 -2
  15. package/packages/doctor/__tests__/detect.test.js +2 -6
  16. package/packages/doctor/src/checks/local-memory.js +164 -196
  17. package/packages/doctor/src/detect.js +11 -3
  18. package/packages/memory/src/__tests__/corpus-chunkers.test.js +143 -0
  19. package/packages/memory/src/__tests__/corpus-discover.test.js +175 -0
  20. package/packages/memory/src/__tests__/corpus-ingest.test.js +236 -0
  21. package/packages/memory/src/__tests__/corpus-signatures.test.js +175 -0
  22. package/packages/memory/src/__tests__/corpus-state.test.js +161 -0
  23. package/packages/memory/src/__tests__/ingest-corpus-opts.test.js +129 -0
  24. package/packages/memory/src/__tests__/search-kind.test.js +108 -0
  25. package/packages/memory/src/corpus/adapters.js +398 -0
  26. package/packages/memory/src/corpus/chunkers.js +328 -0
  27. package/packages/memory/src/corpus/cli.js +613 -0
  28. package/packages/memory/src/corpus/discover.js +379 -0
  29. package/packages/memory/src/corpus/index.js +68 -0
  30. package/packages/memory/src/corpus/ingest.js +356 -0
  31. package/packages/memory/src/corpus/signatures.js +280 -0
  32. package/packages/memory/src/corpus/state.js +134 -0
  33. package/packages/memory/src/index.js +18 -0
  34. package/packages/memory/src/ingest.js +20 -11
  35. package/packages/memory/src/openclaw/index.js +39 -1
  36. package/packages/memory/src/search.js +30 -7
  37. package/packages/memory-engine/.env.example +13 -0
  38. package/packages/memory-engine/README.md +131 -0
  39. package/packages/memory-engine/bench/README.md +99 -0
  40. package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
  41. package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
  42. package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
  43. package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
  44. package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
  45. package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
  46. package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
  47. package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
  48. package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
  49. package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
  50. package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
  51. package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
  52. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
  53. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
  54. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
  55. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
  56. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
  57. package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
  58. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
  59. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
  60. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
  61. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
  62. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
  63. package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
  64. package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
  65. package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
  66. package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
  67. package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
  68. package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
  69. package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
  70. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
  71. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
  72. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
  73. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
  74. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
  75. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
  76. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
  77. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
  78. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
  79. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
  80. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
  81. package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
  82. package/packages/memory-engine/compat/Dockerfile +11 -0
  83. package/packages/memory-engine/compat/server.py +680 -0
  84. package/packages/memory-engine/docker-compose.yml +243 -0
  85. package/packages/memory-engine/docs/MIGRATION.md +178 -0
  86. package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
  87. package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
  88. package/packages/memory-engine/engine/README.md +52 -0
  89. package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
  90. package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
  91. package/packages/memory-engine/engine/l6-document-store.py +1018 -0
  92. package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
  93. package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
  94. package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
  95. package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
  96. package/packages/memory-engine/engine/services/l4/server.py +235 -0
  97. package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
  98. package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
  99. package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
  100. package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
  101. package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
  102. package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
  103. package/packages/memory-engine/pme_memory/__init__.py +0 -0
  104. package/packages/memory-engine/pme_memory/__main__.py +129 -0
  105. package/packages/memory-engine/pme_memory/artifacts.py +95 -0
  106. package/packages/memory-engine/pme_memory/embed.py +74 -0
  107. package/packages/memory-engine/pme_memory/health.py +36 -0
  108. package/packages/memory-engine/pme_memory/hygiene.py +159 -0
  109. package/packages/memory-engine/pme_memory/indexer.py +200 -0
  110. package/packages/memory-engine/pme_memory/needs.py +55 -0
  111. package/packages/memory-engine/pme_memory/provenance.py +80 -0
  112. package/packages/memory-engine/pme_memory/scoring.py +168 -0
  113. package/packages/memory-engine/pme_memory/search.py +52 -0
  114. package/packages/memory-engine/pme_memory/store.py +86 -0
  115. package/packages/memory-engine/pme_memory/synthesis.py +114 -0
  116. package/packages/memory-engine/pyproject.toml +65 -0
  117. package/packages/memory-engine/scripts/kg-extractor.py +557 -0
  118. package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
  119. package/packages/memory-engine/tests/test_api_contract.sh +57 -0
@@ -0,0 +1,379 @@
1
+ /**
2
+ * Repository discovery — walk a directory and yield files eligible for
3
+ * ingest into the memory layer. Honors .gitignore and .tesignore. Hard-
4
+ * excludes secrets and binary/generated artifacts regardless of ignore
5
+ * files (defense in depth).
6
+ *
7
+ * Pure Node — no external deps. Streams via async iterator so callers
8
+ * can show progress without buffering the whole tree.
9
+ */
10
+
11
+ import { readdir, readFile, stat } from "node:fs/promises";
12
+ import { existsSync } from "node:fs";
13
+ import { createHash } from "node:crypto";
14
+ import { join, relative, basename, extname, sep } from "node:path";
15
+
16
+ /**
17
+ * Hard-exclude patterns. These are matched against both filename and
18
+ * full relative path. They CANNOT be re-included by .gitignore overrides
19
+ * or by .tesignore "!pattern" lines — the rule is: secrets and credentials
20
+ * never leave the developer's machine.
21
+ *
22
+ * Update with care. Each addition should have a justification comment.
23
+ */
24
+ const HARD_EXCLUDE_PATTERNS = [
25
+ // Environment files (anything matching .env or .env.*)
26
+ /(^|\/)\.env(\.|$)/,
27
+ // Private keys and certificates
28
+ /\.(pem|key|crt|cer|p12|pfx|jks|keystore)$/i,
29
+ // SSH and cloud credential dirs
30
+ /(^|\/)\.ssh(\/|$)/,
31
+ /(^|\/)\.aws(\/|$)/,
32
+ /(^|\/)\.gcp(\/|$)/,
33
+ /(^|\/)\.azure(\/|$)/,
34
+ // Package registry credentials
35
+ /(^|\/)\.npmrc$/,
36
+ /(^|\/)\.pypirc$/,
37
+ /(^|\/)\.netrc$/,
38
+ // SSH private keys (common ssh-keygen defaults; private has no extension)
39
+ /(^|\/)id_(rsa|dsa|ecdsa|ed25519|xmss)($|\.(?!pub$))/i,
40
+ // Common secret filenames AND directories — `secrets/foo.json` must
41
+ // be excluded too, not just `secrets.json`
42
+ /(^|\/)secrets?(\/|\.|$)/i,
43
+ /(^|\/)credentials?(\/|\.|$)/i,
44
+ /(^|\/)\.htpasswd$/,
45
+ /_secret(\.|$)/i,
46
+ /_token(\.|$)/i,
47
+ /_password(\.|$)/i,
48
+ // Service account JSON (heuristic — files with these stems are almost
49
+ // always GCP service account keys)
50
+ /(^|\/)service[-_]account(\.|$)/i,
51
+ ];
52
+
53
+ /**
54
+ * Default skip directories. These are always skipped at directory level
55
+ * (we don't recurse into them) regardless of .gitignore. Keeps the walk
56
+ * fast and prevents accidental ingest of generated artifacts.
57
+ */
58
+ const DEFAULT_SKIP_DIRS = new Set([
59
+ ".git",
60
+ ".svn",
61
+ ".hg",
62
+ "node_modules",
63
+ ".pnpm",
64
+ ".yarn",
65
+ "venv",
66
+ ".venv",
67
+ "__pycache__",
68
+ ".pytest_cache",
69
+ "target", // Rust/Maven
70
+ "dist",
71
+ "build",
72
+ "out",
73
+ ".next",
74
+ ".nuxt",
75
+ ".cache",
76
+ ".turbo",
77
+ ".vercel",
78
+ ".idea",
79
+ ".vscode",
80
+ "coverage",
81
+ ".nyc_output",
82
+ ".gradle",
83
+ ".terraform",
84
+ ".serverless",
85
+ ]);
86
+
87
+ /**
88
+ * Default file extensions to skip. Lockfiles, binaries, and generated
89
+ * outputs that have negligible signal-to-noise for memory retrieval.
90
+ */
91
+ const DEFAULT_SKIP_EXTENSIONS = new Set([
92
+ // Lockfiles
93
+ ".lock",
94
+ // Compiled / minified
95
+ ".min.js",
96
+ ".min.css",
97
+ ".map",
98
+ // Binaries
99
+ ".so",
100
+ ".dylib",
101
+ ".dll",
102
+ ".exe",
103
+ ".bin",
104
+ ".o",
105
+ ".a",
106
+ ".class",
107
+ ".jar",
108
+ ".war",
109
+ ".pyc",
110
+ ".pyo",
111
+ // Images / media
112
+ ".png",
113
+ ".jpg",
114
+ ".jpeg",
115
+ ".gif",
116
+ ".webp",
117
+ ".svg",
118
+ ".ico",
119
+ ".bmp",
120
+ ".tiff",
121
+ ".pdf",
122
+ ".mp3",
123
+ ".mp4",
124
+ ".mov",
125
+ ".avi",
126
+ ".webm",
127
+ ".wav",
128
+ ".ogg",
129
+ // Archives
130
+ ".zip",
131
+ ".tar",
132
+ ".gz",
133
+ ".bz2",
134
+ ".7z",
135
+ ".rar",
136
+ // Fonts
137
+ ".woff",
138
+ ".woff2",
139
+ ".ttf",
140
+ ".otf",
141
+ ".eot",
142
+ // Datasets (often huge, low signal)
143
+ ".parquet",
144
+ ".arrow",
145
+ ]);
146
+
147
+ /**
148
+ * Default cap on individual file size. Files larger than this are
149
+ * skipped — usually generated, vendored, or otherwise low signal.
150
+ * Configurable per call.
151
+ */
152
+ const DEFAULT_MAX_FILE_BYTES = 512 * 1024; // 512 KB
153
+
154
+ /**
155
+ * Match a path against a glob-ish pattern subset (the bits we use from
156
+ * .gitignore: `*`, `?`, `**`, leading `/` for anchored, trailing `/` for
157
+ * directory-only, and `!` for negation handled by the caller).
158
+ *
159
+ * Not a full gitignore implementation — we use the official `git
160
+ * check-ignore` when available (see honorGitignore) for accuracy.
161
+ */
162
+ function globToRegex(pattern) {
163
+ let p = pattern.trim();
164
+ if (!p || p.startsWith("#")) return null;
165
+ // Negation handled by caller
166
+ if (p.startsWith("!")) p = p.slice(1);
167
+ const dirOnly = p.endsWith("/");
168
+ if (dirOnly) p = p.slice(0, -1);
169
+ const anchored = p.startsWith("/");
170
+ if (anchored) p = p.slice(1);
171
+
172
+ let regex = "";
173
+ for (let i = 0; i < p.length; i++) {
174
+ const ch = p[i];
175
+ if (ch === "*") {
176
+ if (p[i + 1] === "*") {
177
+ regex += "(?:.+)?";
178
+ i++;
179
+ } else {
180
+ regex += "[^/]*";
181
+ }
182
+ } else if (ch === "?") {
183
+ regex += "[^/]";
184
+ } else if ("\\^$.+|()[]{}".includes(ch)) {
185
+ regex += "\\" + ch;
186
+ } else {
187
+ regex += ch;
188
+ }
189
+ }
190
+ const prefix = anchored ? "^" : "(^|/)";
191
+ const suffix = dirOnly ? "(/.*)?$" : "$";
192
+ return new RegExp(prefix + regex + suffix);
193
+ }
194
+
195
+ /**
196
+ * Read an ignore file (.gitignore, .tesignore) and return a list of
197
+ * { regex, negate } rules. Last rule wins on conflict.
198
+ */
199
+ async function readIgnoreFile(filePath) {
200
+ if (!existsSync(filePath)) return [];
201
+ const content = await readFile(filePath, "utf-8");
202
+ const rules = [];
203
+ for (const line of content.split(/\r?\n/)) {
204
+ const trimmed = line.trim();
205
+ if (!trimmed || trimmed.startsWith("#")) continue;
206
+ const negate = trimmed.startsWith("!");
207
+ const regex = globToRegex(trimmed);
208
+ if (regex) rules.push({ regex, negate });
209
+ }
210
+ return rules;
211
+ }
212
+
213
+ /**
214
+ * Apply ignore rules. Returns true if the path is ignored.
215
+ * Iterates rules in order, last match wins, so later negations can
216
+ * un-ignore earlier matches (matches gitignore semantics).
217
+ */
218
+ function isIgnored(relativePath, rules) {
219
+ let ignored = false;
220
+ for (const { regex, negate } of rules) {
221
+ if (regex.test(relativePath)) {
222
+ ignored = !negate;
223
+ }
224
+ }
225
+ return ignored;
226
+ }
227
+
228
+ /**
229
+ * Check if a path matches any hard-exclude pattern. These cannot be
230
+ * overridden — secrets and credentials never get ingested.
231
+ */
232
+ function isHardExcluded(relativePath) {
233
+ return HARD_EXCLUDE_PATTERNS.some((rx) => rx.test(relativePath));
234
+ }
235
+
236
+ /**
237
+ * Compute SHA-256 content hash (hex). Used for delta sync — if a file's
238
+ * hash hasn't changed since last ingest, we can skip re-embedding it.
239
+ */
240
+ function hashContent(content) {
241
+ return createHash("sha256").update(content).digest("hex");
242
+ }
243
+
244
+ /**
245
+ * Walk a repository root and yield ingest-eligible files.
246
+ *
247
+ * @param {string} repoRoot - Absolute path to the repo root.
248
+ * @param {object} [opts]
249
+ * @param {Set<string>} [opts.skipDirs] - Override default skip directories
250
+ * @param {Set<string>} [opts.skipExtensions] - Override default skip extensions
251
+ * @param {number} [opts.maxFileBytes] - Override default max file size
252
+ * @param {boolean} [opts.honorGitignore=true] - Honor .gitignore
253
+ * @param {boolean} [opts.honorTesignore=true] - Honor .tesignore
254
+ * @param {Function} [opts.onWarning] - (msg) => void for non-fatal issues
255
+ * @returns {AsyncIterable<{path: string, relPath: string, size: number, hash: string, content: string}>}
256
+ */
257
+ export async function* discover(repoRoot, opts = {}) {
258
+ const skipDirs = opts.skipDirs || DEFAULT_SKIP_DIRS;
259
+ const skipExtensions = opts.skipExtensions || DEFAULT_SKIP_EXTENSIONS;
260
+ const maxFileBytes = opts.maxFileBytes ?? DEFAULT_MAX_FILE_BYTES;
261
+ const onWarning = opts.onWarning || (() => {});
262
+
263
+ const ignoreRules = [];
264
+ if (opts.honorGitignore !== false) {
265
+ ignoreRules.push(...(await readIgnoreFile(join(repoRoot, ".gitignore"))));
266
+ }
267
+ if (opts.honorTesignore !== false) {
268
+ ignoreRules.push(...(await readIgnoreFile(join(repoRoot, ".tesignore"))));
269
+ }
270
+
271
+ yield* walk(repoRoot, repoRoot, {
272
+ skipDirs,
273
+ skipExtensions,
274
+ maxFileBytes,
275
+ ignoreRules,
276
+ onWarning,
277
+ });
278
+ }
279
+
280
+ async function* walk(currentDir, repoRoot, ctx) {
281
+ let entries;
282
+ try {
283
+ entries = await readdir(currentDir, { withFileTypes: true });
284
+ } catch (err) {
285
+ ctx.onWarning(`discover: cannot read ${currentDir}: ${err.message}`);
286
+ return;
287
+ }
288
+
289
+ for (const entry of entries) {
290
+ const fullPath = join(currentDir, entry.name);
291
+ const relPath = relative(repoRoot, fullPath).split(sep).join("/");
292
+
293
+ if (entry.isDirectory()) {
294
+ if (ctx.skipDirs.has(entry.name)) continue;
295
+ if (isHardExcluded(relPath + "/")) continue;
296
+ if (isIgnored(relPath + "/", ctx.ignoreRules)) continue;
297
+ yield* walk(fullPath, repoRoot, ctx);
298
+ continue;
299
+ }
300
+
301
+ if (!entry.isFile()) continue;
302
+
303
+ if (isHardExcluded(relPath)) {
304
+ ctx.onWarning(`discover: hard-excluded ${relPath} (secret pattern)`);
305
+ continue;
306
+ }
307
+ if (isIgnored(relPath, ctx.ignoreRules)) continue;
308
+
309
+ const ext = extname(entry.name).toLowerCase();
310
+ // .min.X is an extension chain; check the full filename too
311
+ const isMin = entry.name.endsWith(".min.js") || entry.name.endsWith(".min.css");
312
+ if (ctx.skipExtensions.has(ext) || isMin) continue;
313
+
314
+ let s;
315
+ try {
316
+ s = await stat(fullPath);
317
+ } catch (err) {
318
+ ctx.onWarning(`discover: cannot stat ${relPath}: ${err.message}`);
319
+ continue;
320
+ }
321
+ if (s.size === 0) continue;
322
+ if (s.size > ctx.maxFileBytes) {
323
+ ctx.onWarning(
324
+ `discover: skipping ${relPath} (${s.size} bytes > ${ctx.maxFileBytes} cap)`
325
+ );
326
+ continue;
327
+ }
328
+
329
+ let content;
330
+ try {
331
+ content = await readFile(fullPath, "utf-8");
332
+ } catch (err) {
333
+ ctx.onWarning(`discover: cannot read ${relPath}: ${err.message}`);
334
+ continue;
335
+ }
336
+
337
+ // Reject likely-binary content (NUL byte heuristic)
338
+ if (content.includes("\0")) {
339
+ ctx.onWarning(`discover: skipping ${relPath} (binary content)`);
340
+ continue;
341
+ }
342
+
343
+ yield {
344
+ path: fullPath,
345
+ relPath,
346
+ size: s.size,
347
+ hash: hashContent(content),
348
+ content,
349
+ ext,
350
+ basename: entry.name,
351
+ };
352
+ }
353
+ }
354
+
355
+ /**
356
+ * Exported for tests and for callers who want to validate a single path
357
+ * without walking the tree (e.g. a git-hook handler that gets a list of
358
+ * changed files and needs to know which are eligible).
359
+ */
360
+ export function isPathEligible(relPath, opts = {}) {
361
+ const skipDirs = opts.skipDirs || DEFAULT_SKIP_DIRS;
362
+ const skipExtensions = opts.skipExtensions || DEFAULT_SKIP_EXTENSIONS;
363
+
364
+ if (isHardExcluded(relPath)) return { eligible: false, reason: "hard_excluded" };
365
+
366
+ for (const part of relPath.split("/")) {
367
+ if (skipDirs.has(part)) return { eligible: false, reason: "skip_dir" };
368
+ }
369
+
370
+ const ext = extname(relPath).toLowerCase();
371
+ const isMin = relPath.endsWith(".min.js") || relPath.endsWith(".min.css");
372
+ if (skipExtensions.has(ext) || isMin) {
373
+ return { eligible: false, reason: "skip_extension" };
374
+ }
375
+
376
+ return { eligible: true };
377
+ }
378
+
379
+ export { HARD_EXCLUDE_PATTERNS, DEFAULT_SKIP_DIRS, DEFAULT_SKIP_EXTENSIONS };
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Corpus ingest — public entry point.
3
+ *
4
+ * Onboards a developer's repos into the memory layer so retrieval has
5
+ * something to return on the first prompt. Solves the cold-start
6
+ * problem where a freshly-installed plugin returns nothing useful for
7
+ * days.
8
+ *
9
+ * Usage from the CLI is the primary path; this module exposes the
10
+ * underlying functions for programmatic use (tests, IDE plugins, the
11
+ * OpenClaw onboarding hook).
12
+ *
13
+ * @example
14
+ * import { ingestCorpus, hostedAdapter } from "@pentatonic-ai/ai-agent-sdk/memory/corpus";
15
+ *
16
+ * const adapter = hostedAdapter({
17
+ * endpoint: "https://acme.api.pentatonic.com",
18
+ * clientId: "acme",
19
+ * apiKey: process.env.TES_API_KEY,
20
+ * });
21
+ * const totals = await ingestCorpus(adapter, "/Users/me/code/my-app", {
22
+ * onProgress: (p) => console.log(p),
23
+ * });
24
+ * console.log(`Ingested ${totals.chunksCreated} chunks from ${totals.filesIngested} files`);
25
+ */
26
+
27
+ export { discover, isPathEligible } from "./discover.js";
28
+ export { chunkFile } from "./chunkers.js";
29
+ export { ingestCorpus, syncCorpus, ingestPaths } from "./ingest.js";
30
+ export { localAdapter, hostedAdapter, engineAdapter } from "./adapters.js";
31
+ export {
32
+ loadState,
33
+ saveState,
34
+ defaultStatePath,
35
+ emptyState,
36
+ upsertSource,
37
+ removeSource,
38
+ getSource,
39
+ recomputeStats,
40
+ } from "./state.js";
41
+
42
+ /**
43
+ * Estimate the cost of ingesting a repo without actually ingesting it.
44
+ * Useful for the `tes onboard` cost preview before commit.
45
+ *
46
+ * @param {string} repoPath
47
+ * @param {object} [opts] - Forwarded to discover()
48
+ * @returns {Promise<{fileCount: number, totalBytes: number, estimatedChunks: number, estimatedTokens: number}>}
49
+ */
50
+ export async function estimateCorpus(repoPath, opts = {}) {
51
+ const { discover } = await import("./discover.js");
52
+ const { chunkFile, approxTokens } = await import("./chunkers.js");
53
+
54
+ let fileCount = 0;
55
+ let totalBytes = 0;
56
+ let estimatedChunks = 0;
57
+ let estimatedTokens = 0;
58
+
59
+ for await (const file of discover(repoPath, opts)) {
60
+ fileCount++;
61
+ totalBytes += file.size;
62
+ const chunks = chunkFile(file);
63
+ estimatedChunks += chunks.length;
64
+ for (const c of chunks) estimatedTokens += approxTokens(c.content);
65
+ }
66
+
67
+ return { fileCount, totalBytes, estimatedChunks, estimatedTokens };
68
+ }