@pentatonic-ai/ai-agent-sdk 0.5.11 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +345 -174
- package/bin/__tests__/callback-server.test.js +70 -0
- package/bin/__tests__/credentials.test.js +58 -0
- package/bin/__tests__/login.test.js +210 -0
- package/bin/__tests__/pkce.test.js +39 -0
- package/bin/__tests__/whoami.test.js +77 -0
- package/bin/cli.js +109 -440
- package/bin/commands/config.js +251 -0
- package/bin/commands/login.js +219 -0
- package/bin/commands/whoami.js +41 -0
- package/bin/lib/callback-server.js +137 -0
- package/bin/lib/credentials.js +100 -0
- package/bin/lib/pkce.js +26 -0
- package/package.json +4 -2
- package/packages/doctor/__tests__/detect.test.js +2 -6
- package/packages/doctor/src/checks/local-memory.js +164 -196
- package/packages/doctor/src/detect.js +11 -3
- package/packages/memory/src/__tests__/corpus-chunkers.test.js +143 -0
- package/packages/memory/src/__tests__/corpus-discover.test.js +175 -0
- package/packages/memory/src/__tests__/corpus-ingest.test.js +236 -0
- package/packages/memory/src/__tests__/corpus-signatures.test.js +175 -0
- package/packages/memory/src/__tests__/corpus-state.test.js +161 -0
- package/packages/memory/src/__tests__/ingest-corpus-opts.test.js +129 -0
- package/packages/memory/src/__tests__/search-kind.test.js +108 -0
- package/packages/memory/src/corpus/adapters.js +398 -0
- package/packages/memory/src/corpus/chunkers.js +328 -0
- package/packages/memory/src/corpus/cli.js +613 -0
- package/packages/memory/src/corpus/discover.js +379 -0
- package/packages/memory/src/corpus/index.js +68 -0
- package/packages/memory/src/corpus/ingest.js +356 -0
- package/packages/memory/src/corpus/signatures.js +280 -0
- package/packages/memory/src/corpus/state.js +134 -0
- package/packages/memory/src/index.js +18 -0
- package/packages/memory/src/ingest.js +20 -11
- package/packages/memory/src/openclaw/index.js +39 -1
- package/packages/memory/src/search.js +30 -7
- package/packages/memory-engine/.env.example +13 -0
- package/packages/memory-engine/README.md +131 -0
- package/packages/memory-engine/bench/README.md +99 -0
- package/packages/memory-engine/bench/scorecards-engine/agent-coding__pentatonic-baseline__20260427-142523.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine/chat-recall__pentatonic-baseline__20260427-142648.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine/circular-economy__pentatonic-baseline__20260427-142757.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine/customer-support__pentatonic-baseline__20260427-142900.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine/marketplace-ops__pentatonic-baseline__20260427-142957.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine/product-catalogue__pentatonic-baseline__20260427-143122.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/agent-coding__pentatonic-memory__20260427-161812.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/chat-recall__pentatonic-memory__20260427-161701.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/circular-economy__pentatonic-memory__20260427-161713.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/customer-support__pentatonic-memory__20260427-161723.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/marketplace-ops__pentatonic-memory__20260427-161732.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-docker/product-catalogue__pentatonic-memory__20260427-161741.json +937 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/agent-coding__pentatonic-memory__20260427-184718.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/chat-recall__pentatonic-memory__20260427-184614.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/circular-economy__pentatonic-memory__20260427-184809.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/customer-support__pentatonic-memory__20260427-184854.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/marketplace-ops__pentatonic-memory__20260427-184929.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-7-layer-populated/product-catalogue__pentatonic-memory__20260427-185015.json +961 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/agent-coding__pentatonic-memory__20260427-175252.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/chat-recall__pentatonic-memory__20260427-175312.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/circular-economy__pentatonic-memory__20260427-175335.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/customer-support__pentatonic-memory__20260427-175355.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/marketplace-ops__pentatonic-memory__20260427-175413.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-l2-empty-layers/product-catalogue__pentatonic-memory__20260427-175430.json +883 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/agent-coding__pentatonic-memory__20260427-155409.json +1115 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/chat-recall__pentatonic-memory__20260427-155421.json +819 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/circular-economy__pentatonic-memory__20260427-155433.json +1278 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/customer-support__pentatonic-memory__20260427-155443.json +1018 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/marketplace-ops__pentatonic-memory__20260427-155453.json +1038 -0
- package/packages/memory-engine/bench/scorecards-engine-via-shim/product-catalogue__pentatonic-memory__20260427-155503.json +937 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory-latest__20260427-145103.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/agent-coding__pentatonic-memory__20260427-144909.json +1115 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory-latest__20260427-145153.json +819 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/chat-recall__pentatonic-memory__20260427-145120.json +542 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory-latest__20260427-145313.json +1278 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/circular-economy__pentatonic-memory__20260427-145207.json +894 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory-latest__20260427-145412.json +1018 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/customer-support__pentatonic-memory__20260427-145327.json +680 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory-latest__20260427-145517.json +1038 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/marketplace-ops__pentatonic-memory__20260427-145422.json +693 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory-latest__20260427-145616.json +961 -0
- package/packages/memory-engine/bench/scorecards-pentatonic-baseline/product-catalogue__pentatonic-memory__20260427-145528.json +727 -0
- package/packages/memory-engine/compat/Dockerfile +11 -0
- package/packages/memory-engine/compat/server.py +680 -0
- package/packages/memory-engine/docker-compose.yml +243 -0
- package/packages/memory-engine/docs/MIGRATION.md +178 -0
- package/packages/memory-engine/docs/RUNBOOK-AWS.md +375 -0
- package/packages/memory-engine/docs/why-v05-underperforms.md +138 -0
- package/packages/memory-engine/engine/README.md +52 -0
- package/packages/memory-engine/engine/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/l5-comms-layer.py +663 -0
- package/packages/memory-engine/engine/l6-document-store.py +1018 -0
- package/packages/memory-engine/engine/services/l2/Dockerfile +41 -0
- package/packages/memory-engine/engine/services/l2/init_databases.py +81 -0
- package/packages/memory-engine/engine/services/l2/l2-hybridrag-proxy.py +1543 -0
- package/packages/memory-engine/engine/services/l4/Dockerfile +15 -0
- package/packages/memory-engine/engine/services/l4/server.py +235 -0
- package/packages/memory-engine/engine/services/l5/Dockerfile +9 -0
- package/packages/memory-engine/engine/services/l5/l5-comms-layer.py +678 -0
- package/packages/memory-engine/engine/services/l6/Dockerfile +11 -0
- package/packages/memory-engine/engine/services/l6/l6-document-store.py +1016 -0
- package/packages/memory-engine/engine/services/nv-embed/Dockerfile +28 -0
- package/packages/memory-engine/engine/services/nv-embed/server.py +152 -0
- package/packages/memory-engine/pme_memory/__init__.py +0 -0
- package/packages/memory-engine/pme_memory/__main__.py +129 -0
- package/packages/memory-engine/pme_memory/artifacts.py +95 -0
- package/packages/memory-engine/pme_memory/embed.py +74 -0
- package/packages/memory-engine/pme_memory/health.py +36 -0
- package/packages/memory-engine/pme_memory/hygiene.py +159 -0
- package/packages/memory-engine/pme_memory/indexer.py +200 -0
- package/packages/memory-engine/pme_memory/needs.py +55 -0
- package/packages/memory-engine/pme_memory/provenance.py +80 -0
- package/packages/memory-engine/pme_memory/scoring.py +168 -0
- package/packages/memory-engine/pme_memory/search.py +52 -0
- package/packages/memory-engine/pme_memory/store.py +86 -0
- package/packages/memory-engine/pme_memory/synthesis.py +114 -0
- package/packages/memory-engine/pyproject.toml +65 -0
- package/packages/memory-engine/scripts/kg-extractor.py +557 -0
- package/packages/memory-engine/scripts/kg-preflexor-v2.py +738 -0
- package/packages/memory-engine/tests/test_api_contract.sh +57 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Repository discovery — walk a directory and yield files eligible for
|
|
3
|
+
* ingest into the memory layer. Honors .gitignore and .tesignore. Hard-
|
|
4
|
+
* excludes secrets and binary/generated artifacts regardless of ignore
|
|
5
|
+
* files (defense in depth).
|
|
6
|
+
*
|
|
7
|
+
* Pure Node — no external deps. Streams via async iterator so callers
|
|
8
|
+
* can show progress without buffering the whole tree.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { readdir, readFile, stat } from "node:fs/promises";
|
|
12
|
+
import { existsSync } from "node:fs";
|
|
13
|
+
import { createHash } from "node:crypto";
|
|
14
|
+
import { join, relative, basename, extname, sep } from "node:path";
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Hard-exclude patterns. These are matched against both filename and
|
|
18
|
+
* full relative path. They CANNOT be re-included by .gitignore overrides
|
|
19
|
+
* or by .tesignore "!pattern" lines — the rule is: secrets and credentials
|
|
20
|
+
* never leave the developer's machine.
|
|
21
|
+
*
|
|
22
|
+
* Update with care. Each addition should have a justification comment.
|
|
23
|
+
*/
|
|
24
|
+
const HARD_EXCLUDE_PATTERNS = [
|
|
25
|
+
// Environment files (anything matching .env or .env.*)
|
|
26
|
+
/(^|\/)\.env(\.|$)/,
|
|
27
|
+
// Private keys and certificates
|
|
28
|
+
/\.(pem|key|crt|cer|p12|pfx|jks|keystore)$/i,
|
|
29
|
+
// SSH and cloud credential dirs
|
|
30
|
+
/(^|\/)\.ssh(\/|$)/,
|
|
31
|
+
/(^|\/)\.aws(\/|$)/,
|
|
32
|
+
/(^|\/)\.gcp(\/|$)/,
|
|
33
|
+
/(^|\/)\.azure(\/|$)/,
|
|
34
|
+
// Package registry credentials
|
|
35
|
+
/(^|\/)\.npmrc$/,
|
|
36
|
+
/(^|\/)\.pypirc$/,
|
|
37
|
+
/(^|\/)\.netrc$/,
|
|
38
|
+
// SSH private keys (common ssh-keygen defaults; private has no extension)
|
|
39
|
+
/(^|\/)id_(rsa|dsa|ecdsa|ed25519|xmss)($|\.(?!pub$))/i,
|
|
40
|
+
// Common secret filenames AND directories — `secrets/foo.json` must
|
|
41
|
+
// be excluded too, not just `secrets.json`
|
|
42
|
+
/(^|\/)secrets?(\/|\.|$)/i,
|
|
43
|
+
/(^|\/)credentials?(\/|\.|$)/i,
|
|
44
|
+
/(^|\/)\.htpasswd$/,
|
|
45
|
+
/_secret(\.|$)/i,
|
|
46
|
+
/_token(\.|$)/i,
|
|
47
|
+
/_password(\.|$)/i,
|
|
48
|
+
// Service account JSON (heuristic — files with these stems are almost
|
|
49
|
+
// always GCP service account keys)
|
|
50
|
+
/(^|\/)service[-_]account(\.|$)/i,
|
|
51
|
+
];
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Default skip directories. These are always skipped at directory level
|
|
55
|
+
* (we don't recurse into them) regardless of .gitignore. Keeps the walk
|
|
56
|
+
* fast and prevents accidental ingest of generated artifacts.
|
|
57
|
+
*/
|
|
58
|
+
const DEFAULT_SKIP_DIRS = new Set([
|
|
59
|
+
".git",
|
|
60
|
+
".svn",
|
|
61
|
+
".hg",
|
|
62
|
+
"node_modules",
|
|
63
|
+
".pnpm",
|
|
64
|
+
".yarn",
|
|
65
|
+
"venv",
|
|
66
|
+
".venv",
|
|
67
|
+
"__pycache__",
|
|
68
|
+
".pytest_cache",
|
|
69
|
+
"target", // Rust/Maven
|
|
70
|
+
"dist",
|
|
71
|
+
"build",
|
|
72
|
+
"out",
|
|
73
|
+
".next",
|
|
74
|
+
".nuxt",
|
|
75
|
+
".cache",
|
|
76
|
+
".turbo",
|
|
77
|
+
".vercel",
|
|
78
|
+
".idea",
|
|
79
|
+
".vscode",
|
|
80
|
+
"coverage",
|
|
81
|
+
".nyc_output",
|
|
82
|
+
".gradle",
|
|
83
|
+
".terraform",
|
|
84
|
+
".serverless",
|
|
85
|
+
]);
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Default file extensions to skip. Lockfiles, binaries, and generated
|
|
89
|
+
* outputs that have negligible signal-to-noise for memory retrieval.
|
|
90
|
+
*/
|
|
91
|
+
const DEFAULT_SKIP_EXTENSIONS = new Set([
|
|
92
|
+
// Lockfiles
|
|
93
|
+
".lock",
|
|
94
|
+
// Compiled / minified
|
|
95
|
+
".min.js",
|
|
96
|
+
".min.css",
|
|
97
|
+
".map",
|
|
98
|
+
// Binaries
|
|
99
|
+
".so",
|
|
100
|
+
".dylib",
|
|
101
|
+
".dll",
|
|
102
|
+
".exe",
|
|
103
|
+
".bin",
|
|
104
|
+
".o",
|
|
105
|
+
".a",
|
|
106
|
+
".class",
|
|
107
|
+
".jar",
|
|
108
|
+
".war",
|
|
109
|
+
".pyc",
|
|
110
|
+
".pyo",
|
|
111
|
+
// Images / media
|
|
112
|
+
".png",
|
|
113
|
+
".jpg",
|
|
114
|
+
".jpeg",
|
|
115
|
+
".gif",
|
|
116
|
+
".webp",
|
|
117
|
+
".svg",
|
|
118
|
+
".ico",
|
|
119
|
+
".bmp",
|
|
120
|
+
".tiff",
|
|
121
|
+
".pdf",
|
|
122
|
+
".mp3",
|
|
123
|
+
".mp4",
|
|
124
|
+
".mov",
|
|
125
|
+
".avi",
|
|
126
|
+
".webm",
|
|
127
|
+
".wav",
|
|
128
|
+
".ogg",
|
|
129
|
+
// Archives
|
|
130
|
+
".zip",
|
|
131
|
+
".tar",
|
|
132
|
+
".gz",
|
|
133
|
+
".bz2",
|
|
134
|
+
".7z",
|
|
135
|
+
".rar",
|
|
136
|
+
// Fonts
|
|
137
|
+
".woff",
|
|
138
|
+
".woff2",
|
|
139
|
+
".ttf",
|
|
140
|
+
".otf",
|
|
141
|
+
".eot",
|
|
142
|
+
// Datasets (often huge, low signal)
|
|
143
|
+
".parquet",
|
|
144
|
+
".arrow",
|
|
145
|
+
]);
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Default cap on individual file size. Files larger than this are
|
|
149
|
+
* skipped — usually generated, vendored, or otherwise low signal.
|
|
150
|
+
* Configurable per call.
|
|
151
|
+
*/
|
|
152
|
+
const DEFAULT_MAX_FILE_BYTES = 512 * 1024; // 512 KB
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Match a path against a glob-ish pattern subset (the bits we use from
|
|
156
|
+
* .gitignore: `*`, `?`, `**`, leading `/` for anchored, trailing `/` for
|
|
157
|
+
* directory-only, and `!` for negation handled by the caller).
|
|
158
|
+
*
|
|
159
|
+
* Not a full gitignore implementation — we use the official `git
|
|
160
|
+
* check-ignore` when available (see honorGitignore) for accuracy.
|
|
161
|
+
*/
|
|
162
|
+
function globToRegex(pattern) {
|
|
163
|
+
let p = pattern.trim();
|
|
164
|
+
if (!p || p.startsWith("#")) return null;
|
|
165
|
+
// Negation handled by caller
|
|
166
|
+
if (p.startsWith("!")) p = p.slice(1);
|
|
167
|
+
const dirOnly = p.endsWith("/");
|
|
168
|
+
if (dirOnly) p = p.slice(0, -1);
|
|
169
|
+
const anchored = p.startsWith("/");
|
|
170
|
+
if (anchored) p = p.slice(1);
|
|
171
|
+
|
|
172
|
+
let regex = "";
|
|
173
|
+
for (let i = 0; i < p.length; i++) {
|
|
174
|
+
const ch = p[i];
|
|
175
|
+
if (ch === "*") {
|
|
176
|
+
if (p[i + 1] === "*") {
|
|
177
|
+
regex += "(?:.+)?";
|
|
178
|
+
i++;
|
|
179
|
+
} else {
|
|
180
|
+
regex += "[^/]*";
|
|
181
|
+
}
|
|
182
|
+
} else if (ch === "?") {
|
|
183
|
+
regex += "[^/]";
|
|
184
|
+
} else if ("\\^$.+|()[]{}".includes(ch)) {
|
|
185
|
+
regex += "\\" + ch;
|
|
186
|
+
} else {
|
|
187
|
+
regex += ch;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
const prefix = anchored ? "^" : "(^|/)";
|
|
191
|
+
const suffix = dirOnly ? "(/.*)?$" : "$";
|
|
192
|
+
return new RegExp(prefix + regex + suffix);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Read an ignore file (.gitignore, .tesignore) and return a list of
|
|
197
|
+
* { regex, negate } rules. Last rule wins on conflict.
|
|
198
|
+
*/
|
|
199
|
+
async function readIgnoreFile(filePath) {
|
|
200
|
+
if (!existsSync(filePath)) return [];
|
|
201
|
+
const content = await readFile(filePath, "utf-8");
|
|
202
|
+
const rules = [];
|
|
203
|
+
for (const line of content.split(/\r?\n/)) {
|
|
204
|
+
const trimmed = line.trim();
|
|
205
|
+
if (!trimmed || trimmed.startsWith("#")) continue;
|
|
206
|
+
const negate = trimmed.startsWith("!");
|
|
207
|
+
const regex = globToRegex(trimmed);
|
|
208
|
+
if (regex) rules.push({ regex, negate });
|
|
209
|
+
}
|
|
210
|
+
return rules;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Apply ignore rules. Returns true if the path is ignored.
|
|
215
|
+
* Iterates rules in order, last match wins, so later negations can
|
|
216
|
+
* un-ignore earlier matches (matches gitignore semantics).
|
|
217
|
+
*/
|
|
218
|
+
function isIgnored(relativePath, rules) {
|
|
219
|
+
let ignored = false;
|
|
220
|
+
for (const { regex, negate } of rules) {
|
|
221
|
+
if (regex.test(relativePath)) {
|
|
222
|
+
ignored = !negate;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
return ignored;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Check if a path matches any hard-exclude pattern. These cannot be
|
|
230
|
+
* overridden — secrets and credentials never get ingested.
|
|
231
|
+
*/
|
|
232
|
+
function isHardExcluded(relativePath) {
|
|
233
|
+
return HARD_EXCLUDE_PATTERNS.some((rx) => rx.test(relativePath));
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Compute SHA-256 content hash (hex). Used for delta sync — if a file's
|
|
238
|
+
* hash hasn't changed since last ingest, we can skip re-embedding it.
|
|
239
|
+
*/
|
|
240
|
+
function hashContent(content) {
|
|
241
|
+
return createHash("sha256").update(content).digest("hex");
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Walk a repository root and yield ingest-eligible files.
|
|
246
|
+
*
|
|
247
|
+
* @param {string} repoRoot - Absolute path to the repo root.
|
|
248
|
+
* @param {object} [opts]
|
|
249
|
+
* @param {Set<string>} [opts.skipDirs] - Override default skip directories
|
|
250
|
+
* @param {Set<string>} [opts.skipExtensions] - Override default skip extensions
|
|
251
|
+
* @param {number} [opts.maxFileBytes] - Override default max file size
|
|
252
|
+
* @param {boolean} [opts.honorGitignore=true] - Honor .gitignore
|
|
253
|
+
* @param {boolean} [opts.honorTesignore=true] - Honor .tesignore
|
|
254
|
+
* @param {Function} [opts.onWarning] - (msg) => void for non-fatal issues
|
|
255
|
+
* @returns {AsyncIterable<{path: string, relPath: string, size: number, hash: string, content: string}>}
|
|
256
|
+
*/
|
|
257
|
+
export async function* discover(repoRoot, opts = {}) {
|
|
258
|
+
const skipDirs = opts.skipDirs || DEFAULT_SKIP_DIRS;
|
|
259
|
+
const skipExtensions = opts.skipExtensions || DEFAULT_SKIP_EXTENSIONS;
|
|
260
|
+
const maxFileBytes = opts.maxFileBytes ?? DEFAULT_MAX_FILE_BYTES;
|
|
261
|
+
const onWarning = opts.onWarning || (() => {});
|
|
262
|
+
|
|
263
|
+
const ignoreRules = [];
|
|
264
|
+
if (opts.honorGitignore !== false) {
|
|
265
|
+
ignoreRules.push(...(await readIgnoreFile(join(repoRoot, ".gitignore"))));
|
|
266
|
+
}
|
|
267
|
+
if (opts.honorTesignore !== false) {
|
|
268
|
+
ignoreRules.push(...(await readIgnoreFile(join(repoRoot, ".tesignore"))));
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
yield* walk(repoRoot, repoRoot, {
|
|
272
|
+
skipDirs,
|
|
273
|
+
skipExtensions,
|
|
274
|
+
maxFileBytes,
|
|
275
|
+
ignoreRules,
|
|
276
|
+
onWarning,
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
async function* walk(currentDir, repoRoot, ctx) {
|
|
281
|
+
let entries;
|
|
282
|
+
try {
|
|
283
|
+
entries = await readdir(currentDir, { withFileTypes: true });
|
|
284
|
+
} catch (err) {
|
|
285
|
+
ctx.onWarning(`discover: cannot read ${currentDir}: ${err.message}`);
|
|
286
|
+
return;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
for (const entry of entries) {
|
|
290
|
+
const fullPath = join(currentDir, entry.name);
|
|
291
|
+
const relPath = relative(repoRoot, fullPath).split(sep).join("/");
|
|
292
|
+
|
|
293
|
+
if (entry.isDirectory()) {
|
|
294
|
+
if (ctx.skipDirs.has(entry.name)) continue;
|
|
295
|
+
if (isHardExcluded(relPath + "/")) continue;
|
|
296
|
+
if (isIgnored(relPath + "/", ctx.ignoreRules)) continue;
|
|
297
|
+
yield* walk(fullPath, repoRoot, ctx);
|
|
298
|
+
continue;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
if (!entry.isFile()) continue;
|
|
302
|
+
|
|
303
|
+
if (isHardExcluded(relPath)) {
|
|
304
|
+
ctx.onWarning(`discover: hard-excluded ${relPath} (secret pattern)`);
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
307
|
+
if (isIgnored(relPath, ctx.ignoreRules)) continue;
|
|
308
|
+
|
|
309
|
+
const ext = extname(entry.name).toLowerCase();
|
|
310
|
+
// .min.X is an extension chain; check the full filename too
|
|
311
|
+
const isMin = entry.name.endsWith(".min.js") || entry.name.endsWith(".min.css");
|
|
312
|
+
if (ctx.skipExtensions.has(ext) || isMin) continue;
|
|
313
|
+
|
|
314
|
+
let s;
|
|
315
|
+
try {
|
|
316
|
+
s = await stat(fullPath);
|
|
317
|
+
} catch (err) {
|
|
318
|
+
ctx.onWarning(`discover: cannot stat ${relPath}: ${err.message}`);
|
|
319
|
+
continue;
|
|
320
|
+
}
|
|
321
|
+
if (s.size === 0) continue;
|
|
322
|
+
if (s.size > ctx.maxFileBytes) {
|
|
323
|
+
ctx.onWarning(
|
|
324
|
+
`discover: skipping ${relPath} (${s.size} bytes > ${ctx.maxFileBytes} cap)`
|
|
325
|
+
);
|
|
326
|
+
continue;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
let content;
|
|
330
|
+
try {
|
|
331
|
+
content = await readFile(fullPath, "utf-8");
|
|
332
|
+
} catch (err) {
|
|
333
|
+
ctx.onWarning(`discover: cannot read ${relPath}: ${err.message}`);
|
|
334
|
+
continue;
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Reject likely-binary content (NUL byte heuristic)
|
|
338
|
+
if (content.includes("\0")) {
|
|
339
|
+
ctx.onWarning(`discover: skipping ${relPath} (binary content)`);
|
|
340
|
+
continue;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
yield {
|
|
344
|
+
path: fullPath,
|
|
345
|
+
relPath,
|
|
346
|
+
size: s.size,
|
|
347
|
+
hash: hashContent(content),
|
|
348
|
+
content,
|
|
349
|
+
ext,
|
|
350
|
+
basename: entry.name,
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* Exported for tests and for callers who want to validate a single path
|
|
357
|
+
* without walking the tree (e.g. a git-hook handler that gets a list of
|
|
358
|
+
* changed files and needs to know which are eligible).
|
|
359
|
+
*/
|
|
360
|
+
export function isPathEligible(relPath, opts = {}) {
|
|
361
|
+
const skipDirs = opts.skipDirs || DEFAULT_SKIP_DIRS;
|
|
362
|
+
const skipExtensions = opts.skipExtensions || DEFAULT_SKIP_EXTENSIONS;
|
|
363
|
+
|
|
364
|
+
if (isHardExcluded(relPath)) return { eligible: false, reason: "hard_excluded" };
|
|
365
|
+
|
|
366
|
+
for (const part of relPath.split("/")) {
|
|
367
|
+
if (skipDirs.has(part)) return { eligible: false, reason: "skip_dir" };
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
const ext = extname(relPath).toLowerCase();
|
|
371
|
+
const isMin = relPath.endsWith(".min.js") || relPath.endsWith(".min.css");
|
|
372
|
+
if (skipExtensions.has(ext) || isMin) {
|
|
373
|
+
return { eligible: false, reason: "skip_extension" };
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
return { eligible: true };
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
export { HARD_EXCLUDE_PATTERNS, DEFAULT_SKIP_DIRS, DEFAULT_SKIP_EXTENSIONS };
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Corpus ingest — public entry point.
|
|
3
|
+
*
|
|
4
|
+
* Onboards a developer's repos into the memory layer so retrieval has
|
|
5
|
+
* something to return on the first prompt. Solves the cold-start
|
|
6
|
+
* problem where a freshly-installed plugin returns nothing useful for
|
|
7
|
+
* days.
|
|
8
|
+
*
|
|
9
|
+
* Usage from the CLI is the primary path; this module exposes the
|
|
10
|
+
* underlying functions for programmatic use (tests, IDE plugins, the
|
|
11
|
+
* OpenClaw onboarding hook).
|
|
12
|
+
*
|
|
13
|
+
* @example
|
|
14
|
+
* import { ingestCorpus, hostedAdapter } from "@pentatonic-ai/ai-agent-sdk/memory/corpus";
|
|
15
|
+
*
|
|
16
|
+
* const adapter = hostedAdapter({
|
|
17
|
+
* endpoint: "https://acme.api.pentatonic.com",
|
|
18
|
+
* clientId: "acme",
|
|
19
|
+
* apiKey: process.env.TES_API_KEY,
|
|
20
|
+
* });
|
|
21
|
+
* const totals = await ingestCorpus(adapter, "/Users/me/code/my-app", {
|
|
22
|
+
* onProgress: (p) => console.log(p),
|
|
23
|
+
* });
|
|
24
|
+
* console.log(`Ingested ${totals.chunksCreated} chunks from ${totals.filesIngested} files`);
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
export { discover, isPathEligible } from "./discover.js";
|
|
28
|
+
export { chunkFile } from "./chunkers.js";
|
|
29
|
+
export { ingestCorpus, syncCorpus, ingestPaths } from "./ingest.js";
|
|
30
|
+
export { localAdapter, hostedAdapter, engineAdapter } from "./adapters.js";
|
|
31
|
+
export {
|
|
32
|
+
loadState,
|
|
33
|
+
saveState,
|
|
34
|
+
defaultStatePath,
|
|
35
|
+
emptyState,
|
|
36
|
+
upsertSource,
|
|
37
|
+
removeSource,
|
|
38
|
+
getSource,
|
|
39
|
+
recomputeStats,
|
|
40
|
+
} from "./state.js";
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Estimate the cost of ingesting a repo without actually ingesting it.
|
|
44
|
+
* Useful for the `tes onboard` cost preview before commit.
|
|
45
|
+
*
|
|
46
|
+
* @param {string} repoPath
|
|
47
|
+
* @param {object} [opts] - Forwarded to discover()
|
|
48
|
+
* @returns {Promise<{fileCount: number, totalBytes: number, estimatedChunks: number, estimatedTokens: number}>}
|
|
49
|
+
*/
|
|
50
|
+
export async function estimateCorpus(repoPath, opts = {}) {
|
|
51
|
+
const { discover } = await import("./discover.js");
|
|
52
|
+
const { chunkFile, approxTokens } = await import("./chunkers.js");
|
|
53
|
+
|
|
54
|
+
let fileCount = 0;
|
|
55
|
+
let totalBytes = 0;
|
|
56
|
+
let estimatedChunks = 0;
|
|
57
|
+
let estimatedTokens = 0;
|
|
58
|
+
|
|
59
|
+
for await (const file of discover(repoPath, opts)) {
|
|
60
|
+
fileCount++;
|
|
61
|
+
totalBytes += file.size;
|
|
62
|
+
const chunks = chunkFile(file);
|
|
63
|
+
estimatedChunks += chunks.length;
|
|
64
|
+
for (const c of chunks) estimatedTokens += approxTokens(c.content);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return { fileCount, totalBytes, estimatedChunks, estimatedTokens };
|
|
68
|
+
}
|