akm-cli 0.5.0 → 0.6.0-rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +32 -5
- package/dist/asset-registry.js +29 -5
- package/dist/asset-spec.js +12 -5
- package/dist/cli-hints.js +300 -0
- package/dist/cli.js +218 -1357
- package/dist/common.js +147 -50
- package/dist/config.js +224 -13
- package/dist/create-provider-registry.js +1 -1
- package/dist/curate.js +258 -0
- package/dist/{local-search.js → db-search.js} +30 -19
- package/dist/db.js +168 -62
- package/dist/embedder.js +49 -273
- package/dist/embedders/cache.js +47 -0
- package/dist/embedders/local.js +152 -0
- package/dist/embedders/remote.js +121 -0
- package/dist/embedders/types.js +39 -0
- package/dist/errors.js +14 -3
- package/dist/frontmatter.js +61 -7
- package/dist/indexer.js +38 -7
- package/dist/info.js +2 -2
- package/dist/install-audit.js +16 -1
- package/dist/{installed-kits.js → installed-stashes.js} +48 -22
- package/dist/llm-client.js +92 -0
- package/dist/llm.js +14 -126
- package/dist/lockfile.js +28 -1
- package/dist/matchers.js +1 -1
- package/dist/metadata-enhance.js +53 -0
- package/dist/migration-help.js +75 -44
- package/dist/output-context.js +77 -0
- package/dist/output-shapes.js +198 -0
- package/dist/output-text.js +520 -0
- package/dist/paths.js +4 -4
- package/dist/providers/index.js +11 -0
- package/dist/providers/skills-sh.js +1 -1
- package/dist/providers/static-index.js +47 -45
- package/dist/registry-build-index.js +36 -29
- package/dist/registry-factory.js +2 -2
- package/dist/registry-resolve.js +8 -4
- package/dist/registry-search.js +62 -5
- package/dist/remember.js +172 -0
- package/dist/renderers.js +52 -0
- package/dist/search-source.js +73 -42
- package/dist/setup-steps.js +45 -0
- package/dist/setup.js +149 -76
- package/dist/stash-add.js +94 -38
- package/dist/stash-clone.js +4 -4
- package/dist/stash-provider-factory.js +2 -2
- package/dist/stash-provider.js +3 -1
- package/dist/stash-providers/filesystem.js +31 -1
- package/dist/stash-providers/git.js +209 -8
- package/dist/stash-providers/index.js +1 -0
- package/dist/stash-providers/npm.js +159 -0
- package/dist/stash-providers/provider-utils.js +162 -0
- package/dist/stash-providers/sync-from-ref.js +45 -0
- package/dist/stash-providers/tar-utils.js +151 -0
- package/dist/stash-providers/website.js +80 -4
- package/dist/stash-resolve.js +5 -5
- package/dist/stash-search.js +4 -4
- package/dist/stash-show.js +3 -3
- package/dist/wiki.js +6 -6
- package/dist/workflow-authoring.js +12 -4
- package/dist/workflow-markdown.js +9 -0
- package/dist/workflow-runs.js +12 -2
- package/docs/README.md +30 -0
- package/docs/migration/release-notes/0.0.13.md +4 -0
- package/docs/migration/release-notes/0.1.0.md +6 -0
- package/docs/migration/release-notes/0.2.0.md +6 -0
- package/docs/migration/release-notes/0.3.0.md +5 -0
- package/docs/migration/release-notes/0.5.0.md +6 -0
- package/docs/migration/release-notes/0.6.0.md +29 -0
- package/docs/migration/release-notes/README.md +21 -0
- package/package.json +3 -2
- package/dist/registry-install.js +0 -532
- /package/dist/{kit-include.js → stash-include.js} +0 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Npm-source stash provider.
|
|
3
|
+
*
|
|
4
|
+
* `sync()` resolves the npm package tarball, downloads it, verifies its
|
|
5
|
+
* integrity, extracts it securely (via `extractTarGzSecure`), detects the
|
|
6
|
+
* stash root inside the package, and applies any nested `.akm-include`
|
|
7
|
+
* configuration. Cache hits short-circuit the fetch.
|
|
8
|
+
*
|
|
9
|
+
* Audit is intentionally NOT performed here — `akmAdd` calls
|
|
10
|
+
* `auditInstallCandidate` after `sync()` so the policy decision lives at
|
|
11
|
+
* the orchestrator layer where the `--trust` flag is known.
|
|
12
|
+
*/
|
|
13
|
+
import fs from "node:fs";
|
|
14
|
+
import path from "node:path";
|
|
15
|
+
import { ConfigError, UsageError } from "../errors";
|
|
16
|
+
import { getRegistryCacheDir } from "../paths";
|
|
17
|
+
import { parseRegistryRef, resolveRegistryArtifact } from "../registry-resolve";
|
|
18
|
+
import { registerStashProvider } from "../stash-provider-factory";
|
|
19
|
+
import { applyAkmIncludeConfig, buildInstallCacheDir, computeFileHash, detectStashRoot, downloadArchive, isDirectory, } from "./provider-utils";
|
|
20
|
+
import { extractTarGzSecure, verifyArchiveIntegrity } from "./tar-utils";
|
|
21
|
+
class NpmStashProvider {
|
|
22
|
+
type = "npm";
|
|
23
|
+
kind = "syncable";
|
|
24
|
+
name;
|
|
25
|
+
constructor(config) {
|
|
26
|
+
this.name = config.name ?? config.url ?? "npm";
|
|
27
|
+
}
|
|
28
|
+
/** Content is indexed through the standard FTS5 pipeline. */
|
|
29
|
+
async search(_options) {
|
|
30
|
+
return { hits: [] };
|
|
31
|
+
}
|
|
32
|
+
/** Content is local files, shown via showLocal. */
|
|
33
|
+
async show(_ref, _view) {
|
|
34
|
+
throw new Error("NPM provider content is shown via local index");
|
|
35
|
+
}
|
|
36
|
+
canShow(_ref) {
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
async sync(config, options) {
|
|
40
|
+
const ref = npmRefFromConfig(config);
|
|
41
|
+
return syncNpmRef(ref, options);
|
|
42
|
+
}
|
|
43
|
+
getContentDir(config) {
|
|
44
|
+
if (config.path)
|
|
45
|
+
return config.path;
|
|
46
|
+
throw new ConfigError("npm stash entry missing resolved content path");
|
|
47
|
+
}
|
|
48
|
+
async remove(config) {
|
|
49
|
+
if (config.path && isDirectory(config.path)) {
|
|
50
|
+
// Remove the whole versioned cache dir if we know the parent layout.
|
|
51
|
+
const parent = path.dirname(config.path);
|
|
52
|
+
try {
|
|
53
|
+
fs.rmSync(parent, { recursive: true, force: true });
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
/* best-effort */
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
registerStashProvider("npm", (config) => new NpmStashProvider(config));
|
|
62
|
+
function npmRefFromConfig(config) {
|
|
63
|
+
// Prefer an explicit ref-bearing field (set by akmAdd when persisting), else fall back
|
|
64
|
+
// to options or url so the provider stays usable from a hand-rolled config.
|
|
65
|
+
const candidate = config.options?.ref ?? config.url ?? config.options?.package ?? config.name;
|
|
66
|
+
if (typeof candidate !== "string" || !candidate) {
|
|
67
|
+
throw new UsageError('npm stash entry must include an `options.ref` (e.g. "npm:my-pkg@1.2.3")');
|
|
68
|
+
}
|
|
69
|
+
return candidate.startsWith("npm:") ? candidate : `npm:${candidate}`;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Fetch and extract an npm tarball, returning a populated `StashLockData`.
|
|
73
|
+
*
|
|
74
|
+
* Mirrors the historical `installRegistryRef()` path for npm sources:
|
|
75
|
+
* - resolve artifact URL + integrity from the npm registry
|
|
76
|
+
* - reuse cached extraction when present
|
|
77
|
+
* - download, verify, extract securely, then detect the stash root
|
|
78
|
+
* - honour `.akm-include` filters
|
|
79
|
+
*/
|
|
80
|
+
export async function syncNpmRef(ref, options) {
|
|
81
|
+
const parsed = parseRegistryRef(ref);
|
|
82
|
+
if (parsed.source !== "npm") {
|
|
83
|
+
throw new UsageError(`syncNpmRef requires an npm: ref, got "${ref}"`);
|
|
84
|
+
}
|
|
85
|
+
return doSyncNpm(parsed, options);
|
|
86
|
+
}
|
|
87
|
+
async function doSyncNpm(parsed, options) {
|
|
88
|
+
const resolved = await resolveRegistryArtifact(parsed);
|
|
89
|
+
const syncedAt = (options?.now ?? new Date()).toISOString();
|
|
90
|
+
const cacheRootDir = options?.cacheRootDir ?? getRegistryCacheDir();
|
|
91
|
+
const cacheDir = buildInstallCacheDir(cacheRootDir, resolved.source, resolved.id, resolved.resolvedVersion ?? resolved.resolvedRevision);
|
|
92
|
+
const archivePath = path.join(cacheDir, "artifact.tar.gz");
|
|
93
|
+
const extractedDir = path.join(cacheDir, "extracted");
|
|
94
|
+
// Cache hit: extracted dir already valid → reuse it
|
|
95
|
+
if (!options?.force && isDirectory(extractedDir)) {
|
|
96
|
+
try {
|
|
97
|
+
const cachedStashRoot = detectStashRoot(extractedDir);
|
|
98
|
+
if (cachedStashRoot) {
|
|
99
|
+
const integrity = fs.existsSync(archivePath) ? await computeFileHash(archivePath) : undefined;
|
|
100
|
+
return {
|
|
101
|
+
id: resolved.id,
|
|
102
|
+
source: resolved.source,
|
|
103
|
+
ref: resolved.ref,
|
|
104
|
+
artifactUrl: resolved.artifactUrl,
|
|
105
|
+
resolvedVersion: resolved.resolvedVersion,
|
|
106
|
+
resolvedRevision: resolved.resolvedRevision,
|
|
107
|
+
contentDir: cachedStashRoot,
|
|
108
|
+
cacheDir,
|
|
109
|
+
extractedDir,
|
|
110
|
+
integrity,
|
|
111
|
+
writable: options?.writable,
|
|
112
|
+
syncedAt,
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
catch {
|
|
117
|
+
// Cache invalid, re-download
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
fs.mkdirSync(cacheDir, { recursive: true });
|
|
121
|
+
let integrity;
|
|
122
|
+
let provisionalKitRoot;
|
|
123
|
+
let installRoot;
|
|
124
|
+
let stashRoot;
|
|
125
|
+
try {
|
|
126
|
+
await downloadArchive(resolved.artifactUrl, archivePath);
|
|
127
|
+
verifyArchiveIntegrity(archivePath, resolved.resolvedRevision, resolved.source);
|
|
128
|
+
integrity = await computeFileHash(archivePath);
|
|
129
|
+
extractTarGzSecure(archivePath, extractedDir);
|
|
130
|
+
provisionalKitRoot = detectStashRoot(extractedDir);
|
|
131
|
+
installRoot = applyAkmIncludeConfig(provisionalKitRoot, cacheDir, extractedDir) ?? provisionalKitRoot;
|
|
132
|
+
stashRoot = detectStashRoot(installRoot);
|
|
133
|
+
}
|
|
134
|
+
catch (err) {
|
|
135
|
+
// Clean up so stale or partial extractions don't cause false cache hits.
|
|
136
|
+
try {
|
|
137
|
+
fs.rmSync(cacheDir, { recursive: true, force: true });
|
|
138
|
+
}
|
|
139
|
+
catch {
|
|
140
|
+
/* best-effort */
|
|
141
|
+
}
|
|
142
|
+
throw err;
|
|
143
|
+
}
|
|
144
|
+
return {
|
|
145
|
+
id: resolved.id,
|
|
146
|
+
source: resolved.source,
|
|
147
|
+
ref: resolved.ref,
|
|
148
|
+
artifactUrl: resolved.artifactUrl,
|
|
149
|
+
resolvedVersion: resolved.resolvedVersion,
|
|
150
|
+
resolvedRevision: resolved.resolvedRevision,
|
|
151
|
+
contentDir: stashRoot,
|
|
152
|
+
cacheDir,
|
|
153
|
+
extractedDir,
|
|
154
|
+
integrity,
|
|
155
|
+
writable: options?.writable,
|
|
156
|
+
syncedAt,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
export { NpmStashProvider };
|
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { TYPE_DIRS } from "../asset-spec";
|
|
5
|
+
import { fetchWithRetry } from "../common";
|
|
6
|
+
import { copyIncludedPaths, findNearestIncludeConfig } from "../stash-include";
|
|
7
|
+
const REGISTRY_STASH_DIR_NAMES = new Set(Object.values(TYPE_DIRS));
|
|
1
8
|
/** Strip terminal control characters from untrusted strings. */
|
|
2
9
|
export function sanitizeString(value, maxLength = 255) {
|
|
3
10
|
if (typeof value !== "string")
|
|
@@ -9,3 +16,158 @@ export function sanitizeString(value, maxLength = 255) {
|
|
|
9
16
|
export function isExpired(mtimeMs, ttlMs) {
|
|
10
17
|
return Date.now() - mtimeMs > ttlMs;
|
|
11
18
|
}
|
|
19
|
+
/**
|
|
20
|
+
* Find the directory inside `extractedDir` that should be treated as the
|
|
21
|
+
* stash root. Looks for a `.stash` marker, then well-known type dirs, then
|
|
22
|
+
* BFS for the shallowest such candidate.
|
|
23
|
+
*/
|
|
24
|
+
export function detectStashRoot(extractedDir) {
|
|
25
|
+
const root = path.resolve(extractedDir);
|
|
26
|
+
const rootDotStash = path.join(root, ".stash");
|
|
27
|
+
if (isDirectory(rootDotStash)) {
|
|
28
|
+
return root;
|
|
29
|
+
}
|
|
30
|
+
if (hasStashDirs(root)) {
|
|
31
|
+
return root;
|
|
32
|
+
}
|
|
33
|
+
const shallowest = findShallowestStashRoot(root);
|
|
34
|
+
if (shallowest)
|
|
35
|
+
return shallowest;
|
|
36
|
+
return root;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Build a per-source cache directory under `cacheRootDir`.
|
|
40
|
+
*
|
|
41
|
+
* Versioned sources get `${source}-${id}/${version}` for cache reuse;
|
|
42
|
+
* `local` sources get a unique timestamped slug so each install is isolated.
|
|
43
|
+
*/
|
|
44
|
+
export function buildInstallCacheDir(cacheRootDir, source, id, version) {
|
|
45
|
+
const slug = `${source}-${id.replace(/[^a-zA-Z0-9_.-]+/g, "-").replace(/^-+|-+$/g, "")}`;
|
|
46
|
+
const versionSlug = source === "local"
|
|
47
|
+
? `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`
|
|
48
|
+
: (version?.replace(/[^a-zA-Z0-9_.-]+/g, "-") ?? `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`);
|
|
49
|
+
return path.join(cacheRootDir, slug || source, versionSlug);
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Apply an `.akm-include` config (if any) by copying the selected paths
|
|
53
|
+
* into a sibling `selected/` directory and returning that path. Returns
|
|
54
|
+
* undefined when no include config is found.
|
|
55
|
+
*/
|
|
56
|
+
export function applyAkmIncludeConfig(sourceRoot, cacheDir, searchRoot = sourceRoot) {
|
|
57
|
+
const includeConfig = findNearestIncludeConfig(sourceRoot, searchRoot);
|
|
58
|
+
if (!includeConfig)
|
|
59
|
+
return undefined;
|
|
60
|
+
const selectedDir = path.join(cacheDir, "selected");
|
|
61
|
+
fs.rmSync(selectedDir, { recursive: true, force: true });
|
|
62
|
+
fs.mkdirSync(selectedDir, { recursive: true });
|
|
63
|
+
copyIncludedPaths(includeConfig.include, includeConfig.baseDir, selectedDir);
|
|
64
|
+
return selectedDir;
|
|
65
|
+
}
|
|
66
|
+
/** Stream a remote archive to disk using Bun.write when available. */
|
|
67
|
+
export async function downloadArchive(url, destination) {
|
|
68
|
+
const response = await fetchWithRetry(url, undefined, { timeout: 120_000 });
|
|
69
|
+
if (!response.ok) {
|
|
70
|
+
throw new Error(`Failed to download archive (${response.status}) from ${url}`);
|
|
71
|
+
}
|
|
72
|
+
// Stream response to disk instead of buffering the entire archive in memory.
|
|
73
|
+
// Uses Bun.write which handles Response streaming natively.
|
|
74
|
+
const BunRuntime = globalThis
|
|
75
|
+
.Bun;
|
|
76
|
+
if (BunRuntime?.write) {
|
|
77
|
+
await BunRuntime.write(destination, response);
|
|
78
|
+
}
|
|
79
|
+
else {
|
|
80
|
+
// Fallback for non-Bun environments (e.g., tests)
|
|
81
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
82
|
+
fs.writeFileSync(destination, Buffer.from(arrayBuffer));
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
/** SHA-256 of a file, returned as `sha256:<hex>`. */
|
|
86
|
+
export async function computeFileHash(filePath) {
|
|
87
|
+
const data = fs.readFileSync(filePath);
|
|
88
|
+
const hash = createHash("sha256").update(data).digest("hex");
|
|
89
|
+
return `sha256:${hash}`;
|
|
90
|
+
}
|
|
91
|
+
/** Recursively copy directory contents, excluding `.git`. */
|
|
92
|
+
export function copyDirectoryContents(sourceDir, destinationDir) {
|
|
93
|
+
for (const entry of fs.readdirSync(sourceDir, { withFileTypes: true })) {
|
|
94
|
+
if (entry.name === ".git")
|
|
95
|
+
continue;
|
|
96
|
+
const src = path.join(sourceDir, entry.name);
|
|
97
|
+
const dest = path.join(destinationDir, entry.name);
|
|
98
|
+
fs.mkdirSync(path.dirname(dest), { recursive: true });
|
|
99
|
+
if (entry.isDirectory()) {
|
|
100
|
+
fs.cpSync(src, dest, { recursive: true, force: true });
|
|
101
|
+
}
|
|
102
|
+
else {
|
|
103
|
+
fs.copyFileSync(src, dest);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
export function isDirectory(target) {
|
|
108
|
+
try {
|
|
109
|
+
return fs.statSync(target).isDirectory();
|
|
110
|
+
}
|
|
111
|
+
catch {
|
|
112
|
+
return false;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
function hasStashDirs(dirPath) {
|
|
116
|
+
if (!isDirectory(dirPath))
|
|
117
|
+
return false;
|
|
118
|
+
const entries = fs.readdirSync(dirPath, { withFileTypes: true });
|
|
119
|
+
return entries.some((entry) => entry.isDirectory() && REGISTRY_STASH_DIR_NAMES.has(entry.name));
|
|
120
|
+
}
|
|
121
|
+
function countStashDirs(dirPath) {
|
|
122
|
+
if (!isDirectory(dirPath))
|
|
123
|
+
return 0;
|
|
124
|
+
const entries = fs.readdirSync(dirPath, { withFileTypes: true });
|
|
125
|
+
return entries.filter((entry) => entry.isDirectory() && REGISTRY_STASH_DIR_NAMES.has(entry.name)).length;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* BFS to find the shallowest directory that looks like a stash root.
|
|
129
|
+
* Checks for both `.stash` directories and well-known type directories
|
|
130
|
+
* (scripts/, skills/, etc.), so nested layouts like `project/my-stash/scripts/`
|
|
131
|
+
* are discovered even without a `.stash` marker.
|
|
132
|
+
*
|
|
133
|
+
* Skips `root` itself since the caller already checked it via `hasStashDirs`.
|
|
134
|
+
*/
|
|
135
|
+
const BFS_MAX_DEPTH = 5;
|
|
136
|
+
function findShallowestStashRoot(root) {
|
|
137
|
+
const queue = [{ dir: root, depth: 0 }];
|
|
138
|
+
while (queue.length > 0) {
|
|
139
|
+
const item = queue.shift();
|
|
140
|
+
if (!item) {
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
const { dir: current, depth } = item;
|
|
144
|
+
if (current !== root) {
|
|
145
|
+
// .stash directory is a strong stash marker
|
|
146
|
+
if (isDirectory(path.join(current, ".stash"))) {
|
|
147
|
+
return current;
|
|
148
|
+
}
|
|
149
|
+
// Require 2+ type dirs for BFS candidates to avoid false positives.
|
|
150
|
+
// A single "scripts/" is too common (skill dirs, npm packages, etc.).
|
|
151
|
+
if (countStashDirs(current) >= 2) {
|
|
152
|
+
return current;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
if (depth >= BFS_MAX_DEPTH)
|
|
156
|
+
continue;
|
|
157
|
+
let children;
|
|
158
|
+
try {
|
|
159
|
+
children = fs.readdirSync(current, { withFileTypes: true });
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
continue;
|
|
163
|
+
}
|
|
164
|
+
for (const child of children) {
|
|
165
|
+
if (!child.isDirectory())
|
|
166
|
+
continue;
|
|
167
|
+
if (child.name === ".git" || child.name === "node_modules")
|
|
168
|
+
continue;
|
|
169
|
+
queue.push({ dir: path.join(current, child.name), depth: depth + 1 });
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
return undefined;
|
|
173
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified install-ref dispatcher.
|
|
3
|
+
*
|
|
4
|
+
* Replaces the historical `installRegistryRef()` entry point. Given an
|
|
5
|
+
* unparsed install ref, this resolves the right syncable provider and
|
|
6
|
+
* invokes its `sync()` method.
|
|
7
|
+
*
|
|
8
|
+
* Audit is intentionally NOT performed here; callers (`akmAdd`,
|
|
9
|
+
* `akmUpdate`) decide whether to run `auditInstallCandidate` on the
|
|
10
|
+
* synced `contentDir` because they own the `--trust` flag.
|
|
11
|
+
*/
|
|
12
|
+
import { UsageError } from "../errors";
|
|
13
|
+
import { parseRegistryRef } from "../registry-resolve";
|
|
14
|
+
import { detectStashRoot } from "./provider-utils";
|
|
15
|
+
export async function syncFromRef(ref, options) {
|
|
16
|
+
const parsed = parseRegistryRef(ref);
|
|
17
|
+
if (parsed.source === "local") {
|
|
18
|
+
return syncLocalRef(parsed, options);
|
|
19
|
+
}
|
|
20
|
+
if (parsed.source === "npm") {
|
|
21
|
+
const { syncNpmRef } = await import("./npm");
|
|
22
|
+
return syncNpmRef(ref, options);
|
|
23
|
+
}
|
|
24
|
+
if (parsed.source === "git" || parsed.source === "github") {
|
|
25
|
+
const { syncRegistryGitRef } = await import("./git");
|
|
26
|
+
return syncRegistryGitRef(ref, options);
|
|
27
|
+
}
|
|
28
|
+
// Exhaustiveness — `parseRegistryRef` only emits the four sources above.
|
|
29
|
+
throw new UsageError(`No syncable provider for ref: ${ref} (source=${parsed.source})`);
|
|
30
|
+
}
|
|
31
|
+
function syncLocalRef(parsed, options) {
|
|
32
|
+
const stashRoot = detectStashRoot(parsed.sourcePath);
|
|
33
|
+
const syncedAt = (options?.now ?? new Date()).toISOString();
|
|
34
|
+
return {
|
|
35
|
+
id: parsed.id,
|
|
36
|
+
source: "local",
|
|
37
|
+
ref: parsed.ref,
|
|
38
|
+
artifactUrl: parsed.sourcePath,
|
|
39
|
+
contentDir: stashRoot,
|
|
40
|
+
cacheDir: parsed.sourcePath,
|
|
41
|
+
extractedDir: parsed.sourcePath,
|
|
42
|
+
writable: options?.writable,
|
|
43
|
+
syncedAt,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tar archive extraction and integrity verification utilities.
|
|
3
|
+
*
|
|
4
|
+
* These helpers are security-critical: they validate archive entries to
|
|
5
|
+
* prevent path traversal, run a post-extraction scan for symlink escapes,
|
|
6
|
+
* and verify integrity hashes (SRI or hex shasum) before extraction.
|
|
7
|
+
*
|
|
8
|
+
* Extracted from `registry-install.ts` and shared by all syncable
|
|
9
|
+
* providers that fetch tarballs (currently `NpmStashProvider` and the
|
|
10
|
+
* registry index builder).
|
|
11
|
+
*/
|
|
12
|
+
import { spawnSync } from "node:child_process";
|
|
13
|
+
import { createHash } from "node:crypto";
|
|
14
|
+
import fs from "node:fs";
|
|
15
|
+
import path from "node:path";
|
|
16
|
+
import { isWithin } from "../common";
|
|
17
|
+
import { warn } from "../warn";
|
|
18
|
+
/**
|
|
19
|
+
* Verify an archive's integrity against a known hash. Throws and removes
|
|
20
|
+
* the archive when verification fails.
|
|
21
|
+
*
|
|
22
|
+
* Supports SRI hashes (sha256-/sha512-) and hex SHA-1 from npm.
|
|
23
|
+
* Skips verification for git/github sources (revisions are commit SHAs,
|
|
24
|
+
* not content hashes).
|
|
25
|
+
*/
|
|
26
|
+
export function verifyArchiveIntegrity(archivePath, expected, source) {
|
|
27
|
+
if (!expected)
|
|
28
|
+
return;
|
|
29
|
+
// For GitHub and git sources, resolvedRevision is a commit SHA, not a content hash.
|
|
30
|
+
// Content integrity cannot be verified from a commit hash, so skip verification.
|
|
31
|
+
if (source === "github" || source === "git")
|
|
32
|
+
return;
|
|
33
|
+
const fileBuffer = fs.readFileSync(archivePath);
|
|
34
|
+
// SRI hash format: sha256-<base64> or sha512-<base64>
|
|
35
|
+
if (expected.startsWith("sha256-") || expected.startsWith("sha512-")) {
|
|
36
|
+
const dashIndex = expected.indexOf("-");
|
|
37
|
+
const algorithm = expected.slice(0, dashIndex);
|
|
38
|
+
const expectedBase64 = expected.slice(dashIndex + 1);
|
|
39
|
+
const actualBase64 = createHash(algorithm).update(fileBuffer).digest("base64");
|
|
40
|
+
if (actualBase64 !== expectedBase64) {
|
|
41
|
+
fs.unlinkSync(archivePath);
|
|
42
|
+
throw new Error(`Integrity check failed for ${archivePath}: expected ${algorithm} digest ${expectedBase64}, got ${actualBase64}`);
|
|
43
|
+
}
|
|
44
|
+
return;
|
|
45
|
+
}
|
|
46
|
+
// Hex shasum (SHA-1 from npm)
|
|
47
|
+
if (/^[0-9a-f]{40}$/i.test(expected)) {
|
|
48
|
+
const actualHex = createHash("sha1").update(fileBuffer).digest("hex");
|
|
49
|
+
if (actualHex.toLowerCase() !== expected.toLowerCase()) {
|
|
50
|
+
fs.unlinkSync(archivePath);
|
|
51
|
+
throw new Error(`Integrity check failed for ${archivePath}: expected sha1 ${expected}, got ${actualHex}`);
|
|
52
|
+
}
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
// Unrecognized format — warn and skip verification
|
|
56
|
+
warn("Unrecognized integrity format: %s — verification skipped", expected);
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Extract a tar.gz archive into `destinationDir`, validating entries first
|
|
60
|
+
* (no absolute paths, no `..` traversal, no NUL bytes), invoking tar with
|
|
61
|
+
* `--no-same-owner --strip-components=1`, and finally scanning the extracted
|
|
62
|
+
* tree for symlinks that would escape the destination.
|
|
63
|
+
*/
|
|
64
|
+
export function extractTarGzSecure(archivePath, destinationDir) {
|
|
65
|
+
const listResult = spawnSync("tar", ["tzf", archivePath], { encoding: "utf8" });
|
|
66
|
+
if (listResult.status !== 0) {
|
|
67
|
+
const err = listResult.stderr?.trim() || listResult.error?.message || "unknown error";
|
|
68
|
+
throw new Error(`Failed to inspect archive ${archivePath}: ${err}`);
|
|
69
|
+
}
|
|
70
|
+
validateTarEntries(listResult.stdout);
|
|
71
|
+
fs.rmSync(destinationDir, { recursive: true, force: true });
|
|
72
|
+
fs.mkdirSync(destinationDir, { recursive: true });
|
|
73
|
+
const extractResult = spawnSync("tar", ["xzf", archivePath, "--no-same-owner", "--strip-components=1", "-C", destinationDir], { encoding: "utf8" });
|
|
74
|
+
if (extractResult.status !== 0) {
|
|
75
|
+
const err = extractResult.stderr?.trim() || extractResult.error?.message || "unknown error";
|
|
76
|
+
throw new Error(`Failed to extract archive ${archivePath}: ${err}`);
|
|
77
|
+
}
|
|
78
|
+
// Post-extraction scan: verify all extracted files are within destinationDir
|
|
79
|
+
// This mitigates TOCTOU between validateTarEntries (list) and tar extract.
|
|
80
|
+
scanExtractedFiles(destinationDir, destinationDir);
|
|
81
|
+
}
|
|
82
|
+
function scanExtractedFiles(dir, root) {
|
|
83
|
+
let entries;
|
|
84
|
+
try {
|
|
85
|
+
entries = fs.readdirSync(dir, { withFileTypes: true });
|
|
86
|
+
}
|
|
87
|
+
catch {
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
for (const entry of entries) {
|
|
91
|
+
const fullPath = path.join(dir, entry.name);
|
|
92
|
+
// Reject only entries whose name is exactly the parent-traversal segment
|
|
93
|
+
// (or `.`). Substring matches (`foo..bar`, `archive..2024.tar`) are
|
|
94
|
+
// legitimate filenames that the previous `entry.name.includes("..")`
|
|
95
|
+
// check rejected as false positives — flagged in PR #168 review.
|
|
96
|
+
if (entry.name === ".." || entry.name === ".") {
|
|
97
|
+
throw new Error(`Post-extraction scan: suspicious entry name: ${fullPath}`);
|
|
98
|
+
}
|
|
99
|
+
// Symlinks: resolve and confirm the target stays inside the destination.
|
|
100
|
+
if (entry.isSymbolicLink()) {
|
|
101
|
+
const target = fs.realpathSync(fullPath);
|
|
102
|
+
if (!isWithin(target, root)) {
|
|
103
|
+
throw new Error(`Post-extraction scan: symlink escapes destination directory: ${fullPath} -> ${target}`);
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
// Belt-and-suspenders: any regular entry whose resolved path lands outside
|
|
107
|
+
// the destination root is rejected, regardless of how its name looks. This
|
|
108
|
+
// catches anything the tar pre-validation missed.
|
|
109
|
+
if (!entry.isSymbolicLink() && !isWithin(fullPath, root)) {
|
|
110
|
+
throw new Error(`Post-extraction scan: entry escapes destination directory: ${fullPath}`);
|
|
111
|
+
}
|
|
112
|
+
if (entry.isDirectory()) {
|
|
113
|
+
scanExtractedFiles(fullPath, root);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Validate the line-oriented `tar tzf` listing for unsafe entries.
|
|
119
|
+
*
|
|
120
|
+
* Rejects:
|
|
121
|
+
* - empty/NUL-containing entries
|
|
122
|
+
* - absolute paths
|
|
123
|
+
* - parent traversal (`..` / `../`)
|
|
124
|
+
* - any entry that would still escape after `--strip-components=1`
|
|
125
|
+
*/
|
|
126
|
+
export function validateTarEntries(listOutput) {
|
|
127
|
+
const lines = listOutput.split(/\r?\n/).filter(Boolean);
|
|
128
|
+
for (const rawLine of lines) {
|
|
129
|
+
const entry = rawLine.trim();
|
|
130
|
+
if (!entry || entry.includes("\0")) {
|
|
131
|
+
throw new Error(`Archive contains an invalid entry: ${JSON.stringify(rawLine)}`);
|
|
132
|
+
}
|
|
133
|
+
if (entry.startsWith("/")) {
|
|
134
|
+
throw new Error(`Archive contains an absolute path entry: ${entry}`);
|
|
135
|
+
}
|
|
136
|
+
const normalized = path.posix.normalize(entry);
|
|
137
|
+
if (normalized === ".." || normalized.startsWith("../")) {
|
|
138
|
+
throw new Error(`Archive contains a path traversal entry: ${entry}`);
|
|
139
|
+
}
|
|
140
|
+
const parts = normalized.split("/").filter(Boolean);
|
|
141
|
+
const stripped = parts.slice(1).join("/");
|
|
142
|
+
if (!stripped)
|
|
143
|
+
continue;
|
|
144
|
+
const normalizedStripped = path.posix.normalize(stripped);
|
|
145
|
+
if (normalizedStripped === ".." ||
|
|
146
|
+
normalizedStripped.startsWith("../") ||
|
|
147
|
+
path.posix.isAbsolute(normalizedStripped)) {
|
|
148
|
+
throw new Error(`Archive contains an unsafe entry after strip-components: ${entry}`);
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import { createHash } from "node:crypto";
|
|
2
2
|
import fs from "node:fs";
|
|
3
3
|
import path from "node:path";
|
|
4
|
-
import { fetchWithRetry } from "../common";
|
|
4
|
+
import { fetchWithRetry, ResponseTooLargeError, readBodyWithByteCap } from "../common";
|
|
5
5
|
import { ConfigError, UsageError } from "../errors";
|
|
6
6
|
import { getRegistryIndexCacheDir } from "../paths";
|
|
7
7
|
import { registerStashProvider } from "../stash-provider-factory";
|
|
8
|
-
import { isExpired, sanitizeString } from "./provider-utils";
|
|
8
|
+
import { isDirectory, isExpired, sanitizeString } from "./provider-utils";
|
|
9
9
|
/** Refresh website snapshots every 12 hours to balance freshness with scraping load. */
|
|
10
10
|
const CACHE_REFRESH_INTERVAL_MS = 12 * 60 * 60 * 1000;
|
|
11
11
|
/** Allow up to 7 days of stale snapshots when refresh fails so search remains available during outages. */
|
|
@@ -14,10 +14,31 @@ const CACHE_STALE_MS = 7 * 24 * 60 * 60 * 1000;
|
|
|
14
14
|
const QUEUE_EXPANSION_FACTOR = 5;
|
|
15
15
|
const MAX_PAGES_DEFAULT = 50;
|
|
16
16
|
const MAX_DEPTH_DEFAULT = 3;
|
|
17
|
+
/**
|
|
18
|
+
* Per-page body cap for website scraping. HTML pages this large are
|
|
19
|
+
* almost never useful as agent knowledge sources and a runaway server
|
|
20
|
+
* streaming tens of megabytes would blow memory with no upside.
|
|
21
|
+
*/
|
|
22
|
+
const WEBSITE_PAGE_BYTE_CAP = 5 * 1024 * 1024;
|
|
23
|
+
/**
|
|
24
|
+
* Wall-clock cap for a full crawl (10 minutes). With per-request timeouts
|
|
25
|
+
* of 15s and a `maxPages` default of 50, an unresponsive site could
|
|
26
|
+
* otherwise stall `akm add` for 12.5 minutes with no feedback. Cap the
|
|
27
|
+
* whole crawl and return what we have when time runs out.
|
|
28
|
+
*/
|
|
29
|
+
const WEBSITE_CRAWL_WALL_CLOCK_MS = 10 * 60 * 1000;
|
|
30
|
+
/**
|
|
31
|
+
* Website stash provider. Implements {@link SyncableStashProvider} (which
|
|
32
|
+
* extends LiveStashProvider) — scrapes pages into a local mirror so the FTS5
|
|
33
|
+
* indexer can walk them.
|
|
34
|
+
*/
|
|
17
35
|
class WebsiteStashProvider {
|
|
18
36
|
type = "website";
|
|
37
|
+
kind = "syncable";
|
|
19
38
|
name;
|
|
39
|
+
config;
|
|
20
40
|
constructor(config) {
|
|
41
|
+
this.config = config;
|
|
21
42
|
this.name = config.name ?? "website";
|
|
22
43
|
validateWebsiteUrl(config.url ?? "");
|
|
23
44
|
}
|
|
@@ -33,6 +54,40 @@ class WebsiteStashProvider {
|
|
|
33
54
|
canShow(_ref) {
|
|
34
55
|
return false;
|
|
35
56
|
}
|
|
57
|
+
async sync(config, options) {
|
|
58
|
+
const cachePaths = await ensureWebsiteMirror(config, { requireStashDir: true, force: options?.force });
|
|
59
|
+
const syncedAt = (options?.now ?? new Date()).toISOString();
|
|
60
|
+
const url = config.url ?? "";
|
|
61
|
+
// #123 added "website" to the StashSource union, so we can use it directly.
|
|
62
|
+
return {
|
|
63
|
+
id: url,
|
|
64
|
+
source: "website",
|
|
65
|
+
ref: url,
|
|
66
|
+
artifactUrl: url,
|
|
67
|
+
contentDir: cachePaths.stashDir,
|
|
68
|
+
cacheDir: cachePaths.rootDir,
|
|
69
|
+
extractedDir: cachePaths.stashDir,
|
|
70
|
+
syncedAt,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
getContentDir(config) {
|
|
74
|
+
const url = config.url ?? "";
|
|
75
|
+
return getCachePaths(url).stashDir;
|
|
76
|
+
}
|
|
77
|
+
async remove(config) {
|
|
78
|
+
const url = config.url;
|
|
79
|
+
if (!url)
|
|
80
|
+
return;
|
|
81
|
+
const paths = getCachePaths(url);
|
|
82
|
+
if (isDirectory(paths.rootDir)) {
|
|
83
|
+
try {
|
|
84
|
+
fs.rmSync(paths.rootDir, { recursive: true, force: true });
|
|
85
|
+
}
|
|
86
|
+
catch {
|
|
87
|
+
/* best-effort */
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
36
91
|
}
|
|
37
92
|
registerStashProvider("website", (config) => new WebsiteStashProvider(config));
|
|
38
93
|
function getCachePaths(siteUrl) {
|
|
@@ -49,6 +104,7 @@ async function ensureWebsiteMirror(config, options) {
|
|
|
49
104
|
const normalizedUrl = validateWebsiteUrl(rawUrl);
|
|
50
105
|
const cachePaths = getCachePaths(normalizedUrl);
|
|
51
106
|
const requireStashDir = options?.requireStashDir === true;
|
|
107
|
+
const force = options?.force === true;
|
|
52
108
|
let mtime = 0;
|
|
53
109
|
try {
|
|
54
110
|
mtime = fs.statSync(cachePaths.manifestPath).mtimeMs;
|
|
@@ -56,7 +112,8 @@ async function ensureWebsiteMirror(config, options) {
|
|
|
56
112
|
catch {
|
|
57
113
|
/* no cached manifest */
|
|
58
114
|
}
|
|
59
|
-
if (
|
|
115
|
+
if (!force &&
|
|
116
|
+
mtime &&
|
|
60
117
|
!isExpired(mtime, CACHE_REFRESH_INTERVAL_MS) &&
|
|
61
118
|
(!requireStashDir || hasExtractedSite(cachePaths.stashDir))) {
|
|
62
119
|
return cachePaths;
|
|
@@ -124,7 +181,13 @@ async function crawlWebsite(startUrl, options) {
|
|
|
124
181
|
const queue = [{ url: start.toString(), depth: 0 }];
|
|
125
182
|
const visited = new Set();
|
|
126
183
|
const pages = [];
|
|
184
|
+
const deadline = Date.now() + WEBSITE_CRAWL_WALL_CLOCK_MS;
|
|
185
|
+
let stoppedAtDeadline = false;
|
|
127
186
|
while (queue.length > 0 && pages.length < options.maxPages) {
|
|
187
|
+
if (Date.now() > deadline) {
|
|
188
|
+
stoppedAtDeadline = true;
|
|
189
|
+
break;
|
|
190
|
+
}
|
|
128
191
|
const next = queue.shift();
|
|
129
192
|
if (!next)
|
|
130
193
|
break;
|
|
@@ -149,6 +212,9 @@ async function crawlWebsite(startUrl, options) {
|
|
|
149
212
|
queue.push({ url: candidate, depth: next.depth + 1 });
|
|
150
213
|
}
|
|
151
214
|
}
|
|
215
|
+
if (stoppedAtDeadline) {
|
|
216
|
+
console.warn(`[akm] website crawl stopped at the ${WEBSITE_CRAWL_WALL_CLOCK_MS / 1000}s wall-clock cap with ${pages.length}/${options.maxPages} pages collected from ${startUrl}.`);
|
|
217
|
+
}
|
|
152
218
|
return pages;
|
|
153
219
|
}
|
|
154
220
|
async function fetchWebsitePage(pageUrl) {
|
|
@@ -164,7 +230,17 @@ async function fetchWebsitePage(pageUrl) {
|
|
|
164
230
|
throw new Error(`Failed to fetch website content (${response.status}) from ${pageUrl}`);
|
|
165
231
|
}
|
|
166
232
|
const contentType = response.headers.get("content-type")?.toLowerCase() ?? "";
|
|
167
|
-
|
|
233
|
+
let body;
|
|
234
|
+
try {
|
|
235
|
+
body = await readBodyWithByteCap(response, WEBSITE_PAGE_BYTE_CAP);
|
|
236
|
+
}
|
|
237
|
+
catch (err) {
|
|
238
|
+
if (err instanceof ResponseTooLargeError) {
|
|
239
|
+
// Skip oversized pages rather than aborting the whole crawl.
|
|
240
|
+
return null;
|
|
241
|
+
}
|
|
242
|
+
throw err;
|
|
243
|
+
}
|
|
168
244
|
const finalUrl = normalizeCrawlUrl(response.url || pageUrl) ?? pageUrl;
|
|
169
245
|
if (contentType.includes("text/html") || contentType.includes("application/xhtml+xml") || looksLikeMarkup(body)) {
|
|
170
246
|
const title = extractHtmlTitle(body) || new URL(finalUrl).hostname;
|