akm-cli 0.5.0 → 0.6.0-rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/CHANGELOG.md +32 -5
  2. package/dist/asset-registry.js +29 -5
  3. package/dist/asset-spec.js +12 -5
  4. package/dist/cli-hints.js +300 -0
  5. package/dist/cli.js +218 -1357
  6. package/dist/common.js +147 -50
  7. package/dist/config.js +224 -13
  8. package/dist/create-provider-registry.js +1 -1
  9. package/dist/curate.js +258 -0
  10. package/dist/{local-search.js → db-search.js} +30 -19
  11. package/dist/db.js +168 -62
  12. package/dist/embedder.js +49 -273
  13. package/dist/embedders/cache.js +47 -0
  14. package/dist/embedders/local.js +152 -0
  15. package/dist/embedders/remote.js +121 -0
  16. package/dist/embedders/types.js +39 -0
  17. package/dist/errors.js +14 -3
  18. package/dist/frontmatter.js +61 -7
  19. package/dist/indexer.js +38 -7
  20. package/dist/info.js +2 -2
  21. package/dist/install-audit.js +16 -1
  22. package/dist/{installed-kits.js → installed-stashes.js} +48 -22
  23. package/dist/llm-client.js +92 -0
  24. package/dist/llm.js +14 -126
  25. package/dist/lockfile.js +28 -1
  26. package/dist/matchers.js +1 -1
  27. package/dist/metadata-enhance.js +53 -0
  28. package/dist/migration-help.js +75 -44
  29. package/dist/output-context.js +77 -0
  30. package/dist/output-shapes.js +198 -0
  31. package/dist/output-text.js +520 -0
  32. package/dist/paths.js +4 -4
  33. package/dist/providers/index.js +11 -0
  34. package/dist/providers/skills-sh.js +1 -1
  35. package/dist/providers/static-index.js +47 -45
  36. package/dist/registry-build-index.js +36 -29
  37. package/dist/registry-factory.js +2 -2
  38. package/dist/registry-resolve.js +8 -4
  39. package/dist/registry-search.js +62 -5
  40. package/dist/remember.js +172 -0
  41. package/dist/renderers.js +52 -0
  42. package/dist/search-source.js +73 -42
  43. package/dist/setup-steps.js +45 -0
  44. package/dist/setup.js +149 -76
  45. package/dist/stash-add.js +94 -38
  46. package/dist/stash-clone.js +4 -4
  47. package/dist/stash-provider-factory.js +2 -2
  48. package/dist/stash-provider.js +3 -1
  49. package/dist/stash-providers/filesystem.js +31 -1
  50. package/dist/stash-providers/git.js +209 -8
  51. package/dist/stash-providers/index.js +1 -0
  52. package/dist/stash-providers/npm.js +159 -0
  53. package/dist/stash-providers/provider-utils.js +162 -0
  54. package/dist/stash-providers/sync-from-ref.js +45 -0
  55. package/dist/stash-providers/tar-utils.js +151 -0
  56. package/dist/stash-providers/website.js +80 -4
  57. package/dist/stash-resolve.js +5 -5
  58. package/dist/stash-search.js +4 -4
  59. package/dist/stash-show.js +3 -3
  60. package/dist/wiki.js +6 -6
  61. package/dist/workflow-authoring.js +12 -4
  62. package/dist/workflow-markdown.js +9 -0
  63. package/dist/workflow-runs.js +12 -2
  64. package/docs/README.md +30 -0
  65. package/docs/migration/release-notes/0.0.13.md +4 -0
  66. package/docs/migration/release-notes/0.1.0.md +6 -0
  67. package/docs/migration/release-notes/0.2.0.md +6 -0
  68. package/docs/migration/release-notes/0.3.0.md +5 -0
  69. package/docs/migration/release-notes/0.5.0.md +6 -0
  70. package/docs/migration/release-notes/0.6.0.md +29 -0
  71. package/docs/migration/release-notes/README.md +21 -0
  72. package/package.json +3 -2
  73. package/dist/registry-install.js +0 -532
  74. /package/dist/{kit-include.js → stash-include.js} +0 -0
@@ -0,0 +1,159 @@
1
+ /**
2
+ * Npm-source stash provider.
3
+ *
4
+ * `sync()` resolves the npm package tarball, downloads it, verifies its
5
+ * integrity, extracts it securely (via `extractTarGzSecure`), detects the
6
+ * stash root inside the package, and applies any nested `.akm-include`
7
+ * configuration. Cache hits short-circuit the fetch.
8
+ *
9
+ * Audit is intentionally NOT performed here — `akmAdd` calls
10
+ * `auditInstallCandidate` after `sync()` so the policy decision lives at
11
+ * the orchestrator layer where the `--trust` flag is known.
12
+ */
13
+ import fs from "node:fs";
14
+ import path from "node:path";
15
+ import { ConfigError, UsageError } from "../errors";
16
+ import { getRegistryCacheDir } from "../paths";
17
+ import { parseRegistryRef, resolveRegistryArtifact } from "../registry-resolve";
18
+ import { registerStashProvider } from "../stash-provider-factory";
19
+ import { applyAkmIncludeConfig, buildInstallCacheDir, computeFileHash, detectStashRoot, downloadArchive, isDirectory, } from "./provider-utils";
20
+ import { extractTarGzSecure, verifyArchiveIntegrity } from "./tar-utils";
21
+ class NpmStashProvider {
22
+ type = "npm";
23
+ kind = "syncable";
24
+ name;
25
+ constructor(config) {
26
+ this.name = config.name ?? config.url ?? "npm";
27
+ }
28
+ /** Content is indexed through the standard FTS5 pipeline. */
29
+ async search(_options) {
30
+ return { hits: [] };
31
+ }
32
+ /** Content is local files, shown via showLocal. */
33
+ async show(_ref, _view) {
34
+ throw new Error("NPM provider content is shown via local index");
35
+ }
36
+ canShow(_ref) {
37
+ return false;
38
+ }
39
+ async sync(config, options) {
40
+ const ref = npmRefFromConfig(config);
41
+ return syncNpmRef(ref, options);
42
+ }
43
+ getContentDir(config) {
44
+ if (config.path)
45
+ return config.path;
46
+ throw new ConfigError("npm stash entry missing resolved content path");
47
+ }
48
+ async remove(config) {
49
+ if (config.path && isDirectory(config.path)) {
50
+ // Remove the whole versioned cache dir if we know the parent layout.
51
+ const parent = path.dirname(config.path);
52
+ try {
53
+ fs.rmSync(parent, { recursive: true, force: true });
54
+ }
55
+ catch {
56
+ /* best-effort */
57
+ }
58
+ }
59
+ }
60
+ }
61
+ registerStashProvider("npm", (config) => new NpmStashProvider(config));
62
+ function npmRefFromConfig(config) {
63
+ // Prefer an explicit ref-bearing field (set by akmAdd when persisting), else fall back
64
+ // to options or url so the provider stays usable from a hand-rolled config.
65
+ const candidate = config.options?.ref ?? config.url ?? config.options?.package ?? config.name;
66
+ if (typeof candidate !== "string" || !candidate) {
67
+ throw new UsageError('npm stash entry must include an `options.ref` (e.g. "npm:my-pkg@1.2.3")');
68
+ }
69
+ return candidate.startsWith("npm:") ? candidate : `npm:${candidate}`;
70
+ }
71
+ /**
72
+ * Fetch and extract an npm tarball, returning a populated `StashLockData`.
73
+ *
74
+ * Mirrors the historical `installRegistryRef()` path for npm sources:
75
+ * - resolve artifact URL + integrity from the npm registry
76
+ * - reuse cached extraction when present
77
+ * - download, verify, extract securely, then detect the stash root
78
+ * - honour `.akm-include` filters
79
+ */
80
+ export async function syncNpmRef(ref, options) {
81
+ const parsed = parseRegistryRef(ref);
82
+ if (parsed.source !== "npm") {
83
+ throw new UsageError(`syncNpmRef requires an npm: ref, got "${ref}"`);
84
+ }
85
+ return doSyncNpm(parsed, options);
86
+ }
87
+ async function doSyncNpm(parsed, options) {
88
+ const resolved = await resolveRegistryArtifact(parsed);
89
+ const syncedAt = (options?.now ?? new Date()).toISOString();
90
+ const cacheRootDir = options?.cacheRootDir ?? getRegistryCacheDir();
91
+ const cacheDir = buildInstallCacheDir(cacheRootDir, resolved.source, resolved.id, resolved.resolvedVersion ?? resolved.resolvedRevision);
92
+ const archivePath = path.join(cacheDir, "artifact.tar.gz");
93
+ const extractedDir = path.join(cacheDir, "extracted");
94
+ // Cache hit: extracted dir already valid → reuse it
95
+ if (!options?.force && isDirectory(extractedDir)) {
96
+ try {
97
+ const cachedStashRoot = detectStashRoot(extractedDir);
98
+ if (cachedStashRoot) {
99
+ const integrity = fs.existsSync(archivePath) ? await computeFileHash(archivePath) : undefined;
100
+ return {
101
+ id: resolved.id,
102
+ source: resolved.source,
103
+ ref: resolved.ref,
104
+ artifactUrl: resolved.artifactUrl,
105
+ resolvedVersion: resolved.resolvedVersion,
106
+ resolvedRevision: resolved.resolvedRevision,
107
+ contentDir: cachedStashRoot,
108
+ cacheDir,
109
+ extractedDir,
110
+ integrity,
111
+ writable: options?.writable,
112
+ syncedAt,
113
+ };
114
+ }
115
+ }
116
+ catch {
117
+ // Cache invalid, re-download
118
+ }
119
+ }
120
+ fs.mkdirSync(cacheDir, { recursive: true });
121
+ let integrity;
122
+ let provisionalKitRoot;
123
+ let installRoot;
124
+ let stashRoot;
125
+ try {
126
+ await downloadArchive(resolved.artifactUrl, archivePath);
127
+ verifyArchiveIntegrity(archivePath, resolved.resolvedRevision, resolved.source);
128
+ integrity = await computeFileHash(archivePath);
129
+ extractTarGzSecure(archivePath, extractedDir);
130
+ provisionalKitRoot = detectStashRoot(extractedDir);
131
+ installRoot = applyAkmIncludeConfig(provisionalKitRoot, cacheDir, extractedDir) ?? provisionalKitRoot;
132
+ stashRoot = detectStashRoot(installRoot);
133
+ }
134
+ catch (err) {
135
+ // Clean up so stale or partial extractions don't cause false cache hits.
136
+ try {
137
+ fs.rmSync(cacheDir, { recursive: true, force: true });
138
+ }
139
+ catch {
140
+ /* best-effort */
141
+ }
142
+ throw err;
143
+ }
144
+ return {
145
+ id: resolved.id,
146
+ source: resolved.source,
147
+ ref: resolved.ref,
148
+ artifactUrl: resolved.artifactUrl,
149
+ resolvedVersion: resolved.resolvedVersion,
150
+ resolvedRevision: resolved.resolvedRevision,
151
+ contentDir: stashRoot,
152
+ cacheDir,
153
+ extractedDir,
154
+ integrity,
155
+ writable: options?.writable,
156
+ syncedAt,
157
+ };
158
+ }
159
+ export { NpmStashProvider };
@@ -1,3 +1,10 @@
1
+ import { createHash } from "node:crypto";
2
+ import fs from "node:fs";
3
+ import path from "node:path";
4
+ import { TYPE_DIRS } from "../asset-spec";
5
+ import { fetchWithRetry } from "../common";
6
+ import { copyIncludedPaths, findNearestIncludeConfig } from "../stash-include";
7
+ const REGISTRY_STASH_DIR_NAMES = new Set(Object.values(TYPE_DIRS));
1
8
  /** Strip terminal control characters from untrusted strings. */
2
9
  export function sanitizeString(value, maxLength = 255) {
3
10
  if (typeof value !== "string")
@@ -9,3 +16,158 @@ export function sanitizeString(value, maxLength = 255) {
9
16
  export function isExpired(mtimeMs, ttlMs) {
10
17
  return Date.now() - mtimeMs > ttlMs;
11
18
  }
19
+ /**
20
+ * Find the directory inside `extractedDir` that should be treated as the
21
+ * stash root. Looks for a `.stash` marker, then well-known type dirs, then
22
+ * BFS for the shallowest such candidate.
23
+ */
24
+ export function detectStashRoot(extractedDir) {
25
+ const root = path.resolve(extractedDir);
26
+ const rootDotStash = path.join(root, ".stash");
27
+ if (isDirectory(rootDotStash)) {
28
+ return root;
29
+ }
30
+ if (hasStashDirs(root)) {
31
+ return root;
32
+ }
33
+ const shallowest = findShallowestStashRoot(root);
34
+ if (shallowest)
35
+ return shallowest;
36
+ return root;
37
+ }
38
+ /**
39
+ * Build a per-source cache directory under `cacheRootDir`.
40
+ *
41
+ * Versioned sources get `${source}-${id}/${version}` for cache reuse;
42
+ * `local` sources get a unique timestamped slug so each install is isolated.
43
+ */
44
+ export function buildInstallCacheDir(cacheRootDir, source, id, version) {
45
+ const slug = `${source}-${id.replace(/[^a-zA-Z0-9_.-]+/g, "-").replace(/^-+|-+$/g, "")}`;
46
+ const versionSlug = source === "local"
47
+ ? `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`
48
+ : (version?.replace(/[^a-zA-Z0-9_.-]+/g, "-") ?? `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`);
49
+ return path.join(cacheRootDir, slug || source, versionSlug);
50
+ }
51
+ /**
52
+ * Apply an `.akm-include` config (if any) by copying the selected paths
53
+ * into a sibling `selected/` directory and returning that path. Returns
54
+ * undefined when no include config is found.
55
+ */
56
+ export function applyAkmIncludeConfig(sourceRoot, cacheDir, searchRoot = sourceRoot) {
57
+ const includeConfig = findNearestIncludeConfig(sourceRoot, searchRoot);
58
+ if (!includeConfig)
59
+ return undefined;
60
+ const selectedDir = path.join(cacheDir, "selected");
61
+ fs.rmSync(selectedDir, { recursive: true, force: true });
62
+ fs.mkdirSync(selectedDir, { recursive: true });
63
+ copyIncludedPaths(includeConfig.include, includeConfig.baseDir, selectedDir);
64
+ return selectedDir;
65
+ }
66
+ /** Stream a remote archive to disk using Bun.write when available. */
67
+ export async function downloadArchive(url, destination) {
68
+ const response = await fetchWithRetry(url, undefined, { timeout: 120_000 });
69
+ if (!response.ok) {
70
+ throw new Error(`Failed to download archive (${response.status}) from ${url}`);
71
+ }
72
+ // Stream response to disk instead of buffering the entire archive in memory.
73
+ // Uses Bun.write which handles Response streaming natively.
74
+ const BunRuntime = globalThis
75
+ .Bun;
76
+ if (BunRuntime?.write) {
77
+ await BunRuntime.write(destination, response);
78
+ }
79
+ else {
80
+ // Fallback for non-Bun environments (e.g., tests)
81
+ const arrayBuffer = await response.arrayBuffer();
82
+ fs.writeFileSync(destination, Buffer.from(arrayBuffer));
83
+ }
84
+ }
85
+ /** SHA-256 of a file, returned as `sha256:<hex>`. */
86
+ export async function computeFileHash(filePath) {
87
+ const data = fs.readFileSync(filePath);
88
+ const hash = createHash("sha256").update(data).digest("hex");
89
+ return `sha256:${hash}`;
90
+ }
91
+ /** Recursively copy directory contents, excluding `.git`. */
92
+ export function copyDirectoryContents(sourceDir, destinationDir) {
93
+ for (const entry of fs.readdirSync(sourceDir, { withFileTypes: true })) {
94
+ if (entry.name === ".git")
95
+ continue;
96
+ const src = path.join(sourceDir, entry.name);
97
+ const dest = path.join(destinationDir, entry.name);
98
+ fs.mkdirSync(path.dirname(dest), { recursive: true });
99
+ if (entry.isDirectory()) {
100
+ fs.cpSync(src, dest, { recursive: true, force: true });
101
+ }
102
+ else {
103
+ fs.copyFileSync(src, dest);
104
+ }
105
+ }
106
+ }
107
+ export function isDirectory(target) {
108
+ try {
109
+ return fs.statSync(target).isDirectory();
110
+ }
111
+ catch {
112
+ return false;
113
+ }
114
+ }
115
+ function hasStashDirs(dirPath) {
116
+ if (!isDirectory(dirPath))
117
+ return false;
118
+ const entries = fs.readdirSync(dirPath, { withFileTypes: true });
119
+ return entries.some((entry) => entry.isDirectory() && REGISTRY_STASH_DIR_NAMES.has(entry.name));
120
+ }
121
+ function countStashDirs(dirPath) {
122
+ if (!isDirectory(dirPath))
123
+ return 0;
124
+ const entries = fs.readdirSync(dirPath, { withFileTypes: true });
125
+ return entries.filter((entry) => entry.isDirectory() && REGISTRY_STASH_DIR_NAMES.has(entry.name)).length;
126
+ }
127
+ /**
128
+ * BFS to find the shallowest directory that looks like a stash root.
129
+ * Checks for both `.stash` directories and well-known type directories
130
+ * (scripts/, skills/, etc.), so nested layouts like `project/my-stash/scripts/`
131
+ * are discovered even without a `.stash` marker.
132
+ *
133
+ * Skips `root` itself since the caller already checked it via `hasStashDirs`.
134
+ */
135
+ const BFS_MAX_DEPTH = 5;
136
+ function findShallowestStashRoot(root) {
137
+ const queue = [{ dir: root, depth: 0 }];
138
+ while (queue.length > 0) {
139
+ const item = queue.shift();
140
+ if (!item) {
141
+ continue;
142
+ }
143
+ const { dir: current, depth } = item;
144
+ if (current !== root) {
145
+ // .stash directory is a strong stash marker
146
+ if (isDirectory(path.join(current, ".stash"))) {
147
+ return current;
148
+ }
149
+ // Require 2+ type dirs for BFS candidates to avoid false positives.
150
+ // A single "scripts/" is too common (skill dirs, npm packages, etc.).
151
+ if (countStashDirs(current) >= 2) {
152
+ return current;
153
+ }
154
+ }
155
+ if (depth >= BFS_MAX_DEPTH)
156
+ continue;
157
+ let children;
158
+ try {
159
+ children = fs.readdirSync(current, { withFileTypes: true });
160
+ }
161
+ catch {
162
+ continue;
163
+ }
164
+ for (const child of children) {
165
+ if (!child.isDirectory())
166
+ continue;
167
+ if (child.name === ".git" || child.name === "node_modules")
168
+ continue;
169
+ queue.push({ dir: path.join(current, child.name), depth: depth + 1 });
170
+ }
171
+ }
172
+ return undefined;
173
+ }
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Unified install-ref dispatcher.
3
+ *
4
+ * Replaces the historical `installRegistryRef()` entry point. Given an
5
+ * unparsed install ref, this resolves the right syncable provider and
6
+ * invokes its `sync()` method.
7
+ *
8
+ * Audit is intentionally NOT performed here; callers (`akmAdd`,
9
+ * `akmUpdate`) decide whether to run `auditInstallCandidate` on the
10
+ * synced `contentDir` because they own the `--trust` flag.
11
+ */
12
+ import { UsageError } from "../errors";
13
+ import { parseRegistryRef } from "../registry-resolve";
14
+ import { detectStashRoot } from "./provider-utils";
15
+ export async function syncFromRef(ref, options) {
16
+ const parsed = parseRegistryRef(ref);
17
+ if (parsed.source === "local") {
18
+ return syncLocalRef(parsed, options);
19
+ }
20
+ if (parsed.source === "npm") {
21
+ const { syncNpmRef } = await import("./npm");
22
+ return syncNpmRef(ref, options);
23
+ }
24
+ if (parsed.source === "git" || parsed.source === "github") {
25
+ const { syncRegistryGitRef } = await import("./git");
26
+ return syncRegistryGitRef(ref, options);
27
+ }
28
+ // Exhaustiveness — `parseRegistryRef` only emits the four sources above.
29
+ throw new UsageError(`No syncable provider for ref: ${ref} (source=${parsed.source})`);
30
+ }
31
+ function syncLocalRef(parsed, options) {
32
+ const stashRoot = detectStashRoot(parsed.sourcePath);
33
+ const syncedAt = (options?.now ?? new Date()).toISOString();
34
+ return {
35
+ id: parsed.id,
36
+ source: "local",
37
+ ref: parsed.ref,
38
+ artifactUrl: parsed.sourcePath,
39
+ contentDir: stashRoot,
40
+ cacheDir: parsed.sourcePath,
41
+ extractedDir: parsed.sourcePath,
42
+ writable: options?.writable,
43
+ syncedAt,
44
+ };
45
+ }
@@ -0,0 +1,151 @@
1
+ /**
2
+ * Tar archive extraction and integrity verification utilities.
3
+ *
4
+ * These helpers are security-critical: they validate archive entries to
5
+ * prevent path traversal, run a post-extraction scan for symlink escapes,
6
+ * and verify integrity hashes (SRI or hex shasum) before extraction.
7
+ *
8
+ * Extracted from `registry-install.ts` and shared by all syncable
9
+ * providers that fetch tarballs (currently `NpmStashProvider` and the
10
+ * registry index builder).
11
+ */
12
+ import { spawnSync } from "node:child_process";
13
+ import { createHash } from "node:crypto";
14
+ import fs from "node:fs";
15
+ import path from "node:path";
16
+ import { isWithin } from "../common";
17
+ import { warn } from "../warn";
18
+ /**
19
+ * Verify an archive's integrity against a known hash. Throws and removes
20
+ * the archive when verification fails.
21
+ *
22
+ * Supports SRI hashes (sha256-/sha512-) and hex SHA-1 from npm.
23
+ * Skips verification for git/github sources (revisions are commit SHAs,
24
+ * not content hashes).
25
+ */
26
+ export function verifyArchiveIntegrity(archivePath, expected, source) {
27
+ if (!expected)
28
+ return;
29
+ // For GitHub and git sources, resolvedRevision is a commit SHA, not a content hash.
30
+ // Content integrity cannot be verified from a commit hash, so skip verification.
31
+ if (source === "github" || source === "git")
32
+ return;
33
+ const fileBuffer = fs.readFileSync(archivePath);
34
+ // SRI hash format: sha256-<base64> or sha512-<base64>
35
+ if (expected.startsWith("sha256-") || expected.startsWith("sha512-")) {
36
+ const dashIndex = expected.indexOf("-");
37
+ const algorithm = expected.slice(0, dashIndex);
38
+ const expectedBase64 = expected.slice(dashIndex + 1);
39
+ const actualBase64 = createHash(algorithm).update(fileBuffer).digest("base64");
40
+ if (actualBase64 !== expectedBase64) {
41
+ fs.unlinkSync(archivePath);
42
+ throw new Error(`Integrity check failed for ${archivePath}: expected ${algorithm} digest ${expectedBase64}, got ${actualBase64}`);
43
+ }
44
+ return;
45
+ }
46
+ // Hex shasum (SHA-1 from npm)
47
+ if (/^[0-9a-f]{40}$/i.test(expected)) {
48
+ const actualHex = createHash("sha1").update(fileBuffer).digest("hex");
49
+ if (actualHex.toLowerCase() !== expected.toLowerCase()) {
50
+ fs.unlinkSync(archivePath);
51
+ throw new Error(`Integrity check failed for ${archivePath}: expected sha1 ${expected}, got ${actualHex}`);
52
+ }
53
+ return;
54
+ }
55
+ // Unrecognized format — warn and skip verification
56
+ warn("Unrecognized integrity format: %s — verification skipped", expected);
57
+ }
58
+ /**
59
+ * Extract a tar.gz archive into `destinationDir`, validating entries first
60
+ * (no absolute paths, no `..` traversal, no NUL bytes), invoking tar with
61
+ * `--no-same-owner --strip-components=1`, and finally scanning the extracted
62
+ * tree for symlinks that would escape the destination.
63
+ */
64
+ export function extractTarGzSecure(archivePath, destinationDir) {
65
+ const listResult = spawnSync("tar", ["tzf", archivePath], { encoding: "utf8" });
66
+ if (listResult.status !== 0) {
67
+ const err = listResult.stderr?.trim() || listResult.error?.message || "unknown error";
68
+ throw new Error(`Failed to inspect archive ${archivePath}: ${err}`);
69
+ }
70
+ validateTarEntries(listResult.stdout);
71
+ fs.rmSync(destinationDir, { recursive: true, force: true });
72
+ fs.mkdirSync(destinationDir, { recursive: true });
73
+ const extractResult = spawnSync("tar", ["xzf", archivePath, "--no-same-owner", "--strip-components=1", "-C", destinationDir], { encoding: "utf8" });
74
+ if (extractResult.status !== 0) {
75
+ const err = extractResult.stderr?.trim() || extractResult.error?.message || "unknown error";
76
+ throw new Error(`Failed to extract archive ${archivePath}: ${err}`);
77
+ }
78
+ // Post-extraction scan: verify all extracted files are within destinationDir
79
+ // This mitigates TOCTOU between validateTarEntries (list) and tar extract.
80
+ scanExtractedFiles(destinationDir, destinationDir);
81
+ }
82
+ function scanExtractedFiles(dir, root) {
83
+ let entries;
84
+ try {
85
+ entries = fs.readdirSync(dir, { withFileTypes: true });
86
+ }
87
+ catch {
88
+ return;
89
+ }
90
+ for (const entry of entries) {
91
+ const fullPath = path.join(dir, entry.name);
92
+ // Reject only entries whose name is exactly the parent-traversal segment
93
+ // (or `.`). Substring matches (`foo..bar`, `archive..2024.tar`) are
94
+ // legitimate filenames that the previous `entry.name.includes("..")`
95
+ // check rejected as false positives — flagged in PR #168 review.
96
+ if (entry.name === ".." || entry.name === ".") {
97
+ throw new Error(`Post-extraction scan: suspicious entry name: ${fullPath}`);
98
+ }
99
+ // Symlinks: resolve and confirm the target stays inside the destination.
100
+ if (entry.isSymbolicLink()) {
101
+ const target = fs.realpathSync(fullPath);
102
+ if (!isWithin(target, root)) {
103
+ throw new Error(`Post-extraction scan: symlink escapes destination directory: ${fullPath} -> ${target}`);
104
+ }
105
+ }
106
+ // Belt-and-suspenders: any regular entry whose resolved path lands outside
107
+ // the destination root is rejected, regardless of how its name looks. This
108
+ // catches anything the tar pre-validation missed.
109
+ if (!entry.isSymbolicLink() && !isWithin(fullPath, root)) {
110
+ throw new Error(`Post-extraction scan: entry escapes destination directory: ${fullPath}`);
111
+ }
112
+ if (entry.isDirectory()) {
113
+ scanExtractedFiles(fullPath, root);
114
+ }
115
+ }
116
+ }
117
+ /**
118
+ * Validate the line-oriented `tar tzf` listing for unsafe entries.
119
+ *
120
+ * Rejects:
121
+ * - empty/NUL-containing entries
122
+ * - absolute paths
123
+ * - parent traversal (`..` / `../`)
124
+ * - any entry that would still escape after `--strip-components=1`
125
+ */
126
+ export function validateTarEntries(listOutput) {
127
+ const lines = listOutput.split(/\r?\n/).filter(Boolean);
128
+ for (const rawLine of lines) {
129
+ const entry = rawLine.trim();
130
+ if (!entry || entry.includes("\0")) {
131
+ throw new Error(`Archive contains an invalid entry: ${JSON.stringify(rawLine)}`);
132
+ }
133
+ if (entry.startsWith("/")) {
134
+ throw new Error(`Archive contains an absolute path entry: ${entry}`);
135
+ }
136
+ const normalized = path.posix.normalize(entry);
137
+ if (normalized === ".." || normalized.startsWith("../")) {
138
+ throw new Error(`Archive contains a path traversal entry: ${entry}`);
139
+ }
140
+ const parts = normalized.split("/").filter(Boolean);
141
+ const stripped = parts.slice(1).join("/");
142
+ if (!stripped)
143
+ continue;
144
+ const normalizedStripped = path.posix.normalize(stripped);
145
+ if (normalizedStripped === ".." ||
146
+ normalizedStripped.startsWith("../") ||
147
+ path.posix.isAbsolute(normalizedStripped)) {
148
+ throw new Error(`Archive contains an unsafe entry after strip-components: ${entry}`);
149
+ }
150
+ }
151
+ }
@@ -1,11 +1,11 @@
1
1
  import { createHash } from "node:crypto";
2
2
  import fs from "node:fs";
3
3
  import path from "node:path";
4
- import { fetchWithRetry } from "../common";
4
+ import { fetchWithRetry, ResponseTooLargeError, readBodyWithByteCap } from "../common";
5
5
  import { ConfigError, UsageError } from "../errors";
6
6
  import { getRegistryIndexCacheDir } from "../paths";
7
7
  import { registerStashProvider } from "../stash-provider-factory";
8
- import { isExpired, sanitizeString } from "./provider-utils";
8
+ import { isDirectory, isExpired, sanitizeString } from "./provider-utils";
9
9
  /** Refresh website snapshots every 12 hours to balance freshness with scraping load. */
10
10
  const CACHE_REFRESH_INTERVAL_MS = 12 * 60 * 60 * 1000;
11
11
  /** Allow up to 7 days of stale snapshots when refresh fails so search remains available during outages. */
@@ -14,10 +14,31 @@ const CACHE_STALE_MS = 7 * 24 * 60 * 60 * 1000;
14
14
  const QUEUE_EXPANSION_FACTOR = 5;
15
15
  const MAX_PAGES_DEFAULT = 50;
16
16
  const MAX_DEPTH_DEFAULT = 3;
17
+ /**
18
+ * Per-page body cap for website scraping. HTML pages this large are
19
+ * almost never useful as agent knowledge sources and a runaway server
20
+ * streaming tens of megabytes would blow memory with no upside.
21
+ */
22
+ const WEBSITE_PAGE_BYTE_CAP = 5 * 1024 * 1024;
23
+ /**
24
+ * Wall-clock cap for a full crawl (10 minutes). With per-request timeouts
25
+ * of 15s and a `maxPages` default of 50, an unresponsive site could
26
+ * otherwise stall `akm add` for 12.5 minutes with no feedback. Cap the
27
+ * whole crawl and return what we have when time runs out.
28
+ */
29
+ const WEBSITE_CRAWL_WALL_CLOCK_MS = 10 * 60 * 1000;
30
+ /**
31
+ * Website stash provider. Implements {@link SyncableStashProvider} (which
32
+ * extends LiveStashProvider) — scrapes pages into a local mirror so the FTS5
33
+ * indexer can walk them.
34
+ */
17
35
  class WebsiteStashProvider {
18
36
  type = "website";
37
+ kind = "syncable";
19
38
  name;
39
+ config;
20
40
  constructor(config) {
41
+ this.config = config;
21
42
  this.name = config.name ?? "website";
22
43
  validateWebsiteUrl(config.url ?? "");
23
44
  }
@@ -33,6 +54,40 @@ class WebsiteStashProvider {
33
54
  canShow(_ref) {
34
55
  return false;
35
56
  }
57
+ async sync(config, options) {
58
+ const cachePaths = await ensureWebsiteMirror(config, { requireStashDir: true, force: options?.force });
59
+ const syncedAt = (options?.now ?? new Date()).toISOString();
60
+ const url = config.url ?? "";
61
+ // #123 added "website" to the StashSource union, so we can use it directly.
62
+ return {
63
+ id: url,
64
+ source: "website",
65
+ ref: url,
66
+ artifactUrl: url,
67
+ contentDir: cachePaths.stashDir,
68
+ cacheDir: cachePaths.rootDir,
69
+ extractedDir: cachePaths.stashDir,
70
+ syncedAt,
71
+ };
72
+ }
73
+ getContentDir(config) {
74
+ const url = config.url ?? "";
75
+ return getCachePaths(url).stashDir;
76
+ }
77
+ async remove(config) {
78
+ const url = config.url;
79
+ if (!url)
80
+ return;
81
+ const paths = getCachePaths(url);
82
+ if (isDirectory(paths.rootDir)) {
83
+ try {
84
+ fs.rmSync(paths.rootDir, { recursive: true, force: true });
85
+ }
86
+ catch {
87
+ /* best-effort */
88
+ }
89
+ }
90
+ }
36
91
  }
37
92
  registerStashProvider("website", (config) => new WebsiteStashProvider(config));
38
93
  function getCachePaths(siteUrl) {
@@ -49,6 +104,7 @@ async function ensureWebsiteMirror(config, options) {
49
104
  const normalizedUrl = validateWebsiteUrl(rawUrl);
50
105
  const cachePaths = getCachePaths(normalizedUrl);
51
106
  const requireStashDir = options?.requireStashDir === true;
107
+ const force = options?.force === true;
52
108
  let mtime = 0;
53
109
  try {
54
110
  mtime = fs.statSync(cachePaths.manifestPath).mtimeMs;
@@ -56,7 +112,8 @@ async function ensureWebsiteMirror(config, options) {
56
112
  catch {
57
113
  /* no cached manifest */
58
114
  }
59
- if (mtime &&
115
+ if (!force &&
116
+ mtime &&
60
117
  !isExpired(mtime, CACHE_REFRESH_INTERVAL_MS) &&
61
118
  (!requireStashDir || hasExtractedSite(cachePaths.stashDir))) {
62
119
  return cachePaths;
@@ -124,7 +181,13 @@ async function crawlWebsite(startUrl, options) {
124
181
  const queue = [{ url: start.toString(), depth: 0 }];
125
182
  const visited = new Set();
126
183
  const pages = [];
184
+ const deadline = Date.now() + WEBSITE_CRAWL_WALL_CLOCK_MS;
185
+ let stoppedAtDeadline = false;
127
186
  while (queue.length > 0 && pages.length < options.maxPages) {
187
+ if (Date.now() > deadline) {
188
+ stoppedAtDeadline = true;
189
+ break;
190
+ }
128
191
  const next = queue.shift();
129
192
  if (!next)
130
193
  break;
@@ -149,6 +212,9 @@ async function crawlWebsite(startUrl, options) {
149
212
  queue.push({ url: candidate, depth: next.depth + 1 });
150
213
  }
151
214
  }
215
+ if (stoppedAtDeadline) {
216
+ console.warn(`[akm] website crawl stopped at the ${WEBSITE_CRAWL_WALL_CLOCK_MS / 1000}s wall-clock cap with ${pages.length}/${options.maxPages} pages collected from ${startUrl}.`);
217
+ }
152
218
  return pages;
153
219
  }
154
220
  async function fetchWebsitePage(pageUrl) {
@@ -164,7 +230,17 @@ async function fetchWebsitePage(pageUrl) {
164
230
  throw new Error(`Failed to fetch website content (${response.status}) from ${pageUrl}`);
165
231
  }
166
232
  const contentType = response.headers.get("content-type")?.toLowerCase() ?? "";
167
- const body = await response.text();
233
+ let body;
234
+ try {
235
+ body = await readBodyWithByteCap(response, WEBSITE_PAGE_BYTE_CAP);
236
+ }
237
+ catch (err) {
238
+ if (err instanceof ResponseTooLargeError) {
239
+ // Skip oversized pages rather than aborting the whole crawl.
240
+ return null;
241
+ }
242
+ throw err;
243
+ }
168
244
  const finalUrl = normalizeCrawlUrl(response.url || pageUrl) ?? pageUrl;
169
245
  if (contentType.includes("text/html") || contentType.includes("application/xhtml+xml") || looksLikeMarkup(body)) {
170
246
  const title = extractHtmlTitle(body) || new URL(finalUrl).hostname;