akm-cli 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -553,17 +553,19 @@ const searchCommand = defineCommand({
553
553
  const addCommand = defineCommand({
554
554
  meta: {
555
555
  name: "add",
556
- description: "Add a source (local directory, npm package, GitHub repo, git URL, or remote provider)",
556
+ description: "Add a source (local directory, website, npm package, GitHub repo, git URL, or remote provider)",
557
557
  },
558
558
  args: {
559
559
  ref: {
560
560
  type: "positional",
561
- description: "Path, URL, or registry ref (npm package, owner/repo, git URL, or local directory)",
561
+ description: "Path, URL, or registry ref (website URL, npm package, owner/repo, git URL, or local directory)",
562
562
  required: true,
563
563
  },
564
564
  provider: { type: "string", description: "Provider type (e.g. openviking). Required for URL sources." },
565
565
  options: { type: "string", description: 'Provider options as JSON (e.g. \'{"apiKey":"key"}\').' },
566
566
  name: { type: "string", description: "Human-friendly name for the source" },
567
+ "max-pages": { type: "string", description: "Maximum pages to crawl for website sources (default: 50)" },
568
+ "max-depth": { type: "string", description: "Maximum crawl depth for website sources (default: 3)" },
567
569
  },
568
570
  async run({ args }) {
569
571
  await runWithJsonErrors(async () => {
@@ -580,7 +582,7 @@ const addCommand = defineCommand({
580
582
  }
581
583
  // URL with --provider → stash source (remote or git provider)
582
584
  if (args.provider) {
583
- if (ref.startsWith("http://")) {
585
+ if (shouldWarnOnPlainHttp(ref)) {
584
586
  warn("Warning: source URL uses plain HTTP (not HTTPS). For security, prefer https:// to protect against eavesdropping and tampering.");
585
587
  }
586
588
  let parsedOptions;
@@ -607,7 +609,19 @@ const addCommand = defineCommand({
607
609
  output("stash-add", result);
608
610
  return;
609
611
  }
610
- const result = await akmAdd({ ref });
612
+ if (shouldWarnOnPlainHttp(ref)) {
613
+ warn("Warning: source URL uses plain HTTP (not HTTPS). For security, prefer https:// to protect against eavesdropping and tampering.");
614
+ }
615
+ const websiteOptions = {};
616
+ if (args["max-pages"])
617
+ websiteOptions.maxPages = args["max-pages"];
618
+ if (args["max-depth"])
619
+ websiteOptions.maxDepth = args["max-depth"];
620
+ const result = await akmAdd({
621
+ ref,
622
+ name: args.name,
623
+ options: Object.keys(websiteOptions).length > 0 ? websiteOptions : undefined,
624
+ });
611
625
  output("add", result);
612
626
  });
613
627
  },
@@ -624,6 +638,22 @@ function parseKindFilter(raw) {
624
638
  }
625
639
  return kinds;
626
640
  }
641
+ function shouldWarnOnPlainHttp(ref) {
642
+ if (!ref.startsWith("http://"))
643
+ return false;
644
+ try {
645
+ const hostname = new URL(ref).hostname.toLowerCase();
646
+ return (hostname !== "localhost" &&
647
+ hostname !== "127.0.0.1" &&
648
+ hostname !== "0.0.0.0" &&
649
+ hostname !== "::1" &&
650
+ hostname !== "[::1]" &&
651
+ !hostname.endsWith(".localhost"));
652
+ }
653
+ catch {
654
+ return true;
655
+ }
656
+ }
627
657
  const listCommand = defineCommand({
628
658
  meta: { name: "list", description: "List all sources (local directories, managed packages, remote providers)" },
629
659
  args: {
package/dist/indexer.js CHANGED
@@ -18,8 +18,8 @@ export async function akmIndex(options) {
18
18
  const config = loadConfig();
19
19
  // Ensure git stash caches are extracted before resolving stash dirs,
20
20
  // so their content directories exist on disk for the walker to discover.
21
- const { ensureGitCaches, resolveAllStashDirs } = await import("./search-source.js");
22
- await ensureGitCaches(config);
21
+ const { ensureStashCaches, resolveAllStashDirs } = await import("./search-source.js");
22
+ await ensureStashCaches(config);
23
23
  const allStashDirs = resolveAllStashDirs(stashDir);
24
24
  const t0 = Date.now();
25
25
  // Open database — pass embedding dimension from config if available
@@ -3,6 +3,7 @@ import path from "node:path";
3
3
  import { resolveStashDir } from "./common";
4
4
  import { loadConfig } from "./config";
5
5
  import { ensureGitMirror, getCachePaths, parseGitRepoUrl } from "./stash-providers/git";
6
+ import { ensureWebsiteMirror, getCachePaths as getWebsiteCachePaths } from "./stash-providers/website";
6
7
  import { warn } from "./warn";
7
8
  // ── Resolution ──────────────────────────────────────────────────────────────
8
9
  /**
@@ -54,6 +55,19 @@ export function resolveStashSources(overrideStashDir, existingConfig) {
54
55
  }
55
56
  }
56
57
  }
58
+ // Website stash entries: resolve cache directory so the indexer can walk
59
+ // the scraped markdown snapshots.
60
+ for (const entry of config.stashes ?? []) {
61
+ if (entry.type === "website" && entry.url && entry.enabled !== false) {
62
+ try {
63
+ const cachePaths = getWebsiteCachePaths(entry.url);
64
+ addSource(cachePaths.stashDir, entry.name ?? entry.url);
65
+ }
66
+ catch (err) {
67
+ warn(`Warning: failed to resolve website stash cache for "${entry.url}": ${err instanceof Error ? err.message : String(err)}`);
68
+ }
69
+ }
70
+ }
57
71
  // Installed kits (registry and local)
58
72
  for (const entry of config.installed ?? []) {
59
73
  addSource(entry.stashRoot, entry.id);
@@ -153,11 +167,12 @@ function isValidDirectory(dir) {
153
167
  // ── Git stash cache integration ──────────────────────────────────────────────
154
168
  const GIT_STASH_TYPES = new Set(["context-hub", "github", "git"]);
155
169
  /**
156
- * Ensure all git stash mirrors are refreshed so their cache directories
157
- * exist on disk. Must be called (async) before `resolveStashSources()` so
158
- * the content directories pass the `isValidDirectory()` check.
170
+ * Ensure all cache-backed stash providers are refreshed so their cache
171
+ * directories exist on disk. Must be called (async) before
172
+ * `resolveStashSources()` so the content directories pass the
173
+ * `isValidDirectory()` check.
159
174
  */
160
- export async function ensureGitCaches(config) {
175
+ export async function ensureStashCaches(config) {
161
176
  const cfg = config ?? loadConfig();
162
177
  for (const entry of cfg.stashes ?? []) {
163
178
  if (!GIT_STASH_TYPES.has(entry.type) || !entry.url || entry.enabled === false)
@@ -171,6 +186,18 @@ export async function ensureGitCaches(config) {
171
186
  warn(`Warning: failed to refresh git mirror for "${entry.url}": ${err instanceof Error ? err.message : String(err)}`);
172
187
  }
173
188
  }
189
+ for (const entry of cfg.stashes ?? []) {
190
+ if (entry.type !== "website" || !entry.url || entry.enabled === false)
191
+ continue;
192
+ try {
193
+ await ensureWebsiteMirror(entry, { requireStashDir: true });
194
+ }
195
+ catch (err) {
196
+ warn(`Warning: failed to refresh website stash for "${entry.url}": ${err instanceof Error ? err.message : String(err)}`);
197
+ }
198
+ }
174
199
  }
175
- /** @deprecated Use ensureGitCaches instead. */
176
- export const ensureContextHubCaches = ensureGitCaches;
200
+ /** @deprecated Use ensureStashCaches instead. */
201
+ export const ensureGitCaches = ensureStashCaches;
202
+ /** @deprecated Use ensureStashCaches instead. */
203
+ export const ensureContextHubCaches = ensureStashCaches;
package/dist/stash-add.js CHANGED
@@ -1,18 +1,22 @@
1
1
  import fs from "node:fs";
2
2
  import path from "node:path";
3
- import { resolveStashDir } from "./common";
3
+ import { isHttpUrl, resolveStashDir } from "./common";
4
4
  import { loadConfig, saveConfig } from "./config";
5
5
  import { UsageError } from "./errors";
6
6
  import { akmIndex } from "./indexer";
7
7
  import { upsertLockEntry } from "./lockfile";
8
8
  import { detectStashRoot, installRegistryRef, upsertInstalledRegistryEntry } from "./registry-install";
9
9
  import { parseRegistryRef } from "./registry-resolve";
10
+ import { ensureWebsiteMirror, validateWebsiteInputUrl } from "./stash-providers/website";
10
11
  export async function akmAdd(input) {
11
12
  const ref = input.ref.trim();
12
13
  if (!ref)
13
14
  throw new UsageError("Install ref or local directory is required. " +
14
15
  "Examples: `akm add @scope/kit`, `akm add github:owner/repo`, `akm add ./local/path`");
15
16
  const stashDir = resolveStashDir();
17
+ if (shouldAddAsWebsiteUrl(ref)) {
18
+ return addWebsiteStashSource(ref, stashDir, input.name, input.options);
19
+ }
16
20
  // Detect local directory refs and route them to stashes[] instead of installed[]
17
21
  try {
18
22
  const parsed = parseRegistryRef(ref);
@@ -69,6 +73,50 @@ async function addLocalStashSource(ref, sourcePath, stashDir) {
69
73
  },
70
74
  };
71
75
  }
76
+ async function addWebsiteStashSource(ref, stashDir, name, options) {
77
+ const normalizedUrl = validateWebsiteInputUrl(ref);
78
+ const config = loadConfig();
79
+ const stashes = [...(config.stashes ?? [])];
80
+ let entry = stashes.find((stash) => stash.type === "website" && stash.url === normalizedUrl);
81
+ if (!entry) {
82
+ entry = {
83
+ type: "website",
84
+ url: normalizedUrl,
85
+ name: name ?? toWebsiteName(normalizedUrl),
86
+ ...(options && Object.keys(options).length > 0 ? { options } : {}),
87
+ };
88
+ stashes.push(entry);
89
+ saveConfig({ ...config, stashes });
90
+ }
91
+ else if (options && Object.keys(options).length > 0) {
92
+ entry.options = { ...entry.options, ...options };
93
+ saveConfig({ ...config, stashes });
94
+ }
95
+ const cachePaths = await ensureWebsiteMirror(entry, { requireStashDir: true });
96
+ const index = await akmIndex({ stashDir });
97
+ const updatedConfig = loadConfig();
98
+ return {
99
+ schemaVersion: 1,
100
+ stashDir,
101
+ ref,
102
+ stashSource: {
103
+ type: "website",
104
+ url: normalizedUrl,
105
+ name: entry.name,
106
+ stashRoot: cachePaths.stashDir,
107
+ },
108
+ config: {
109
+ stashCount: updatedConfig.stashes?.length ?? 0,
110
+ installedKitCount: updatedConfig.installed?.length ?? 0,
111
+ },
112
+ index: {
113
+ mode: index.mode,
114
+ totalEntries: index.totalEntries,
115
+ directoriesScanned: index.directoriesScanned,
116
+ directoriesSkipped: index.directoriesSkipped,
117
+ },
118
+ };
119
+ }
72
120
  /**
73
121
  * Install a kit from a registry (npm, github, git).
74
122
  */
@@ -139,3 +187,26 @@ function toReadableId(resolvedPath) {
139
187
  }
140
188
  return resolvedPath;
141
189
  }
190
+ // Keep this list limited to widely-used git hosts for the non-breaking
191
+ // "repo-like URL" fast-path; everything else continues to default to website snapshots.
192
+ const KNOWN_GIT_HOSTS = new Set(["github.com", "gitlab.com", "bitbucket.org", "codeberg.org", "git.sr.ht"]);
193
+ export function shouldAddAsWebsiteUrl(ref) {
194
+ return isHttpUrl(ref) && !isLikelyGitRepositoryUrl(ref);
195
+ }
196
+ function isLikelyGitRepositoryUrl(ref) {
197
+ try {
198
+ const parsed = new URL(ref);
199
+ return KNOWN_GIT_HOSTS.has(parsed.hostname.toLowerCase()) || parsed.pathname.endsWith(".git");
200
+ }
201
+ catch {
202
+ return false;
203
+ }
204
+ }
205
+ function toWebsiteName(siteUrl) {
206
+ try {
207
+ return new URL(siteUrl).hostname;
208
+ }
209
+ catch {
210
+ return siteUrl;
211
+ }
212
+ }
@@ -8,3 +8,4 @@
8
8
  import "./filesystem";
9
9
  import "./git";
10
10
  import "./openviking";
11
+ import "./website";
@@ -0,0 +1,443 @@
1
+ import { createHash } from "node:crypto";
2
+ import fs from "node:fs";
3
+ import path from "node:path";
4
+ import { fetchWithRetry } from "../common";
5
+ import { ConfigError, UsageError } from "../errors";
6
+ import { getRegistryIndexCacheDir } from "../paths";
7
+ import { registerStashProvider } from "../stash-provider-factory";
8
+ import { isExpired, sanitizeString } from "./provider-utils";
9
+ /** Refresh website snapshots every 12 hours to balance freshness with scraping load. */
10
+ const CACHE_REFRESH_INTERVAL_MS = 12 * 60 * 60 * 1000;
11
+ /** Allow up to 7 days of stale snapshots when refresh fails so search remains available during outages. */
12
+ const CACHE_STALE_MS = 7 * 24 * 60 * 60 * 1000;
13
+ /** Allow limited breadth-first expansion without letting the crawl queue grow unbounded. */
14
+ const QUEUE_EXPANSION_FACTOR = 5;
15
+ const MAX_PAGES_DEFAULT = 50;
16
+ const MAX_DEPTH_DEFAULT = 3;
17
+ class WebsiteStashProvider {
18
+ type = "website";
19
+ name;
20
+ constructor(config) {
21
+ this.name = config.name ?? "website";
22
+ validateWebsiteUrl(config.url ?? "");
23
+ }
24
+ /** Content is indexed through the standard FTS5 pipeline. */
25
+ async search(_options) {
26
+ return { hits: [] };
27
+ }
28
+ /** Content is local files, shown via showLocal. */
29
+ async show(_ref, _view) {
30
+ throw new Error("Website provider content is shown via local index");
31
+ }
32
+ /** Content is local; no remote show needed. */
33
+ canShow(_ref) {
34
+ return false;
35
+ }
36
+ }
37
+ registerStashProvider("website", (config) => new WebsiteStashProvider(config));
38
+ function getCachePaths(siteUrl) {
39
+ const key = createHash("sha256").update(normalizeSiteUrl(siteUrl)).digest("hex").slice(0, 16);
40
+ const rootDir = path.join(getRegistryIndexCacheDir(), `website-${key}`);
41
+ return {
42
+ rootDir,
43
+ stashDir: path.join(rootDir, "stash"),
44
+ manifestPath: path.join(rootDir, "manifest.json"),
45
+ };
46
+ }
47
+ async function ensureWebsiteMirror(config, options) {
48
+ const rawUrl = config.url ?? "";
49
+ const normalizedUrl = validateWebsiteUrl(rawUrl);
50
+ const cachePaths = getCachePaths(normalizedUrl);
51
+ const requireStashDir = options?.requireStashDir === true;
52
+ let mtime = 0;
53
+ try {
54
+ mtime = fs.statSync(cachePaths.manifestPath).mtimeMs;
55
+ }
56
+ catch {
57
+ /* no cached manifest */
58
+ }
59
+ if (mtime &&
60
+ !isExpired(mtime, CACHE_REFRESH_INTERVAL_MS) &&
61
+ (!requireStashDir || hasExtractedSite(cachePaths.stashDir))) {
62
+ return cachePaths;
63
+ }
64
+ try {
65
+ fs.mkdirSync(cachePaths.rootDir, { recursive: true });
66
+ await scrapeWebsiteToStash(normalizedUrl, cachePaths.stashDir, {
67
+ maxPages: coercePositiveInt(config.options?.maxPages, MAX_PAGES_DEFAULT),
68
+ maxDepth: coercePositiveInt(config.options?.maxDepth, MAX_DEPTH_DEFAULT),
69
+ });
70
+ fs.writeFileSync(cachePaths.manifestPath, `${JSON.stringify({ url: normalizedUrl, fetchedAt: new Date().toISOString() }, null, 2)}\n`, { encoding: "utf8", mode: 0o600 });
71
+ return cachePaths;
72
+ }
73
+ catch (err) {
74
+ if (mtime && !isExpired(mtime, CACHE_STALE_MS) && (!requireStashDir || hasExtractedSite(cachePaths.stashDir))) {
75
+ return cachePaths;
76
+ }
77
+ throw err;
78
+ }
79
+ }
80
+ function hasExtractedSite(stashDir) {
81
+ try {
82
+ const knowledgeDir = path.join(stashDir, "knowledge");
83
+ if (!fs.statSync(stashDir).isDirectory() || !fs.statSync(knowledgeDir).isDirectory())
84
+ return false;
85
+ // Check top-level and one level of subdirectories for .md files
86
+ for (const entry of fs.readdirSync(knowledgeDir, { withFileTypes: true })) {
87
+ if (entry.isFile() && entry.name.endsWith(".md"))
88
+ return true;
89
+ if (entry.isDirectory()) {
90
+ const subEntries = fs.readdirSync(path.join(knowledgeDir, entry.name));
91
+ if (subEntries.some((e) => e.endsWith(".md")))
92
+ return true;
93
+ }
94
+ }
95
+ return false;
96
+ }
97
+ catch {
98
+ return false;
99
+ }
100
+ }
101
+ async function scrapeWebsiteToStash(startUrl, stashDir, options) {
102
+ const pages = await crawlWebsite(startUrl, options);
103
+ if (pages.length === 0) {
104
+ throw new Error(`No content could be scraped from ${startUrl}`);
105
+ }
106
+ fs.rmSync(stashDir, { recursive: true, force: true });
107
+ const knowledgeDir = path.join(stashDir, "knowledge");
108
+ fs.mkdirSync(knowledgeDir, { recursive: true });
109
+ const usedPaths = new Set();
110
+ for (const page of pages) {
111
+ const relPath = urlToRelativePath(page.url);
112
+ const uniquePath = uniqueSlug(relPath, usedPaths);
113
+ const filePath = path.join(knowledgeDir, `${uniquePath}.md`);
114
+ const dir = path.dirname(filePath);
115
+ if (dir !== knowledgeDir)
116
+ fs.mkdirSync(dir, { recursive: true });
117
+ const slug = uniquePath.split("/").pop() ?? "index";
118
+ fs.writeFileSync(filePath, buildMarkdownSnapshot(page, slug), "utf8");
119
+ }
120
+ }
121
+ async function crawlWebsite(startUrl, options) {
122
+ const start = new URL(normalizeSiteUrl(startUrl));
123
+ const allowedOrigin = start.origin;
124
+ const queue = [{ url: start.toString(), depth: 0 }];
125
+ const visited = new Set();
126
+ const pages = [];
127
+ while (queue.length > 0 && pages.length < options.maxPages) {
128
+ const next = queue.shift();
129
+ if (!next)
130
+ break;
131
+ const normalized = normalizeCrawlUrl(next.url);
132
+ if (!normalized || visited.has(normalized))
133
+ continue;
134
+ visited.add(normalized);
135
+ const fetched = await fetchWebsitePage(normalized);
136
+ if (!fetched)
137
+ continue;
138
+ pages.push(fetched.page);
139
+ if (next.depth >= options.maxDepth)
140
+ continue;
141
+ for (const link of fetched.links) {
142
+ if (queue.length + pages.length >= options.maxPages * QUEUE_EXPANSION_FACTOR)
143
+ break;
144
+ if (link.origin !== allowedOrigin)
145
+ continue;
146
+ const candidate = normalizeCrawlUrl(link.toString());
147
+ if (!candidate || visited.has(candidate) || isAssetLikePath(link.pathname))
148
+ continue;
149
+ queue.push({ url: candidate, depth: next.depth + 1 });
150
+ }
151
+ }
152
+ return pages;
153
+ }
154
+ async function fetchWebsitePage(pageUrl) {
155
+ const response = await fetchWithRetry(pageUrl, {
156
+ headers: {
157
+ Accept: "text/html, text/markdown, text/plain;q=0.9, application/xhtml+xml;q=0.8",
158
+ "User-Agent": "akm-cli website provider",
159
+ },
160
+ }, { timeout: 15_000, retries: 1 });
161
+ if (!response.ok) {
162
+ if (response.status === 404)
163
+ return null;
164
+ throw new Error(`Failed to fetch website content (${response.status}) from ${pageUrl}`);
165
+ }
166
+ const contentType = response.headers.get("content-type")?.toLowerCase() ?? "";
167
+ const body = await response.text();
168
+ const finalUrl = normalizeCrawlUrl(response.url || pageUrl) ?? pageUrl;
169
+ if (contentType.includes("text/html") || contentType.includes("application/xhtml+xml") || looksLikeMarkup(body)) {
170
+ const title = extractHtmlTitle(body) || new URL(finalUrl).hostname;
171
+ return {
172
+ page: {
173
+ url: finalUrl,
174
+ title,
175
+ markdown: htmlToMarkdown(body, finalUrl),
176
+ },
177
+ links: extractSameDocumentLinks(body, finalUrl),
178
+ };
179
+ }
180
+ return {
181
+ page: {
182
+ url: finalUrl,
183
+ title: extractTextTitle(body) || new URL(finalUrl).hostname,
184
+ markdown: body.trim(),
185
+ },
186
+ links: [],
187
+ };
188
+ }
189
+ function buildMarkdownSnapshot(page, slug) {
190
+ const title = sanitizeString(page.title, 200) || slug;
191
+ const description = sanitizeString(`Snapshot of ${page.url}`, 500);
192
+ const host = sanitizeString(new URL(page.url).hostname, 120);
193
+ const content = page.markdown.trim() || `Source: ${page.url}`;
194
+ return [
195
+ "---",
196
+ `name: ${JSON.stringify(slug)}`,
197
+ `description: ${JSON.stringify(description)}`,
198
+ `sourceUrl: ${JSON.stringify(page.url)}`,
199
+ `title: ${JSON.stringify(title)}`,
200
+ "tags:",
201
+ ` - ${JSON.stringify("website")}`,
202
+ ` - ${JSON.stringify(host)}`,
203
+ "---",
204
+ "",
205
+ `# ${title}`,
206
+ "",
207
+ `Source: ${page.url}`,
208
+ "",
209
+ content,
210
+ "",
211
+ ].join("\n");
212
+ }
213
+ function validateWebsiteUrl(rawUrl) {
214
+ return validateWebsiteUrlWithError(rawUrl, ConfigError);
215
+ }
216
+ function validateWebsiteInputUrl(rawUrl) {
217
+ return validateWebsiteUrlWithError(rawUrl, UsageError);
218
+ }
219
+ function validateWebsiteUrlWithError(rawUrl, ErrorType) {
220
+ if (!rawUrl) {
221
+ throw new ErrorType("Website provider requires a URL");
222
+ }
223
+ let parsed;
224
+ try {
225
+ parsed = new URL(rawUrl);
226
+ }
227
+ catch {
228
+ throw new ErrorType(`Website URL is not valid: "${rawUrl}"`);
229
+ }
230
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
231
+ throw new ErrorType(`Website URL must use http:// or https://, got "${parsed.protocol}" in "${rawUrl}"`);
232
+ }
233
+ if (parsed.username || parsed.password) {
234
+ throw new ErrorType("Website URL must not contain embedded credentials");
235
+ }
236
+ parsed.hash = "";
237
+ return normalizeSiteUrl(parsed.toString());
238
+ }
239
+ function normalizeSiteUrl(rawUrl) {
240
+ const parsed = new URL(rawUrl);
241
+ parsed.hash = "";
242
+ if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) {
243
+ parsed.pathname = parsed.pathname.replace(/\/+$/, "");
244
+ }
245
+ return parsed.toString();
246
+ }
247
+ function normalizeCrawlUrl(rawUrl) {
248
+ try {
249
+ const parsed = new URL(rawUrl);
250
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:")
251
+ return null;
252
+ parsed.hash = "";
253
+ if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) {
254
+ parsed.pathname = parsed.pathname.replace(/\/+$/, "");
255
+ }
256
+ return parsed.toString();
257
+ }
258
+ catch {
259
+ return null;
260
+ }
261
+ }
262
+ /** Convert a page URL into a relative file path preserving the URL hierarchy.
263
+ * e.g. https://example.com/docs/guide → docs/guide
264
+ * https://example.com/ → index
265
+ */
266
+ function urlToRelativePath(rawUrl) {
267
+ const parsed = new URL(rawUrl);
268
+ const segments = parsed.pathname
269
+ .split("/")
270
+ .filter(Boolean)
271
+ .map((segment) => slugifySegment(segment))
272
+ .filter(Boolean);
273
+ if (parsed.search) {
274
+ const querySuffix = slugifySegment(parsed.search.slice(1));
275
+ if (querySuffix && segments.length > 0) {
276
+ segments[segments.length - 1] = `${segments[segments.length - 1]}_${querySuffix}`;
277
+ }
278
+ }
279
+ return segments.length > 0 ? segments.join("/") : "index";
280
+ }
281
+ function slugifySegment(value) {
282
+ return sanitizeString(value, 200)
283
+ .toLowerCase()
284
+ .replace(/[^a-z0-9._-]+/g, "-")
285
+ .replace(/^-+|-+$/g, "");
286
+ }
287
+ function uniqueSlug(base, used) {
288
+ const seed = base || "website";
289
+ let candidate = seed;
290
+ let i = 2;
291
+ while (used.has(candidate)) {
292
+ candidate = `${seed}-${i}`;
293
+ i += 1;
294
+ }
295
+ used.add(candidate);
296
+ return candidate;
297
+ }
298
+ function coercePositiveInt(value, fallback) {
299
+ if (typeof value === "number" && Number.isInteger(value) && value > 0)
300
+ return value;
301
+ if (typeof value === "string") {
302
+ const parsed = Number.parseInt(value, 10);
303
+ if (Number.isInteger(parsed) && parsed > 0)
304
+ return parsed;
305
+ }
306
+ return fallback;
307
+ }
308
+ function looksLikeMarkup(body) {
309
+ return /<html[\s>]|<body[\s>]|<\/[a-z][\w:-]*>/i.test(body);
310
+ }
311
+ function extractHtmlTitle(html) {
312
+ const title = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1];
313
+ if (title)
314
+ return decodeHtmlEntities(stripTags(title)).trim();
315
+ const h1 = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i)?.[1];
316
+ if (h1)
317
+ return decodeHtmlEntities(stripTags(h1)).trim();
318
+ return undefined;
319
+ }
320
+ function extractTextTitle(text) {
321
+ for (const line of text.split(/\r?\n/)) {
322
+ const trimmed = line.trim();
323
+ if (!trimmed)
324
+ continue;
325
+ if (trimmed.startsWith("#"))
326
+ return trimmed.replace(/^#+\s*/, "");
327
+ return trimmed.slice(0, 120);
328
+ }
329
+ return undefined;
330
+ }
331
+ function extractSameDocumentLinks(html, pageUrl) {
332
+ const links = [];
333
+ const hrefPattern = /<a\b[^>]*href\s*=\s*(['"])(.*?)\1[^>]*>/gi;
334
+ for (const match of html.matchAll(hrefPattern)) {
335
+ const href = match[2]?.trim();
336
+ if (!href || href.startsWith("#"))
337
+ continue;
338
+ try {
339
+ const resolved = new URL(href, pageUrl);
340
+ if (!isSafeLinkUrl(resolved))
341
+ continue;
342
+ links.push(resolved);
343
+ }
344
+ catch {
345
+ /* ignore malformed links */
346
+ }
347
+ }
348
+ return links;
349
+ }
350
+ function htmlToMarkdown(html, pageUrl) {
351
+ let text = html;
352
+ text = stripDangerousBlockTag(text, "script");
353
+ text = stripDangerousBlockTag(text, "style");
354
+ text = stripDangerousBlockTag(text, "noscript");
355
+ text = stripDangerousBlockTag(text, "template");
356
+ text = text.replace(/<pre\b[^>]*><code\b[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (_match, code) => {
357
+ const decoded = decodeHtmlEntities(stripTags(code)).trim();
358
+ return decoded ? `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n` : "\n\n";
359
+ });
360
+ text = text.replace(/<code\b[^>]*>([\s\S]*?)<\/code>/gi, (_match, code) => {
361
+ const decoded = decodeHtmlEntities(stripTags(code)).trim();
362
+ return decoded ? `\`${decoded}\`` : "";
363
+ });
364
+ text = text.replace(/<a\b[^>]*href\s*=\s*(['"])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, (_match, _q, href, body) => {
365
+ const label = decodeHtmlEntities(stripTags(body)).trim();
366
+ if (!label)
367
+ return "";
368
+ try {
369
+ const resolved = new URL(href, pageUrl);
370
+ if (!isSafeLinkUrl(resolved))
371
+ return label;
372
+ return `[${label}](${resolved})`;
373
+ }
374
+ catch {
375
+ return label;
376
+ }
377
+ });
378
+ text = text.replace(/<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi, (_match, level, body) => {
379
+ const heading = decodeHtmlEntities(stripTags(body)).trim();
380
+ return heading ? `\n\n${"#".repeat(Number(level))} ${heading}\n\n` : "\n\n";
381
+ });
382
+ text = text.replace(/<li\b[^>]*>([\s\S]*?)<\/li>/gi, (_match, body) => {
383
+ const item = decodeHtmlEntities(stripTags(body)).trim();
384
+ return item ? `\n- ${item}` : "";
385
+ });
386
+ text = text.replace(/<(p|div|section|article|main|header|footer|blockquote|table|tr)\b[^>]*>/gi, "\n\n");
387
+ text = text.replace(/<\/(p|div|section|article|main|header|footer|blockquote|table|tr)>/gi, "\n\n");
388
+ text = text.replace(/<br\s*\/?>/gi, "\n");
389
+ text = text.replace(/<\/?(ul|ol)\b[^>]*>/gi, "\n");
390
+ text = decodeHtmlEntities(stripTags(text));
391
+ text = text
392
+ .replace(/\r/g, "")
393
+ .replace(/[ \t]+\n/g, "\n")
394
+ .replace(/\n{3,}/g, "\n\n")
395
+ .trim();
396
+ return text;
397
+ }
398
+ function stripTags(value) {
399
+ return value.replace(/<[^>]+>/g, " ");
400
+ }
401
+ function decodeHtmlEntities(value) {
402
+ const namedEntities = {
403
+ nbsp: " ",
404
+ amp: "&",
405
+ lt: "<",
406
+ gt: ">",
407
+ quot: '"',
408
+ apos: "'",
409
+ };
410
+ return value.replace(/&(#x[0-9a-f]+|#\d+|[a-z]+);/gi, (match, entity) => {
411
+ const normalized = String(entity).toLowerCase();
412
+ if (normalized.startsWith("#x")) {
413
+ return safeCodePointToString(Number.parseInt(normalized.slice(2), 16)) ?? match;
414
+ }
415
+ if (normalized.startsWith("#")) {
416
+ return safeCodePointToString(Number.parseInt(normalized.slice(1), 10)) ?? match;
417
+ }
418
+ return namedEntities[normalized] ?? match;
419
+ });
420
+ }
421
+ function isAssetLikePath(pathname) {
422
+ // Keep this list intentionally conservative so docs paths are still crawled
423
+ // unless they clearly point at static assets/binaries.
424
+ return /\.(css|js|json|png|jpe?g|gif|svg|ico|webp|pdf|zip|tar|gz|mp4|mp3|woff2?)$/i.test(pathname);
425
+ }
426
+ function isSafeLinkUrl(url) {
427
+ return url.protocol === "http:" || url.protocol === "https:";
428
+ }
429
+ function stripDangerousBlockTag(value, tagName) {
430
+ const pattern = new RegExp(`<${tagName}\\b[^>]*>[\\s\\S]*?<\\/${tagName}\\s*>`, "gi");
431
+ return value.replace(pattern, "");
432
+ }
433
+ function safeCodePointToString(value) {
434
+ if (!Number.isFinite(value) || value < 0 || value > 0x10ffff)
435
+ return undefined;
436
+ try {
437
+ return String.fromCodePoint(value);
438
+ }
439
+ catch {
440
+ return undefined;
441
+ }
442
+ }
443
+ export { ensureWebsiteMirror, getCachePaths, validateWebsiteInputUrl, validateWebsiteUrl, WebsiteStashProvider };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "akm-cli",
3
- "version": "0.3.0",
3
+ "version": "0.3.1",
4
4
  "type": "module",
5
5
  "description": "akm (Agent Kit Manager) — A package manager for AI agent skills, commands, tools, and knowledge. Works with Claude Code, OpenCode, Cursor, and any AI coding assistant.",
6
6
  "keywords": [