akm-cli 0.7.1 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +35 -0
- package/dist/cli.js +62 -16
- package/dist/commands/history.js +2 -7
- package/dist/commands/info.js +2 -2
- package/dist/commands/installed-stashes.js +45 -1
- package/dist/commands/search.js +2 -2
- package/dist/commands/show.js +4 -19
- package/dist/commands/source-add.js +1 -1
- package/dist/core/common.js +16 -1
- package/dist/core/config.js +18 -3
- package/dist/indexer/db-search.js +33 -39
- package/dist/indexer/db.js +51 -1
- package/dist/indexer/graph-extraction.js +5 -3
- package/dist/indexer/indexer.js +334 -121
- package/dist/indexer/manifest.js +18 -23
- package/dist/indexer/memory-inference.js +47 -58
- package/dist/indexer/metadata.js +253 -21
- package/dist/indexer/search-source.js +11 -5
- package/dist/llm/client.js +61 -1
- package/dist/llm/embedder.js +8 -5
- package/dist/llm/embedders/local.js +8 -2
- package/dist/llm/embedders/remote.js +4 -2
- package/dist/llm/graph-extract.js +4 -4
- package/dist/llm/memory-infer.js +61 -33
- package/dist/llm/metadata-enhance.js +2 -2
- package/dist/output/cli-hints.js +5 -2
- package/dist/output/renderers.js +22 -49
- package/dist/registry/build-index.js +13 -18
- package/dist/setup/setup.js +238 -96
- package/dist/sources/providers/git.js +14 -2
- package/dist/sources/providers/website.js +4 -460
- package/dist/sources/website-ingest.js +470 -0
- package/dist/wiki/wiki.js +11 -1
- package/dist/workflows/parser.js +19 -4
- package/dist/workflows/runs.js +3 -3
- package/docs/README.md +10 -3
- package/docs/migration/release-notes/0.7.0.md +22 -0
- package/package.json +5 -2
|
@@ -0,0 +1,470 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import fs from "node:fs";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import { fetchWithRetry, ResponseTooLargeError, readBodyWithByteCap } from "../core/common";
|
|
5
|
+
import { ConfigError, UsageError } from "../core/errors";
|
|
6
|
+
import { getRegistryIndexCacheDir } from "../core/paths";
|
|
7
|
+
import { warn } from "../core/warn";
|
|
8
|
+
import { isExpired, sanitizeString } from "./providers/provider-utils";
|
|
9
|
+
/** Refresh website snapshots every 12 hours to balance freshness with scraping load. */
|
|
10
|
+
const CACHE_REFRESH_INTERVAL_MS = 12 * 60 * 60 * 1000;
|
|
11
|
+
/** Allow up to 7 days of stale snapshots when refresh fails so search remains available during outages. */
|
|
12
|
+
const CACHE_STALE_MS = 7 * 24 * 60 * 60 * 1000;
|
|
13
|
+
/** Allow limited breadth-first expansion without letting the crawl queue grow unbounded. */
|
|
14
|
+
const QUEUE_EXPANSION_FACTOR = 5;
|
|
15
|
+
const MAX_PAGES_DEFAULT = 50;
|
|
16
|
+
const MAX_DEPTH_DEFAULT = 3;
|
|
17
|
+
/**
|
|
18
|
+
* Per-page body cap for website scraping. HTML pages this large are
|
|
19
|
+
* almost never useful as agent knowledge sources and a runaway server
|
|
20
|
+
* streaming tens of megabytes would blow memory with no upside.
|
|
21
|
+
*/
|
|
22
|
+
const WEBSITE_PAGE_BYTE_CAP = 5 * 1024 * 1024;
|
|
23
|
+
/**
|
|
24
|
+
* Wall-clock cap for a full crawl (10 minutes). With per-request timeouts
|
|
25
|
+
* of 15s and a `maxPages` default of 50, an unresponsive site could
|
|
26
|
+
* otherwise stall `akm add` for 12.5 minutes with no feedback. Cap the
|
|
27
|
+
* whole crawl and return what we have when time runs out.
|
|
28
|
+
*/
|
|
29
|
+
const WEBSITE_CRAWL_WALL_CLOCK_MS = 10 * 60 * 1000;
|
|
30
|
+
export function getWebsiteCachePaths(siteUrl) {
|
|
31
|
+
const key = createHash("sha256").update(normalizeSiteUrl(siteUrl)).digest("hex").slice(0, 16);
|
|
32
|
+
const rootDir = path.join(getRegistryIndexCacheDir(), `website-${key}`);
|
|
33
|
+
return {
|
|
34
|
+
rootDir,
|
|
35
|
+
stashDir: path.join(rootDir, "stash"),
|
|
36
|
+
manifestPath: path.join(rootDir, "manifest.json"),
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
export async function ensureWebsiteMirror(config, options) {
|
|
40
|
+
const rawUrl = config.url ?? "";
|
|
41
|
+
const normalizedUrl = validateWebsiteUrl(rawUrl);
|
|
42
|
+
const cachePaths = getWebsiteCachePaths(normalizedUrl);
|
|
43
|
+
const requireStashDir = options?.requireStashDir === true;
|
|
44
|
+
const force = options?.force === true;
|
|
45
|
+
let mtime = 0;
|
|
46
|
+
try {
|
|
47
|
+
mtime = fs.statSync(cachePaths.manifestPath).mtimeMs;
|
|
48
|
+
}
|
|
49
|
+
catch {
|
|
50
|
+
/* no cached manifest */
|
|
51
|
+
}
|
|
52
|
+
if (!force &&
|
|
53
|
+
mtime &&
|
|
54
|
+
!isExpired(mtime, CACHE_REFRESH_INTERVAL_MS) &&
|
|
55
|
+
(!requireStashDir || hasExtractedSite(cachePaths.stashDir))) {
|
|
56
|
+
return cachePaths;
|
|
57
|
+
}
|
|
58
|
+
try {
|
|
59
|
+
fs.mkdirSync(cachePaths.rootDir, { recursive: true });
|
|
60
|
+
await scrapeWebsiteToStash(normalizedUrl, cachePaths.stashDir, {
|
|
61
|
+
maxPages: coercePositiveInt(config.options?.maxPages, MAX_PAGES_DEFAULT),
|
|
62
|
+
maxDepth: coercePositiveInt(config.options?.maxDepth, MAX_DEPTH_DEFAULT),
|
|
63
|
+
});
|
|
64
|
+
fs.writeFileSync(cachePaths.manifestPath, `${JSON.stringify({ url: normalizedUrl, fetchedAt: new Date().toISOString() }, null, 2)}\n`, { encoding: "utf8", mode: 0o600 });
|
|
65
|
+
return cachePaths;
|
|
66
|
+
}
|
|
67
|
+
catch (err) {
|
|
68
|
+
if (mtime && !isExpired(mtime, CACHE_STALE_MS) && (!requireStashDir || hasExtractedSite(cachePaths.stashDir))) {
|
|
69
|
+
return cachePaths;
|
|
70
|
+
}
|
|
71
|
+
throw err;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
function hasExtractedSite(stashDir) {
|
|
75
|
+
try {
|
|
76
|
+
const knowledgeDir = path.join(stashDir, "knowledge");
|
|
77
|
+
if (!fs.statSync(stashDir).isDirectory() || !fs.statSync(knowledgeDir).isDirectory())
|
|
78
|
+
return false;
|
|
79
|
+
for (const entry of fs.readdirSync(knowledgeDir, { withFileTypes: true })) {
|
|
80
|
+
if (entry.isFile() && entry.name.endsWith(".md"))
|
|
81
|
+
return true;
|
|
82
|
+
if (entry.isDirectory()) {
|
|
83
|
+
const subEntries = fs.readdirSync(path.join(knowledgeDir, entry.name));
|
|
84
|
+
if (subEntries.some((e) => e.endsWith(".md")))
|
|
85
|
+
return true;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
catch {
|
|
91
|
+
return false;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
async function scrapeWebsiteToStash(startUrl, stashDir, options) {
|
|
95
|
+
const pages = await crawlWebsite(startUrl, options);
|
|
96
|
+
if (pages.length === 0) {
|
|
97
|
+
throw new Error(`No content could be scraped from ${startUrl}`);
|
|
98
|
+
}
|
|
99
|
+
fs.rmSync(stashDir, { recursive: true, force: true });
|
|
100
|
+
const knowledgeDir = path.join(stashDir, "knowledge");
|
|
101
|
+
fs.mkdirSync(knowledgeDir, { recursive: true });
|
|
102
|
+
const usedPaths = new Set();
|
|
103
|
+
for (const page of pages) {
|
|
104
|
+
const relPath = urlToRelativePath(page.url);
|
|
105
|
+
const uniquePath = uniqueSlug(relPath, usedPaths);
|
|
106
|
+
const filePath = path.join(knowledgeDir, `${uniquePath}.md`);
|
|
107
|
+
const dir = path.dirname(filePath);
|
|
108
|
+
if (dir !== knowledgeDir)
|
|
109
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
110
|
+
const slug = uniquePath.split("/").pop() ?? "index";
|
|
111
|
+
fs.writeFileSync(filePath, buildMarkdownSnapshot(page, slug), "utf8");
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
export async function fetchWebsiteMarkdownSnapshot(rawUrl) {
|
|
115
|
+
const normalizedUrl = validateWebsiteInputUrl(rawUrl);
|
|
116
|
+
const fetched = await fetchWebsitePage(normalizedUrl);
|
|
117
|
+
if (!fetched) {
|
|
118
|
+
throw new UsageError(`No content could be fetched from ${normalizedUrl}`);
|
|
119
|
+
}
|
|
120
|
+
const preferredName = deriveImportPath(fetched.page.url);
|
|
121
|
+
const slug = preferredName.split("/").pop() ?? preferredName;
|
|
122
|
+
return {
|
|
123
|
+
url: fetched.page.url,
|
|
124
|
+
title: fetched.page.title,
|
|
125
|
+
markdown: fetched.page.markdown,
|
|
126
|
+
preferredName,
|
|
127
|
+
content: buildMarkdownSnapshot(fetched.page, slug || "website"),
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
async function crawlWebsite(startUrl, options) {
|
|
131
|
+
const start = new URL(normalizeSiteUrl(startUrl));
|
|
132
|
+
const allowedOrigin = start.origin;
|
|
133
|
+
const queue = [{ url: start.toString(), depth: 0 }];
|
|
134
|
+
const visited = new Set();
|
|
135
|
+
const pages = [];
|
|
136
|
+
const deadline = Date.now() + WEBSITE_CRAWL_WALL_CLOCK_MS;
|
|
137
|
+
while (queue.length > 0 && pages.length < options.maxPages) {
|
|
138
|
+
if (Date.now() > deadline)
|
|
139
|
+
break;
|
|
140
|
+
const next = queue.shift();
|
|
141
|
+
if (!next)
|
|
142
|
+
break;
|
|
143
|
+
const normalized = normalizeCrawlUrl(next.url);
|
|
144
|
+
if (!normalized || visited.has(normalized))
|
|
145
|
+
continue;
|
|
146
|
+
visited.add(normalized);
|
|
147
|
+
const fetched = await fetchWebsitePage(normalized);
|
|
148
|
+
if (!fetched)
|
|
149
|
+
continue;
|
|
150
|
+
pages.push(fetched.page);
|
|
151
|
+
if (next.depth >= options.maxDepth)
|
|
152
|
+
continue;
|
|
153
|
+
for (const link of fetched.links) {
|
|
154
|
+
if (queue.length + pages.length >= options.maxPages * QUEUE_EXPANSION_FACTOR)
|
|
155
|
+
break;
|
|
156
|
+
if (link.origin !== allowedOrigin)
|
|
157
|
+
continue;
|
|
158
|
+
const candidate = normalizeCrawlUrl(link.toString());
|
|
159
|
+
if (!candidate || visited.has(candidate) || isAssetLikePath(link.pathname))
|
|
160
|
+
continue;
|
|
161
|
+
queue.push({ url: candidate, depth: next.depth + 1 });
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
if (Date.now() > deadline) {
|
|
165
|
+
warn("[akm] website crawl stopped at the %ds wall-clock cap with %d/%d pages collected from %s.", WEBSITE_CRAWL_WALL_CLOCK_MS / 1000, pages.length, options.maxPages, startUrl);
|
|
166
|
+
}
|
|
167
|
+
return pages;
|
|
168
|
+
}
|
|
169
|
+
async function fetchWebsitePage(pageUrl) {
|
|
170
|
+
const response = await fetchWithRetry(pageUrl, {
|
|
171
|
+
headers: {
|
|
172
|
+
Accept: "text/html, text/markdown, text/plain;q=0.9, application/xhtml+xml;q=0.8",
|
|
173
|
+
"User-Agent": "akm-cli website provider",
|
|
174
|
+
},
|
|
175
|
+
}, { timeout: 15_000, retries: 1 });
|
|
176
|
+
if (!response.ok) {
|
|
177
|
+
if (response.status === 404)
|
|
178
|
+
return null;
|
|
179
|
+
throw new Error(`Failed to fetch website content (${response.status}) from ${pageUrl}`);
|
|
180
|
+
}
|
|
181
|
+
const contentType = response.headers.get("content-type")?.toLowerCase() ?? "";
|
|
182
|
+
let body;
|
|
183
|
+
try {
|
|
184
|
+
body = await readBodyWithByteCap(response, WEBSITE_PAGE_BYTE_CAP);
|
|
185
|
+
}
|
|
186
|
+
catch (err) {
|
|
187
|
+
if (err instanceof ResponseTooLargeError)
|
|
188
|
+
return null;
|
|
189
|
+
throw err;
|
|
190
|
+
}
|
|
191
|
+
const finalUrl = normalizeCrawlUrl(response.url || pageUrl) ?? pageUrl;
|
|
192
|
+
if (contentType.includes("text/html") || contentType.includes("application/xhtml+xml") || looksLikeMarkup(body)) {
|
|
193
|
+
const title = extractHtmlTitle(body) || new URL(finalUrl).hostname;
|
|
194
|
+
return {
|
|
195
|
+
page: {
|
|
196
|
+
url: finalUrl,
|
|
197
|
+
title,
|
|
198
|
+
markdown: htmlToMarkdown(body, finalUrl),
|
|
199
|
+
},
|
|
200
|
+
links: extractSameDocumentLinks(body, finalUrl),
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
return {
|
|
204
|
+
page: {
|
|
205
|
+
url: finalUrl,
|
|
206
|
+
title: extractTextTitle(body) || new URL(finalUrl).hostname,
|
|
207
|
+
markdown: body.trim(),
|
|
208
|
+
},
|
|
209
|
+
links: [],
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
function buildMarkdownSnapshot(page, slug) {
|
|
213
|
+
const title = sanitizeString(page.title, 200) || slug;
|
|
214
|
+
const description = sanitizeString(`Snapshot of ${page.url}`, 500);
|
|
215
|
+
const host = sanitizeString(new URL(page.url).hostname, 120);
|
|
216
|
+
const content = page.markdown.trim() || `Source: ${page.url}`;
|
|
217
|
+
return [
|
|
218
|
+
"---",
|
|
219
|
+
`name: ${JSON.stringify(slug)}`,
|
|
220
|
+
`description: ${JSON.stringify(description)}`,
|
|
221
|
+
`sourceUrl: ${JSON.stringify(page.url)}`,
|
|
222
|
+
`title: ${JSON.stringify(title)}`,
|
|
223
|
+
"tags:",
|
|
224
|
+
` - ${JSON.stringify("website")}`,
|
|
225
|
+
` - ${JSON.stringify(host)}`,
|
|
226
|
+
"---",
|
|
227
|
+
"",
|
|
228
|
+
`# ${title}`,
|
|
229
|
+
"",
|
|
230
|
+
`Source: ${page.url}`,
|
|
231
|
+
"",
|
|
232
|
+
content,
|
|
233
|
+
"",
|
|
234
|
+
].join("\n");
|
|
235
|
+
}
|
|
236
|
+
export function validateWebsiteUrl(rawUrl) {
|
|
237
|
+
return validateWebsiteUrlWithError(rawUrl, ConfigError);
|
|
238
|
+
}
|
|
239
|
+
export function validateWebsiteInputUrl(rawUrl) {
|
|
240
|
+
return validateWebsiteUrlWithError(rawUrl, UsageError);
|
|
241
|
+
}
|
|
242
|
+
function validateWebsiteUrlWithError(rawUrl, ErrorType) {
|
|
243
|
+
if (!rawUrl) {
|
|
244
|
+
throw new ErrorType("Website provider requires a URL");
|
|
245
|
+
}
|
|
246
|
+
let parsed;
|
|
247
|
+
try {
|
|
248
|
+
parsed = new URL(rawUrl);
|
|
249
|
+
}
|
|
250
|
+
catch {
|
|
251
|
+
throw new ErrorType(`Website URL is not valid: "${rawUrl}"`);
|
|
252
|
+
}
|
|
253
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
254
|
+
throw new ErrorType(`Website URL must use http:// or https://, got "${parsed.protocol}" in "${rawUrl}"`);
|
|
255
|
+
}
|
|
256
|
+
if (parsed.username || parsed.password) {
|
|
257
|
+
throw new ErrorType("Website URL must not contain embedded credentials");
|
|
258
|
+
}
|
|
259
|
+
parsed.hash = "";
|
|
260
|
+
return normalizeSiteUrl(parsed.toString());
|
|
261
|
+
}
|
|
262
|
+
function normalizeSiteUrl(rawUrl) {
|
|
263
|
+
const parsed = new URL(rawUrl);
|
|
264
|
+
parsed.hash = "";
|
|
265
|
+
if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) {
|
|
266
|
+
parsed.pathname = parsed.pathname.replace(/\/+$/, "");
|
|
267
|
+
}
|
|
268
|
+
return parsed.toString();
|
|
269
|
+
}
|
|
270
|
+
function normalizeCrawlUrl(rawUrl) {
|
|
271
|
+
try {
|
|
272
|
+
const parsed = new URL(rawUrl);
|
|
273
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:")
|
|
274
|
+
return null;
|
|
275
|
+
parsed.hash = "";
|
|
276
|
+
if (parsed.pathname !== "/" && parsed.pathname.endsWith("/")) {
|
|
277
|
+
parsed.pathname = parsed.pathname.replace(/\/+$/, "");
|
|
278
|
+
}
|
|
279
|
+
return parsed.toString();
|
|
280
|
+
}
|
|
281
|
+
catch {
|
|
282
|
+
return null;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
function urlToRelativePath(rawUrl) {
|
|
286
|
+
const parsed = new URL(rawUrl);
|
|
287
|
+
const segments = parsed.pathname
|
|
288
|
+
.split("/")
|
|
289
|
+
.filter(Boolean)
|
|
290
|
+
.map((segment) => slugifySegment(segment))
|
|
291
|
+
.filter(Boolean);
|
|
292
|
+
if (parsed.search) {
|
|
293
|
+
const querySuffix = slugifySegment(parsed.search.slice(1));
|
|
294
|
+
if (querySuffix && segments.length > 0) {
|
|
295
|
+
segments[segments.length - 1] = `${segments[segments.length - 1]}_${querySuffix}`;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
return segments.length > 0 ? segments.join("/") : "index";
|
|
299
|
+
}
|
|
300
|
+
function deriveImportPath(rawUrl) {
|
|
301
|
+
const parsed = new URL(rawUrl);
|
|
302
|
+
const relativePath = urlToRelativePath(rawUrl);
|
|
303
|
+
if (relativePath !== "index")
|
|
304
|
+
return relativePath;
|
|
305
|
+
const host = slugifySegment(parsed.hostname) || "website";
|
|
306
|
+
if (!parsed.search)
|
|
307
|
+
return host;
|
|
308
|
+
const querySuffix = slugifySegment(parsed.search.slice(1));
|
|
309
|
+
return querySuffix ? `${host}-${querySuffix}` : host;
|
|
310
|
+
}
|
|
311
|
+
function slugifySegment(value) {
|
|
312
|
+
return sanitizeString(value, 200)
|
|
313
|
+
.toLowerCase()
|
|
314
|
+
.replace(/[^a-z0-9._-]+/g, "-")
|
|
315
|
+
.replace(/^-+|-+$/g, "");
|
|
316
|
+
}
|
|
317
|
+
function uniqueSlug(base, used) {
|
|
318
|
+
const seed = base || "website";
|
|
319
|
+
let candidate = seed;
|
|
320
|
+
let i = 2;
|
|
321
|
+
while (used.has(candidate)) {
|
|
322
|
+
candidate = `${seed}-${i}`;
|
|
323
|
+
i += 1;
|
|
324
|
+
}
|
|
325
|
+
used.add(candidate);
|
|
326
|
+
return candidate;
|
|
327
|
+
}
|
|
328
|
+
function coercePositiveInt(value, fallback) {
|
|
329
|
+
if (typeof value === "number" && Number.isInteger(value) && value > 0)
|
|
330
|
+
return value;
|
|
331
|
+
if (typeof value === "string") {
|
|
332
|
+
const parsed = Number.parseInt(value, 10);
|
|
333
|
+
if (Number.isInteger(parsed) && parsed > 0)
|
|
334
|
+
return parsed;
|
|
335
|
+
}
|
|
336
|
+
return fallback;
|
|
337
|
+
}
|
|
338
|
+
function looksLikeMarkup(body) {
|
|
339
|
+
return /<html[\s>]|<body[\s>]|<\/[a-z][\w:-]*>/i.test(body);
|
|
340
|
+
}
|
|
341
|
+
function extractHtmlTitle(html) {
|
|
342
|
+
const title = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1];
|
|
343
|
+
if (title)
|
|
344
|
+
return decodeHtmlEntities(stripTags(title)).trim();
|
|
345
|
+
const h1 = html.match(/<h1[^>]*>([\s\S]*?)<\/h1>/i)?.[1];
|
|
346
|
+
if (h1)
|
|
347
|
+
return decodeHtmlEntities(stripTags(h1)).trim();
|
|
348
|
+
return undefined;
|
|
349
|
+
}
|
|
350
|
+
function extractTextTitle(text) {
|
|
351
|
+
for (const line of text.split(/\r?\n/)) {
|
|
352
|
+
const trimmed = line.trim();
|
|
353
|
+
if (!trimmed)
|
|
354
|
+
continue;
|
|
355
|
+
if (trimmed.startsWith("#"))
|
|
356
|
+
return trimmed.replace(/^#+\s*/, "");
|
|
357
|
+
return trimmed.slice(0, 120);
|
|
358
|
+
}
|
|
359
|
+
return undefined;
|
|
360
|
+
}
|
|
361
|
+
function extractSameDocumentLinks(html, pageUrl) {
|
|
362
|
+
const links = [];
|
|
363
|
+
const hrefPattern = /<a\b[^>]*href\s*=\s*(['"])(.*?)\1[^>]*>/gi;
|
|
364
|
+
for (const match of html.matchAll(hrefPattern)) {
|
|
365
|
+
const href = match[2]?.trim();
|
|
366
|
+
if (!href || href.startsWith("#"))
|
|
367
|
+
continue;
|
|
368
|
+
try {
|
|
369
|
+
const resolved = new URL(href, pageUrl);
|
|
370
|
+
if (!isSafeLinkUrl(resolved))
|
|
371
|
+
continue;
|
|
372
|
+
links.push(resolved);
|
|
373
|
+
}
|
|
374
|
+
catch {
|
|
375
|
+
/* ignore malformed links */
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
return links;
|
|
379
|
+
}
|
|
380
|
+
function htmlToMarkdown(html, pageUrl) {
|
|
381
|
+
let text = html;
|
|
382
|
+
text = stripDangerousBlockTag(text, "script");
|
|
383
|
+
text = stripDangerousBlockTag(text, "style");
|
|
384
|
+
text = stripDangerousBlockTag(text, "noscript");
|
|
385
|
+
text = stripDangerousBlockTag(text, "template");
|
|
386
|
+
text = text.replace(/<pre\b[^>]*><code\b[^>]*>([\s\S]*?)<\/code><\/pre>/gi, (_match, code) => {
|
|
387
|
+
const decoded = decodeHtmlEntities(stripTags(code)).trim();
|
|
388
|
+
return decoded ? `\n\n\`\`\`\n${decoded}\n\`\`\`\n\n` : "\n\n";
|
|
389
|
+
});
|
|
390
|
+
text = text.replace(/<code\b[^>]*>([\s\S]*?)<\/code>/gi, (_match, code) => {
|
|
391
|
+
const decoded = decodeHtmlEntities(stripTags(code)).trim();
|
|
392
|
+
return decoded ? `\`${decoded}\`` : "";
|
|
393
|
+
});
|
|
394
|
+
text = text.replace(/<a\b[^>]*href\s*=\s*(['"])(.*?)\1[^>]*>([\s\S]*?)<\/a>/gi, (_match, _q, href, body) => {
|
|
395
|
+
const label = decodeHtmlEntities(stripTags(body)).trim();
|
|
396
|
+
if (!label)
|
|
397
|
+
return "";
|
|
398
|
+
try {
|
|
399
|
+
const resolved = new URL(href, pageUrl);
|
|
400
|
+
if (!isSafeLinkUrl(resolved))
|
|
401
|
+
return label;
|
|
402
|
+
return `[${label}](${resolved})`;
|
|
403
|
+
}
|
|
404
|
+
catch {
|
|
405
|
+
return label;
|
|
406
|
+
}
|
|
407
|
+
});
|
|
408
|
+
text = text.replace(/<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>/gi, (_match, level, body) => {
|
|
409
|
+
const heading = decodeHtmlEntities(stripTags(body)).trim();
|
|
410
|
+
return heading ? `\n\n${"#".repeat(Number(level))} ${heading}\n\n` : "\n\n";
|
|
411
|
+
});
|
|
412
|
+
text = text.replace(/<li\b[^>]*>([\s\S]*?)<\/li>/gi, (_match, body) => {
|
|
413
|
+
const item = decodeHtmlEntities(stripTags(body)).trim();
|
|
414
|
+
return item ? `\n- ${item}` : "";
|
|
415
|
+
});
|
|
416
|
+
text = text.replace(/<(p|div|section|article|main|header|footer|blockquote|table|tr)\b[^>]*>/gi, "\n\n");
|
|
417
|
+
text = text.replace(/<\/(p|div|section|article|main|header|footer|blockquote|table|tr)>/gi, "\n\n");
|
|
418
|
+
text = text.replace(/<br\s*\/?>/gi, "\n");
|
|
419
|
+
text = text.replace(/<\/?(ul|ol)\b[^>]*>/gi, "\n");
|
|
420
|
+
text = decodeHtmlEntities(stripTags(text));
|
|
421
|
+
text = text
|
|
422
|
+
.replace(/\r/g, "")
|
|
423
|
+
.replace(/[ \t]+\n/g, "\n")
|
|
424
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
425
|
+
.trim();
|
|
426
|
+
return text;
|
|
427
|
+
}
|
|
428
|
+
function stripTags(value) {
|
|
429
|
+
return value.replace(/<[^>]+>/g, " ");
|
|
430
|
+
}
|
|
431
|
+
function decodeHtmlEntities(value) {
|
|
432
|
+
const namedEntities = {
|
|
433
|
+
nbsp: " ",
|
|
434
|
+
amp: "&",
|
|
435
|
+
lt: "<",
|
|
436
|
+
gt: ">",
|
|
437
|
+
quot: '"',
|
|
438
|
+
apos: "'",
|
|
439
|
+
};
|
|
440
|
+
return value.replace(/&(#x[0-9a-f]+|#\d+|[a-z]+);/gi, (match, entity) => {
|
|
441
|
+
const normalized = String(entity).toLowerCase();
|
|
442
|
+
if (normalized.startsWith("#x")) {
|
|
443
|
+
return safeCodePointToString(Number.parseInt(normalized.slice(2), 16)) ?? match;
|
|
444
|
+
}
|
|
445
|
+
if (normalized.startsWith("#")) {
|
|
446
|
+
return safeCodePointToString(Number.parseInt(normalized.slice(1), 10)) ?? match;
|
|
447
|
+
}
|
|
448
|
+
return namedEntities[normalized] ?? match;
|
|
449
|
+
});
|
|
450
|
+
}
|
|
451
|
+
function isAssetLikePath(pathname) {
|
|
452
|
+
return /\.(css|js|json|png|jpe?g|gif|svg|ico|webp|pdf|zip|tar|gz|mp4|mp3|woff2?)$/i.test(pathname);
|
|
453
|
+
}
|
|
454
|
+
function isSafeLinkUrl(url) {
|
|
455
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
456
|
+
}
|
|
457
|
+
function stripDangerousBlockTag(value, tagName) {
|
|
458
|
+
const pattern = new RegExp(`<${tagName}\\b[^>]*>[\\s\\S]*?<\\/${tagName}\\s*>`, "gi");
|
|
459
|
+
return value.replace(pattern, "");
|
|
460
|
+
}
|
|
461
|
+
function safeCodePointToString(value) {
|
|
462
|
+
if (!Number.isFinite(value) || value < 0 || value > 0x10ffff)
|
|
463
|
+
return undefined;
|
|
464
|
+
try {
|
|
465
|
+
return String.fromCodePoint(value);
|
|
466
|
+
}
|
|
467
|
+
catch {
|
|
468
|
+
return undefined;
|
|
469
|
+
}
|
|
470
|
+
}
|
package/dist/wiki/wiki.js
CHANGED
|
@@ -894,7 +894,17 @@ export function regenerateWikiIndex(stashDir, name) {
|
|
|
894
894
|
}
|
|
895
895
|
lines.push("");
|
|
896
896
|
}
|
|
897
|
-
|
|
897
|
+
const next = `${lines.join("\n")}\n`;
|
|
898
|
+
const indexPath = path.join(wikiDir, INDEX_MD);
|
|
899
|
+
try {
|
|
900
|
+
const current = fs.readFileSync(indexPath, "utf8");
|
|
901
|
+
if (current === next)
|
|
902
|
+
return false;
|
|
903
|
+
}
|
|
904
|
+
catch {
|
|
905
|
+
/* missing file -> write below */
|
|
906
|
+
}
|
|
907
|
+
fs.writeFileSync(indexPath, next, "utf8");
|
|
898
908
|
return true;
|
|
899
909
|
}
|
|
900
910
|
catch {
|
package/dist/workflows/parser.js
CHANGED
|
@@ -24,10 +24,25 @@ const SUBSECTION_COMPLETION_CRITERIA = "Completion Criteria";
|
|
|
24
24
|
* the matcher and parser cannot drift.
|
|
25
25
|
*/
|
|
26
26
|
export function looksLikeWorkflow(body) {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
const structuralBody = stripFencedCodeBlocks(body);
|
|
28
|
+
return (/^#\s+Workflow:\s+/m.test(structuralBody) &&
|
|
29
|
+
/^##\s+Step:\s+/m.test(structuralBody) &&
|
|
30
|
+
/^Step ID:\s+/m.test(structuralBody) &&
|
|
31
|
+
/^###\s+Instructions\s*$/m.test(structuralBody));
|
|
32
|
+
}
|
|
33
|
+
function stripFencedCodeBlocks(body) {
|
|
34
|
+
let inFence = false;
|
|
35
|
+
const lines = body.split(/\r?\n/);
|
|
36
|
+
const stripped = [];
|
|
37
|
+
for (const line of lines) {
|
|
38
|
+
if (/^\s*```/.test(line) || /^\s*~~~/.test(line)) {
|
|
39
|
+
inFence = !inFence;
|
|
40
|
+
stripped.push("");
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
stripped.push(inFence ? "" : line);
|
|
44
|
+
}
|
|
45
|
+
return stripped.join("\n");
|
|
31
46
|
}
|
|
32
47
|
export function parseWorkflow(markdown, source) {
|
|
33
48
|
const errors = [];
|
package/dist/workflows/runs.js
CHANGED
|
@@ -5,7 +5,7 @@ import { loadConfig } from "../core/config";
|
|
|
5
5
|
import { NotFoundError, UsageError } from "../core/errors";
|
|
6
6
|
import { appendEvent } from "../core/events";
|
|
7
7
|
import { getDbPath } from "../core/paths";
|
|
8
|
-
import { closeDatabase,
|
|
8
|
+
import { closeDatabase, openExistingDatabase } from "../indexer/db";
|
|
9
9
|
import { resolveSourceEntries } from "../indexer/search-source";
|
|
10
10
|
import { resolveSourcesForOrigin } from "../registry/origin-resolve";
|
|
11
11
|
import { resolveAssetPath } from "../sources/resolve";
|
|
@@ -264,7 +264,7 @@ function readWorkflowDocumentFromIndex(sourcePath, ref) {
|
|
|
264
264
|
const dbPath = getDbPath();
|
|
265
265
|
if (!fs.existsSync(dbPath))
|
|
266
266
|
return null;
|
|
267
|
-
const db =
|
|
267
|
+
const db = openExistingDatabase(dbPath);
|
|
268
268
|
try {
|
|
269
269
|
const parsed = parseAssetRef(ref);
|
|
270
270
|
const entryKey = `${sourcePath}:${parsed.type}:${parsed.name}`;
|
|
@@ -315,7 +315,7 @@ function resolveWorkflowEntryId(sourcePath, ref) {
|
|
|
315
315
|
const dbPath = getDbPath();
|
|
316
316
|
if (!fs.existsSync(dbPath))
|
|
317
317
|
return null;
|
|
318
|
-
const db =
|
|
318
|
+
const db = openExistingDatabase(dbPath);
|
|
319
319
|
try {
|
|
320
320
|
const parsed = parseAssetRef(ref);
|
|
321
321
|
const entryKey = `${sourcePath}:${parsed.type}:${parsed.name}`;
|
package/docs/README.md
CHANGED
|
@@ -10,8 +10,8 @@
|
|
|
10
10
|
|
|
11
11
|
## Upgrading
|
|
12
12
|
|
|
13
|
-
- [v1 migration guide](migration/v1.md) -- The path from 0.x to v1.0
|
|
14
|
-
- [Release notes (latest: 0.7.0)](migration/release-notes/0.7.0.md) -- Per-release notes drop into `migration/release-notes
|
|
13
|
+
- [v1 migration guide](migration/v1.md) -- The path from 0.x to v1.0, including the `.stash.json` removal scheduled for v0.8.0
|
|
14
|
+
- [Release notes (latest: 0.7.0)](migration/release-notes/0.7.0.md) -- Per-release notes drop into `migration/release-notes/`, including current pre-release removals
|
|
15
15
|
- [v0.5 → v0.6 migration guide](migration/v0.5-to-v0.6.md) -- Every breaking change with before/after code, publisher checklist, and troubleshooting
|
|
16
16
|
|
|
17
17
|
## Reference
|
|
@@ -19,7 +19,14 @@
|
|
|
19
19
|
- [CLI](cli.md) -- All `akm` commands and flags
|
|
20
20
|
- [Registry](registry.md) -- Registries, search, hosting, and managing sources
|
|
21
21
|
- [Configuration](configuration.md) -- Providers, settings, and Ollama setup
|
|
22
|
-
- [Filesystem](technical/filesystem.md) -- Directory layout
|
|
22
|
+
- [Filesystem](technical/filesystem.md) -- Directory layout plus `.stash.json` deprecation and migration notes
|
|
23
|
+
|
|
24
|
+
## Official Ecosystem Repositories
|
|
25
|
+
|
|
26
|
+
- [itlackey/akm-stash](https://github.com/itlackey/akm-stash) -- the official onboarding stash with ready-made assets you can install with `akm add`
|
|
27
|
+
- [itlackey/akm-registry](https://github.com/itlackey/akm-registry) -- the official registry index that powers built-in discovery
|
|
28
|
+
- [itlackey/akm-plugins](https://github.com/itlackey/akm-plugins) -- optional integrations for tools like OpenCode
|
|
29
|
+
- [itlackey/akm-bench](https://github.com/itlackey/akm-bench) -- the standalone benchmark and evaluation repo for akm
|
|
23
30
|
|
|
24
31
|
## Internals
|
|
25
32
|
|
|
@@ -17,6 +17,14 @@ If you are coming from 0.6.x, the
|
|
|
17
17
|
canonical upgrade reference. This file is the executive summary of
|
|
18
18
|
what's new in 0.7.0.
|
|
19
19
|
|
|
20
|
+
For stash authors on the 0.7.x pre-release line: `.stash.json` remains supported
|
|
21
|
+
for compatibility in this release, but it is deprecated and will be removed in
|
|
22
|
+
v0.8.0. That timeline is intentional: during this aggressive pre-release
|
|
23
|
+
phase-out window, compatibility shims do not stay around until 1.0 unless they
|
|
24
|
+
still earn their cost. Prefer frontmatter for markdown assets and structured
|
|
25
|
+
code comments for scripts, and migrate any remaining `.stash.json` metadata
|
|
26
|
+
before taking the 0.8 upgrade.
|
|
27
|
+
|
|
20
28
|
## Major new surfaces in 0.7.0
|
|
21
29
|
|
|
22
30
|
### Proposal queue (`akm proposal *`) — new (#225, #226, #233)
|
|
@@ -150,6 +158,20 @@ This is the surface 0.7.0 commits to maintain through 0.8.x / 0.9.x.
|
|
|
150
158
|
Renaming or removing any of these commands at 1.0 GA would be a major
|
|
151
159
|
version bump.
|
|
152
160
|
|
|
161
|
+
## Post-0.7.0 additive surfaces
|
|
162
|
+
|
|
163
|
+
These landed after the 0.7.0 cut without changing the existing command grammar:
|
|
164
|
+
|
|
165
|
+
- `akm import <url>` now fetches one HTTP/HTTPS URL, converts it to markdown,
|
|
166
|
+
and writes it into `knowledge/` using a URL-path-derived default name.
|
|
167
|
+
- `akm wiki stash <name> <url>` now fetches one HTTP/HTTPS URL, converts it to
|
|
168
|
+
markdown, and writes it into `wikis/<name>/raw/`.
|
|
169
|
+
- Both flows are one-shot ingest only: they do not register a persistent
|
|
170
|
+
website source and they do not crawl linked pages.
|
|
171
|
+
- Website source sync (`akm add <url> --provider website`) and one-shot URL
|
|
172
|
+
ingest now share the same `src/sources/website-ingest.ts` module for URL
|
|
173
|
+
validation, fetch/convert, and mirror generation behavior.
|
|
174
|
+
|
|
153
175
|
## Pre-prod hardening (PR #275)
|
|
154
176
|
|
|
155
177
|
PR #275 batched five issues plus the bench tmp follow-up that operators
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "akm-cli",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.3",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "akm (Agent Kit Manager) — A package manager for AI agent skills, commands, tools, and knowledge. Works with Claude Code, OpenCode, Cursor, and any AI coding assistant.",
|
|
6
6
|
"keywords": [
|
|
@@ -46,6 +46,9 @@
|
|
|
46
46
|
"check": "bun run lint && bunx tsc --noEmit && bun test ./tests",
|
|
47
47
|
"check:changed": "bun test tests/output-baseline.test.ts tests/e2e.test.ts tests/stash-search.test.ts && bun run lint && bunx tsc --noEmit",
|
|
48
48
|
"test": "bun test ./tests",
|
|
49
|
+
"lint:devto-posts": "bun scripts/lint-devto-posts.ts",
|
|
50
|
+
"lint:devto-posts:fix": "bun scripts/lint-devto-posts.ts --fix",
|
|
51
|
+
"publish:devto": "npx -y @sinedied/devto-cli push \"docs/posts/**/*.md\" --token \"$DEVTO_TOKEN\" --repo \"$GITHUB_REPOSITORY\" --branch \"${GITHUB_REF_NAME:-main}\" --reconcile",
|
|
49
52
|
"release:check": "./tests/release-check.sh",
|
|
50
53
|
"lint": "bunx biome check src/ tests/",
|
|
51
54
|
"lint:fix": "bunx biome check --write src/ tests/",
|
|
@@ -63,7 +66,7 @@
|
|
|
63
66
|
"typescript": "^5.9.3"
|
|
64
67
|
},
|
|
65
68
|
"optionalDependencies": {
|
|
66
|
-
"@huggingface/transformers": "
|
|
69
|
+
"@huggingface/transformers": "^4.2.0",
|
|
67
70
|
"sqlite-vec": "0.1.7-alpha.2"
|
|
68
71
|
},
|
|
69
72
|
"engines": {
|