okfy-ai 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +115 -0
- package/assets/demo.gif +0 -0
- package/assets/logo.svg +14 -0
- package/dist/chunk-C46QXZDU.js +1013 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +151 -0
- package/dist/index.d.ts +179 -0
- package/dist/index.js +40 -0
- package/docs/mcp-clients.md +278 -0
- package/examples/README.md +98 -0
- package/examples/bundles/okfy-docs/concepts/index.md +14 -0
- package/examples/bundles/okfy-docs/concepts/okf-bundle.md +33 -0
- package/examples/bundles/okfy-docs/concepts/progressive-disclosure.md +26 -0
- package/examples/bundles/okfy-docs/guides/import-local-markdown.md +31 -0
- package/examples/bundles/okfy-docs/guides/index.md +14 -0
- package/examples/bundles/okfy-docs/guides/serve-over-mcp.md +29 -0
- package/examples/bundles/okfy-docs/index.md +22 -0
- package/examples/bundles/okfy-docs/okfy-example.json +10 -0
- package/examples/bundles/okfy-docs/reference/index.md +13 -0
- package/examples/bundles/okfy-docs/reference/mcp-tools.md +36 -0
- package/examples/bundles/stripe-checkout-small/index.md +21 -0
- package/examples/bundles/stripe-checkout-small/okfy-example.json +11 -0
- package/examples/bundles/stripe-checkout-small/quickstart.md +26 -0
- package/examples/bundles/stripe-checkout-small/sessions.md +20 -0
- package/examples/bundles/stripe-checkout-small/webhooks.md +19 -0
- package/examples/local-markdown/concepts/okf-bundle.md +19 -0
- package/examples/local-markdown/concepts/progressive-disclosure.md +15 -0
- package/examples/local-markdown/guides/import-local-markdown.md +20 -0
- package/examples/local-markdown/guides/serve-over-mcp.md +17 -0
- package/examples/local-markdown/index.md +11 -0
- package/examples/local-markdown/okfy-example.json +10 -0
- package/examples/local-markdown/reference/mcp-tools.md +25 -0
- package/package.json +71 -0
|
@@ -0,0 +1,1013 @@
|
|
|
1
|
+
// src/normalize.ts
|
|
2
|
+
import * as cheerio from "cheerio";
|
|
3
|
+
import TurndownService from "turndown";
|
|
4
|
+
|
|
5
|
+
// src/util/path.ts
|
|
6
|
+
import path from "path";
|
|
7
|
+
function toPosixPath(input) {
|
|
8
|
+
return input.split(path.sep).join("/");
|
|
9
|
+
}
|
|
10
|
+
function stripMdExtension(input) {
|
|
11
|
+
return input.replace(/\.md$/i, "");
|
|
12
|
+
}
|
|
13
|
+
function safeSegment(input) {
|
|
14
|
+
let decoded = input;
|
|
15
|
+
try {
|
|
16
|
+
decoded = decodeURIComponent(input);
|
|
17
|
+
} catch {
|
|
18
|
+
decoded = input;
|
|
19
|
+
}
|
|
20
|
+
const cleaned = decoded.normalize("NFKD").replace(/[^\w.\-~]+/g, "-").replace(/^-+|-+$/g, "").replace(/-{2,}/g, "-").toLowerCase();
|
|
21
|
+
return cleaned || "index";
|
|
22
|
+
}
|
|
23
|
+
function ensureMarkdownPath(input) {
|
|
24
|
+
if (!input || input === "/") return "index.md";
|
|
25
|
+
const trimmed = input.replace(/^\/+/, "").replace(/\/+$/, "");
|
|
26
|
+
if (!trimmed) return "index.md";
|
|
27
|
+
const parts = trimmed.split("/").map(safeSegment);
|
|
28
|
+
const last = parts[parts.length - 1] ?? "index";
|
|
29
|
+
if (/\.(md|mdx|html?|txt)$/i.test(last)) {
|
|
30
|
+
parts[parts.length - 1] = last.replace(/\.(mdx|html?|txt)$/i, ".md");
|
|
31
|
+
} else {
|
|
32
|
+
parts[parts.length - 1] = `${last}.md`;
|
|
33
|
+
}
|
|
34
|
+
return parts.join("/");
|
|
35
|
+
}
|
|
36
|
+
function urlToOutputPath(url) {
|
|
37
|
+
const parsed = new URL(url);
|
|
38
|
+
if (parsed.pathname === "/" || parsed.pathname === "") return "index.md";
|
|
39
|
+
const trailingSlash = parsed.pathname.endsWith("/");
|
|
40
|
+
if (trailingSlash) {
|
|
41
|
+
const trimmed = parsed.pathname.replace(/^\/+|\/+$/g, "");
|
|
42
|
+
return `${trimmed.split("/").map(safeSegment).join("/")}/index.md`;
|
|
43
|
+
}
|
|
44
|
+
return ensureMarkdownPath(parsed.pathname);
|
|
45
|
+
}
|
|
46
|
+
function relativeMarkdownLink(fromPath, toPath) {
|
|
47
|
+
const fromDir = path.posix.dirname(toPosixPath(fromPath));
|
|
48
|
+
let rel = path.posix.relative(fromDir, toPosixPath(toPath));
|
|
49
|
+
if (!rel.startsWith(".")) rel = `./${rel}`;
|
|
50
|
+
return rel;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// src/normalize.ts
|
|
54
|
+
var turndown = new TurndownService({
|
|
55
|
+
codeBlockStyle: "fenced",
|
|
56
|
+
headingStyle: "atx",
|
|
57
|
+
bulletListMarker: "-"
|
|
58
|
+
});
|
|
59
|
+
turndown.keep(["table"]);
|
|
60
|
+
function extractHeadings(markdown) {
|
|
61
|
+
return [...markdown.matchAll(/^(#{1,6})\s+(.+)$/gm)].map((match) => ({
|
|
62
|
+
depth: match[1]?.length ?? 1,
|
|
63
|
+
text: (match[2] ?? "").trim(),
|
|
64
|
+
slug: safeSegment(match[2] ?? "")
|
|
65
|
+
}));
|
|
66
|
+
}
|
|
67
|
+
function extractMarkdownLinks(markdown) {
|
|
68
|
+
return [...markdown.matchAll(/\[([^\]]*)\]\(([^)\s]+)(?:\s+"[^"]*")?\)/g)].map((match) => ({
|
|
69
|
+
text: match[1] ?? "",
|
|
70
|
+
href: match[2] ?? ""
|
|
71
|
+
}));
|
|
72
|
+
}
|
|
73
|
+
function inferType(title, sourceId, markdown) {
|
|
74
|
+
const haystack = `${title} ${sourceId} ${markdown.slice(0, 2e3)}`.toLowerCase();
|
|
75
|
+
if (/\breadme\b/.test(haystack)) return "README";
|
|
76
|
+
if (/\b(api|reference|sdk|endpoint|parameter|request|response)\b/.test(haystack)) return "API Reference";
|
|
77
|
+
if (/\b(quickstart|guide|tutorial|walkthrough|get started)\b/.test(haystack)) return "Guide";
|
|
78
|
+
if (/\bdocs?\b/.test(haystack)) return "Documentation Page";
|
|
79
|
+
return "Concept";
|
|
80
|
+
}
|
|
81
|
+
function inferTags(title, sourceId, headings) {
|
|
82
|
+
const raw = `${sourceId} ${title} ${headings.slice(0, 3).map((h) => h.text).join(" ")}`;
|
|
83
|
+
const words = raw.toLowerCase().replace(/https?:\/\/[^/]+/g, "").split(/[^a-z0-9]+/).filter((word) => word.length >= 3 && word.length <= 24).filter((word) => !["html", "markdown", "index", "docs", "page", "guide"].includes(word));
|
|
84
|
+
return [...new Set(words)].slice(0, 6);
|
|
85
|
+
}
|
|
86
|
+
function titleFromMarkdown(markdown, fallback) {
|
|
87
|
+
const heading = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim();
|
|
88
|
+
if (heading) return plainTitle(heading);
|
|
89
|
+
return fallback;
|
|
90
|
+
}
|
|
91
|
+
function plainTitle(title) {
|
|
92
|
+
return title.replace(/\[([^\]]+)]\([^)]+\)/g, "$1").replace(/[`*_#]/g, "").replace(/\s+/g, " ").trim();
|
|
93
|
+
}
|
|
94
|
+
function fallbackTitle(sourceId) {
|
|
95
|
+
const leaf = sourceId.split(/[/?#]/).filter(Boolean).pop() ?? "Index";
|
|
96
|
+
return leaf.replace(/\.[a-z0-9]+$/i, "").split(/[-_\s]+/).filter(Boolean).map((word) => word.slice(0, 1).toUpperCase() + word.slice(1)).join(" ");
|
|
97
|
+
}
|
|
98
|
+
function normalizeDocument(raw) {
|
|
99
|
+
let markdown = raw.raw;
|
|
100
|
+
let title = fallbackTitle(raw.url ?? raw.filePath ?? raw.sourceId);
|
|
101
|
+
if (raw.contentType === "html") {
|
|
102
|
+
const $ = cheerio.load(raw.raw);
|
|
103
|
+
$("script,style,noscript,svg,header,footer,nav,aside").remove();
|
|
104
|
+
title = $("h1").first().text().trim() || $("title").first().text().trim() || title;
|
|
105
|
+
const main = $("main, article, [role='main'], .markdown-body, .docs-content").first();
|
|
106
|
+
const html = (main.length ? main : $("body")).html() ?? raw.raw;
|
|
107
|
+
markdown = turndown.turndown(html).trim();
|
|
108
|
+
} else if (raw.contentType === "text") {
|
|
109
|
+
markdown = `# ${title}
|
|
110
|
+
|
|
111
|
+
\`\`\`text
|
|
112
|
+
${raw.raw.trim()}
|
|
113
|
+
\`\`\``;
|
|
114
|
+
}
|
|
115
|
+
markdown = markdown.replace(/\r\n/g, "\n").trim();
|
|
116
|
+
title = titleFromMarkdown(markdown, plainTitle(title)).replace(/\s+/g, " ").trim();
|
|
117
|
+
const headings = extractHeadings(markdown);
|
|
118
|
+
const links = extractMarkdownLinks(markdown);
|
|
119
|
+
const sourceId = raw.url ?? raw.filePath ?? raw.sourceId;
|
|
120
|
+
return {
|
|
121
|
+
sourceId,
|
|
122
|
+
title,
|
|
123
|
+
markdown,
|
|
124
|
+
resource: raw.url,
|
|
125
|
+
sourcePath: raw.filePath,
|
|
126
|
+
headings,
|
|
127
|
+
links,
|
|
128
|
+
tags: inferTags(title, sourceId, headings),
|
|
129
|
+
type: inferType(title, sourceId, markdown)
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
function descriptionFromMarkdown(markdown) {
|
|
133
|
+
const text = markdown.replace(/^---[\s\S]*?---\s*/m, "").replace(/^#{1,6}\s+.+$/gm, "").replace(/```[\s\S]*?```/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/[`*_>#-]/g, "").replace(/\s+/g, " ").trim();
|
|
134
|
+
return text.slice(0, 180) || "Generated OKF concept.";
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// src/writer.ts
|
|
138
|
+
import fs from "fs/promises";
|
|
139
|
+
import path2 from "path";
|
|
140
|
+
|
|
141
|
+
// src/util/url.ts
|
|
142
|
+
import net from "net";
|
|
143
|
+
var TRACKING_PARAMS = [/^utm_/i, /^fbclid$/i, /^gclid$/i, /^mc_/i];
|
|
144
|
+
function canonicalizeUrl(input, base) {
|
|
145
|
+
const url = new URL(input, base);
|
|
146
|
+
url.hash = "";
|
|
147
|
+
for (const key of [...url.searchParams.keys()]) {
|
|
148
|
+
if (TRACKING_PARAMS.some((pattern) => pattern.test(key))) url.searchParams.delete(key);
|
|
149
|
+
}
|
|
150
|
+
url.pathname = url.pathname.replace(/\/{2,}/g, "/");
|
|
151
|
+
if (url.pathname !== "/" && url.pathname.endsWith("/") && !input.endsWith("/")) {
|
|
152
|
+
url.pathname = url.pathname.replace(/\/+$/, "");
|
|
153
|
+
}
|
|
154
|
+
url.hostname = url.hostname.toLowerCase();
|
|
155
|
+
return url.toString();
|
|
156
|
+
}
|
|
157
|
+
function sameOrigin(a, b) {
|
|
158
|
+
const left = new URL(a);
|
|
159
|
+
const right = new URL(b);
|
|
160
|
+
return left.origin === right.origin;
|
|
161
|
+
}
|
|
162
|
+
function isHttpUrl(input) {
|
|
163
|
+
try {
|
|
164
|
+
const url = new URL(input);
|
|
165
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
166
|
+
} catch {
|
|
167
|
+
return false;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
function isPrivateNetworkUrl(input) {
|
|
171
|
+
const url = new URL(input);
|
|
172
|
+
const host = url.hostname.toLowerCase();
|
|
173
|
+
if (host === "localhost" || host.endsWith(".localhost")) return true;
|
|
174
|
+
if (host === "::1" || host.startsWith("fe80:")) return true;
|
|
175
|
+
const ipKind = net.isIP(host);
|
|
176
|
+
if (ipKind === 4) {
|
|
177
|
+
const parts = host.split(".").map(Number);
|
|
178
|
+
const [a = 0, b = 0] = parts;
|
|
179
|
+
return a === 10 || a === 127 || a === 172 && b >= 16 && b <= 31 || a === 192 && b === 168 || a === 169 && b === 254;
|
|
180
|
+
}
|
|
181
|
+
if (ipKind === 6) return host === "::1" || host.startsWith("fc") || host.startsWith("fd");
|
|
182
|
+
return false;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// src/writer.ts
|
|
186
|
+
function yamlScalar(value) {
|
|
187
|
+
return JSON.stringify(value);
|
|
188
|
+
}
|
|
189
|
+
function frontmatter(doc, timestamp) {
|
|
190
|
+
const lines = [
|
|
191
|
+
"---",
|
|
192
|
+
`type: ${yamlScalar(doc.type)}`,
|
|
193
|
+
`title: ${yamlScalar(doc.title)}`,
|
|
194
|
+
`description: ${yamlScalar(descriptionFromMarkdown(doc.markdown))}`,
|
|
195
|
+
`resource: ${yamlScalar(doc.resource ?? doc.sourcePath ?? doc.sourceId)}`,
|
|
196
|
+
"tags:",
|
|
197
|
+
...doc.tags.length ? doc.tags.map((tag) => ` - ${yamlScalar(tag)}`) : [" []"],
|
|
198
|
+
`timestamp: ${yamlScalar(timestamp)}`,
|
|
199
|
+
"---",
|
|
200
|
+
""
|
|
201
|
+
];
|
|
202
|
+
return lines.join("\n");
|
|
203
|
+
}
|
|
204
|
+
function withTitle(title, markdown) {
|
|
205
|
+
const trimmed = markdown.trim();
|
|
206
|
+
if (trimmed.match(/^#\s+/)) return trimmed;
|
|
207
|
+
return `# ${title}
|
|
208
|
+
|
|
209
|
+
${trimmed}`;
|
|
210
|
+
}
|
|
211
|
+
function sourceKey(doc) {
|
|
212
|
+
if (doc.resource) return canonicalizeUrl(doc.resource);
|
|
213
|
+
return toPosixPath(doc.sourcePath ?? doc.sourceId);
|
|
214
|
+
}
|
|
215
|
+
function assignOutputPaths(docs) {
|
|
216
|
+
const used = /* @__PURE__ */ new Set();
|
|
217
|
+
const result = /* @__PURE__ */ new Map();
|
|
218
|
+
for (const doc of docs) {
|
|
219
|
+
const base = doc.resource ? urlToOutputPath(doc.resource) : ensureMarkdownPath(doc.sourcePath ?? doc.sourceId);
|
|
220
|
+
let candidate = base;
|
|
221
|
+
let index = 2;
|
|
222
|
+
while (used.has(candidate)) {
|
|
223
|
+
const parsed = path2.posix.parse(base);
|
|
224
|
+
candidate = path2.posix.join(parsed.dir, `${parsed.name}-${index}${parsed.ext}`);
|
|
225
|
+
index += 1;
|
|
226
|
+
}
|
|
227
|
+
used.add(candidate);
|
|
228
|
+
result.set(sourceKey(doc), candidate);
|
|
229
|
+
doc.outputPath = candidate;
|
|
230
|
+
}
|
|
231
|
+
return result;
|
|
232
|
+
}
|
|
233
|
+
function rewriteLinks(doc, sourceToOutput) {
|
|
234
|
+
return doc.markdown.replace(/\[([^\]]*)\]\(([^)\s]+)([^)]*)\)/g, (full, text, href, suffix) => {
|
|
235
|
+
if (/^(https?:)?\/\//.test(href)) {
|
|
236
|
+
try {
|
|
237
|
+
const key = canonicalizeUrl(href);
|
|
238
|
+
const target = sourceToOutput.get(key);
|
|
239
|
+
if (target && doc.outputPath) {
|
|
240
|
+
return `[${text}](${relativeMarkdownLink(doc.outputPath, target)}${suffix})`;
|
|
241
|
+
}
|
|
242
|
+
} catch {
|
|
243
|
+
return full;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
if (!href.startsWith("#") && doc.resource) {
|
|
247
|
+
try {
|
|
248
|
+
const key = canonicalizeUrl(href, doc.resource);
|
|
249
|
+
const target = sourceToOutput.get(key);
|
|
250
|
+
if (target && doc.outputPath) return `[${text}](${relativeMarkdownLink(doc.outputPath, target)}${suffix})`;
|
|
251
|
+
return `[${text}](${key}${suffix})`;
|
|
252
|
+
} catch {
|
|
253
|
+
return full;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
if (!href.startsWith("#") && doc.sourcePath) {
|
|
257
|
+
const abs = toPosixPath(path2.posix.normalize(path2.posix.join(path2.posix.dirname(doc.sourcePath), href)));
|
|
258
|
+
const noHash = abs.split("#")[0] ?? abs;
|
|
259
|
+
const target = sourceToOutput.get(noHash);
|
|
260
|
+
if (target && doc.outputPath) return `[${text}](${relativeMarkdownLink(doc.outputPath, target)}${suffix})`;
|
|
261
|
+
}
|
|
262
|
+
return full;
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
async function ensureCleanOutDir(outDir, force) {
|
|
266
|
+
try {
|
|
267
|
+
const entries = await fs.readdir(outDir);
|
|
268
|
+
if (entries.length > 0) {
|
|
269
|
+
if (!force) throw new Error(`Output directory is not empty: ${outDir}. Use --force to overwrite.`);
|
|
270
|
+
await fs.rm(outDir, { recursive: true, force: true });
|
|
271
|
+
}
|
|
272
|
+
} catch (error) {
|
|
273
|
+
if (error?.code !== "ENOENT") throw error;
|
|
274
|
+
}
|
|
275
|
+
await fs.mkdir(outDir, { recursive: true });
|
|
276
|
+
}
|
|
277
|
+
async function writeOkfBundle(docs, options) {
|
|
278
|
+
if (docs.length === 0) throw new Error("No documents to write.");
|
|
279
|
+
await ensureCleanOutDir(options.outDir, options.force);
|
|
280
|
+
const timestamp = options.timestamp ?? (/* @__PURE__ */ new Date()).toISOString();
|
|
281
|
+
const sourceToOutput = assignOutputPaths(docs);
|
|
282
|
+
const written = [];
|
|
283
|
+
for (const doc of docs) {
|
|
284
|
+
const relPath = doc.outputPath ?? "index.md";
|
|
285
|
+
const absolute = path2.join(options.outDir, relPath);
|
|
286
|
+
await fs.mkdir(path2.dirname(absolute), { recursive: true });
|
|
287
|
+
const body = withTitle(doc.title, rewriteLinks(doc, sourceToOutput));
|
|
288
|
+
await fs.writeFile(absolute, `${frontmatter(doc, timestamp)}${body}
|
|
289
|
+
`, "utf8");
|
|
290
|
+
written.push(relPath);
|
|
291
|
+
}
|
|
292
|
+
if (!written.includes("index.md")) {
|
|
293
|
+
const title = options.title ?? options.sourceName ?? "OKF Bundle";
|
|
294
|
+
const list = written.sort().map((file) => `- [${file.replace(/\.md$/, "")}](./${file})`).join("\n");
|
|
295
|
+
const indexDoc = [
|
|
296
|
+
"---",
|
|
297
|
+
'type: "Bundle Index"',
|
|
298
|
+
`title: ${yamlScalar(title)}`,
|
|
299
|
+
`description: ${yamlScalar(`Index for ${title}.`)}`,
|
|
300
|
+
`resource: ${yamlScalar(options.sourceName ?? title)}`,
|
|
301
|
+
"tags:",
|
|
302
|
+
' - "index"',
|
|
303
|
+
`timestamp: ${yamlScalar(timestamp)}`,
|
|
304
|
+
"---",
|
|
305
|
+
"",
|
|
306
|
+
`# ${title}`,
|
|
307
|
+
"",
|
|
308
|
+
list,
|
|
309
|
+
""
|
|
310
|
+
].join("\n");
|
|
311
|
+
await fs.writeFile(path2.join(options.outDir, "index.md"), indexDoc, "utf8");
|
|
312
|
+
written.unshift("index.md");
|
|
313
|
+
}
|
|
314
|
+
const dirs = [...new Set(written.map((file) => path2.posix.dirname(file)).filter((dir) => dir !== "."))].sort();
|
|
315
|
+
for (const dir of dirs) {
|
|
316
|
+
const indexPath = path2.posix.join(dir, "index.md");
|
|
317
|
+
if (written.includes(indexPath)) continue;
|
|
318
|
+
const children = written.filter((file) => path2.posix.dirname(file) === dir && path2.posix.basename(file) !== "index.md").sort();
|
|
319
|
+
if (children.length === 0) continue;
|
|
320
|
+
const title = `${dir.split("/").map((segment) => segment.slice(0, 1).toUpperCase() + segment.slice(1)).join(" / ")} Index`;
|
|
321
|
+
const list = children.map((file) => `- [${path2.posix.basename(file, ".md")}](./${path2.posix.basename(file)})`).join("\n");
|
|
322
|
+
const folderIndex = [
|
|
323
|
+
"---",
|
|
324
|
+
'type: "Folder Index"',
|
|
325
|
+
`title: ${yamlScalar(title)}`,
|
|
326
|
+
`description: ${yamlScalar(`Index for ${dir}.`)}`,
|
|
327
|
+
`resource: ${yamlScalar(options.sourceName ?? dir)}`,
|
|
328
|
+
"tags:",
|
|
329
|
+
' - "index"',
|
|
330
|
+
`timestamp: ${yamlScalar(timestamp)}`,
|
|
331
|
+
"---",
|
|
332
|
+
"",
|
|
333
|
+
`# ${title}`,
|
|
334
|
+
"",
|
|
335
|
+
list,
|
|
336
|
+
""
|
|
337
|
+
].join("\n");
|
|
338
|
+
await fs.mkdir(path2.join(options.outDir, dir), { recursive: true });
|
|
339
|
+
await fs.writeFile(path2.join(options.outDir, indexPath), folderIndex, "utf8");
|
|
340
|
+
written.push(indexPath);
|
|
341
|
+
}
|
|
342
|
+
return written.sort();
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
// src/crawler.ts
|
|
346
|
+
import robotsParser from "robots-parser";
|
|
347
|
+
import pLimit from "p-limit";
|
|
348
|
+
import * as cheerio2 from "cheerio";
|
|
349
|
+
|
|
350
|
+
// src/util/match.ts
|
|
351
|
+
import { minimatch } from "minimatch";
|
|
352
|
+
function matchesPattern(value, pattern) {
|
|
353
|
+
if (pattern.startsWith("/") && pattern.endsWith("/") && pattern.length > 2) {
|
|
354
|
+
try {
|
|
355
|
+
return new RegExp(pattern.slice(1, -1)).test(value);
|
|
356
|
+
} catch {
|
|
357
|
+
return false;
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
try {
|
|
361
|
+
return minimatch(value, pattern, { dot: true });
|
|
362
|
+
} catch {
|
|
363
|
+
return false;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
function matchesAnyPattern(value, patterns) {
|
|
367
|
+
return Boolean(patterns?.some((pattern) => matchesPattern(value, pattern)));
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// src/crawler.ts
|
|
371
|
+
var USER_AGENT = "okfy/0.1 (+https://github.com/0dust/OKFy)";
|
|
372
|
+
var MAX_RESPONSE_BYTES = 5 * 1024 * 1024;
|
|
373
|
+
async function fetchText(url) {
|
|
374
|
+
const controller = new AbortController();
|
|
375
|
+
const timeout = setTimeout(() => controller.abort(), 15e3);
|
|
376
|
+
try {
|
|
377
|
+
let lastError;
|
|
378
|
+
for (let attempt = 0; attempt < 3; attempt += 1) {
|
|
379
|
+
try {
|
|
380
|
+
const response = await fetch(url, {
|
|
381
|
+
signal: controller.signal,
|
|
382
|
+
headers: { "user-agent": USER_AGENT, accept: "text/html,text/markdown,text/plain,*/*" },
|
|
383
|
+
redirect: "follow"
|
|
384
|
+
});
|
|
385
|
+
if (!response.ok) {
|
|
386
|
+
if ((response.status >= 500 || response.status === 429) && attempt < 2) {
|
|
387
|
+
await new Promise((resolve) => setTimeout(resolve, 250 * 2 ** attempt));
|
|
388
|
+
continue;
|
|
389
|
+
}
|
|
390
|
+
throw new Error(`Fetch failed ${response.status} for ${url}`);
|
|
391
|
+
}
|
|
392
|
+
const length = Number(response.headers.get("content-length") ?? "0");
|
|
393
|
+
if (length > MAX_RESPONSE_BYTES) throw new Error(`Response too large for ${url}`);
|
|
394
|
+
const text = await response.text();
|
|
395
|
+
if (Buffer.byteLength(text, "utf8") > MAX_RESPONSE_BYTES) throw new Error(`Response too large for ${url}`);
|
|
396
|
+
return { text, contentType: response.headers.get("content-type") ?? "" };
|
|
397
|
+
} catch (error) {
|
|
398
|
+
lastError = error;
|
|
399
|
+
if (attempt < 2) await new Promise((resolve) => setTimeout(resolve, 250 * 2 ** attempt));
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
throw lastError ?? new Error(`Fetch failed for ${url}`);
|
|
403
|
+
} finally {
|
|
404
|
+
clearTimeout(timeout);
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
async function loadRobots(seedUrl, enabled) {
|
|
408
|
+
if (!enabled) return void 0;
|
|
409
|
+
const origin = new URL(seedUrl).origin;
|
|
410
|
+
try {
|
|
411
|
+
const response = await fetch(`${origin}/robots.txt`, { headers: { "user-agent": USER_AGENT } });
|
|
412
|
+
const text = response.ok ? await response.text() : "";
|
|
413
|
+
return robotsParser(`${origin}/robots.txt`, text);
|
|
414
|
+
} catch {
|
|
415
|
+
return robotsParser(`${origin}/robots.txt`, "");
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
function shouldVisit(url, seed, options, robots) {
|
|
419
|
+
if (!isHttpUrl(url)) return false;
|
|
420
|
+
if ((options.sameOrigin ?? true) && !sameOrigin(url, seed)) return false;
|
|
421
|
+
if (!options.allowPrivateNetwork && isPrivateNetworkUrl(url)) return false;
|
|
422
|
+
if (options.include?.length && !matchesAnyPattern(url, options.include)) return false;
|
|
423
|
+
if (matchesAnyPattern(url, options.exclude)) return false;
|
|
424
|
+
if (robots && !robots.isAllowed(url, USER_AGENT)) return false;
|
|
425
|
+
return true;
|
|
426
|
+
}
|
|
427
|
+
function contentTypeFromHeader(header) {
|
|
428
|
+
const lower = header.toLowerCase();
|
|
429
|
+
if (lower.includes("text/html")) return "html";
|
|
430
|
+
if (lower.includes("markdown")) return "markdown";
|
|
431
|
+
if (lower.includes("text/plain")) return "text";
|
|
432
|
+
if (!lower) return "html";
|
|
433
|
+
return void 0;
|
|
434
|
+
}
|
|
435
|
+
function extractRawHtmlLinks(raw) {
|
|
436
|
+
const $ = cheerio2.load(raw);
|
|
437
|
+
return $("a[href]").map((_, element) => ({
|
|
438
|
+
href: String($(element).attr("href") ?? ""),
|
|
439
|
+
text: $(element).text().trim()
|
|
440
|
+
})).get().filter((link) => link.href.length > 0);
|
|
441
|
+
}
|
|
442
|
+
async function crawlWebsite(options) {
|
|
443
|
+
const seed = canonicalizeUrl(options.seedUrl);
|
|
444
|
+
if (!options.allowPrivateNetwork && isPrivateNetworkUrl(seed)) {
|
|
445
|
+
throw new Error("Private network crawl target rejected. Use --allow-private-network for trusted local fixtures.");
|
|
446
|
+
}
|
|
447
|
+
const maxPages = options.maxPages ?? 100;
|
|
448
|
+
const maxDepth = options.maxDepth ?? 4;
|
|
449
|
+
const robots = await loadRobots(seed, options.respectRobots ?? true);
|
|
450
|
+
const queue = [{ url: seed, depth: 0 }];
|
|
451
|
+
const queued = /* @__PURE__ */ new Set([seed]);
|
|
452
|
+
const visited = /* @__PURE__ */ new Set();
|
|
453
|
+
const planned = [];
|
|
454
|
+
const documents = [];
|
|
455
|
+
let skipped = 0;
|
|
456
|
+
let failed = 0;
|
|
457
|
+
const limit = pLimit(options.concurrency ?? 4);
|
|
458
|
+
while (queue.length > 0 && visited.size < maxPages) {
|
|
459
|
+
const batch = queue.splice(0, Math.min(queue.length, maxPages - visited.size));
|
|
460
|
+
const results = await Promise.all(
|
|
461
|
+
batch.map(
|
|
462
|
+
(item) => limit(async () => {
|
|
463
|
+
if (visited.has(item.url)) return;
|
|
464
|
+
visited.add(item.url);
|
|
465
|
+
if (!shouldVisit(item.url, seed, options, robots)) {
|
|
466
|
+
skipped += 1;
|
|
467
|
+
return;
|
|
468
|
+
}
|
|
469
|
+
planned.push(item.url);
|
|
470
|
+
try {
|
|
471
|
+
const fetched = await fetchText(item.url);
|
|
472
|
+
const contentType = contentTypeFromHeader(fetched.contentType);
|
|
473
|
+
if (!contentType) {
|
|
474
|
+
skipped += 1;
|
|
475
|
+
return;
|
|
476
|
+
}
|
|
477
|
+
const raw = {
|
|
478
|
+
sourceId: item.url,
|
|
479
|
+
url: item.url,
|
|
480
|
+
contentType,
|
|
481
|
+
raw: fetched.text,
|
|
482
|
+
discoveredAt: options.timestamp ?? (/* @__PURE__ */ new Date()).toISOString()
|
|
483
|
+
};
|
|
484
|
+
const doc = normalizeDocument(raw);
|
|
485
|
+
if (!options.dryRun) documents.push(doc);
|
|
486
|
+
if (item.depth < maxDepth) {
|
|
487
|
+
const links = options.dryRun && contentType === "html" ? extractRawHtmlLinks(fetched.text) : doc.links;
|
|
488
|
+
for (const link of links) {
|
|
489
|
+
try {
|
|
490
|
+
const next = canonicalizeUrl(link.href, item.url);
|
|
491
|
+
if (!queued.has(next) && shouldVisit(next, seed, options, robots) && queued.size < maxPages * 4) {
|
|
492
|
+
queued.add(next);
|
|
493
|
+
queue.push({ url: next, depth: item.depth + 1 });
|
|
494
|
+
}
|
|
495
|
+
} catch {
|
|
496
|
+
skipped += 1;
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
} catch {
|
|
501
|
+
failed += 1;
|
|
502
|
+
}
|
|
503
|
+
})
|
|
504
|
+
)
|
|
505
|
+
);
|
|
506
|
+
void results;
|
|
507
|
+
}
|
|
508
|
+
if (options.dryRun) {
|
|
509
|
+
return { pagesFetched: planned.length, skipped, failed, written: [], documents: [], dryRunPages: planned.slice(0, maxPages) };
|
|
510
|
+
}
|
|
511
|
+
if (documents.length === 0) throw new Error("Crawl generated zero concepts.");
|
|
512
|
+
const written = await writeOkfBundle(documents, {
|
|
513
|
+
outDir: options.outDir,
|
|
514
|
+
title: options.title,
|
|
515
|
+
sourceName: seed,
|
|
516
|
+
force: options.force,
|
|
517
|
+
timestamp: options.timestamp
|
|
518
|
+
});
|
|
519
|
+
return { pagesFetched: documents.length, skipped, failed, written, documents };
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
// src/importer.ts
|
|
523
|
+
import fs2 from "fs/promises";
|
|
524
|
+
import path3 from "path";
|
|
525
|
+
function contentTypeFor(file) {
|
|
526
|
+
const ext = path3.extname(file).toLowerCase();
|
|
527
|
+
if (ext === ".md") return "markdown";
|
|
528
|
+
if (ext === ".mdx") return "mdx";
|
|
529
|
+
if (ext === ".html" || ext === ".htm") return "html";
|
|
530
|
+
if (ext === ".txt") return "text";
|
|
531
|
+
return void 0;
|
|
532
|
+
}
|
|
533
|
+
async function listFiles(root) {
|
|
534
|
+
const stat = await fs2.stat(root);
|
|
535
|
+
if (stat.isFile()) return [root];
|
|
536
|
+
const files = [];
|
|
537
|
+
async function walk(dir) {
|
|
538
|
+
for (const entry of await fs2.readdir(dir, { withFileTypes: true })) {
|
|
539
|
+
const absolute = path3.join(dir, entry.name);
|
|
540
|
+
if (entry.isDirectory()) {
|
|
541
|
+
if (![".git", "node_modules", "dist"].includes(entry.name)) await walk(absolute);
|
|
542
|
+
} else if (entry.isFile()) {
|
|
543
|
+
files.push(absolute);
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
await walk(root);
|
|
548
|
+
return files.sort();
|
|
549
|
+
}
|
|
550
|
+
async function importLocal(options) {
|
|
551
|
+
const root = path3.resolve(options.inputPath);
|
|
552
|
+
const files = await listFiles(root);
|
|
553
|
+
const docs = [];
|
|
554
|
+
for (const file of files) {
|
|
555
|
+
const rel = path3.relative(root, file).split(path3.sep).join("/");
|
|
556
|
+
if (options.include?.length && !matchesAnyPattern(rel, options.include)) continue;
|
|
557
|
+
if (matchesAnyPattern(rel, options.exclude)) continue;
|
|
558
|
+
const contentType = contentTypeFor(file);
|
|
559
|
+
if (!contentType) continue;
|
|
560
|
+
const raw = {
|
|
561
|
+
sourceId: rel,
|
|
562
|
+
filePath: rel,
|
|
563
|
+
contentType,
|
|
564
|
+
raw: await fs2.readFile(file, "utf8"),
|
|
565
|
+
discoveredAt: options.timestamp ?? (/* @__PURE__ */ new Date()).toISOString()
|
|
566
|
+
};
|
|
567
|
+
docs.push(normalizeDocument(raw));
|
|
568
|
+
}
|
|
569
|
+
if (docs.length === 0) throw new Error("No supported Markdown, MDX, HTML, or text files found.");
|
|
570
|
+
const written = await writeOkfBundle(docs, {
|
|
571
|
+
outDir: options.outDir,
|
|
572
|
+
title: options.sourceName,
|
|
573
|
+
sourceName: options.sourceName ?? options.inputPath,
|
|
574
|
+
force: options.force,
|
|
575
|
+
timestamp: options.timestamp
|
|
576
|
+
});
|
|
577
|
+
return { written, documents: docs };
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// src/graph.ts
|
|
581
|
+
import path4 from "path";
|
|
582
|
+
function extractInternalLinks(concept) {
|
|
583
|
+
const links = /* @__PURE__ */ new Set();
|
|
584
|
+
for (const match of concept.body.matchAll(/\[[^\]]*]\(([^)\s]+)(?:\s+"[^"]*")?\)/g)) {
|
|
585
|
+
const href = match[1] ?? "";
|
|
586
|
+
if (/^(https?:)?\/\//.test(href) || href.startsWith("mailto:") || href.startsWith("#")) continue;
|
|
587
|
+
const noHash = href.split("#")[0] ?? href;
|
|
588
|
+
if (!noHash) continue;
|
|
589
|
+
const resolved = path4.posix.normalize(path4.posix.join(path4.posix.dirname(concept.path), noHash));
|
|
590
|
+
links.add(stripMdExtension(resolved));
|
|
591
|
+
}
|
|
592
|
+
return [...links].sort();
|
|
593
|
+
}
|
|
594
|
+
function buildGraph(conceptsByAnyKey) {
|
|
595
|
+
const concepts = /* @__PURE__ */ new Map();
|
|
596
|
+
for (const concept of conceptsByAnyKey.values()) concepts.set(concept.id, concept);
|
|
597
|
+
const outbound = /* @__PURE__ */ new Map();
|
|
598
|
+
const backlinks = /* @__PURE__ */ new Map();
|
|
599
|
+
for (const concept of concepts.values()) {
|
|
600
|
+
const targets = extractInternalLinks(concept).filter((id) => concepts.has(id));
|
|
601
|
+
outbound.set(concept.id, targets);
|
|
602
|
+
for (const target of targets) {
|
|
603
|
+
backlinks.set(target, [...backlinks.get(target) ?? [], concept.id].sort());
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
for (const concept of concepts.values()) {
|
|
607
|
+
if (!backlinks.has(concept.id)) backlinks.set(concept.id, []);
|
|
608
|
+
if (!outbound.has(concept.id)) outbound.set(concept.id, []);
|
|
609
|
+
}
|
|
610
|
+
return { concepts, outbound, backlinks };
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
// src/reader.ts
|
|
614
|
+
import fs3 from "fs/promises";
|
|
615
|
+
import path5 from "path";
|
|
616
|
+
import matter from "gray-matter";
|
|
617
|
+
async function listMarkdownFiles(dir) {
|
|
618
|
+
const result = [];
|
|
619
|
+
async function walk(current) {
|
|
620
|
+
for (const entry of await fs3.readdir(current, { withFileTypes: true })) {
|
|
621
|
+
const absolute = path5.join(current, entry.name);
|
|
622
|
+
if (entry.isDirectory()) await walk(absolute);
|
|
623
|
+
else if (entry.isFile() && entry.name.endsWith(".md")) result.push(absolute);
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
await walk(dir);
|
|
627
|
+
return result.sort();
|
|
628
|
+
}
|
|
629
|
+
function stringArray(value) {
|
|
630
|
+
if (!Array.isArray(value)) return [];
|
|
631
|
+
return value.filter((item) => typeof item === "string");
|
|
632
|
+
}
|
|
633
|
+
async function readConceptFile(bundleDir, absolutePath) {
|
|
634
|
+
const raw = await fs3.readFile(absolutePath, "utf8");
|
|
635
|
+
const parsed = matter(raw);
|
|
636
|
+
const relPath = toPosixPath(path5.relative(bundleDir, absolutePath));
|
|
637
|
+
const id = stripMdExtension(relPath);
|
|
638
|
+
const frontmatter2 = parsed.data;
|
|
639
|
+
return {
|
|
640
|
+
id,
|
|
641
|
+
path: relPath,
|
|
642
|
+
frontmatter: frontmatter2,
|
|
643
|
+
type: typeof frontmatter2.type === "string" ? frontmatter2.type : "",
|
|
644
|
+
title: typeof frontmatter2.title === "string" ? frontmatter2.title : void 0,
|
|
645
|
+
description: typeof frontmatter2.description === "string" ? frontmatter2.description : void 0,
|
|
646
|
+
resource: typeof frontmatter2.resource === "string" ? frontmatter2.resource : void 0,
|
|
647
|
+
tags: stringArray(frontmatter2.tags),
|
|
648
|
+
body: parsed.content.trim()
|
|
649
|
+
};
|
|
650
|
+
}
|
|
651
|
+
async function readBundle(bundleDir) {
|
|
652
|
+
const files = await listMarkdownFiles(bundleDir);
|
|
653
|
+
const concepts = /* @__PURE__ */ new Map();
|
|
654
|
+
for (const file of files) {
|
|
655
|
+
const concept = await readConceptFile(bundleDir, file);
|
|
656
|
+
concepts.set(concept.id, concept);
|
|
657
|
+
concepts.set(concept.path, concept);
|
|
658
|
+
}
|
|
659
|
+
return concepts;
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
// src/search.ts
|
|
663
|
+
import MiniSearch from "minisearch";
|
|
664
|
+
function snippet(concept, query, max = 240) {
|
|
665
|
+
const text = `${concept.description ?? ""} ${concept.body}`.replace(/\s+/g, " ").trim();
|
|
666
|
+
const lower = text.toLowerCase();
|
|
667
|
+
const term = query.toLowerCase().split(/\s+/).find(Boolean) ?? "";
|
|
668
|
+
const index = term ? lower.indexOf(term) : -1;
|
|
669
|
+
const start = Math.max(0, index - 80);
|
|
670
|
+
return text.slice(start, start + max);
|
|
671
|
+
}
|
|
672
|
+
var BundleSearch = class _BundleSearch {
|
|
673
|
+
graph;
|
|
674
|
+
index;
|
|
675
|
+
constructor(conceptsByAnyKey) {
|
|
676
|
+
this.graph = buildGraph(conceptsByAnyKey);
|
|
677
|
+
this.index = new MiniSearch({
|
|
678
|
+
fields: ["title", "description", "tags", "type", "body"],
|
|
679
|
+
storeFields: ["id"],
|
|
680
|
+
searchOptions: { boost: { title: 4, tags: 3, type: 2, description: 2 }, fuzzy: 0.2, prefix: true }
|
|
681
|
+
});
|
|
682
|
+
this.index.addAll(
|
|
683
|
+
[...this.graph.concepts.values()].map((concept) => ({
|
|
684
|
+
id: concept.id,
|
|
685
|
+
title: concept.title ?? concept.id,
|
|
686
|
+
type: concept.type,
|
|
687
|
+
description: concept.description ?? "",
|
|
688
|
+
tags: concept.tags.join(" "),
|
|
689
|
+
body: concept.body
|
|
690
|
+
}))
|
|
691
|
+
);
|
|
692
|
+
}
|
|
693
|
+
static async fromBundle(bundleDir) {
|
|
694
|
+
return new _BundleSearch(await readBundle(bundleDir));
|
|
695
|
+
}
|
|
696
|
+
search(query, options = {}) {
|
|
697
|
+
const hits = this.index.search(query || MiniSearch.wildcard, { combineWith: "AND" }).slice(0, 100);
|
|
698
|
+
const tagFilter = new Set(options.tags ?? []);
|
|
699
|
+
return hits.map((hit) => ({ hit, concept: this.graph.concepts.get(hit.id) })).filter((row) => Boolean(row.concept)).filter(({ concept }) => !options.type || concept.type === options.type).filter(({ concept }) => tagFilter.size === 0 || concept.tags.some((tag) => tagFilter.has(tag))).slice(0, options.limit ?? 10).map(({ hit, concept }) => ({
|
|
700
|
+
id: concept.id,
|
|
701
|
+
title: concept.title,
|
|
702
|
+
type: concept.type,
|
|
703
|
+
description: concept.description,
|
|
704
|
+
tags: concept.tags,
|
|
705
|
+
resource: concept.resource,
|
|
706
|
+
snippet: snippet(concept, query),
|
|
707
|
+
score: hit.score
|
|
708
|
+
}));
|
|
709
|
+
}
|
|
710
|
+
getConcept(idOrPath) {
|
|
711
|
+
const id = idOrPath.replace(/\.md$/i, "");
|
|
712
|
+
return this.graph.concepts.get(id) ?? [...this.graph.concepts.values()].find((concept) => concept.path === idOrPath);
|
|
713
|
+
}
|
|
714
|
+
};
|
|
715
|
+
|
|
716
|
+
// src/validate.ts
|
|
717
|
+
import fs4 from "fs/promises";
|
|
718
|
+
import path6 from "path";
|
|
719
|
+
import matter2 from "gray-matter";
|
|
720
|
+
async function listMarkdownFiles2(dir) {
|
|
721
|
+
const result = [];
|
|
722
|
+
async function walk(current) {
|
|
723
|
+
for (const entry of await fs4.readdir(current, { withFileTypes: true })) {
|
|
724
|
+
const absolute = path6.join(current, entry.name);
|
|
725
|
+
if (entry.isDirectory()) await walk(absolute);
|
|
726
|
+
else if (entry.isFile() && entry.name.endsWith(".md")) result.push(absolute);
|
|
727
|
+
}
|
|
728
|
+
}
|
|
729
|
+
await walk(dir);
|
|
730
|
+
return result.sort();
|
|
731
|
+
}
|
|
732
|
+
function issue(severity, code, message, file) {
|
|
733
|
+
return { severity, code, message, path: file };
|
|
734
|
+
}
|
|
735
|
+
async function validateBundle(bundleDir) {
|
|
736
|
+
const issues = [];
|
|
737
|
+
let files = [];
|
|
738
|
+
try {
|
|
739
|
+
files = await listMarkdownFiles2(bundleDir);
|
|
740
|
+
} catch (error) {
|
|
741
|
+
return {
|
|
742
|
+
valid: false,
|
|
743
|
+
issues: [issue("error", "bundle_unreadable", error?.message ?? "Bundle cannot be read.")],
|
|
744
|
+
conceptCount: 0
|
|
745
|
+
};
|
|
746
|
+
}
|
|
747
|
+
const seenIds = /* @__PURE__ */ new Set();
|
|
748
|
+
for (const file of files) {
|
|
749
|
+
const rel = path6.relative(bundleDir, file).split(path6.sep).join("/");
|
|
750
|
+
if (rel.includes("..") || path6.isAbsolute(rel)) {
|
|
751
|
+
issues.push(issue("error", "unsafe_path", "Concept path is unsafe.", rel));
|
|
752
|
+
}
|
|
753
|
+
const raw = await fs4.readFile(file, "utf8");
|
|
754
|
+
if (!raw.startsWith("---")) {
|
|
755
|
+
issues.push(issue("error", "missing_frontmatter", "Concept file must start with YAML frontmatter.", rel));
|
|
756
|
+
continue;
|
|
757
|
+
}
|
|
758
|
+
let parsed;
|
|
759
|
+
try {
|
|
760
|
+
parsed = matter2(raw);
|
|
761
|
+
} catch (error) {
|
|
762
|
+
issues.push(issue("error", "malformed_frontmatter", error?.message ?? "Malformed YAML frontmatter.", rel));
|
|
763
|
+
continue;
|
|
764
|
+
}
|
|
765
|
+
const data = parsed.data;
|
|
766
|
+
if (typeof data.type !== "string" || data.type.trim() === "") {
|
|
767
|
+
issues.push(issue("error", "missing_type", "Frontmatter type must be a non-empty string.", rel));
|
|
768
|
+
}
|
|
769
|
+
for (const key of ["title", "description", "resource", "timestamp"]) {
|
|
770
|
+
if (data[key] !== void 0 && typeof data[key] !== "string") {
|
|
771
|
+
issues.push(issue("error", "bad_field_shape", `${key} must be a string when present.`, rel));
|
|
772
|
+
}
|
|
773
|
+
}
|
|
774
|
+
if (data.tags !== void 0 && (!Array.isArray(data.tags) || data.tags.some((tag) => typeof tag !== "string"))) {
|
|
775
|
+
issues.push(issue("error", "bad_field_shape", "tags must be an array of strings when present.", rel));
|
|
776
|
+
}
|
|
777
|
+
if (parsed.content.trim().length === 0) {
|
|
778
|
+
issues.push(issue("error", "empty_concept", "Concept body must not be empty.", rel));
|
|
779
|
+
}
|
|
780
|
+
const id = rel.replace(/\.md$/i, "");
|
|
781
|
+
if (seenIds.has(id)) issues.push(issue("error", "duplicate_concept_id", `Duplicate concept id: ${id}`, rel));
|
|
782
|
+
seenIds.add(id);
|
|
783
|
+
}
|
|
784
|
+
const concepts = await readBundle(bundleDir).catch(() => /* @__PURE__ */ new Map());
|
|
785
|
+
const canonicalIds = new Set([...concepts.values()].map((concept) => concept.id));
|
|
786
|
+
for (const concept of new Map([...concepts.values()].map((concept2) => [concept2.id, concept2])).values()) {
|
|
787
|
+
for (const target of extractInternalLinks(concept)) {
|
|
788
|
+
if (!canonicalIds.has(target)) {
|
|
789
|
+
issues.push(issue("error", "broken_internal_link", `Broken internal link to ${target}.`, concept.path));
|
|
790
|
+
}
|
|
791
|
+
}
|
|
792
|
+
}
|
|
793
|
+
const dirs = new Set(files.map((file) => path6.dirname(file)));
|
|
794
|
+
for (const dir of dirs) {
|
|
795
|
+
const index = path6.join(dir, "index.md");
|
|
796
|
+
if (!files.includes(index)) {
|
|
797
|
+
issues.push(
|
|
798
|
+
issue(
|
|
799
|
+
"warning",
|
|
800
|
+
"missing_folder_index",
|
|
801
|
+
"Folder has concepts but no index.md.",
|
|
802
|
+
path6.relative(bundleDir, dir).split(path6.sep).join("/") || "."
|
|
803
|
+
)
|
|
804
|
+
);
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
return {
|
|
808
|
+
valid: !issues.some((item) => item.severity === "error"),
|
|
809
|
+
issues,
|
|
810
|
+
conceptCount: files.length
|
|
811
|
+
};
|
|
812
|
+
}
|
|
813
|
+
async function inspectBundle(bundleDir) {
|
|
814
|
+
const conceptsByAnyKey = await readBundle(bundleDir);
|
|
815
|
+
const graph = buildGraph(conceptsByAnyKey);
|
|
816
|
+
const concepts = [...graph.concepts.values()];
|
|
817
|
+
const typeDistribution = {};
|
|
818
|
+
const tagDistribution = {};
|
|
819
|
+
const sourceDomains = {};
|
|
820
|
+
for (const concept of concepts) {
|
|
821
|
+
typeDistribution[concept.type] = (typeDistribution[concept.type] ?? 0) + 1;
|
|
822
|
+
for (const tag of concept.tags) tagDistribution[tag] = (tagDistribution[tag] ?? 0) + 1;
|
|
823
|
+
if (concept.resource?.startsWith("http")) {
|
|
824
|
+
const domain = new URL(concept.resource).hostname;
|
|
825
|
+
sourceDomains[domain] = (sourceDomains[domain] ?? 0) + 1;
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
const topLinkedConcepts = concepts.map((concept) => ({
|
|
829
|
+
id: concept.id,
|
|
830
|
+
title: concept.title,
|
|
831
|
+
count: (graph.backlinks.get(concept.id) ?? []).length
|
|
832
|
+
})).sort((a, b) => b.count - a.count || a.id.localeCompare(b.id)).slice(0, 10);
|
|
833
|
+
const linkCount = [...graph.outbound.values()].reduce((sum, links) => sum + links.length, 0);
|
|
834
|
+
const validation = await validateBundle(bundleDir);
|
|
835
|
+
return {
|
|
836
|
+
title: concepts.find((concept) => concept.id === "index")?.title ?? path6.basename(bundleDir),
|
|
837
|
+
conceptCount: concepts.length,
|
|
838
|
+
typeDistribution,
|
|
839
|
+
tagDistribution,
|
|
840
|
+
linkCount,
|
|
841
|
+
brokenLinks: validation.issues.filter((item) => item.code === "broken_internal_link").length,
|
|
842
|
+
orphanConcepts: concepts.filter((concept) => concept.id !== "index").filter((concept) => (graph.backlinks.get(concept.id) ?? []).length === 0).map((concept) => concept.id).sort(),
|
|
843
|
+
topLinkedConcepts,
|
|
844
|
+
sourceDomains
|
|
845
|
+
};
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
// src/mcp.ts
|
|
849
|
+
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
850
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
851
|
+
import { CallToolRequestSchema, ListToolsRequestSchema } from "@modelcontextprotocol/sdk/types.js";
|
|
852
|
+
import { z } from "zod";
|
|
853
|
+
function json(value, maxChars = 12e3) {
|
|
854
|
+
let text = JSON.stringify(value, null, 2);
|
|
855
|
+
if (text.length > maxChars) text = `${text.slice(0, maxChars)}
|
|
856
|
+
...truncated`;
|
|
857
|
+
return { content: [{ type: "text", text }] };
|
|
858
|
+
}
|
|
859
|
+
var searchSchema = z.object({
|
|
860
|
+
query: z.string(),
|
|
861
|
+
type: z.string().optional(),
|
|
862
|
+
tags: z.array(z.string()).optional(),
|
|
863
|
+
limit: z.number().int().positive().max(50).optional()
|
|
864
|
+
});
|
|
865
|
+
var readSchema = z.object({ id: z.string(), max_chars: z.number().int().positive().optional() });
|
|
866
|
+
var neighborsSchema = z.object({ id: z.string(), depth: z.number().int().min(1).max(2).optional() });
|
|
867
|
+
async function createMcpServer(options) {
|
|
868
|
+
const search = await BundleSearch.fromBundle(options.bundleDir);
|
|
869
|
+
const server = new Server(
|
|
870
|
+
{ name: options.name ?? "okfy", version: "0.1.0" },
|
|
871
|
+
{ capabilities: { tools: {} } }
|
|
872
|
+
);
|
|
873
|
+
const maxResultChars = options.maxResultChars ?? 12e3;
|
|
874
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({
|
|
875
|
+
tools: [
|
|
876
|
+
{
|
|
877
|
+
name: "search_concepts",
|
|
878
|
+
description: "Search OKF concepts by query, type, and tags.",
|
|
879
|
+
inputSchema: {
|
|
880
|
+
type: "object",
|
|
881
|
+
properties: {
|
|
882
|
+
query: { type: "string" },
|
|
883
|
+
type: { type: "string" },
|
|
884
|
+
tags: { type: "array", items: { type: "string" } },
|
|
885
|
+
limit: { type: "number", default: 10 }
|
|
886
|
+
},
|
|
887
|
+
required: ["query"]
|
|
888
|
+
}
|
|
889
|
+
},
|
|
890
|
+
{
|
|
891
|
+
name: "read_concept",
|
|
892
|
+
description: "Read one OKF concept by id or path.",
|
|
893
|
+
inputSchema: {
|
|
894
|
+
type: "object",
|
|
895
|
+
properties: { id: { type: "string" }, max_chars: { type: "number" } },
|
|
896
|
+
required: ["id"]
|
|
897
|
+
}
|
|
898
|
+
},
|
|
899
|
+
{
|
|
900
|
+
name: "get_neighbors",
|
|
901
|
+
description: "Return outbound links and backlinks for a concept.",
|
|
902
|
+
inputSchema: {
|
|
903
|
+
type: "object",
|
|
904
|
+
properties: { id: { type: "string" }, depth: { type: "number", default: 1 } },
|
|
905
|
+
required: ["id"]
|
|
906
|
+
}
|
|
907
|
+
},
|
|
908
|
+
{ name: "list_types", description: "List concept types and counts.", inputSchema: { type: "object", properties: {} } },
|
|
909
|
+
{ name: "list_tags", description: "List concept tags and counts.", inputSchema: { type: "object", properties: {} } },
|
|
910
|
+
{ name: "bundle_summary", description: "Return bundle stats and validation status.", inputSchema: { type: "object", properties: {} } }
|
|
911
|
+
]
|
|
912
|
+
}));
|
|
913
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
914
|
+
const args = request.params.arguments ?? {};
|
|
915
|
+
try {
|
|
916
|
+
if (request.params.name === "search_concepts") {
|
|
917
|
+
const parsed = searchSchema.parse(args);
|
|
918
|
+
return json(search.search(parsed.query, parsed), maxResultChars);
|
|
919
|
+
}
|
|
920
|
+
if (request.params.name === "read_concept") {
|
|
921
|
+
const parsed = readSchema.parse(args);
|
|
922
|
+
const concept = search.getConcept(parsed.id);
|
|
923
|
+
if (!concept) return json({ error: { code: "unknown_concept", message: `No concept found for ${parsed.id}` } });
|
|
924
|
+
const max = parsed.max_chars ?? maxResultChars;
|
|
925
|
+
return json(
|
|
926
|
+
{
|
|
927
|
+
frontmatter: concept.frontmatter,
|
|
928
|
+
markdown_body: concept.body.slice(0, max),
|
|
929
|
+
outbound_links: search.graph.outbound.get(concept.id) ?? [],
|
|
930
|
+
backlinks: search.graph.backlinks.get(concept.id) ?? [],
|
|
931
|
+
source_resource: concept.resource
|
|
932
|
+
},
|
|
933
|
+
maxResultChars
|
|
934
|
+
);
|
|
935
|
+
}
|
|
936
|
+
if (request.params.name === "get_neighbors") {
|
|
937
|
+
const parsed = neighborsSchema.parse(args);
|
|
938
|
+
const root = search.getConcept(parsed.id);
|
|
939
|
+
if (!root) return json({ error: { code: "unknown_concept", message: `No concept found for ${parsed.id}` } });
|
|
940
|
+
const depth = parsed.depth ?? 1;
|
|
941
|
+
const seen = /* @__PURE__ */ new Set([root.id]);
|
|
942
|
+
let frontier = [root.id];
|
|
943
|
+
const edges = [];
|
|
944
|
+
for (let level = 0; level < depth; level += 1) {
|
|
945
|
+
const next = [];
|
|
946
|
+
for (const id of frontier) {
|
|
947
|
+
for (const to of search.graph.outbound.get(id) ?? []) {
|
|
948
|
+
edges.push({ from: id, to, direction: "outbound", relationship_text: "Markdown link" });
|
|
949
|
+
if (!seen.has(to)) next.push(to);
|
|
950
|
+
seen.add(to);
|
|
951
|
+
}
|
|
952
|
+
for (const from of search.graph.backlinks.get(id) ?? []) {
|
|
953
|
+
edges.push({ from, to: id, direction: "backlink", relationship_text: "Backlink" });
|
|
954
|
+
if (!seen.has(from)) next.push(from);
|
|
955
|
+
seen.add(from);
|
|
956
|
+
}
|
|
957
|
+
}
|
|
958
|
+
frontier = next;
|
|
959
|
+
}
|
|
960
|
+
return json({
|
|
961
|
+
root: root.id,
|
|
962
|
+
concepts: [...seen].map((id) => {
|
|
963
|
+
const concept = search.graph.concepts.get(id);
|
|
964
|
+
return { id, title: concept?.title, type: concept?.type, resource: concept?.resource };
|
|
965
|
+
}),
|
|
966
|
+
edges
|
|
967
|
+
});
|
|
968
|
+
}
|
|
969
|
+
if (request.params.name === "list_types") {
|
|
970
|
+
const stats = await inspectBundle(options.bundleDir);
|
|
971
|
+
return json(stats.typeDistribution);
|
|
972
|
+
}
|
|
973
|
+
if (request.params.name === "list_tags") {
|
|
974
|
+
const stats = await inspectBundle(options.bundleDir);
|
|
975
|
+
return json(stats.tagDistribution);
|
|
976
|
+
}
|
|
977
|
+
if (request.params.name === "bundle_summary") {
|
|
978
|
+
const [stats, validation] = await Promise.all([inspectBundle(options.bundleDir), validateBundle(options.bundleDir)]);
|
|
979
|
+
return json({ ...stats, validationStatus: validation.valid ? "valid" : "invalid", validationIssues: validation.issues });
|
|
980
|
+
}
|
|
981
|
+
return json({ error: { code: "unknown_tool", message: `Unknown tool: ${request.params.name}` } });
|
|
982
|
+
} catch (error) {
|
|
983
|
+
return json({ error: { code: "tool_error", message: error?.message ?? "Tool failed." } });
|
|
984
|
+
}
|
|
985
|
+
});
|
|
986
|
+
return server;
|
|
987
|
+
}
|
|
988
|
+
async function serveMcpStdio(options) {
|
|
989
|
+
const server = await createMcpServer(options);
|
|
990
|
+
const transport = new StdioServerTransport();
|
|
991
|
+
await server.connect(transport);
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
export {
|
|
995
|
+
extractHeadings,
|
|
996
|
+
extractMarkdownLinks,
|
|
997
|
+
inferType,
|
|
998
|
+
inferTags,
|
|
999
|
+
normalizeDocument,
|
|
1000
|
+
descriptionFromMarkdown,
|
|
1001
|
+
writeOkfBundle,
|
|
1002
|
+
crawlWebsite,
|
|
1003
|
+
importLocal,
|
|
1004
|
+
extractInternalLinks,
|
|
1005
|
+
buildGraph,
|
|
1006
|
+
readConceptFile,
|
|
1007
|
+
readBundle,
|
|
1008
|
+
BundleSearch,
|
|
1009
|
+
validateBundle,
|
|
1010
|
+
inspectBundle,
|
|
1011
|
+
createMcpServer,
|
|
1012
|
+
serveMcpStdio
|
|
1013
|
+
};
|