pi-soly 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/docs.ts ADDED
@@ -0,0 +1,235 @@
1
+ // =============================================================================
2
+ // docs.ts — Doc search + snippet tools support
3
+ // =============================================================================
4
+ //
5
+ // Lazy context helpers:
6
+ // - searchDocs(query, cwd) — index of all .md files (one-line descriptions),
7
+ // matches by simple substring scoring
8
+ // - readSnippet(path, offset, limit) — bounded file read for soly_snippet
9
+ // =============================================================================
10
+
11
+ import * as fs from "node:fs";
12
+ import * as path from "node:path";
13
+ import { estimateTokens, findMarkdownFiles, readIfExists } from "./core.js";
14
+ import { extractTitleAndPreview, stripHtml } from "./html.js";
15
+
16
+ // Re-export the stripHtml helper so existing imports of `stripHtml from
17
+ // "./docs.js"` (used by tools.ts) continue to work without churn.
18
+ export { stripHtml };
19
+
20
+ export interface DocIndexEntry {
21
+ relPath: string;
22
+ absPath: string;
23
+ tokens: number;
24
+ title: string;
25
+ preview: string;
26
+ /** Priority bucket: 0=intent, 1=phase-intent, 2=project. Used in search ranking. */
27
+ sourceKind: "intent" | "phase-intent" | "project";
28
+ }
29
+
30
+ const DOC_GLOBS_IGNORE = [
31
+ "node_modules",
32
+ "dist",
33
+ "build",
34
+ "coverage",
35
+ ".git",
36
+ "out",
37
+ ".next",
38
+ ".nuxt",
39
+ "target", // rust
40
+ "__pycache__",
41
+ ".venv",
42
+ "venv",
43
+ ];
44
+
45
+ // .soly/ IS indexed, but intent docs in .soly/docs/ and .soly/phases/<N>/docs/
46
+ // are tagged with higher priority in search results (see buildDocIndex).
47
+
48
+ const INTENT_DOC_EXTS = [".md", ".html", ".htm"];
49
+
50
+ /** Build an index of all .md / .html files under cwd, excluding noisy dirs. */
51
+ export function buildDocIndex(cwd: string, limit = 5000): DocIndexEntry[] {
52
+ const mdFiles = findMarkdownFiles(cwd);
53
+ const out: DocIndexEntry[] = [];
54
+ const seen = new Set<string>();
55
+
56
+ const addFile = (relPath: string, absPath: string, sourceKind: DocIndexEntry["sourceKind"]) => {
57
+ if (seen.has(relPath)) return;
58
+ seen.add(relPath);
59
+ const raw = readIfExists(absPath);
60
+ if (!raw) return;
61
+ const ext = path.extname(relPath).toLowerCase();
62
+ if (ext !== ".md" && ext !== ".html" && ext !== ".htm") return;
63
+ const { title, preview } = extractTitleAndPreview(raw, ext);
64
+ out.push({
65
+ relPath,
66
+ absPath,
67
+ tokens: estimateTokens(raw),
68
+ title,
69
+ preview,
70
+ sourceKind,
71
+ });
72
+ };
73
+
74
+ // 1. Intent docs (priority 0)
75
+ const docsRoot = path.join(cwd, ".soly", "docs");
76
+ if (fs.existsSync(docsRoot)) {
77
+ const intentFiles = findIntentFiles(docsRoot);
78
+ for (const f of intentFiles) {
79
+ const rel = path.relative(cwd, f);
80
+ addFile(rel, f, "intent");
81
+ if (out.length >= limit) break;
82
+ }
83
+ }
84
+
85
+ // 2. Phase intent docs (priority 1) — only if phases dir exists
86
+ const phasesRoot = path.join(cwd, ".soly", "phases");
87
+ if (out.length < limit && fs.existsSync(phasesRoot)) {
88
+ try {
89
+ const phaseEntries = fs.readdirSync(phasesRoot, { withFileTypes: true });
90
+ for (const pe of phaseEntries) {
91
+ if (!pe.isDirectory()) continue;
92
+ const phaseDocsDir = path.join(phasesRoot, pe.name, "docs");
93
+ if (fs.existsSync(phaseDocsDir)) {
94
+ const phaseFiles = findIntentFiles(phaseDocsDir);
95
+ for (const f of phaseFiles) {
96
+ const rel = path.relative(cwd, f);
97
+ addFile(rel, f, "phase-intent");
98
+ if (out.length >= limit) break;
99
+ }
100
+ }
101
+ if (out.length >= limit) break;
102
+ }
103
+ } catch {
104
+ // best effort
105
+ }
106
+ }
107
+
108
+ // 3. Rest of project (priority 2)
109
+ if (out.length < limit) {
110
+ for (const relPath of mdFiles) {
111
+ const segments = relPath.split("/");
112
+ if (segments.some((s) => DOC_GLOBS_IGNORE.includes(s))) continue;
113
+ const absPath = path.join(cwd, relPath);
114
+ addFile(relPath, absPath, "project");
115
+ if (out.length >= limit) break;
116
+ }
117
+ }
118
+
119
+ return out;
120
+ }
121
+
122
+ function findIntentFiles(dir: string): string[] {
123
+ const out: string[] = [];
124
+ if (!fs.existsSync(dir)) return out;
125
+ let entries: fs.Dirent[];
126
+ try {
127
+ entries = fs.readdirSync(dir, { withFileTypes: true });
128
+ } catch {
129
+ return out;
130
+ }
131
+ for (const e of entries) {
132
+ if (e.name.startsWith(".")) continue;
133
+ const full = path.join(dir, e.name);
134
+ if (e.isDirectory()) {
135
+ out.push(...findIntentFiles(full));
136
+ } else if (e.isFile()) {
137
+ const ext = path.extname(e.name).toLowerCase();
138
+ if (INTENT_DOC_EXTS.includes(ext)) {
139
+ out.push(full);
140
+ }
141
+ }
142
+ }
143
+ return out;
144
+ }
145
+
146
+ export interface DocSearchHit {
147
+ entry: DocIndexEntry;
148
+ score: number;
149
+ /** Substring excerpts where the query matched (up to 3). */
150
+ excerpts: string[];
151
+ /** Source-priority bonus applied (intent > phase-intent > project). */
152
+ priorityBonus: number;
153
+ }
154
+
155
+ const SOURCE_PRIORITY_BONUS: Record<DocIndexEntry["sourceKind"], number> = {
156
+ intent: 5,
157
+ "phase-intent": 3,
158
+ project: 0,
159
+ };
160
+
161
+ const SOURCE_TAG: Record<DocIndexEntry["sourceKind"], string> = {
162
+ intent: "[intent]",
163
+ "phase-intent": "[phase-intent]",
164
+ project: "[project]",
165
+ };
166
+
167
+ /**
168
+ * Search the doc index by substring scoring.
169
+ * Title matches outscore body matches 3:1. Case-insensitive.
170
+ * Intent docs are prioritized over project docs (source-priority bonus).
171
+ */
172
+ export function searchDocs(index: DocIndexEntry[], query: string, limit = 10): DocSearchHit[] {
173
+ const q = query.trim().toLowerCase();
174
+ if (!q) return [];
175
+ const tokens = q.split(/\s+/).filter(Boolean);
176
+ const hits: DocSearchHit[] = [];
177
+
178
+ for (const entry of index) {
179
+ const titleLower = entry.title.toLowerCase();
180
+ const previewLower = entry.preview.toLowerCase();
181
+ const relPathLower = entry.relPath.toLowerCase();
182
+
183
+ let score = 0;
184
+ const excerpts: string[] = [];
185
+
186
+ for (const t of tokens) {
187
+ if (titleLower.includes(t)) score += 3;
188
+ if (relPathLower.includes(t)) score += 2;
189
+ const previewMatches = previewLower.split(t).length - 1;
190
+ if (previewMatches > 0) {
191
+ score += previewMatches;
192
+ const idx = previewLower.indexOf(t);
193
+ if (idx >= 0 && excerpts.length < 3) {
194
+ const start = Math.max(0, idx - 40);
195
+ const end = Math.min(entry.preview.length, idx + t.length + 60);
196
+ excerpts.push(
197
+ (start > 0 ? "…" : "") + entry.preview.slice(start, end) + (end < entry.preview.length ? "…" : ""),
198
+ );
199
+ }
200
+ }
201
+ }
202
+
203
+ if (score > 0) {
204
+ const priorityBonus = SOURCE_PRIORITY_BONUS[entry.sourceKind];
205
+ hits.push({ entry, score: score + priorityBonus, excerpts, priorityBonus });
206
+ }
207
+ }
208
+
209
+ hits.sort((a, b) => b.score - a.score);
210
+ return hits.slice(0, limit);
211
+ }
212
+
213
+ /** Helper: source tag for a DocIndexEntry (used in search output). */
214
+ export function sourceTag(entry: DocIndexEntry): string {
215
+ return SOURCE_TAG[entry.sourceKind];
216
+ }
217
+
218
+ /** Bounded file read with line numbers, for soly_snippet. */
219
+ export function readSnippet(
220
+ absPath: string,
221
+ offset = 0,
222
+ limit = 100,
223
+ ): { lines: string[]; totalLines: number; outOfRange: boolean } | null {
224
+ const raw = readIfExists(absPath);
225
+ if (raw === null) return null;
226
+ const allLines = raw.split(/\r?\n/);
227
+ const start = Math.max(0, offset);
228
+ const end = Math.min(allLines.length, start + limit);
229
+ return {
230
+ lines: allLines.slice(start, end),
231
+ totalLines: allLines.length,
232
+ outOfRange: end < allLines.length,
233
+ };
234
+ }
235
+
package/env.ts ADDED
@@ -0,0 +1,196 @@
1
+ // =============================================================================
2
+ // env.ts — Project environment summary for the soly extension
3
+ // =============================================================================
4
+ //
5
+ // Detects the project's runtime environment: package manager, node/bun
6
+ // version, key dependencies, and common services (postgres, redis, etc.).
7
+ // Used by the `soly_env` tool and injected into the system prompt as a
8
+ // short "## project env" section.
9
+ //
10
+ // All detection is best-effort. Missing files just skip their block.
11
+ // =============================================================================
12
+
13
+ import * as fs from "node:fs";
14
+ import * as path from "node:path";
15
+
16
+ interface PackageJson {
17
+ name?: string;
18
+ version?: string;
19
+ private?: boolean;
20
+ type?: string;
21
+ scripts?: Record<string, string>;
22
+ dependencies?: Record<string, string>;
23
+ devDependencies?: Record<string, string>;
24
+ engines?: Record<string, string>;
25
+ packageManager?: string;
26
+ workspaces?: string[] | { packages?: string[] };
27
+ }
28
+
29
+ export interface EnvSummary {
30
+ projectName: string | null;
31
+ projectVersion: string | null;
32
+ runtimes: string[];
33
+ packageManager: string | null;
34
+ mainDependencies: string[]; // up to 8 most relevant
35
+ scripts: string[]; // up to 6 most common
36
+ services: string[];
37
+ hasTypeScript: boolean;
38
+ hasTests: boolean;
39
+ hasDocker: boolean;
40
+ hasCI: boolean;
41
+ }
42
+
43
+ /** Heuristic: which top-level deps look "main" rather than "peripheral". */
44
+ const MAIN_DEP_HINTS = [
45
+ "react", "vue", "svelte", "next", "nuxt", "remix", "astro",
46
+ "express", "fastify", "koa", "hapi", "nestjs",
47
+ "prisma", "drizzle-orm", "typeorm", "sequelize", "mongoose",
48
+ "@earendil-works/pi-coding-agent", "@earendil-works/pi-ai",
49
+ "typescript", "zod", "typebox",
50
+ "tailwindcss", "@radix-ui/react",
51
+ ];
52
+
53
+ const COMMON_SCRIPTS = [
54
+ "dev", "build", "start", "test", "lint", "typecheck", "format", "check",
55
+ ];
56
+
57
+ function readJsonSafe<T>(p: string): T | null {
58
+ try {
59
+ return JSON.parse(fs.readFileSync(p, "utf-8")) as T;
60
+ } catch {
61
+ return null;
62
+ }
63
+ }
64
+
65
+ function readFirstLine(p: string): string | null {
66
+ try {
67
+ const content = fs.readFileSync(p, "utf-8");
68
+ const first = content.split(/\r?\n/)[0]?.trim() ?? "";
69
+ return first || null;
70
+ } catch {
71
+ return null;
72
+ }
73
+ }
74
+
75
+ export function detectEnv(cwd: string): EnvSummary {
76
+ const out: EnvSummary = {
77
+ projectName: null,
78
+ projectVersion: null,
79
+ runtimes: [],
80
+ packageManager: null,
81
+ mainDependencies: [],
82
+ scripts: [],
83
+ services: [],
84
+ hasTypeScript: false,
85
+ hasTests: false,
86
+ hasDocker: false,
87
+ hasCI: false,
88
+ };
89
+
90
+ // package.json
91
+ const pkg = readJsonSafe<PackageJson>(path.join(cwd, "package.json"));
92
+ if (pkg) {
93
+ out.projectName = pkg.name ?? null;
94
+ out.projectVersion = pkg.version ?? null;
95
+ if (pkg.packageManager) out.packageManager = pkg.packageManager;
96
+
97
+ // Engines
98
+ if (pkg.engines) {
99
+ for (const [k, v] of Object.entries(pkg.engines)) {
100
+ out.runtimes.push(`${k} ${v}`);
101
+ }
102
+ }
103
+
104
+ // Main dependencies — prefer hints, then top-level deps
105
+ const allDeps = { ...(pkg.dependencies ?? {}), ...(pkg.devDependencies ?? {}) };
106
+ const hinted = MAIN_DEP_HINTS.filter((h) => allDeps[h]).slice(0, 8);
107
+ out.mainDependencies = hinted;
108
+
109
+ // Scripts — only those in COMMON_SCRIPTS
110
+ if (pkg.scripts) {
111
+ out.scripts = COMMON_SCRIPTS.filter((s) => pkg.scripts?.[s]).slice(0, 6);
112
+ }
113
+
114
+ // Has TypeScript?
115
+ out.hasTypeScript = "typescript" in allDeps || fs.existsSync(path.join(cwd, "tsconfig.json"));
116
+ // Has tests?
117
+ out.hasTests =
118
+ "vitest" in allDeps ||
119
+ "jest" in allDeps ||
120
+ "mocha" in allDeps ||
121
+ "@playwright/test" in allDeps ||
122
+ fs.existsSync(path.join(cwd, "tests")) ||
123
+ fs.existsSync(path.join(cwd, "__tests__"));
124
+ }
125
+
126
+ // Has Docker?
127
+ out.hasDocker =
128
+ fs.existsSync(path.join(cwd, "Dockerfile")) ||
129
+ fs.existsSync(path.join(cwd, "docker-compose.yml")) ||
130
+ fs.existsSync(path.join(cwd, "docker-compose.yaml"));
131
+
132
+ // Has CI?
133
+ const ciDirs = [".github/workflows", ".gitlab-ci.yml", ".circleci", ".buildkite"];
134
+ out.hasCI = ciDirs.some((d) => fs.existsSync(path.join(cwd, d)));
135
+
136
+ // Services — scan compose file for known service names
137
+ const composeFile =
138
+ fs.existsSync(path.join(cwd, "docker-compose.yml"))
139
+ ? path.join(cwd, "docker-compose.yml")
140
+ : fs.existsSync(path.join(cwd, "docker-compose.yaml"))
141
+ ? path.join(cwd, "docker-compose.yaml")
142
+ : null;
143
+ if (composeFile) {
144
+ try {
145
+ const text = fs.readFileSync(composeFile, "utf-8");
146
+ const serviceHints = ["postgres", "redis", "mysql", "mongo", "rabbitmq", "kafka", "nginx", "traefik"];
147
+ out.services = serviceHints.filter((s) => new RegExp(`\\b${s}\\b`, "i").test(text));
148
+ } catch {
149
+ // ignore
150
+ }
151
+ }
152
+
153
+ // .nvmrc / .node-version / .tool-versions
154
+ const nvmrc = readFirstLine(path.join(cwd, ".nvmrc"));
155
+ if (nvmrc) out.runtimes.push(`node ${nvmrc}`);
156
+ const toolVersions = readFirstLine(path.join(cwd, ".tool-versions"));
157
+ if (toolVersions) out.runtimes.push(`asdf ${toolVersions.replace(/\s+/g, " ")}`);
158
+
159
+ return out;
160
+ }
161
+
162
+ /** Short env section to inject into the system prompt. */
163
+ export function buildEnvSection(env: EnvSummary): string {
164
+ if (!env.projectName && env.runtimes.length === 0 && !env.packageManager) {
165
+ return "";
166
+ }
167
+
168
+ const lines: string[] = ["", "## project env", ""];
169
+ if (env.projectName) {
170
+ lines.push(`- **name**: ${env.projectName}${env.projectVersion ? ` @ ${env.projectVersion}` : ""}`);
171
+ }
172
+ if (env.packageManager) {
173
+ lines.push(`- **package manager**: ${env.packageManager}`);
174
+ }
175
+ if (env.runtimes.length > 0) {
176
+ lines.push(`- **runtimes**: ${env.runtimes.join(", ")}`);
177
+ }
178
+ if (env.mainDependencies.length > 0) {
179
+ lines.push(`- **key deps**: ${env.mainDependencies.join(", ")}`);
180
+ }
181
+ if (env.scripts.length > 0) {
182
+ lines.push(`- **scripts**: \`${env.scripts.join("`, `")}\``);
183
+ }
184
+ const flags: string[] = [];
185
+ if (env.hasTypeScript) flags.push("ts");
186
+ if (env.hasTests) flags.push("tests");
187
+ if (env.hasDocker) flags.push("docker");
188
+ if (env.hasCI) flags.push("ci");
189
+ if (flags.length > 0) {
190
+ lines.push(`- **tooling**: ${flags.join(", ")}`);
191
+ }
192
+ if (env.services.length > 0) {
193
+ lines.push(`- **services**: ${env.services.join(", ")}`);
194
+ }
195
+ return lines.join("\n");
196
+ }
package/git.ts ADDED
@@ -0,0 +1,95 @@
1
+ // =============================================================================
2
+ // git.ts — Git context provider for the soly extension
3
+ // =============================================================================
4
+ //
5
+ // Reads current git state (branch, status, last 5 commits) and renders a
6
+ // short section to inject into the system prompt. The model gets immediate
7
+ // awareness of what's changed recently and what's uncommitted.
8
+ //
9
+ // All git calls go through `git ...` with a 2s timeout. Failures are silent
10
+ // (no git, not a repo, no network) — the section just doesn't render.
11
+ // =============================================================================
12
+
13
+ import { execFile } from "node:child_process";
14
+ import { promisify } from "node:util";
15
+
16
+ const execFileAsync = promisify(execFile);
17
+
18
+ const TIMEOUT_MS = 2000;
19
+
20
+ interface GitResult {
21
+ ok: boolean;
22
+ stdout: string;
23
+ }
24
+
25
+ async function safeGit(args: string[], cwd: string): Promise<GitResult> {
26
+ try {
27
+ const { stdout } = await execFileAsync("git", args, {
28
+ cwd,
29
+ timeout: TIMEOUT_MS,
30
+ maxBuffer: 64 * 1024,
31
+ encoding: "utf-8",
32
+ });
33
+ return { ok: true, stdout: stdout.trim() };
34
+ } catch {
35
+ return { ok: false, stdout: "" };
36
+ }
37
+ }
38
+
39
+ export interface GitContext {
40
+ available: boolean;
41
+ branch: string | null;
42
+ statusShort: string | null;
43
+ lastCommits: string[];
44
+ }
45
+
46
+ /** Read git state. Returns `available: false` if git is missing / not a repo. */
47
+ export async function readGitContext(cwd: string): Promise<GitContext> {
48
+ const branch = await safeGit(["rev-parse", "--abbrev-ref", "HEAD"], cwd);
49
+ if (!branch.ok) {
50
+ return { available: false, branch: null, statusShort: null, lastCommits: [] };
51
+ }
52
+ const [status, log] = await Promise.all([
53
+ safeGit(["status", "--short"], cwd),
54
+ safeGit(["log", "--oneline", "-5", "--no-decorate"], cwd),
55
+ ]);
56
+ return {
57
+ available: true,
58
+ branch: branch.stdout || null,
59
+ statusShort: status.ok ? status.stdout : null,
60
+ lastCommits: log.ok && log.stdout ? log.stdout.split(/\r?\n/).filter(Boolean) : [],
61
+ };
62
+ }
63
+
64
+ /** Render a short git section to inject into the system prompt. */
65
+ export function buildGitSection(ctx: GitContext): string {
66
+ if (!ctx.available) return "";
67
+
68
+ const lines: string[] = ["", "## current git state", ""];
69
+ lines.push(`- **branch**: ${ctx.branch ?? "(detached)"}`);
70
+
71
+ if (ctx.statusShort !== null) {
72
+ if (ctx.statusShort === "") {
73
+ lines.push("- **working tree**: clean");
74
+ } else {
75
+ const changed = ctx.statusShort.split(/\r?\n/).filter(Boolean);
76
+ lines.push(`- **working tree**: ${changed.length} changed file(s)`);
77
+ // Inline first 10 for visibility — full list available via `soly diff`
78
+ for (const c of changed.slice(0, 10)) {
79
+ lines.push(` - ${c}`);
80
+ }
81
+ if (changed.length > 10) {
82
+ lines.push(` - ... and ${changed.length - 10} more (run \`soly diff\` for full)`);
83
+ }
84
+ }
85
+ }
86
+
87
+ if (ctx.lastCommits.length > 0) {
88
+ lines.push("- **recent commits**:");
89
+ for (const c of ctx.lastCommits.slice(0, 5)) {
90
+ lines.push(` - ${c}`);
91
+ }
92
+ }
93
+
94
+ return lines.join("\n");
95
+ }
package/html.ts ADDED
@@ -0,0 +1,157 @@
1
+ // =============================================================================
2
+ // html.ts — Shared HTML utilities (shared between intent and docs loaders)
3
+ // =============================================================================
4
+ //
5
+ // Single source of truth for parsing `.html`/`.htm` intent docs and for
6
+ // stripping HTML tags from arbitrary text. Previously duplicated in
7
+ // intent.ts and docs.ts — extracting here lets the test suite cover
8
+ // one parser instead of two, and prevents drift.
9
+ //
10
+ // Public API:
11
+ // - stripHtml(html) — strip tags + decode common entities
12
+ // - extractHtmlMeta(html) — pull <title> / <h1> / <meta description>
13
+ // - extractTitleAndPreview(raw, ext, opts?) — unified .md / .html frontmatter
14
+ //
15
+ // All functions are pure (no I/O) — they accept a string and return
16
+ // a string (or a small object). They never read the filesystem.
17
+ // =============================================================================
18
+
19
+ const HTML_TAG_RE = /<[^>]+>/g;
20
+ const HTML_STYLE_RE = /<style[\s\S]*?<\/style>/gi;
21
+ const HTML_SCRIPT_RE = /<script[\s\S]*?<\/script>/gi;
22
+ const HTML_COMMENT_RE = /<!--[\s\S]*?-->/g;
23
+ const HTML_TITLE_RE = /<title[^>]*>([\s\S]*?)<\/title>/i;
24
+ const HTML_H1_RE = /<h1[^>]*>([\s\S]*?)<\/h1>/i;
25
+ const HTML_META_DESC_RE = /<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i;
26
+
27
+ // Markdown frontmatter (YAML-ish). Captures: [1] body after the closing `---`.
28
+ const MD_FRONTMATTER_RE = /^---\r?\n[\s\S]*?\r?\n---\r?\n?([\s\S]*)$/;
29
+
30
+ /** Strip just the tags (used internally for title/description extraction). */
31
+ function stripTags(html: string): string {
32
+ return html
33
+ .replace(HTML_STYLE_RE, " ")
34
+ .replace(HTML_SCRIPT_RE, " ")
35
+ .replace(HTML_COMMENT_RE, " ")
36
+ .replace(HTML_TAG_RE, " ")
37
+ .replace(/\s+/g, " ")
38
+ .trim();
39
+ }
40
+
41
+ /** Strip HTML tags and decode common entities. Whitespace is collapsed. */
42
+ export function stripHtml(html: string): string {
43
+ return stripTags(html)
44
+ .replace(/&nbsp;/g, " ")
45
+ .replace(/&amp;/g, "&")
46
+ .replace(/&lt;/g, "<")
47
+ .replace(/&gt;/g, ">")
48
+ .replace(/&quot;/g, '"')
49
+ .replace(/&#39;/g, "'");
50
+ }
51
+
52
+ export interface HtmlMeta {
53
+ title: string;
54
+ description: string;
55
+ }
56
+
57
+ /**
58
+ * Extract `<title>` / `<h1>` (fallback) / `<meta name="description">` from a
59
+ * raw HTML document. Entities in the extracted text are decoded. Title is
60
+ * capped at 200 chars, description at 300.
61
+ */
62
+ export function extractHtmlMeta(html: string): HtmlMeta {
63
+ const titleMatch = html.match(HTML_TITLE_RE);
64
+ const h1Match = html.match(HTML_H1_RE);
65
+ const metaMatch = html.match(HTML_META_DESC_RE);
66
+ const title =
67
+ (titleMatch?.[1] ? stripHtml(titleMatch[1]) : "") ||
68
+ (h1Match?.[1] ? stripHtml(h1Match[1]) : "");
69
+ const description = metaMatch?.[1] ? stripHtml(metaMatch[1]) : "";
70
+ return {
71
+ title: title.slice(0, 200),
72
+ description: description.slice(0, 300),
73
+ };
74
+ }
75
+
76
+ export interface ExtractedDoc {
77
+ title: string;
78
+ preview: string;
79
+ }
80
+
81
+ /**
82
+ * Unified title + preview extractor for both `.md` and `.html`/`.htm` files.
83
+ * Markdown path strips YAML frontmatter, picks the first H1 (or first non-blank
84
+ * non-code-fence line as fallback), and joins non-heading non-code paragraphs.
85
+ * HTML path delegates to `extractHtmlMeta` + `stripHtml`.
86
+ *
87
+ * @param raw Full file content
88
+ * @param ext Lowercase file extension, e.g. ".md", ".html", ".htm"
89
+ * @param opts `maxPreview` caps the preview string (default 200)
90
+ */
91
+ export function extractTitleAndPreview(
92
+ raw: string,
93
+ ext: ".md" | ".html" | ".htm",
94
+ opts: { maxPreview?: number } = {},
95
+ ): ExtractedDoc {
96
+ const maxPreview = opts.maxPreview ?? 200;
97
+ if (ext === ".html" || ext === ".htm") {
98
+ const { title, description } = extractHtmlMeta(raw);
99
+ return {
100
+ title: title.slice(0, 120),
101
+ preview: description || stripHtml(raw).slice(0, maxPreview),
102
+ };
103
+ }
104
+
105
+ // Markdown path
106
+ const fmMatch = raw.match(MD_FRONTMATTER_RE);
107
+ const body = fmMatch ? fmMatch[1] : raw;
108
+ const lines = body.split(/\r?\n/);
109
+
110
+ // Strip fenced code blocks (``` ... ```) so we don't pull in code as the
111
+ // title or as part of the preview body. Track open/close state across
112
+ // lines: a fence opens, everything until the matching close is skipped.
113
+ const stripCodeBlocks = (input: string[]): string[] => {
114
+ const out: string[] = [];
115
+ let inFence = false;
116
+ for (const l of input) {
117
+ const trimmed = l.trim();
118
+ if (trimmed.startsWith("```")) {
119
+ inFence = !inFence;
120
+ continue; // skip the fence line itself
121
+ }
122
+ if (inFence) continue;
123
+ out.push(l);
124
+ }
125
+ return out;
126
+ };
127
+ const bodyLines = stripCodeBlocks(lines);
128
+
129
+ let title = "";
130
+ for (const l of bodyLines) {
131
+ const h = l.match(/^#\s+(.+)$/);
132
+ if (h) {
133
+ title = h[1].trim();
134
+ break;
135
+ }
136
+ }
137
+ if (!title) {
138
+ for (const l of bodyLines) {
139
+ const t = l.trim();
140
+ if (t) {
141
+ title = t;
142
+ break;
143
+ }
144
+ }
145
+ }
146
+
147
+ const meaningful = bodyLines
148
+ .filter((l) => l.trim() && !l.startsWith("#"))
149
+ .join(" ")
150
+ .replace(/\s+/g, " ")
151
+ .trim();
152
+
153
+ return {
154
+ title: title.slice(0, 120),
155
+ preview: meaningful.slice(0, maxPreview),
156
+ };
157
+ }