tgo-wiki 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/CHANGELOG.md +32 -0
  2. package/README.md +255 -0
  3. package/docs/mcp-usage.md +631 -0
  4. package/docs/v0-acceptance.md +105 -0
  5. package/docs/v0-delivery-checklist.md +57 -0
  6. package/docs/v1-acceptance.md +39 -0
  7. package/docs/v2-acceptance.md +165 -0
  8. package/package.json +69 -0
  9. package/packages/core/src/config/config-loader.ts +109 -0
  10. package/packages/core/src/config/defaults.ts +74 -0
  11. package/packages/core/src/config/workspace-resolver.ts +40 -0
  12. package/packages/core/src/documents/command-document-parser.ts +206 -0
  13. package/packages/core/src/documents/document-id.ts +26 -0
  14. package/packages/core/src/documents/document-parser-registry.ts +126 -0
  15. package/packages/core/src/documents/document-service.ts +656 -0
  16. package/packages/core/src/documents/document-store.ts +132 -0
  17. package/packages/core/src/documents/document-types.ts +33 -0
  18. package/packages/core/src/documents/pdf-text-parser.ts +35 -0
  19. package/packages/core/src/documents/text-markdown-parser.ts +50 -0
  20. package/packages/core/src/errors.ts +46 -0
  21. package/packages/core/src/git/git-service.ts +68 -0
  22. package/packages/core/src/index.ts +38 -0
  23. package/packages/core/src/markdown/markdown-scanner.ts +90 -0
  24. package/packages/core/src/permissions/permission-service.ts +50 -0
  25. package/packages/core/src/publish/publish-service.ts +142 -0
  26. package/packages/core/src/result.ts +13 -0
  27. package/packages/core/src/services/session-workflow-service.ts +493 -0
  28. package/packages/core/src/services/wiki-service.ts +119 -0
  29. package/packages/core/src/services/workspace-service.ts +223 -0
  30. package/packages/core/src/session/session-id.ts +14 -0
  31. package/packages/core/src/session/session-service.ts +77 -0
  32. package/packages/core/src/session/session-store.ts +91 -0
  33. package/packages/core/src/session/session-types.ts +17 -0
  34. package/packages/core/src/sources/source-id.ts +19 -0
  35. package/packages/core/src/sources/source-paths.ts +15 -0
  36. package/packages/core/src/sources/source-service.ts +416 -0
  37. package/packages/core/src/sources/source-types.ts +77 -0
  38. package/packages/core/src/sources/source-validator.ts +132 -0
  39. package/packages/core/src/sources/source-writer.ts +419 -0
  40. package/packages/core/src/validation/frontmatter-validator.ts +128 -0
  41. package/packages/core/src/validation/link-validator.ts +55 -0
  42. package/packages/core/src/validation/path-validator.ts +65 -0
  43. package/packages/core/src/validation/source-reference-validator.ts +191 -0
  44. package/packages/core/src/validation/validation-service.ts +106 -0
  45. package/packages/core/src/vfs/vfs-command-parser.ts +69 -0
  46. package/packages/core/src/vfs/vfs-service.ts +498 -0
  47. package/packages/core/src/web/html-to-markdown.ts +144 -0
  48. package/packages/core/src/web/static-web-fetcher.ts +537 -0
  49. package/packages/core/src/web/web-id.ts +26 -0
  50. package/packages/core/src/web/web-ingestion-service.ts +335 -0
  51. package/packages/core/src/web/web-paths.ts +6 -0
  52. package/packages/core/src/web/web-types.ts +33 -0
  53. package/packages/server/src/cli.ts +56 -0
  54. package/packages/server/src/context.ts +7 -0
  55. package/packages/server/src/index.ts +2 -0
  56. package/packages/server/src/mcp-server.ts +111 -0
  57. package/packages/server/src/schemas/documents.ts +17 -0
  58. package/packages/server/src/schemas/read.ts +16 -0
  59. package/packages/server/src/schemas/session.ts +31 -0
  60. package/packages/server/src/schemas/sources.ts +12 -0
  61. package/packages/server/src/schemas/web.ts +23 -0
  62. package/packages/server/src/tools/document-tools.ts +46 -0
  63. package/packages/server/src/tools/publish-tools.ts +33 -0
  64. package/packages/server/src/tools/read-tools.ts +52 -0
  65. package/packages/server/src/tools/response.ts +24 -0
  66. package/packages/server/src/tools/session-tools.ts +100 -0
  67. package/packages/server/src/tools/source-tools.ts +32 -0
  68. package/packages/server/src/tools/web-tools.ts +26 -0
@@ -0,0 +1,498 @@
1
+ import { lstat, readdir, readFile, realpath } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { resolveWorkspacePaths, type WorkspacePaths } from "../config/workspace-resolver.js";
4
+ import { WikiError } from "../errors.js";
5
+ import { err, ok, type Result } from "../result.js";
6
+ import { parseVfsCommand, type VfsCommand } from "./vfs-command-parser.js";
7
+
8
+ export type VfsExecInput = {
9
+ command: string;
10
+ ref?: "stable" | string;
11
+ };
12
+
13
+ export type VfsExecResult = {
14
+ stdout: string;
15
+ stderr: string;
16
+ exit_code: number;
17
+ cwd: "wiki";
18
+ ref: "stable";
19
+ };
20
+
21
+ export class VfsService {
22
+ private readonly paths: WorkspacePaths;
23
+
24
+ constructor(workspaceRoot: string) {
25
+ this.paths = resolveWorkspacePaths(workspaceRoot);
26
+ }
27
+
28
+ async exec(input: VfsExecInput): Promise<Result<VfsExecResult>> {
29
+ if ((input.ref ?? "stable") !== "stable") {
30
+ return err(new WikiError("unsupported_channel", `Unsupported VFS ref: ${input.ref}`));
31
+ }
32
+
33
+ try {
34
+ const command = parseVfsCommand(input.command);
35
+ const stdout = await this.run(command);
36
+
37
+ return ok({
38
+ stdout,
39
+ stderr: "",
40
+ exit_code: 0,
41
+ cwd: "wiki",
42
+ ref: "stable"
43
+ });
44
+ } catch (error) {
45
+ if (error instanceof WikiError) {
46
+ return err(error);
47
+ }
48
+
49
+ return err(new WikiError("invalid_path", error instanceof Error ? error.message : String(error)));
50
+ }
51
+ }
52
+
53
+ private async run(command: VfsCommand): Promise<string> {
54
+ switch (command.name) {
55
+ case "pwd":
56
+ return "wiki\n";
57
+ case "ls":
58
+ return this.ls(command.args);
59
+ case "find":
60
+ return this.find(command.args);
61
+ case "tree":
62
+ return this.tree(command.args);
63
+ case "cat":
64
+ return this.cat(command.args);
65
+ case "head":
66
+ return this.head(command.args);
67
+ case "tail":
68
+ return this.tail(command.args);
69
+ case "wc":
70
+ return this.wc(command.args);
71
+ case "grep":
72
+ return this.grep(command.args);
73
+ }
74
+ }
75
+
76
+ private async ls(args: string[]): Promise<string> {
77
+ const parsed = parseLsArgs(args);
78
+ const target = await this.resolveDirectory(parsed.path);
79
+ const entries = await readdir(target, { withFileTypes: true });
80
+
81
+ return entries
82
+ .map(entry => `${entry.name}${entry.isDirectory() ? "/" : ""}`)
83
+ .sort()
84
+ .join("\n")
85
+ .concat(entries.length > 0 ? "\n" : "");
86
+ }
87
+
88
+ private async find(args: string[]): Promise<string> {
89
+ const parsed = parseFindArgs(args);
90
+ const root = await this.resolveDirectory(parsed.path);
91
+ const files = await this.collectMarkdownFiles(root, parsed.extensions, parsed.maxDepth);
92
+
93
+ return formatLines(files);
94
+ }
95
+
96
+ private async tree(args: string[]): Promise<string> {
97
+ const parsed = parseTreeArgs(args);
98
+ const target = await this.resolveDirectory(parsed.path);
99
+ const files = await this.collectMarkdownFiles(target);
100
+
101
+ return formatLines(files.map(file => file.replace(/^wiki\//, "")));
102
+ }
103
+
104
+ private async cat(args: string[]): Promise<string> {
105
+ const file = await this.readCommandFile(parseSingleFilePath(args, "cat"));
106
+ return file.content;
107
+ }
108
+
109
+ private async head(args: string[]): Promise<string> {
110
+ const { count, paths } = parseLineCount(args);
111
+ const file = await this.readCommandFile(singlePathOrRequired(paths, "head"));
112
+ return firstLines(file.content, count);
113
+ }
114
+
115
+ private async tail(args: string[]): Promise<string> {
116
+ const { count, paths } = parseLineCount(args);
117
+ const file = await this.readCommandFile(singlePathOrRequired(paths, "tail"));
118
+ return lastLines(file.content, count);
119
+ }
120
+
121
+ private async wc(args: string[]): Promise<string> {
122
+ const file = await this.readCommandFile(parseSingleFilePath(args, "wc"));
123
+ const lines = file.content.length === 0 ? 0 : file.content.split("\n").length - (file.content.endsWith("\n") ? 1 : 0);
124
+ const words = file.content.trim().length === 0 ? 0 : file.content.trim().split(/\s+/).length;
125
+ const bytes = Buffer.byteLength(file.content, "utf8");
126
+
127
+ return `${lines} ${words} ${bytes} ${file.vfsPath}\n`;
128
+ }
129
+
130
+ private async grep(args: string[]): Promise<string> {
131
+ const parsed = parseGrepArgs(args);
132
+
133
+ if (parsed.paths.length === 0) {
134
+ parsed.paths.push("wiki");
135
+ }
136
+
137
+ const matches: string[] = [];
138
+ const files = await this.expandGrepFiles(parsed.paths, parsed.recursive);
139
+
140
+ for (const filePath of files) {
141
+ const file = await this.readCommandFile(filePath);
142
+ const lines = file.content.split("\n");
143
+
144
+ for (const [index, line] of lines.entries()) {
145
+ if (line.includes(parsed.pattern)) {
146
+ const prefix = files.length > 1 || parsed.recursive ? `${file.vfsPath}:` : "";
147
+ const lineNumber = parsed.lineNumbers ? `${index + 1}:` : "";
148
+ matches.push(`${prefix}${lineNumber}${line}`);
149
+ }
150
+ }
151
+ }
152
+
153
+ return formatLines(matches);
154
+ }
155
+
156
+ private async readCommandFile(input: string | undefined): Promise<{ content: string; vfsPath: string }> {
157
+ const vfsPath = normalizeVfsPath(input);
158
+ const filePath = await this.resolveFile(vfsPath);
159
+ const content = await readFile(filePath, "utf8");
160
+
161
+ return { content, vfsPath };
162
+ }
163
+
164
+ private async resolveFile(input: string): Promise<string> {
165
+ const filePath = this.resolvePath(input);
166
+ await ensureReadableTargetInsideWikiRoot(filePath, this.wikiRootPath());
167
+ return filePath;
168
+ }
169
+
170
+ private async resolveDirectory(input: string): Promise<string> {
171
+ const directoryPath = this.resolvePath(input);
172
+ await ensureReadableTargetInsideWikiRoot(directoryPath, this.wikiRootPath());
173
+ const stat = await lstat(directoryPath);
174
+
175
+ if (!stat.isDirectory()) {
176
+ throw new WikiError("invalid_path", `Path must be a directory: ${input}`);
177
+ }
178
+
179
+ return directoryPath;
180
+ }
181
+
182
+ private resolvePath(input: string): string {
183
+ const vfsPath = normalizeVfsPath(input);
184
+ return path.resolve(this.paths.stableWorktreePath, vfsPath);
185
+ }
186
+
187
+ private wikiRootPath(): string {
188
+ return path.join(this.paths.stableWorktreePath, this.paths.wikiRootName);
189
+ }
190
+
191
+ private async expandGrepFiles(inputs: string[], recursive: boolean): Promise<string[]> {
192
+ const files: string[] = [];
193
+
194
+ for (const input of inputs) {
195
+ const absolute = this.resolvePath(input);
196
+ await ensureReadableTargetInsideWikiRoot(absolute, this.wikiRootPath());
197
+ const stat = await lstat(absolute);
198
+
199
+ if (stat.isDirectory()) {
200
+ if (!recursive) {
201
+ throw new WikiError("vfs_command_denied", `grep requires -R for directories: ${input}`);
202
+ }
203
+
204
+ files.push(...(await this.collectMarkdownFiles(absolute)));
205
+ continue;
206
+ }
207
+
208
+ if (stat.isFile()) {
209
+ files.push(toVfsPath(absolute, this.paths.stableWorktreePath));
210
+ continue;
211
+ }
212
+
213
+ throw new WikiError("invalid_path", `Path must be a file or directory: ${input}`);
214
+ }
215
+
216
+ return files.sort();
217
+ }
218
+
219
+ private async collectMarkdownFiles(
220
+ root: string,
221
+ extensions = [".md"],
222
+ maxDepth = Number.POSITIVE_INFINITY,
223
+ depth = 0
224
+ ): Promise<string[]> {
225
+ const entries = await readdir(root, { withFileTypes: true });
226
+ const files = await Promise.all(
227
+ entries.map(async entry => {
228
+ const absolute = path.join(root, entry.name);
229
+ await ensureReadableTargetInsideWikiRoot(absolute, this.wikiRootPath());
230
+
231
+ if (entry.isDirectory()) {
232
+ return depth + 1 < maxDepth ? this.collectMarkdownFiles(absolute, extensions, maxDepth, depth + 1) : [];
233
+ }
234
+
235
+ if (entry.isFile() && extensions.some(extension => entry.name.endsWith(extension))) {
236
+ return [toVfsPath(absolute, this.paths.stableWorktreePath)];
237
+ }
238
+
239
+ return [];
240
+ })
241
+ );
242
+
243
+ return files.flat().sort();
244
+ }
245
+ }
246
+
247
+ function normalizeVfsPath(input: string | undefined): string {
248
+ if (!input) {
249
+ throw new WikiError("invalid_path", "Path is required");
250
+ }
251
+
252
+ const unixInput = input.replaceAll("\\", "/");
253
+
254
+ if (path.posix.isAbsolute(unixInput) || /^[A-Za-z]:\//.test(unixInput)) {
255
+ throw new WikiError("invalid_path", `Path must be relative: ${input}`);
256
+ }
257
+
258
+ if (unixInput.split("/").some(segment => segment === "..")) {
259
+ throw new WikiError("invalid_path", `Path escapes wiki root: ${input}`);
260
+ }
261
+
262
+ const normalized = path.posix.normalize(unixInput);
263
+ const vfsPath = normalized === "." || normalized === "wiki" || normalized.startsWith("wiki/")
264
+ ? normalized === "." ? "wiki" : normalized
265
+ : `wiki/${normalized}`;
266
+
267
+ if (vfsPath !== "wiki" && !vfsPath.startsWith("wiki/")) {
268
+ throw new WikiError("invalid_path", `Path must be inside wiki/: ${input}`);
269
+ }
270
+
271
+ return vfsPath;
272
+ }
273
+
274
+ function parseLsArgs(args: string[]): { path: string } {
275
+ const paths: string[] = [];
276
+
277
+ for (let index = 0; index < args.length; index += 1) {
278
+ const arg = args[index];
279
+
280
+ if (arg === "--color") {
281
+ const value = args[index + 1];
282
+
283
+ if (value !== "never") {
284
+ throw new WikiError("vfs_command_denied", "--color requires never");
285
+ }
286
+
287
+ index += 1;
288
+ continue;
289
+ }
290
+
291
+ if (arg.startsWith("-")) {
292
+ throw new WikiError("vfs_command_denied", `Unsupported ls option: ${arg}`);
293
+ }
294
+
295
+ paths.push(arg);
296
+ }
297
+
298
+ return { path: singlePathOrDefault(paths, "ls") };
299
+ }
300
+
301
+ function parseFindArgs(args: string[]): { path: string; extensions: string[]; maxDepth: number } {
302
+ const paths: string[] = [];
303
+ let extensions = [".md"];
304
+ let maxDepth = Number.POSITIVE_INFINITY;
305
+
306
+ for (let index = 0; index < args.length; index += 1) {
307
+ const arg = args[index];
308
+
309
+ if (arg === "-name") {
310
+ const value = args[index + 1];
311
+
312
+ if (!value || value.startsWith("-")) {
313
+ throw new WikiError("vfs_command_denied", "-name requires a pattern");
314
+ }
315
+
316
+ if (!["*.md", "*.markdown"].includes(value)) {
317
+ throw new WikiError("vfs_command_denied", `Unsupported find -name pattern: ${value}`);
318
+ }
319
+
320
+ extensions = [value.slice(1)];
321
+ index += 1;
322
+ continue;
323
+ }
324
+
325
+ if (arg === "-maxdepth") {
326
+ const value = args[index + 1];
327
+ const parsed = Number.parseInt(value ?? "", 10);
328
+
329
+ if (!Number.isInteger(parsed) || parsed < 0 || String(parsed) !== value) {
330
+ throw new WikiError("vfs_command_denied", "-maxdepth requires a non-negative integer");
331
+ }
332
+
333
+ maxDepth = parsed;
334
+ index += 1;
335
+ continue;
336
+ }
337
+
338
+ if (arg.startsWith("-")) {
339
+ throw new WikiError("vfs_command_denied", `Unsupported find option: ${arg}`);
340
+ }
341
+
342
+ paths.push(arg);
343
+ }
344
+
345
+ return { path: singlePathOrDefault(paths, "find"), extensions, maxDepth };
346
+ }
347
+
348
+ function parseTreeArgs(args: string[]): { path: string } {
349
+ const paths: string[] = [];
350
+
351
+ for (const arg of args) {
352
+ if (arg.startsWith("-")) {
353
+ throw new WikiError("vfs_command_denied", `Unsupported tree option: ${arg}`);
354
+ }
355
+
356
+ paths.push(arg);
357
+ }
358
+
359
+ return { path: singlePathOrDefault(paths, "tree") };
360
+ }
361
+
362
+ function parseGrepArgs(args: string[]): { pattern: string; paths: string[]; lineNumbers: boolean; recursive: boolean } {
363
+ let lineNumbers = false;
364
+ let recursive = false;
365
+ let index = 0;
366
+
367
+ while (index < args.length) {
368
+ const arg = args[index];
369
+
370
+ if (arg === "-n") {
371
+ lineNumbers = true;
372
+ index += 1;
373
+ continue;
374
+ }
375
+
376
+ if (arg === "-R" || arg === "-r") {
377
+ recursive = true;
378
+ index += 1;
379
+ continue;
380
+ }
381
+
382
+ if (arg.startsWith("-")) {
383
+ throw new WikiError("vfs_command_denied", `Unsupported grep option: ${arg}`);
384
+ }
385
+
386
+ break;
387
+ }
388
+
389
+ const pattern = args[index];
390
+
391
+ if (!pattern) {
392
+ throw new WikiError("vfs_command_denied", "grep requires a pattern");
393
+ }
394
+
395
+ const paths = args.slice(index + 1);
396
+
397
+ for (const pathArg of paths) {
398
+ if (pathArg.startsWith("-")) {
399
+ throw new WikiError("vfs_command_denied", `Unsupported grep option: ${pathArg}`);
400
+ }
401
+ }
402
+
403
+ return {
404
+ pattern,
405
+ paths,
406
+ lineNumbers,
407
+ recursive
408
+ };
409
+ }
410
+
411
+ function singlePathOrDefault(paths: string[], command: string): string {
412
+ if (paths.length > 1) {
413
+ throw new WikiError("vfs_command_denied", `${command} accepts at most one path`);
414
+ }
415
+
416
+ return paths[0] ?? "wiki";
417
+ }
418
+
419
+ function parseSingleFilePath(args: string[], command: string): string {
420
+ for (const arg of args) {
421
+ if (arg.startsWith("-")) {
422
+ throw new WikiError("vfs_command_denied", `Unsupported ${command} option: ${arg}`);
423
+ }
424
+ }
425
+
426
+ return singlePathOrRequired(args, command);
427
+ }
428
+
429
+ function singlePathOrRequired(paths: string[], command: string): string {
430
+ for (const pathArg of paths) {
431
+ if (pathArg.startsWith("-")) {
432
+ throw new WikiError("vfs_command_denied", `Unsupported ${command} option: ${pathArg}`);
433
+ }
434
+ }
435
+
436
+ if (paths.length !== 1) {
437
+ throw new WikiError("vfs_command_denied", `${command} requires exactly one path`);
438
+ }
439
+
440
+ return paths[0];
441
+ }
442
+
443
+ async function ensureReadableTargetInsideWikiRoot(filePath: string, wikiRootPath: string): Promise<void> {
444
+ const stat = await lstat(filePath);
445
+
446
+ if (stat.isSymbolicLink()) {
447
+ throw new WikiError("invalid_path", `Path is a symlink: ${filePath}`);
448
+ }
449
+
450
+ const wikiRoot = await realpath(wikiRootPath);
451
+ const target = await realpath(filePath);
452
+
453
+ if (!target.startsWith(`${wikiRoot}${path.sep}`) && target !== wikiRoot) {
454
+ throw new WikiError("invalid_path", `Path escapes wiki root: ${filePath}`);
455
+ }
456
+ }
457
+
458
+ function parseLineCount(args: string[]): { count: number; paths: string[] } {
459
+ if (args[0] === "-n") {
460
+ const count = Number.parseInt(args[1] ?? "", 10);
461
+
462
+ if (!Number.isInteger(count) || count < 0) {
463
+ throw new WikiError("vfs_command_denied", "Line count must be a non-negative integer");
464
+ }
465
+
466
+ return { count, paths: args.slice(2) };
467
+ }
468
+
469
+ if (args.some(arg => arg.startsWith("-"))) {
470
+ const option = args.find(arg => arg.startsWith("-"));
471
+ throw new WikiError("vfs_command_denied", `Unsupported line option: ${option}`);
472
+ }
473
+
474
+ return { count: 10, paths: args };
475
+ }
476
+
477
+ function firstLines(content: string, count: number): string {
478
+ return ensureTrailingNewline(content.split("\n").slice(0, count).join("\n"));
479
+ }
480
+
481
+ function lastLines(content: string, count: number): string {
482
+ const lines = content.split("\n");
483
+ const trimmed = content.endsWith("\n") ? lines.slice(0, -1) : lines;
484
+
485
+ return ensureTrailingNewline(trimmed.slice(Math.max(0, trimmed.length - count)).join("\n"));
486
+ }
487
+
488
+ function ensureTrailingNewline(content: string): string {
489
+ return content.length > 0 && !content.endsWith("\n") ? `${content}\n` : content;
490
+ }
491
+
492
+ function formatLines(lines: string[]): string {
493
+ return lines.length > 0 ? `${lines.join("\n")}\n` : "";
494
+ }
495
+
496
+ function toVfsPath(filePath: string, stableWorktreePath: string): string {
497
+ return path.relative(stableWorktreePath, filePath).split(path.sep).join("/");
498
+ }
@@ -0,0 +1,144 @@
1
+ export type HtmlToRawMarkdownInput = {
2
+ html: string;
3
+ requestedUrl: string;
4
+ finalUrl: string;
5
+ fetchedAt: string;
6
+ fetcherName: string;
7
+ };
8
+
9
+ export type HtmlToRawMarkdownResult = {
10
+ title?: string;
11
+ markdown: string;
12
+ };
13
+
14
+ type BlockTag = "h1" | "h2" | "h3" | "p" | "li";
15
+
16
+ type MarkdownBlock = {
17
+ text: string;
18
+ isListItem: boolean;
19
+ };
20
+
21
+ export function htmlToRawMarkdown(input: HtmlToRawMarkdownInput): HtmlToRawMarkdownResult {
22
+ const cleanedHtml = removeIgnoredElements(input.html);
23
+ const title = extractFirstTagText(cleanedHtml, "title");
24
+ const firstH1 = extractFirstTagText(cleanedHtml, "h1");
25
+ const pageTitle = title ?? firstH1;
26
+ const headerText = pageTitle ?? input.finalUrl;
27
+ const bodyMarkdown = renderBodyMarkdown(cleanedHtml, headerText);
28
+
29
+ const markdown = normalizeMarkdown(`# ${headerText}
30
+
31
+ > Source web page: ${input.requestedUrl}
32
+ > Final URL: ${input.finalUrl}
33
+ > Fetched at: ${input.fetchedAt}
34
+ > Fetcher: ${input.fetcherName}
35
+
36
+ ${bodyMarkdown}`);
37
+
38
+ return {
39
+ title: pageTitle,
40
+ markdown
41
+ };
42
+ }
43
+
44
+ function removeIgnoredElements(html: string): string {
45
+ return html.replace(/<(script|style|noscript)\b[^>]*>[\s\S]*?<\/\1>/gi, "");
46
+ }
47
+
48
+ function extractFirstTagText(html: string, tag: "title" | "h1"): string | undefined {
49
+ const match = new RegExp(`<${tag}\\b[^>]*>([\\s\\S]*?)<\\/${tag}>`, "i").exec(html);
50
+ if (!match) {
51
+ return undefined;
52
+ }
53
+
54
+ const text = inlineText(match[1]);
55
+ return text.length > 0 ? text : undefined;
56
+ }
57
+
58
+ function renderBodyMarkdown(html: string, headerText: string): string {
59
+ const bodyHtml = html
60
+ .replace(/<!doctype\b[^>]*>/gi, "")
61
+ .replace(/<head\b[^>]*>[\s\S]*?<\/head>/gi, "")
62
+ .replace(/<title\b[^>]*>[\s\S]*?<\/title>/gi, "");
63
+ const blocks: MarkdownBlock[] = [];
64
+ const blockPattern = /<(h1|h2|h3|p|li)\b[^>]*>([\s\S]*?)<\/\1>/gi;
65
+ let lastIndex = 0;
66
+
67
+ for (const match of bodyHtml.matchAll(blockPattern)) {
68
+ addPlainTextBlock(blocks, bodyHtml.slice(lastIndex, match.index));
69
+
70
+ const tag = match[1].toLowerCase() as BlockTag;
71
+ const text = inlineText(match[2]);
72
+ lastIndex = match.index + match[0].length;
73
+
74
+ if (text.length === 0 || (tag === "h1" && text === headerText)) {
75
+ continue;
76
+ }
77
+
78
+ if (tag === "h1") {
79
+ blocks.push({ text: `# ${text}`, isListItem: false });
80
+ } else if (tag === "h2") {
81
+ blocks.push({ text: `## ${text}`, isListItem: false });
82
+ } else if (tag === "h3") {
83
+ blocks.push({ text: `### ${text}`, isListItem: false });
84
+ } else if (tag === "li") {
85
+ blocks.push({ text: `- ${text}`, isListItem: true });
86
+ } else {
87
+ blocks.push({ text, isListItem: false });
88
+ }
89
+ }
90
+
91
+ addPlainTextBlock(blocks, bodyHtml.slice(lastIndex));
92
+
93
+ return joinMarkdownBlocks(blocks);
94
+ }
95
+
96
+ function addPlainTextBlock(blocks: MarkdownBlock[], html: string): void {
97
+ const text = inlineText(html);
98
+ if (text.length > 0) {
99
+ blocks.push({ text, isListItem: false });
100
+ }
101
+ }
102
+
103
+ function joinMarkdownBlocks(blocks: MarkdownBlock[]): string {
104
+ return blocks.reduce((markdown, block, index) => {
105
+ if (index === 0) {
106
+ return block.text;
107
+ }
108
+
109
+ const previous = blocks[index - 1];
110
+ const separator = previous.isListItem && block.isListItem ? "\n" : "\n\n";
111
+ return `${markdown}${separator}${block.text}`;
112
+ }, "");
113
+ }
114
+
115
+ function inlineText(html: string): string {
116
+ const withLinks = html.replace(/<a\b([^>]*)>([\s\S]*?)<\/a>/gi, (_match, attrs: string, content: string) => {
117
+ const text = inlineText(content);
118
+ const href = extractHref(attrs);
119
+ return href ? `[${text}](${decodeHtmlEntities(href)})` : text;
120
+ });
121
+
122
+ return decodeHtmlEntities(withLinks.replace(/<[^>]+>/g, " "))
123
+ .replace(/\s+/g, " ")
124
+ .trim();
125
+ }
126
+
127
+ function extractHref(attrs: string): string | undefined {
128
+ const match = /\bhref\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'>]+))/i.exec(attrs);
129
+ return match?.[1] ?? match?.[2] ?? match?.[3];
130
+ }
131
+
132
+ function decodeHtmlEntities(text: string): string {
133
+ return text
134
+ .replace(/&nbsp;/g, " ")
135
+ .replace(/&amp;/g, "&")
136
+ .replace(/&lt;/g, "<")
137
+ .replace(/&gt;/g, ">")
138
+ .replace(/&quot;/g, "\"")
139
+ .replace(/&#39;/g, "'");
140
+ }
141
+
142
+ function normalizeMarkdown(markdown: string): string {
143
+ return `${markdown.replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim()}\n`;
144
+ }