browzy 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. package/README.md +324 -0
  2. package/dist/cli/app.d.ts +16 -0
  3. package/dist/cli/app.js +615 -0
  4. package/dist/cli/banner.d.ts +1 -0
  5. package/dist/cli/banner.js +60 -0
  6. package/dist/cli/commands/compile.d.ts +2 -0
  7. package/dist/cli/commands/compile.js +42 -0
  8. package/dist/cli/commands/ingest.d.ts +2 -0
  9. package/dist/cli/commands/ingest.js +32 -0
  10. package/dist/cli/commands/init.d.ts +2 -0
  11. package/dist/cli/commands/init.js +48 -0
  12. package/dist/cli/commands/lint.d.ts +2 -0
  13. package/dist/cli/commands/lint.js +40 -0
  14. package/dist/cli/commands/query.d.ts +2 -0
  15. package/dist/cli/commands/query.js +36 -0
  16. package/dist/cli/commands/search.d.ts +2 -0
  17. package/dist/cli/commands/search.js +34 -0
  18. package/dist/cli/commands/status.d.ts +2 -0
  19. package/dist/cli/commands/status.js +27 -0
  20. package/dist/cli/components/Banner.d.ts +13 -0
  21. package/dist/cli/components/Banner.js +20 -0
  22. package/dist/cli/components/Markdown.d.ts +14 -0
  23. package/dist/cli/components/Markdown.js +324 -0
  24. package/dist/cli/components/Message.d.ts +14 -0
  25. package/dist/cli/components/Message.js +17 -0
  26. package/dist/cli/components/Spinner.d.ts +7 -0
  27. package/dist/cli/components/Spinner.js +19 -0
  28. package/dist/cli/components/StatusBar.d.ts +14 -0
  29. package/dist/cli/components/StatusBar.js +19 -0
  30. package/dist/cli/components/Suggestions.d.ts +13 -0
  31. package/dist/cli/components/Suggestions.js +14 -0
  32. package/dist/cli/entry.d.ts +2 -0
  33. package/dist/cli/entry.js +61 -0
  34. package/dist/cli/helpers.d.ts +14 -0
  35. package/dist/cli/helpers.js +32 -0
  36. package/dist/cli/hooks/useAutocomplete.d.ts +11 -0
  37. package/dist/cli/hooks/useAutocomplete.js +71 -0
  38. package/dist/cli/hooks/useHistory.d.ts +13 -0
  39. package/dist/cli/hooks/useHistory.js +106 -0
  40. package/dist/cli/hooks/useSession.d.ts +16 -0
  41. package/dist/cli/hooks/useSession.js +133 -0
  42. package/dist/cli/index.d.ts +2 -0
  43. package/dist/cli/index.js +41 -0
  44. package/dist/cli/keystore.d.ts +28 -0
  45. package/dist/cli/keystore.js +59 -0
  46. package/dist/cli/onboarding.d.ts +18 -0
  47. package/dist/cli/onboarding.js +306 -0
  48. package/dist/cli/personality.d.ts +34 -0
  49. package/dist/cli/personality.js +196 -0
  50. package/dist/cli/repl.d.ts +20 -0
  51. package/dist/cli/repl.js +338 -0
  52. package/dist/cli/theme.d.ts +25 -0
  53. package/dist/cli/theme.js +64 -0
  54. package/dist/core/compile/compiler.d.ts +25 -0
  55. package/dist/core/compile/compiler.js +229 -0
  56. package/dist/core/compile/index.d.ts +2 -0
  57. package/dist/core/compile/index.js +1 -0
  58. package/dist/core/config.d.ts +10 -0
  59. package/dist/core/config.js +92 -0
  60. package/dist/core/index.d.ts +12 -0
  61. package/dist/core/index.js +11 -0
  62. package/dist/core/ingest/image.d.ts +3 -0
  63. package/dist/core/ingest/image.js +61 -0
  64. package/dist/core/ingest/index.d.ts +18 -0
  65. package/dist/core/ingest/index.js +79 -0
  66. package/dist/core/ingest/pdf.d.ts +2 -0
  67. package/dist/core/ingest/pdf.js +36 -0
  68. package/dist/core/ingest/text.d.ts +2 -0
  69. package/dist/core/ingest/text.js +38 -0
  70. package/dist/core/ingest/web.d.ts +2 -0
  71. package/dist/core/ingest/web.js +202 -0
  72. package/dist/core/lint/index.d.ts +1 -0
  73. package/dist/core/lint/index.js +1 -0
  74. package/dist/core/lint/linter.d.ts +27 -0
  75. package/dist/core/lint/linter.js +147 -0
  76. package/dist/core/llm/index.d.ts +2 -0
  77. package/dist/core/llm/index.js +1 -0
  78. package/dist/core/llm/provider.d.ts +15 -0
  79. package/dist/core/llm/provider.js +241 -0
  80. package/dist/core/prompts.d.ts +28 -0
  81. package/dist/core/prompts.js +374 -0
  82. package/dist/core/query/engine.d.ts +29 -0
  83. package/dist/core/query/engine.js +131 -0
  84. package/dist/core/query/index.d.ts +2 -0
  85. package/dist/core/query/index.js +1 -0
  86. package/dist/core/sanitization.d.ts +11 -0
  87. package/dist/core/sanitization.js +50 -0
  88. package/dist/core/storage/filesystem.d.ts +23 -0
  89. package/dist/core/storage/filesystem.js +106 -0
  90. package/dist/core/storage/index.d.ts +2 -0
  91. package/dist/core/storage/index.js +2 -0
  92. package/dist/core/storage/sqlite.d.ts +30 -0
  93. package/dist/core/storage/sqlite.js +104 -0
  94. package/dist/core/types.d.ts +95 -0
  95. package/dist/core/types.js +4 -0
  96. package/dist/core/utils.d.ts +8 -0
  97. package/dist/core/utils.js +94 -0
  98. package/dist/core/wiki/index.d.ts +1 -0
  99. package/dist/core/wiki/index.js +1 -0
  100. package/dist/core/wiki/wiki.d.ts +19 -0
  101. package/dist/core/wiki/wiki.js +37 -0
  102. package/dist/index.d.ts +2 -0
  103. package/dist/index.js +3 -0
  104. package/package.json +54 -0
@@ -0,0 +1 @@
1
+ export { WikiCompiler } from './compiler.js';
@@ -0,0 +1,10 @@
1
+ import type { BrowzyConfig } from './types.js';
2
+ /**
3
+ * Load config from browzy.config.json in the current directory,
4
+ * or from ~/.browzy/config.json, or use defaults.
5
+ */
6
+ export declare function loadConfig(configPath?: string): BrowzyConfig;
7
+ /**
8
+ * Ensure the data directory structure exists.
9
+ */
10
+ export declare function ensureDataDirs(config: BrowzyConfig): void;
@@ -0,0 +1,92 @@
1
+ import { readFileSync, existsSync, mkdirSync } from 'fs';
2
+ import { join } from 'path';
3
+ import { homedir } from 'os';
4
+ const DEFAULT_DATA_DIR = join(homedir(), '.browzy', 'default');
5
+ const DEFAULT_CONFIG = {
6
+ dataDir: DEFAULT_DATA_DIR,
7
+ llm: {
8
+ provider: 'claude',
9
+ apiKey: '',
10
+ model: 'claude-sonnet-4-20250514',
11
+ },
12
+ compile: {
13
+ batchSize: 20,
14
+ extractConcepts: true,
15
+ },
16
+ };
17
+ /**
18
+ * Load config from browzy.config.json in the current directory,
19
+ * or from ~/.browzy/config.json, or use defaults.
20
+ */
21
+ export function loadConfig(configPath) {
22
+ const candidates = configPath
23
+ ? [configPath]
24
+ : [
25
+ join(process.cwd(), 'browzy.config.json'),
26
+ join(homedir(), '.browzy', 'config.json'),
27
+ ];
28
+ for (const path of candidates) {
29
+ if (existsSync(path)) {
30
+ let parsed;
31
+ try {
32
+ parsed = JSON.parse(readFileSync(path, 'utf-8'));
33
+ }
34
+ catch {
35
+ throw new Error(`Invalid JSON in config file: ${path}`);
36
+ }
37
+ if (typeof parsed !== 'object' || parsed === null || Array.isArray(parsed)) {
38
+ throw new Error(`Invalid config: expected an object in ${path}`);
39
+ }
40
+ const obj = parsed;
41
+ if (obj.dataDir !== undefined && typeof obj.dataDir !== 'string') {
42
+ throw new Error(`Invalid config: dataDir must be a string`);
43
+ }
44
+ if (obj.llm !== undefined && (typeof obj.llm !== 'object' || obj.llm === null)) {
45
+ throw new Error(`Invalid config: llm must be an object`);
46
+ }
47
+ return mergeConfig(DEFAULT_CONFIG, obj);
48
+ }
49
+ }
50
+ // Apply env vars as overrides
51
+ return applyEnvOverrides(DEFAULT_CONFIG);
52
+ }
53
+ function mergeConfig(defaults, overrides) {
54
+ const merged = {
55
+ ...defaults,
56
+ ...overrides,
57
+ llm: { ...defaults.llm, ...overrides.llm },
58
+ compile: { ...defaults.compile, ...overrides.compile },
59
+ };
60
+ return applyEnvOverrides(merged);
61
+ }
62
+ function applyEnvOverrides(config) {
63
+ if (process.env.ANTHROPIC_API_KEY && config.llm.provider === 'claude') {
64
+ config.llm.apiKey = process.env.ANTHROPIC_API_KEY;
65
+ }
66
+ if (process.env.OPENAI_API_KEY && config.llm.provider === 'openai') {
67
+ config.llm.apiKey = process.env.OPENAI_API_KEY;
68
+ }
69
+ if (process.env.OPENROUTER_API_KEY && config.llm.provider === 'openrouter') {
70
+ config.llm.apiKey = process.env.OPENROUTER_API_KEY;
71
+ }
72
+ if (process.env.BROWZY_DATA_DIR) {
73
+ config.dataDir = process.env.BROWZY_DATA_DIR;
74
+ }
75
+ return config;
76
+ }
77
+ /**
78
+ * Ensure the data directory structure exists.
79
+ */
80
+ export function ensureDataDirs(config) {
81
+ const dirs = [
82
+ config.dataDir,
83
+ join(config.dataDir, 'raw'),
84
+ join(config.dataDir, 'raw', 'images'),
85
+ join(config.dataDir, 'wiki'),
86
+ join(config.dataDir, 'output'),
87
+ join(config.dataDir, '.browzy'),
88
+ ];
89
+ for (const dir of dirs) {
90
+ mkdirSync(dir, { recursive: true });
91
+ }
92
+ }
@@ -0,0 +1,12 @@
1
+ export { loadConfig, ensureDataDirs } from './config.js';
2
+ export { createProvider } from './llm/index.js';
3
+ export { FilesystemStorage } from './storage/filesystem.js';
4
+ export { SQLiteStorage } from './storage/sqlite.js';
5
+ export { ingest, detectSourceType } from './ingest/index.js';
6
+ export { WikiCompiler } from './compile/index.js';
7
+ export { QueryEngine } from './query/index.js';
8
+ export { WikiLinter } from './lint/index.js';
9
+ export { Wiki } from './wiki/index.js';
10
+ export { sanitizeUnicode, sanitizeDeep } from './sanitization.js';
11
+ export { slugify, safePath, safeParseJSON, checkFileSize, fetchWithTimeout, clampInt } from './utils.js';
12
+ export type * from './types.js';
@@ -0,0 +1,11 @@
1
+ export { loadConfig, ensureDataDirs } from './config.js';
2
+ export { createProvider } from './llm/index.js';
3
+ export { FilesystemStorage } from './storage/filesystem.js';
4
+ export { SQLiteStorage } from './storage/sqlite.js';
5
+ export { ingest, detectSourceType } from './ingest/index.js';
6
+ export { WikiCompiler } from './compile/index.js';
7
+ export { QueryEngine } from './query/index.js';
8
+ export { WikiLinter } from './lint/index.js';
9
+ export { Wiki } from './wiki/index.js';
10
+ export { sanitizeUnicode, sanitizeDeep } from './sanitization.js';
11
+ export { slugify, safePath, safeParseJSON, checkFileSize, fetchWithTimeout, clampInt } from './utils.js';
@@ -0,0 +1,3 @@
1
+ import type { RawSource } from '../types.js';
2
+ import type { LLMProvider } from '../llm/provider.js';
3
+ export declare function ingestImage(filePath: string, dataDir: string, llm?: LLMProvider): Promise<RawSource>;
@@ -0,0 +1,61 @@
1
+ import { readFileSync, writeFileSync, existsSync, copyFileSync } from 'fs';
2
+ import { join, basename, extname } from 'path';
3
+ import { createHash } from 'crypto';
4
+ import { lookup } from 'mime-types';
5
+ import { IMAGE_DESCRIPTION_PROMPT } from '../prompts.js';
6
+ import { slugify, checkFileSize } from '../utils.js';
7
+ import { sanitizeUnicode } from '../sanitization.js';
8
+ const MAX_IMAGE_SIZE = 50 * 1024 * 1024; // 50 MB
9
+ export async function ingestImage(filePath, dataDir, llm) {
10
+ if (!existsSync(filePath)) {
11
+ throw new Error(`File not found: ${filePath}`);
12
+ }
13
+ checkFileSize(filePath, MAX_IMAGE_SIZE);
14
+ const ext = extname(filePath).toLowerCase();
15
+ const title = sanitizeUnicode(basename(filePath, ext));
16
+ const id = createHash('sha256').update(filePath).digest('hex').slice(0, 12);
17
+ // Copy image to images directory
18
+ const imgFilename = `${id}${ext}`;
19
+ const imgDest = join(dataDir, 'raw', 'images', imgFilename);
20
+ copyFileSync(filePath, imgDest);
21
+ // Generate description via LLM if available
22
+ let description = `![${title}](images/${imgFilename})`;
23
+ if (llm) {
24
+ const imageData = readFileSync(filePath).toString('base64');
25
+ const mimeType = lookup(ext) || 'image/png';
26
+ try {
27
+ const response = await llm.chat([
28
+ {
29
+ role: 'user',
30
+ content: `Describe this image in detail. The image is: ${title}`,
31
+ },
32
+ ], { system: IMAGE_DESCRIPTION_PROMPT });
33
+ description = `![${title}](images/${imgFilename})\n\n## Description\n\n${sanitizeUnicode(response.content)}`;
34
+ }
35
+ catch {
36
+ description = `![${title}](images/${imgFilename})\n\n*Image description pending — LLM unavailable during ingest.*`;
37
+ }
38
+ }
39
+ const filename = `${slugify(title)}-${id}.md`;
40
+ const frontmatter = [
41
+ '---',
42
+ `title: ${JSON.stringify(title)}`,
43
+ `source: ${JSON.stringify(filePath)}`,
44
+ `type: image`,
45
+ `image: ${JSON.stringify('images/' + imgFilename)}`,
46
+ `ingested: ${JSON.stringify(new Date().toISOString())}`,
47
+ '---',
48
+ '',
49
+ ].join('\n');
50
+ const outputPath = join(dataDir, 'raw', filename);
51
+ writeFileSync(outputPath, frontmatter + description, 'utf-8');
52
+ return {
53
+ id,
54
+ type: 'image',
55
+ title,
56
+ origin: filePath,
57
+ path: outputPath,
58
+ images: [imgDest],
59
+ ingestedAt: new Date().toISOString(),
60
+ };
61
+ }
@@ -0,0 +1,18 @@
1
+ import type { RawSource, SourceType } from '../types.js';
2
+ import type { LLMProvider } from '../llm/provider.js';
3
+ import { ingestWeb } from './web.js';
4
+ import { ingestPdf } from './pdf.js';
5
+ import { ingestText } from './text.js';
6
+ import { ingestImage } from './image.js';
7
+ /**
8
+ * Detect source type from input string (URL or file path).
9
+ */
10
+ export declare function detectSourceType(input: string): SourceType;
11
+ /**
12
+ * Ingest a source into the knowledge base.
13
+ */
14
+ export declare function ingest(input: string, dataDir: string, options?: {
15
+ llm?: LLMProvider;
16
+ type?: SourceType;
17
+ }): Promise<RawSource>;
18
+ export { ingestWeb, ingestPdf, ingestText, ingestImage };
@@ -0,0 +1,79 @@
1
+ import { extname } from 'path';
2
+ import { FilesystemStorage } from '../storage/filesystem.js';
3
+ import { SQLiteStorage } from '../storage/sqlite.js';
4
+ import { ingestWeb } from './web.js';
5
+ import { ingestPdf } from './pdf.js';
6
+ import { ingestText } from './text.js';
7
+ import { ingestImage } from './image.js';
8
+ const IMAGE_EXTS = new Set(['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp']);
9
+ /**
10
+ * Detect source type from input string (URL or file path).
11
+ */
12
+ export function detectSourceType(input) {
13
+ if (input.startsWith('http://') || input.startsWith('https://')) {
14
+ return 'web';
15
+ }
16
+ const ext = extname(input).toLowerCase();
17
+ if (ext === '.pdf')
18
+ return 'pdf';
19
+ if (IMAGE_EXTS.has(ext))
20
+ return 'image';
21
+ if (ext === '.md')
22
+ return 'markdown';
23
+ return 'text';
24
+ }
25
+ /**
26
+ * Ingest a source into the knowledge base.
27
+ */
28
+ export async function ingest(input, dataDir, options) {
29
+ const type = options?.type ?? detectSourceType(input);
30
+ let source;
31
+ switch (type) {
32
+ case 'web':
33
+ source = await ingestWeb(input, dataDir);
34
+ break;
35
+ case 'pdf':
36
+ source = await ingestPdf(input, dataDir);
37
+ break;
38
+ case 'image':
39
+ source = await ingestImage(input, dataDir, options?.llm);
40
+ break;
41
+ case 'markdown':
42
+ case 'text':
43
+ source = await ingestText(input, dataDir);
44
+ break;
45
+ default:
46
+ throw new Error(`Unsupported source type: ${type}`);
47
+ }
48
+ // Update manifest
49
+ const fs = new FilesystemStorage(dataDir);
50
+ const manifest = fs.getRawManifest();
51
+ // Replace if same origin exists, otherwise append
52
+ const existing = manifest.findIndex(s => s.origin === source.origin);
53
+ if (existing >= 0) {
54
+ manifest[existing] = source;
55
+ }
56
+ else {
57
+ manifest.push(source);
58
+ }
59
+ fs.writeRawManifest(manifest);
60
+ // Index in SQLite
61
+ const db = new SQLiteStorage(dataDir);
62
+ try {
63
+ db.upsertSource({
64
+ id: source.id,
65
+ type: source.type,
66
+ title: source.title,
67
+ origin: source.origin,
68
+ path: source.path,
69
+ summary: source.summary,
70
+ tags: source.tags,
71
+ ingestedAt: source.ingestedAt,
72
+ });
73
+ }
74
+ finally {
75
+ db.close();
76
+ }
77
+ return source;
78
+ }
79
+ export { ingestWeb, ingestPdf, ingestText, ingestImage };
@@ -0,0 +1,2 @@
1
+ import type { RawSource } from '../types.js';
2
+ export declare function ingestPdf(filePath: string, dataDir: string): Promise<RawSource>;
@@ -0,0 +1,36 @@
1
+ import { readFileSync, writeFileSync } from 'fs';
2
+ import { join, basename } from 'path';
3
+ import { createHash } from 'crypto';
4
+ import { slugify, checkFileSize } from '../utils.js';
5
+ import { sanitizeUnicode } from '../sanitization.js';
6
+ const MAX_PDF_SIZE = 50 * 1024 * 1024; // 50 MB
7
+ export async function ingestPdf(filePath, dataDir) {
8
+ checkFileSize(filePath, MAX_PDF_SIZE);
9
+ const pdfParse = (await import('pdf-parse')).default;
10
+ const buffer = readFileSync(filePath);
11
+ const data = await pdfParse(buffer);
12
+ const title = sanitizeUnicode(data.info?.Title || basename(filePath, '.pdf'));
13
+ const id = createHash('sha256').update(filePath).digest('hex').slice(0, 12);
14
+ const filename = `${slugify(title)}-${id}.md`;
15
+ const frontmatter = [
16
+ '---',
17
+ `title: ${JSON.stringify(title)}`,
18
+ `source: ${JSON.stringify(filePath)}`,
19
+ `type: pdf`,
20
+ `pages: ${data.numpages}`,
21
+ `ingested: ${JSON.stringify(new Date().toISOString())}`,
22
+ '---',
23
+ '',
24
+ ].join('\n');
25
+ const outputPath = join(dataDir, 'raw', filename);
26
+ writeFileSync(outputPath, frontmatter + sanitizeUnicode(data.text), 'utf-8');
27
+ return {
28
+ id,
29
+ type: 'pdf',
30
+ title,
31
+ origin: filePath,
32
+ path: outputPath,
33
+ images: [],
34
+ ingestedAt: new Date().toISOString(),
35
+ };
36
+ }
@@ -0,0 +1,2 @@
1
+ import type { RawSource } from '../types.js';
2
+ export declare function ingestText(filePath: string, dataDir: string): Promise<RawSource>;
@@ -0,0 +1,38 @@
1
+ import { readFileSync, writeFileSync, existsSync } from 'fs';
2
+ import { join, basename, extname } from 'path';
3
+ import { createHash } from 'crypto';
4
+ import { slugify, checkFileSize } from '../utils.js';
5
+ import { sanitizeUnicode } from '../sanitization.js';
6
+ const MAX_TEXT_SIZE = 50 * 1024 * 1024; // 50 MB
7
+ export async function ingestText(filePath, dataDir) {
8
+ if (!existsSync(filePath)) {
9
+ throw new Error(`File not found: ${filePath}`);
10
+ }
11
+ checkFileSize(filePath, MAX_TEXT_SIZE);
12
+ const content = sanitizeUnicode(readFileSync(filePath, 'utf-8'));
13
+ const ext = extname(filePath).toLowerCase();
14
+ const type = ext === '.md' ? 'markdown' : 'text';
15
+ const title = sanitizeUnicode(basename(filePath, ext));
16
+ const id = createHash('sha256').update(filePath).digest('hex').slice(0, 12);
17
+ const filename = `${slugify(title)}-${id}.md`;
18
+ const frontmatter = [
19
+ '---',
20
+ `title: ${JSON.stringify(title)}`,
21
+ `source: ${JSON.stringify(filePath)}`,
22
+ `type: ${type}`,
23
+ `ingested: ${JSON.stringify(new Date().toISOString())}`,
24
+ '---',
25
+ '',
26
+ ].join('\n');
27
+ const outputPath = join(dataDir, 'raw', filename);
28
+ writeFileSync(outputPath, frontmatter + content, 'utf-8');
29
+ return {
30
+ id,
31
+ type,
32
+ title,
33
+ origin: filePath,
34
+ path: outputPath,
35
+ images: [],
36
+ ingestedAt: new Date().toISOString(),
37
+ };
38
+ }
@@ -0,0 +1,2 @@
1
+ import type { RawSource } from '../types.js';
2
+ export declare function ingestWeb(url: string, dataDir: string): Promise<RawSource>;
@@ -0,0 +1,202 @@
1
+ import TurndownService from 'turndown';
2
+ import { createHash } from 'node:crypto';
3
+ import { writeFileSync } from 'fs';
4
+ import { join, extname as pathExtname } from 'path';
5
+ import { slugify, fetchWithTimeout } from '../utils.js';
6
+ import { sanitizeUnicode } from '../sanitization.js';
7
+ const MAX_IMAGES = 50;
8
+ const MAX_IMAGE_BYTES = 10 * 1024 * 1024; // 10 MB per image
9
+ const MAX_HTML_BYTES = 10 * 1024 * 1024; // 10 MB for page HTML
10
+ const MAX_REDIRECTS = 5;
11
+ const HEADERS = {
12
+ 'User-Agent': 'browzy/0.1.0',
13
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
14
+ };
15
+ // ── SSRF Protection ──────────────────────────────────────────────
16
+ function isPrivateUrl(urlStr) {
17
+ let parsed;
18
+ try {
19
+ parsed = new URL(urlStr);
20
+ }
21
+ catch {
22
+ return true; // Block unparseable URLs
23
+ }
24
+ const host = parsed.hostname.toLowerCase();
25
+ // Obvious private hostnames
26
+ if (host === 'localhost' ||
27
+ host === '0.0.0.0' ||
28
+ host.endsWith('.local') ||
29
+ host.endsWith('.internal'))
30
+ return true;
31
+ const stripped = host.replace(/^\[|\]$/g, ''); // strip brackets for IPv6
32
+ // Block octal/hex IP notation (bypass attempts like 0x7f000001, 0177.0.0.1)
33
+ if (/^[0-9a-fx.:]+$/i.test(stripped) && (/0x/i.test(stripped) || /^0\d/.test(stripped))) {
34
+ return true;
35
+ }
36
+ // IPv4 loopback and private ranges
37
+ const v4Parts = stripped.split('.').map(Number);
38
+ if (v4Parts.length === 4 && v4Parts.every(n => !isNaN(n))) {
39
+ if (v4Parts[0] === 127)
40
+ return true; // 127.0.0.0/8
41
+ if (v4Parts[0] === 10)
42
+ return true; // 10.0.0.0/8
43
+ if (v4Parts[0] === 172 && v4Parts[1] >= 16 && v4Parts[1] <= 31)
44
+ return true; // 172.16.0.0/12
45
+ if (v4Parts[0] === 192 && v4Parts[1] === 168)
46
+ return true; // 192.168.0.0/16
47
+ if (v4Parts[0] === 169 && v4Parts[1] === 254)
48
+ return true; // 169.254.0.0/16 (link-local / cloud metadata)
49
+ if (v4Parts[0] === 0)
50
+ return true; // 0.0.0.0/8
51
+ }
52
+ // Block bare decimal IP (e.g., 2130706433 = 127.0.0.1)
53
+ if (/^\d+$/.test(stripped) && parseInt(stripped, 10) > 0)
54
+ return true;
55
+ // IPv6 private ranges
56
+ if (stripped === '::1')
57
+ return true; // loopback
58
+ if (stripped.startsWith('fc') || stripped.startsWith('fd'))
59
+ return true; // fc00::/7 unique local
60
+ if (stripped.startsWith('fe80'))
61
+ return true; // fe80::/10 link-local
62
+ if (stripped.startsWith('::ffff:'))
63
+ return true; // IPv4-mapped IPv6
64
+ return false;
65
+ }
66
+ // ── Redirect Validation ──────────────────────────────────────────
67
+ function isPermittedRedirect(originalUrl, redirectUrl) {
68
+ try {
69
+ const orig = new URL(originalUrl);
70
+ const redir = new URL(redirectUrl);
71
+ // Block private redirect targets
72
+ if (isPrivateUrl(redirectUrl))
73
+ return false;
74
+ // No credentials in redirect
75
+ if (redir.username || redir.password)
76
+ return false;
77
+ // Must stay same host (allow www add/remove)
78
+ const stripWww = (h) => h.replace(/^www\./, '');
79
+ return stripWww(orig.hostname) === stripWww(redir.hostname);
80
+ }
81
+ catch {
82
+ return false;
83
+ }
84
+ }
85
+ async function fetchFollowingSafeRedirects(url, options = {}, depth = 0) {
86
+ if (depth > MAX_REDIRECTS) {
87
+ throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`);
88
+ }
89
+ const response = await fetchWithTimeout(url, { ...options, timeoutMs: 30_000 });
90
+ if ([301, 302, 307, 308].includes(response.status)) {
91
+ const location = response.headers.get('location');
92
+ if (!location)
93
+ throw new Error('Redirect missing Location header');
94
+ const redirectUrl = new URL(location, url).href;
95
+ if (!isPermittedRedirect(url, redirectUrl)) {
96
+ throw new Error(`Blocked redirect from ${new URL(url).hostname} to ${new URL(redirectUrl).hostname}`);
97
+ }
98
+ return fetchFollowingSafeRedirects(redirectUrl, options, depth + 1);
99
+ }
100
+ return response;
101
+ }
102
+ // ── Image Extension Extraction ───────────────────────────────────
103
+ function safeImageExt(imgUrl) {
104
+ try {
105
+ const pathname = new URL(imgUrl).pathname;
106
+ const ext = pathExtname(pathname).toLowerCase().replace(/^\./, '');
107
+ const allowed = new Set(['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'bmp', 'ico']);
108
+ return allowed.has(ext) ? ext : 'png';
109
+ }
110
+ catch {
111
+ return 'png';
112
+ }
113
+ }
114
+ // ── Main ─────────────────────────────────────────────────────────
115
+ const turndown = new TurndownService({
116
+ headingStyle: 'atx',
117
+ codeBlockStyle: 'fenced',
118
+ });
119
+ export async function ingestWeb(url, dataDir) {
120
+ if (url.length > 2000) {
121
+ throw new Error('URL exceeds maximum length of 2000 characters');
122
+ }
123
+ if (isPrivateUrl(url)) {
124
+ throw new Error('Cannot fetch private or internal URLs');
125
+ }
126
+ const response = await fetchFollowingSafeRedirects(url, { headers: HEADERS });
127
+ if (!response.ok) {
128
+ throw new Error(`Fetch failed: ${response.status} ${response.statusText}`);
129
+ }
130
+ // Content-type guard
131
+ const contentType = response.headers.get('content-type') || '';
132
+ if (!contentType.includes('text/') && !contentType.includes('html') && !contentType.includes('xml')) {
133
+ throw new Error(`Unexpected content type: ${contentType}. Expected HTML or text.`);
134
+ }
135
+ // Content-length guard (pre-read)
136
+ const contentLength = parseInt(response.headers.get('content-length') || '0', 10);
137
+ if (contentLength > MAX_HTML_BYTES) {
138
+ throw new Error(`Page too large: ${(contentLength / 1024 / 1024).toFixed(1)}MB exceeds ${(MAX_HTML_BYTES / 1024 / 1024).toFixed(0)}MB limit`);
139
+ }
140
+ const html = await response.text();
141
+ // Post-read size check (content-length can be missing or wrong)
142
+ if (Buffer.byteLength(html) > MAX_HTML_BYTES) {
143
+ throw new Error('Page content exceeds size limit');
144
+ }
145
+ const markdown = sanitizeUnicode(turndown.turndown(html));
146
+ // Extract title
147
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
148
+ const title = sanitizeUnicode(titleMatch?.[1]?.trim() || url);
149
+ // Download images (capped, with SSRF + size checks)
150
+ const images = [];
151
+ const imgRegex = /<img[^>]+src="([^"]+)"/g;
152
+ let match;
153
+ while ((match = imgRegex.exec(html)) !== null && images.length < MAX_IMAGES) {
154
+ try {
155
+ const imgUrl = new URL(match[1], url).href;
156
+ if (isPrivateUrl(imgUrl))
157
+ continue;
158
+ const imgResponse = await fetchWithTimeout(imgUrl, {
159
+ headers: { 'User-Agent': 'browzy/0.1.0' },
160
+ timeoutMs: 15_000,
161
+ });
162
+ if (!imgResponse.ok)
163
+ continue;
164
+ const contentLength = parseInt(imgResponse.headers.get('content-length') || '0', 10);
165
+ if (contentLength > MAX_IMAGE_BYTES)
166
+ continue;
167
+ const buffer = Buffer.from(await imgResponse.arrayBuffer());
168
+ if (buffer.length > MAX_IMAGE_BYTES)
169
+ continue;
170
+ const ext = safeImageExt(imgUrl);
171
+ const imgFilename = `${createHash('sha256').update(imgUrl).digest('hex').slice(0, 12)}.${ext}`;
172
+ const imgPath = join(dataDir, 'raw', 'images', imgFilename);
173
+ writeFileSync(imgPath, buffer);
174
+ images.push(imgPath);
175
+ }
176
+ catch {
177
+ // Skip failed image downloads
178
+ }
179
+ }
180
+ const id = createHash('sha256').update(url).digest('hex').slice(0, 12);
181
+ const filename = `${slugify(title)}-${id}.md`;
182
+ const frontmatter = [
183
+ '---',
184
+ `title: ${JSON.stringify(title)}`,
185
+ `source: ${JSON.stringify(url)}`,
186
+ `type: web`,
187
+ `ingested: ${JSON.stringify(new Date().toISOString())}`,
188
+ '---',
189
+ '',
190
+ ].join('\n');
191
+ const path = join(dataDir, 'raw', filename);
192
+ writeFileSync(path, frontmatter + markdown, 'utf-8');
193
+ return {
194
+ id,
195
+ type: 'web',
196
+ title,
197
+ origin: url,
198
+ path,
199
+ images,
200
+ ingestedAt: new Date().toISOString(),
201
+ };
202
+ }
@@ -0,0 +1 @@
1
+ export { WikiLinter } from './linter.js';
@@ -0,0 +1 @@
1
+ export { WikiLinter } from './linter.js';
@@ -0,0 +1,27 @@
1
+ import type { LLMProvider } from '../llm/provider.js';
2
+ import type { LintIssue } from '../types.js';
3
+ export declare class WikiLinter {
4
+ private fs;
5
+ private llm;
6
+ constructor(dataDir: string, llm: LLMProvider);
7
+ /**
8
+ * Run all lint checks on the wiki.
9
+ */
10
+ lint(): Promise<LintIssue[]>;
11
+ /**
12
+ * Check for [[wiki-links]] that point to non-existent articles.
13
+ */
14
+ private checkBrokenLinks;
15
+ /**
16
+ * Find articles with no incoming links (orphans).
17
+ */
18
+ private checkOrphanArticles;
19
+ /**
20
+ * Check for missing frontmatter fields.
21
+ */
22
+ private checkMissingFields;
23
+ /**
24
+ * Use LLM to check for inconsistencies, duplicates, and gaps.
25
+ */
26
+ private checkConsistency;
27
+ }