@gmickel/gno 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +256 -0
  2. package/assets/skill/SKILL.md +112 -0
  3. package/assets/skill/cli-reference.md +327 -0
  4. package/assets/skill/examples.md +234 -0
  5. package/assets/skill/mcp-reference.md +159 -0
  6. package/package.json +90 -0
  7. package/src/app/constants.ts +313 -0
  8. package/src/cli/colors.ts +65 -0
  9. package/src/cli/commands/ask.ts +545 -0
  10. package/src/cli/commands/cleanup.ts +105 -0
  11. package/src/cli/commands/collection/add.ts +120 -0
  12. package/src/cli/commands/collection/index.ts +10 -0
  13. package/src/cli/commands/collection/list.ts +108 -0
  14. package/src/cli/commands/collection/remove.ts +64 -0
  15. package/src/cli/commands/collection/rename.ts +95 -0
  16. package/src/cli/commands/context/add.ts +67 -0
  17. package/src/cli/commands/context/check.ts +153 -0
  18. package/src/cli/commands/context/index.ts +10 -0
  19. package/src/cli/commands/context/list.ts +109 -0
  20. package/src/cli/commands/context/rm.ts +52 -0
  21. package/src/cli/commands/doctor.ts +393 -0
  22. package/src/cli/commands/embed.ts +462 -0
  23. package/src/cli/commands/get.ts +356 -0
  24. package/src/cli/commands/index-cmd.ts +119 -0
  25. package/src/cli/commands/index.ts +102 -0
  26. package/src/cli/commands/init.ts +328 -0
  27. package/src/cli/commands/ls.ts +217 -0
  28. package/src/cli/commands/mcp/config.ts +300 -0
  29. package/src/cli/commands/mcp/index.ts +24 -0
  30. package/src/cli/commands/mcp/install.ts +203 -0
  31. package/src/cli/commands/mcp/paths.ts +470 -0
  32. package/src/cli/commands/mcp/status.ts +222 -0
  33. package/src/cli/commands/mcp/uninstall.ts +158 -0
  34. package/src/cli/commands/mcp.ts +20 -0
  35. package/src/cli/commands/models/clear.ts +103 -0
  36. package/src/cli/commands/models/index.ts +32 -0
  37. package/src/cli/commands/models/list.ts +214 -0
  38. package/src/cli/commands/models/path.ts +51 -0
  39. package/src/cli/commands/models/pull.ts +199 -0
  40. package/src/cli/commands/models/use.ts +85 -0
  41. package/src/cli/commands/multi-get.ts +400 -0
  42. package/src/cli/commands/query.ts +220 -0
  43. package/src/cli/commands/ref-parser.ts +108 -0
  44. package/src/cli/commands/reset.ts +191 -0
  45. package/src/cli/commands/search.ts +136 -0
  46. package/src/cli/commands/shared.ts +156 -0
  47. package/src/cli/commands/skill/index.ts +19 -0
  48. package/src/cli/commands/skill/install.ts +197 -0
  49. package/src/cli/commands/skill/paths-cmd.ts +81 -0
  50. package/src/cli/commands/skill/paths.ts +191 -0
  51. package/src/cli/commands/skill/show.ts +73 -0
  52. package/src/cli/commands/skill/uninstall.ts +141 -0
  53. package/src/cli/commands/status.ts +205 -0
  54. package/src/cli/commands/update.ts +68 -0
  55. package/src/cli/commands/vsearch.ts +188 -0
  56. package/src/cli/context.ts +64 -0
  57. package/src/cli/errors.ts +64 -0
  58. package/src/cli/format/search-results.ts +211 -0
  59. package/src/cli/options.ts +183 -0
  60. package/src/cli/program.ts +1330 -0
  61. package/src/cli/run.ts +213 -0
  62. package/src/cli/ui.ts +92 -0
  63. package/src/config/defaults.ts +20 -0
  64. package/src/config/index.ts +55 -0
  65. package/src/config/loader.ts +161 -0
  66. package/src/config/paths.ts +87 -0
  67. package/src/config/saver.ts +153 -0
  68. package/src/config/types.ts +280 -0
  69. package/src/converters/adapters/markitdownTs/adapter.ts +140 -0
  70. package/src/converters/adapters/officeparser/adapter.ts +126 -0
  71. package/src/converters/canonicalize.ts +89 -0
  72. package/src/converters/errors.ts +218 -0
  73. package/src/converters/index.ts +51 -0
  74. package/src/converters/mime.ts +163 -0
  75. package/src/converters/native/markdown.ts +115 -0
  76. package/src/converters/native/plaintext.ts +56 -0
  77. package/src/converters/path.ts +48 -0
  78. package/src/converters/pipeline.ts +159 -0
  79. package/src/converters/registry.ts +74 -0
  80. package/src/converters/types.ts +123 -0
  81. package/src/converters/versions.ts +24 -0
  82. package/src/index.ts +27 -0
  83. package/src/ingestion/chunker.ts +238 -0
  84. package/src/ingestion/index.ts +32 -0
  85. package/src/ingestion/language.ts +276 -0
  86. package/src/ingestion/sync.ts +671 -0
  87. package/src/ingestion/types.ts +219 -0
  88. package/src/ingestion/walker.ts +235 -0
  89. package/src/llm/cache.ts +467 -0
  90. package/src/llm/errors.ts +191 -0
  91. package/src/llm/index.ts +58 -0
  92. package/src/llm/nodeLlamaCpp/adapter.ts +133 -0
  93. package/src/llm/nodeLlamaCpp/embedding.ts +165 -0
  94. package/src/llm/nodeLlamaCpp/generation.ts +88 -0
  95. package/src/llm/nodeLlamaCpp/lifecycle.ts +317 -0
  96. package/src/llm/nodeLlamaCpp/rerank.ts +94 -0
  97. package/src/llm/registry.ts +86 -0
  98. package/src/llm/types.ts +129 -0
  99. package/src/mcp/resources/index.ts +151 -0
  100. package/src/mcp/server.ts +229 -0
  101. package/src/mcp/tools/get.ts +220 -0
  102. package/src/mcp/tools/index.ts +160 -0
  103. package/src/mcp/tools/multi-get.ts +263 -0
  104. package/src/mcp/tools/query.ts +226 -0
  105. package/src/mcp/tools/search.ts +119 -0
  106. package/src/mcp/tools/status.ts +81 -0
  107. package/src/mcp/tools/vsearch.ts +198 -0
  108. package/src/pipeline/chunk-lookup.ts +44 -0
  109. package/src/pipeline/expansion.ts +256 -0
  110. package/src/pipeline/explain.ts +115 -0
  111. package/src/pipeline/fusion.ts +185 -0
  112. package/src/pipeline/hybrid.ts +535 -0
  113. package/src/pipeline/index.ts +64 -0
  114. package/src/pipeline/query-language.ts +118 -0
  115. package/src/pipeline/rerank.ts +223 -0
  116. package/src/pipeline/search.ts +261 -0
  117. package/src/pipeline/types.ts +328 -0
  118. package/src/pipeline/vsearch.ts +348 -0
  119. package/src/store/index.ts +41 -0
  120. package/src/store/migrations/001-initial.ts +196 -0
  121. package/src/store/migrations/index.ts +20 -0
  122. package/src/store/migrations/runner.ts +187 -0
  123. package/src/store/sqlite/adapter.ts +1242 -0
  124. package/src/store/sqlite/index.ts +7 -0
  125. package/src/store/sqlite/setup.ts +129 -0
  126. package/src/store/sqlite/types.ts +28 -0
  127. package/src/store/types.ts +506 -0
  128. package/src/store/vector/index.ts +13 -0
  129. package/src/store/vector/sqlite-vec.ts +373 -0
  130. package/src/store/vector/stats.ts +152 -0
  131. package/src/store/vector/types.ts +115 -0
@@ -0,0 +1,89 @@
1
+ /**
2
+ * Markdown canonicalization for deterministic output.
3
+ * PRD §8.4 - Canonical Markdown conventions
4
+ *
5
+ * CRITICAL: These rules are a compatibility contract.
6
+ * Changing them invalidates all existing mirrorHash values.
7
+ */
8
+
9
+ /**
10
+ * Control character pattern built dynamically to avoid lint issues with literal control chars.
11
+ * Matches U+0000-U+0008, U+000B-U+000C, U+000E-U+001F, U+007F (excludes \n and \t)
12
+ */
13
+ const CONTROL_CHAR_PATTERN = new RegExp(
14
+ `[${String.fromCharCode(0)}-${String.fromCharCode(8)}${String.fromCharCode(11)}${String.fromCharCode(12)}${String.fromCharCode(14)}-${String.fromCharCode(31)}${String.fromCharCode(127)}]`,
15
+ 'g'
16
+ );
17
+
18
+ /**
19
+ * Canonicalize markdown to ensure deterministic output.
20
+ *
21
+ * Rules (PRD §8.4):
22
+ * 0. Strip BOM (U+FEFF) if present
23
+ * 1. Normalize to \n newlines (no \r)
24
+ * 2. Apply NFC Unicode normalization (cross-platform hash stability)
25
+ * 3. Strip control chars U+0000-U+001F and U+007F except \n (U+000A) and \t (U+0009)
26
+ * 4. Trim trailing whitespace per line
27
+ * 5. Treat whitespace-only lines as blank (trim first, then count)
28
+ * 6. Collapse 2+ consecutive blank lines to exactly 1 (content\n\ncontent)
29
+ * 7. Ensure exactly one final \n
30
+ */
31
+ export function canonicalize(markdown: string): string {
32
+ if (!markdown) {
33
+ return '\n';
34
+ }
35
+
36
+ // 0. Strip BOM if present (U+FEFF) - ensures deterministic hashing
37
+ let result = markdown.startsWith('\uFEFF') ? markdown.slice(1) : markdown;
38
+
39
+ // 1. Normalize line endings: \r\n → \n, lone \r → \n
40
+ result = result.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
41
+
42
+ // 2. Apply NFC Unicode normalization
43
+ result = result.normalize('NFC');
44
+
45
+ // 3. Strip control characters except \n (U+000A) and \t (U+0009)
46
+ // Range: U+0000-U+0008, U+000B-U+000C, U+000E-U+001F, U+007F
47
+ result = result.replace(CONTROL_CHAR_PATTERN, '');
48
+
49
+ // 4. Trim trailing whitespace per line and
50
+ // 5. Treat whitespace-only lines as blank
51
+ const lines = result.split('\n').map((line) => line.trimEnd());
52
+
53
+ // 6. Collapse multiple blank lines to exactly 1
54
+ // (i.e., content\n\ncontent between paragraphs)
55
+ const collapsed: string[] = [];
56
+ let blankCount = 0;
57
+
58
+ for (const line of lines) {
59
+ if (line === '') {
60
+ blankCount += 1;
61
+ // Only keep one blank line between content
62
+ if (blankCount === 1) {
63
+ collapsed.push(line);
64
+ }
65
+ } else {
66
+ blankCount = 0;
67
+ collapsed.push(line);
68
+ }
69
+ }
70
+
71
+ // 7. Ensure exactly one final \n
72
+ // Remove trailing blank lines first
73
+ while (collapsed.length > 0 && collapsed.at(-1) === '') {
74
+ collapsed.pop();
75
+ }
76
+
77
+ // Join and add single final newline
78
+ return `${collapsed.join('\n')}\n`;
79
+ }
80
+
81
+ /**
82
+ * Compute SHA-256 hash of canonical markdown.
83
+ * Returns lowercase hex string (64 chars).
84
+ */
85
+ export function mirrorHash(canonical: string): string {
86
+ const hasher = new Bun.CryptoHasher('sha256');
87
+ hasher.update(canonical);
88
+ return hasher.digest('hex');
89
+ }
@@ -0,0 +1,218 @@
1
+ /**
2
+ * Converter error types and helpers.
3
+ * PRD §8.3 - Error model
4
+ */
5
+
6
+ import type { ConvertError, ConvertErrorCode, ConvertInput } from './types';
7
+
8
+ type ConvertErrorOpts = Omit<ConvertError, 'code'>;
9
+
10
+ /** Max length for error messages/causes to prevent bloat */
11
+ const MAX_CAUSE_LENGTH = 1000;
12
+
13
+ /**
14
+ * Normalize a cause to a safe, serializable format.
15
+ * Extracts essential info from Error objects, limits length.
16
+ */
17
+ function normalizeCause(
18
+ cause: unknown
19
+ ): { name: string; message: string } | string | undefined {
20
+ if (cause === undefined || cause === null) {
21
+ return;
22
+ }
23
+
24
+ if (cause instanceof Error) {
25
+ const message =
26
+ cause.message.length > MAX_CAUSE_LENGTH
27
+ ? `${cause.message.slice(0, MAX_CAUSE_LENGTH)}...`
28
+ : cause.message;
29
+ return { name: cause.name, message };
30
+ }
31
+
32
+ if (typeof cause === 'string') {
33
+ return cause.length > MAX_CAUSE_LENGTH
34
+ ? `${cause.slice(0, MAX_CAUSE_LENGTH)}...`
35
+ : cause;
36
+ }
37
+
38
+ // For other types, try to stringify safely
39
+ try {
40
+ const str = String(cause);
41
+ return str.length > MAX_CAUSE_LENGTH
42
+ ? `${str.slice(0, MAX_CAUSE_LENGTH)}...`
43
+ : str;
44
+ } catch {
45
+ return '[unserializable cause]';
46
+ }
47
+ }
48
+
49
+ /**
50
+ * Create a ConvertError with the given code and options.
51
+ * Normalizes cause to prevent bloat and serialization issues.
52
+ */
53
+ export function convertError(
54
+ code: ConvertErrorCode,
55
+ opts: ConvertErrorOpts
56
+ ): ConvertError {
57
+ return {
58
+ code,
59
+ ...opts,
60
+ cause: normalizeCause(opts.cause),
61
+ };
62
+ }
63
+
64
+ /**
65
+ * Check if an error code indicates a retryable failure.
66
+ */
67
+ export function isRetryable(code: ConvertErrorCode): boolean {
68
+ return ['TIMEOUT', 'IO', 'ADAPTER_FAILURE'].includes(code);
69
+ }
70
+
71
+ /**
72
+ * Create a standard error result for unsupported file types.
73
+ */
74
+ export function unsupportedError(
75
+ input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
76
+ converterId = 'registry'
77
+ ): ConvertError {
78
+ return convertError('UNSUPPORTED', {
79
+ message: `No converter for ${input.mime} (${input.ext})`,
80
+ retryable: false,
81
+ fatal: false,
82
+ converterId,
83
+ sourcePath: input.sourcePath,
84
+ mime: input.mime,
85
+ ext: input.ext,
86
+ });
87
+ }
88
+
89
+ /**
90
+ * Create an error for files exceeding size limits.
91
+ */
92
+ export function tooLargeError(
93
+ input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext' | 'bytes' | 'limits'>,
94
+ converterId: string
95
+ ): ConvertError {
96
+ return convertError('TOO_LARGE', {
97
+ message: `File size ${input.bytes.length} exceeds limit ${input.limits.maxBytes}`,
98
+ retryable: false,
99
+ fatal: false,
100
+ converterId,
101
+ sourcePath: input.sourcePath,
102
+ mime: input.mime,
103
+ ext: input.ext,
104
+ details: {
105
+ size: input.bytes.length,
106
+ limit: input.limits.maxBytes,
107
+ },
108
+ });
109
+ }
110
+
111
+ /**
112
+ * Create an error for conversion output exceeding size limits.
113
+ * Distinct from tooLargeError (input) - this is for output (zip bomb protection).
114
+ */
115
+ export function outputTooLargeError(
116
+ input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
117
+ converterId: string,
118
+ opts: { outputChars: number; limitChars: number; stage: 'raw' | 'canonical' }
119
+ ): ConvertError {
120
+ return convertError('TOO_LARGE', {
121
+ message: `Conversion output (${opts.outputChars} chars at ${opts.stage}) exceeds limit ${opts.limitChars}`,
122
+ retryable: false,
123
+ fatal: false,
124
+ converterId,
125
+ sourcePath: input.sourcePath,
126
+ mime: input.mime,
127
+ ext: input.ext,
128
+ details: {
129
+ outputChars: opts.outputChars,
130
+ limitChars: opts.limitChars,
131
+ stage: opts.stage,
132
+ },
133
+ });
134
+ }
135
+
136
+ /**
137
+ * Create an error for conversion timeouts.
138
+ */
139
+ export function timeoutError(
140
+ input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext' | 'limits'>,
141
+ converterId: string
142
+ ): ConvertError {
143
+ return convertError('TIMEOUT', {
144
+ message: `Conversion timed out after ${input.limits.timeoutMs}ms`,
145
+ retryable: true,
146
+ fatal: false,
147
+ converterId,
148
+ sourcePath: input.sourcePath,
149
+ mime: input.mime,
150
+ ext: input.ext,
151
+ details: {
152
+ timeoutMs: input.limits.timeoutMs,
153
+ },
154
+ });
155
+ }
156
+
157
+ /**
158
+ * Create an error for corrupt or invalid files.
159
+ */
160
+ export function corruptError(
161
+ input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
162
+ converterId: string,
163
+ message: string,
164
+ cause?: unknown
165
+ ): ConvertError {
166
+ return convertError('CORRUPT', {
167
+ message,
168
+ retryable: false,
169
+ fatal: false,
170
+ converterId,
171
+ sourcePath: input.sourcePath,
172
+ mime: input.mime,
173
+ ext: input.ext,
174
+ cause,
175
+ });
176
+ }
177
+
178
+ /**
179
+ * Create an error for adapter-level failures.
180
+ */
181
+ export function adapterError(
182
+ input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
183
+ converterId: string,
184
+ message: string,
185
+ cause?: unknown
186
+ ): ConvertError {
187
+ return convertError('ADAPTER_FAILURE', {
188
+ message,
189
+ retryable: true,
190
+ fatal: false,
191
+ converterId,
192
+ sourcePath: input.sourcePath,
193
+ mime: input.mime,
194
+ ext: input.ext,
195
+ cause,
196
+ });
197
+ }
198
+
199
+ /**
200
+ * Create an error for internal pipeline failures.
201
+ */
202
+ export function internalError(
203
+ input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
204
+ converterId: string,
205
+ message: string,
206
+ cause?: unknown
207
+ ): ConvertError {
208
+ return convertError('INTERNAL', {
209
+ message,
210
+ retryable: false,
211
+ fatal: true,
212
+ converterId,
213
+ sourcePath: input.sourcePath,
214
+ mime: input.mime,
215
+ ext: input.ext,
216
+ cause,
217
+ });
218
+ }
@@ -0,0 +1,51 @@
1
+ /**
2
+ * Converter subsystem public API.
3
+ *
4
+ * Usage:
5
+ * import { getDefaultPipeline } from './converters';
6
+ * const pipeline = getDefaultPipeline();
7
+ * const result = await pipeline.convert(input);
8
+ */
9
+
10
+ // Canonicalization
11
+ export { canonicalize, mirrorHash } from './canonicalize';
12
+ // Errors
13
+ export {
14
+ adapterError,
15
+ convertError,
16
+ corruptError,
17
+ isRetryable,
18
+ timeoutError,
19
+ tooLargeError,
20
+ unsupportedError,
21
+ } from './errors';
22
+ // MIME detection
23
+ export type { MimeDetection, MimeDetector } from './mime';
24
+ export {
25
+ DefaultMimeDetector,
26
+ getDefaultMimeDetector,
27
+ isSupportedExtension,
28
+ SUPPORTED_EXTENSIONS,
29
+ } from './mime';
30
+ // Pipeline (main entry point)
31
+ export {
32
+ ConversionPipeline,
33
+ getDefaultPipeline,
34
+ resetDefaultPipeline,
35
+ } from './pipeline';
36
+ // Registry
37
+ export { ConverterRegistry, createDefaultRegistry } from './registry';
38
+ // Types
39
+ export type {
40
+ ConversionArtifact,
41
+ ConvertError,
42
+ ConvertErrorCode,
43
+ Converter,
44
+ ConverterId,
45
+ ConvertInput,
46
+ ConvertOutput,
47
+ ConvertResult,
48
+ ConvertWarning,
49
+ PipelineResult,
50
+ } from './types';
51
+ export { DEFAULT_LIMITS } from './types';
@@ -0,0 +1,163 @@
1
+ /**
2
+ * MIME type detection with magic byte sniffing and extension mapping.
3
+ * PRD §8.5 - MIME detection strategy
4
+ */
5
+
6
+ import { extname } from './path';
7
+
8
+ export interface MimeDetection {
9
+ mime: string;
10
+ ext: string;
11
+ confidence: 'high' | 'medium' | 'low';
12
+ via: 'sniff' | 'sniff+ext' | 'ext' | 'fallback';
13
+ }
14
+
15
+ export interface MimeDetector {
16
+ detect(path: string, bytes: Uint8Array): MimeDetection;
17
+ }
18
+
19
+ /** Extension to MIME type mapping (PRD §8.5) */
20
+ const EXTENSION_MAP: Record<string, string> = {
21
+ '.md': 'text/markdown',
22
+ '.txt': 'text/plain',
23
+ '.pdf': 'application/pdf',
24
+ '.docx':
25
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
26
+ '.pptx':
27
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
28
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
29
+ };
30
+
31
+ /** OOXML extension to MIME mapping */
32
+ const OOXML_MAP: Record<string, string> = {
33
+ '.docx':
34
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
35
+ '.pptx':
36
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
37
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
38
+ };
39
+
40
+ /** PDF magic bytes: %PDF- */
41
+ const PDF_MAGIC = new Uint8Array([0x25, 0x50, 0x44, 0x46, 0x2d]);
42
+
43
+ /** ZIP/OOXML magic bytes: PK\x03\x04 */
44
+ const ZIP_MAGIC = new Uint8Array([0x50, 0x4b, 0x03, 0x04]);
45
+
46
+ /**
47
+ * Check if bytes start with the given prefix.
48
+ */
49
+ function startsWith(bytes: Uint8Array, prefix: Uint8Array): boolean {
50
+ if (bytes.length < prefix.length) {
51
+ return false;
52
+ }
53
+ for (let i = 0; i < prefix.length; i++) {
54
+ if (bytes[i] !== prefix[i]) {
55
+ return false;
56
+ }
57
+ }
58
+ return true;
59
+ }
60
+
61
+ interface SniffResult {
62
+ mime: string;
63
+ /** True if sniff alone is sufficient (e.g., PDF); false if ext-assisted (OOXML) */
64
+ pureSniff: boolean;
65
+ }
66
+
67
+ /**
68
+ * Sniff MIME type from magic bytes.
69
+ * Returns detected MIME or undefined if no match.
70
+ */
71
+ function sniffMagicBytes(
72
+ bytes: Uint8Array,
73
+ ext: string
74
+ ): SniffResult | undefined {
75
+ // PDF detection - pure sniff, no extension needed
76
+ if (startsWith(bytes, PDF_MAGIC)) {
77
+ return { mime: 'application/pdf', pureSniff: true };
78
+ }
79
+
80
+ // ZIP/OOXML detection - requires extension to distinguish OOXML from generic ZIP
81
+ if (startsWith(bytes, ZIP_MAGIC)) {
82
+ const ooxmlMime = Object.hasOwn(OOXML_MAP, ext)
83
+ ? OOXML_MAP[ext]
84
+ : undefined;
85
+ if (ooxmlMime) {
86
+ // ZIP magic + OOXML extension = extension-assisted sniff
87
+ return { mime: ooxmlMime, pureSniff: false };
88
+ }
89
+ // Generic ZIP (not OOXML)
90
+ return { mime: 'application/zip', pureSniff: true };
91
+ }
92
+
93
+ return;
94
+ }
95
+
96
+ /**
97
+ * Default MIME detector implementation.
98
+ * Detection priority:
99
+ * 1. Magic bytes (sniff) → high confidence for pure sniff
100
+ * 2. Magic bytes + extension → medium confidence (OOXML via ZIP+ext)
101
+ * 3. Extension map → medium confidence
102
+ * 4. Fallback application/octet-stream → low confidence
103
+ */
104
+ export class DefaultMimeDetector implements MimeDetector {
105
+ detect(path: string, bytes: Uint8Array): MimeDetection {
106
+ const ext = extname(path);
107
+
108
+ // 1. Try magic byte sniffing (first 512 bytes sufficient)
109
+ // Use subarray for zero-copy view (no allocation)
110
+ const sniffBytes = bytes.subarray(0, 512);
111
+ const sniffed = sniffMagicBytes(sniffBytes, ext);
112
+ if (sniffed) {
113
+ return {
114
+ mime: sniffed.mime,
115
+ ext,
116
+ // Pure sniff (e.g., PDF) is high confidence
117
+ // Extension-assisted sniff (OOXML) is medium confidence
118
+ confidence: sniffed.pureSniff ? 'high' : 'medium',
119
+ via: sniffed.pureSniff ? 'sniff' : 'sniff+ext',
120
+ };
121
+ }
122
+
123
+ // 2. Try extension mapping
124
+ const extMime = Object.hasOwn(EXTENSION_MAP, ext)
125
+ ? EXTENSION_MAP[ext]
126
+ : undefined;
127
+ if (extMime) {
128
+ return {
129
+ mime: extMime,
130
+ ext,
131
+ confidence: 'medium',
132
+ via: 'ext',
133
+ };
134
+ }
135
+
136
+ // 3. Fallback
137
+ return {
138
+ mime: 'application/octet-stream',
139
+ ext,
140
+ confidence: 'low',
141
+ via: 'fallback',
142
+ };
143
+ }
144
+ }
145
+
146
+ /** Singleton default detector */
147
+ let defaultDetector: MimeDetector | null = null;
148
+
149
+ export function getDefaultMimeDetector(): MimeDetector {
150
+ if (!defaultDetector) {
151
+ defaultDetector = new DefaultMimeDetector();
152
+ }
153
+ return defaultDetector;
154
+ }
155
+
156
+ /** Supported extensions for conversion */
157
+ export const SUPPORTED_EXTENSIONS = Object.keys(EXTENSION_MAP);
158
+
159
+ /** Check if extension is supported for conversion (prototype-safe) */
160
+ export function isSupportedExtension(ext: string): boolean {
161
+ const normalized = ext.toLowerCase();
162
+ return Object.hasOwn(EXTENSION_MAP, normalized);
163
+ }
@@ -0,0 +1,115 @@
1
+ /**
2
+ * Native Markdown converter (passthrough).
3
+ * Simply reads .md files and extracts title from first heading.
4
+ */
5
+
6
+ import type { Converter, ConvertInput, ConvertResult } from '../types';
7
+ import { NATIVE_VERSIONS } from '../versions';
8
+
9
+ const CONVERTER_ID = 'native/markdown' as const;
10
+ const CONVERTER_VERSION = NATIVE_VERSIONS.markdown;
11
+
12
+ /** UTF-8 BOM character */
13
+ const BOM = '\uFEFF';
14
+
15
+ /** Regex to match # heading at line start */
16
+ const HEADING_PATTERN = /^\s*#\s+(.+)/;
17
+
18
+ /** Regex to detect code fence start (captures the fence chars and optional info string) */
19
+ const CODE_FENCE_START = /^(`{3,}|~{3,})/;
20
+
21
+ /**
22
+ * Check if a line closes a code fence.
23
+ * Closing fence must be same char type and at least as long as opening.
24
+ */
25
+ function isClosingFence(
26
+ line: string,
27
+ fenceChar: string,
28
+ fenceLen: number
29
+ ): boolean {
30
+ const trimmed = line.trim();
31
+ // Must be only fence chars (no info string on close)
32
+ if (trimmed.length < fenceLen) {
33
+ return false;
34
+ }
35
+ // All chars must be the fence char
36
+ for (const char of trimmed) {
37
+ if (char !== fenceChar) {
38
+ return false;
39
+ }
40
+ }
41
+ return true;
42
+ }
43
+
44
+ /**
45
+ * Extract title from first # heading in markdown, skipping code blocks.
46
+ * Returns undefined if no heading found.
47
+ */
48
+ function extractFirstHeading(markdown: string): string | undefined {
49
+ const lines = markdown.split('\n');
50
+ let fenceChar = '';
51
+ let fenceLen = 0;
52
+
53
+ for (const line of lines) {
54
+ // If inside a fence, check for closing
55
+ if (fenceLen > 0) {
56
+ if (isClosingFence(line, fenceChar, fenceLen)) {
57
+ fenceChar = '';
58
+ fenceLen = 0;
59
+ }
60
+ continue;
61
+ }
62
+
63
+ // Check for fence opening
64
+ const fenceMatch = line.match(CODE_FENCE_START);
65
+ if (fenceMatch?.[1]) {
66
+ fenceChar = fenceMatch[1].charAt(0);
67
+ fenceLen = fenceMatch[1].length;
68
+ continue;
69
+ }
70
+
71
+ // Check for heading (not inside fence)
72
+ const headingMatch = line.match(HEADING_PATTERN);
73
+ if (headingMatch?.[1]) {
74
+ return headingMatch[1].trim();
75
+ }
76
+ }
77
+
78
+ return;
79
+ }
80
+
81
+ export const markdownConverter: Converter = {
82
+ id: CONVERTER_ID,
83
+ version: CONVERTER_VERSION,
84
+
85
+ canHandle(mime: string, ext: string): boolean {
86
+ return mime === 'text/markdown' || ext === '.md';
87
+ },
88
+
89
+ convert(input: ConvertInput): Promise<ConvertResult> {
90
+ // Decode bytes to string (assumes UTF-8)
91
+ let text = new TextDecoder('utf-8', { fatal: false }).decode(input.bytes);
92
+
93
+ // Strip BOM if present (ensures consistent hashes)
94
+ if (text.startsWith(BOM)) {
95
+ text = text.slice(1);
96
+ }
97
+
98
+ // Extract title from first heading
99
+ const title = extractFirstHeading(text);
100
+
101
+ // NOTE: Do NOT canonicalize here - pipeline.ts handles all normalization
102
+ return Promise.resolve({
103
+ ok: true,
104
+ value: {
105
+ markdown: text,
106
+ title,
107
+ meta: {
108
+ converterId: CONVERTER_ID,
109
+ converterVersion: CONVERTER_VERSION,
110
+ sourceMime: input.mime,
111
+ },
112
+ },
113
+ });
114
+ },
115
+ };
@@ -0,0 +1,56 @@
1
+ /**
2
+ * Native plaintext converter.
3
+ * Converts .txt files to markdown (passthrough as paragraphs).
4
+ */
5
+
6
+ import { basenameWithoutExt } from '../path';
7
+ import type { Converter, ConvertInput, ConvertResult } from '../types';
8
+ import { NATIVE_VERSIONS } from '../versions';
9
+
10
+ const CONVERTER_ID = 'native/plaintext' as const;
11
+ const CONVERTER_VERSION = NATIVE_VERSIONS.plaintext;
12
+
13
+ /** UTF-8 BOM character */
14
+ const BOM = '\uFEFF';
15
+
16
+ export const plaintextConverter: Converter = {
17
+ id: CONVERTER_ID,
18
+ version: CONVERTER_VERSION,
19
+
20
+ canHandle(mime: string, ext: string): boolean {
21
+ return mime === 'text/plain' || ext === '.txt';
22
+ },
23
+
24
+ convert(input: ConvertInput): Promise<ConvertResult> {
25
+ // Decode as UTF-8 with replacement for invalid bytes (deterministic)
26
+ const decoder = new TextDecoder('utf-8', {
27
+ fatal: false, // Don't throw on invalid bytes
28
+ ignoreBOM: false, // We'll strip manually for determinism
29
+ });
30
+
31
+ let text = decoder.decode(input.bytes);
32
+
33
+ // Strip BOM if present (ensures consistent hashes)
34
+ if (text.startsWith(BOM)) {
35
+ text = text.slice(1);
36
+ }
37
+
38
+ // Derive title from filename (cross-platform safe)
39
+ const title = basenameWithoutExt(input.relativePath);
40
+
41
+ // Pass through as paragraphs (no code fence wrapping - better for search)
42
+ // NOTE: Do NOT canonicalize here - pipeline.ts handles all normalization
43
+ return Promise.resolve({
44
+ ok: true,
45
+ value: {
46
+ markdown: text,
47
+ title,
48
+ meta: {
49
+ converterId: CONVERTER_ID,
50
+ converterVersion: CONVERTER_VERSION,
51
+ sourceMime: input.mime,
52
+ },
53
+ },
54
+ });
55
+ },
56
+ };