@gmickel/gno 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +256 -0
- package/assets/skill/SKILL.md +112 -0
- package/assets/skill/cli-reference.md +327 -0
- package/assets/skill/examples.md +234 -0
- package/assets/skill/mcp-reference.md +159 -0
- package/package.json +90 -0
- package/src/app/constants.ts +313 -0
- package/src/cli/colors.ts +65 -0
- package/src/cli/commands/ask.ts +545 -0
- package/src/cli/commands/cleanup.ts +105 -0
- package/src/cli/commands/collection/add.ts +120 -0
- package/src/cli/commands/collection/index.ts +10 -0
- package/src/cli/commands/collection/list.ts +108 -0
- package/src/cli/commands/collection/remove.ts +64 -0
- package/src/cli/commands/collection/rename.ts +95 -0
- package/src/cli/commands/context/add.ts +67 -0
- package/src/cli/commands/context/check.ts +153 -0
- package/src/cli/commands/context/index.ts +10 -0
- package/src/cli/commands/context/list.ts +109 -0
- package/src/cli/commands/context/rm.ts +52 -0
- package/src/cli/commands/doctor.ts +393 -0
- package/src/cli/commands/embed.ts +462 -0
- package/src/cli/commands/get.ts +356 -0
- package/src/cli/commands/index-cmd.ts +119 -0
- package/src/cli/commands/index.ts +102 -0
- package/src/cli/commands/init.ts +328 -0
- package/src/cli/commands/ls.ts +217 -0
- package/src/cli/commands/mcp/config.ts +300 -0
- package/src/cli/commands/mcp/index.ts +24 -0
- package/src/cli/commands/mcp/install.ts +203 -0
- package/src/cli/commands/mcp/paths.ts +470 -0
- package/src/cli/commands/mcp/status.ts +222 -0
- package/src/cli/commands/mcp/uninstall.ts +158 -0
- package/src/cli/commands/mcp.ts +20 -0
- package/src/cli/commands/models/clear.ts +103 -0
- package/src/cli/commands/models/index.ts +32 -0
- package/src/cli/commands/models/list.ts +214 -0
- package/src/cli/commands/models/path.ts +51 -0
- package/src/cli/commands/models/pull.ts +199 -0
- package/src/cli/commands/models/use.ts +85 -0
- package/src/cli/commands/multi-get.ts +400 -0
- package/src/cli/commands/query.ts +220 -0
- package/src/cli/commands/ref-parser.ts +108 -0
- package/src/cli/commands/reset.ts +191 -0
- package/src/cli/commands/search.ts +136 -0
- package/src/cli/commands/shared.ts +156 -0
- package/src/cli/commands/skill/index.ts +19 -0
- package/src/cli/commands/skill/install.ts +197 -0
- package/src/cli/commands/skill/paths-cmd.ts +81 -0
- package/src/cli/commands/skill/paths.ts +191 -0
- package/src/cli/commands/skill/show.ts +73 -0
- package/src/cli/commands/skill/uninstall.ts +141 -0
- package/src/cli/commands/status.ts +205 -0
- package/src/cli/commands/update.ts +68 -0
- package/src/cli/commands/vsearch.ts +188 -0
- package/src/cli/context.ts +64 -0
- package/src/cli/errors.ts +64 -0
- package/src/cli/format/search-results.ts +211 -0
- package/src/cli/options.ts +183 -0
- package/src/cli/program.ts +1330 -0
- package/src/cli/run.ts +213 -0
- package/src/cli/ui.ts +92 -0
- package/src/config/defaults.ts +20 -0
- package/src/config/index.ts +55 -0
- package/src/config/loader.ts +161 -0
- package/src/config/paths.ts +87 -0
- package/src/config/saver.ts +153 -0
- package/src/config/types.ts +280 -0
- package/src/converters/adapters/markitdownTs/adapter.ts +140 -0
- package/src/converters/adapters/officeparser/adapter.ts +126 -0
- package/src/converters/canonicalize.ts +89 -0
- package/src/converters/errors.ts +218 -0
- package/src/converters/index.ts +51 -0
- package/src/converters/mime.ts +163 -0
- package/src/converters/native/markdown.ts +115 -0
- package/src/converters/native/plaintext.ts +56 -0
- package/src/converters/path.ts +48 -0
- package/src/converters/pipeline.ts +159 -0
- package/src/converters/registry.ts +74 -0
- package/src/converters/types.ts +123 -0
- package/src/converters/versions.ts +24 -0
- package/src/index.ts +27 -0
- package/src/ingestion/chunker.ts +238 -0
- package/src/ingestion/index.ts +32 -0
- package/src/ingestion/language.ts +276 -0
- package/src/ingestion/sync.ts +671 -0
- package/src/ingestion/types.ts +219 -0
- package/src/ingestion/walker.ts +235 -0
- package/src/llm/cache.ts +467 -0
- package/src/llm/errors.ts +191 -0
- package/src/llm/index.ts +58 -0
- package/src/llm/nodeLlamaCpp/adapter.ts +133 -0
- package/src/llm/nodeLlamaCpp/embedding.ts +165 -0
- package/src/llm/nodeLlamaCpp/generation.ts +88 -0
- package/src/llm/nodeLlamaCpp/lifecycle.ts +317 -0
- package/src/llm/nodeLlamaCpp/rerank.ts +94 -0
- package/src/llm/registry.ts +86 -0
- package/src/llm/types.ts +129 -0
- package/src/mcp/resources/index.ts +151 -0
- package/src/mcp/server.ts +229 -0
- package/src/mcp/tools/get.ts +220 -0
- package/src/mcp/tools/index.ts +160 -0
- package/src/mcp/tools/multi-get.ts +263 -0
- package/src/mcp/tools/query.ts +226 -0
- package/src/mcp/tools/search.ts +119 -0
- package/src/mcp/tools/status.ts +81 -0
- package/src/mcp/tools/vsearch.ts +198 -0
- package/src/pipeline/chunk-lookup.ts +44 -0
- package/src/pipeline/expansion.ts +256 -0
- package/src/pipeline/explain.ts +115 -0
- package/src/pipeline/fusion.ts +185 -0
- package/src/pipeline/hybrid.ts +535 -0
- package/src/pipeline/index.ts +64 -0
- package/src/pipeline/query-language.ts +118 -0
- package/src/pipeline/rerank.ts +223 -0
- package/src/pipeline/search.ts +261 -0
- package/src/pipeline/types.ts +328 -0
- package/src/pipeline/vsearch.ts +348 -0
- package/src/store/index.ts +41 -0
- package/src/store/migrations/001-initial.ts +196 -0
- package/src/store/migrations/index.ts +20 -0
- package/src/store/migrations/runner.ts +187 -0
- package/src/store/sqlite/adapter.ts +1242 -0
- package/src/store/sqlite/index.ts +7 -0
- package/src/store/sqlite/setup.ts +129 -0
- package/src/store/sqlite/types.ts +28 -0
- package/src/store/types.ts +506 -0
- package/src/store/vector/index.ts +13 -0
- package/src/store/vector/sqlite-vec.ts +373 -0
- package/src/store/vector/stats.ts +152 -0
- package/src/store/vector/types.ts +115 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown canonicalization for deterministic output.
|
|
3
|
+
* PRD §8.4 - Canonical Markdown conventions
|
|
4
|
+
*
|
|
5
|
+
* CRITICAL: These rules are a compatibility contract.
|
|
6
|
+
* Changing them invalidates all existing mirrorHash values.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Control character pattern built dynamically to avoid lint issues with literal control chars.
|
|
11
|
+
* Matches U+0000-U+0008, U+000B-U+000C, U+000E-U+001F, U+007F (excludes \n and \t)
|
|
12
|
+
*/
|
|
13
|
+
const CONTROL_CHAR_PATTERN = new RegExp(
|
|
14
|
+
`[${String.fromCharCode(0)}-${String.fromCharCode(8)}${String.fromCharCode(11)}${String.fromCharCode(12)}${String.fromCharCode(14)}-${String.fromCharCode(31)}${String.fromCharCode(127)}]`,
|
|
15
|
+
'g'
|
|
16
|
+
);
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Canonicalize markdown to ensure deterministic output.
|
|
20
|
+
*
|
|
21
|
+
* Rules (PRD §8.4):
|
|
22
|
+
* 0. Strip BOM (U+FEFF) if present
|
|
23
|
+
* 1. Normalize to \n newlines (no \r)
|
|
24
|
+
* 2. Apply NFC Unicode normalization (cross-platform hash stability)
|
|
25
|
+
* 3. Strip control chars U+0000-U+001F and U+007F except \n (U+000A) and \t (U+0009)
|
|
26
|
+
* 4. Trim trailing whitespace per line
|
|
27
|
+
* 5. Treat whitespace-only lines as blank (trim first, then count)
|
|
28
|
+
* 6. Collapse 2+ consecutive blank lines to exactly 1 (content\n\ncontent)
|
|
29
|
+
* 7. Ensure exactly one final \n
|
|
30
|
+
*/
|
|
31
|
+
export function canonicalize(markdown: string): string {
|
|
32
|
+
if (!markdown) {
|
|
33
|
+
return '\n';
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// 0. Strip BOM if present (U+FEFF) - ensures deterministic hashing
|
|
37
|
+
let result = markdown.startsWith('\uFEFF') ? markdown.slice(1) : markdown;
|
|
38
|
+
|
|
39
|
+
// 1. Normalize line endings: \r\n → \n, lone \r → \n
|
|
40
|
+
result = result.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
|
41
|
+
|
|
42
|
+
// 2. Apply NFC Unicode normalization
|
|
43
|
+
result = result.normalize('NFC');
|
|
44
|
+
|
|
45
|
+
// 3. Strip control characters except \n (U+000A) and \t (U+0009)
|
|
46
|
+
// Range: U+0000-U+0008, U+000B-U+000C, U+000E-U+001F, U+007F
|
|
47
|
+
result = result.replace(CONTROL_CHAR_PATTERN, '');
|
|
48
|
+
|
|
49
|
+
// 4. Trim trailing whitespace per line and
|
|
50
|
+
// 5. Treat whitespace-only lines as blank
|
|
51
|
+
const lines = result.split('\n').map((line) => line.trimEnd());
|
|
52
|
+
|
|
53
|
+
// 6. Collapse multiple blank lines to exactly 1
|
|
54
|
+
// (i.e., content\n\ncontent between paragraphs)
|
|
55
|
+
const collapsed: string[] = [];
|
|
56
|
+
let blankCount = 0;
|
|
57
|
+
|
|
58
|
+
for (const line of lines) {
|
|
59
|
+
if (line === '') {
|
|
60
|
+
blankCount += 1;
|
|
61
|
+
// Only keep one blank line between content
|
|
62
|
+
if (blankCount === 1) {
|
|
63
|
+
collapsed.push(line);
|
|
64
|
+
}
|
|
65
|
+
} else {
|
|
66
|
+
blankCount = 0;
|
|
67
|
+
collapsed.push(line);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// 7. Ensure exactly one final \n
|
|
72
|
+
// Remove trailing blank lines first
|
|
73
|
+
while (collapsed.length > 0 && collapsed.at(-1) === '') {
|
|
74
|
+
collapsed.pop();
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Join and add single final newline
|
|
78
|
+
return `${collapsed.join('\n')}\n`;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Compute SHA-256 hash of canonical markdown.
|
|
83
|
+
* Returns lowercase hex string (64 chars).
|
|
84
|
+
*/
|
|
85
|
+
export function mirrorHash(canonical: string): string {
|
|
86
|
+
const hasher = new Bun.CryptoHasher('sha256');
|
|
87
|
+
hasher.update(canonical);
|
|
88
|
+
return hasher.digest('hex');
|
|
89
|
+
}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Converter error types and helpers.
|
|
3
|
+
* PRD §8.3 - Error model
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type { ConvertError, ConvertErrorCode, ConvertInput } from './types';
|
|
7
|
+
|
|
8
|
+
type ConvertErrorOpts = Omit<ConvertError, 'code'>;
|
|
9
|
+
|
|
10
|
+
/** Max length for error messages/causes to prevent bloat */
|
|
11
|
+
const MAX_CAUSE_LENGTH = 1000;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Normalize a cause to a safe, serializable format.
|
|
15
|
+
* Extracts essential info from Error objects, limits length.
|
|
16
|
+
*/
|
|
17
|
+
function normalizeCause(
|
|
18
|
+
cause: unknown
|
|
19
|
+
): { name: string; message: string } | string | undefined {
|
|
20
|
+
if (cause === undefined || cause === null) {
|
|
21
|
+
return;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (cause instanceof Error) {
|
|
25
|
+
const message =
|
|
26
|
+
cause.message.length > MAX_CAUSE_LENGTH
|
|
27
|
+
? `${cause.message.slice(0, MAX_CAUSE_LENGTH)}...`
|
|
28
|
+
: cause.message;
|
|
29
|
+
return { name: cause.name, message };
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if (typeof cause === 'string') {
|
|
33
|
+
return cause.length > MAX_CAUSE_LENGTH
|
|
34
|
+
? `${cause.slice(0, MAX_CAUSE_LENGTH)}...`
|
|
35
|
+
: cause;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// For other types, try to stringify safely
|
|
39
|
+
try {
|
|
40
|
+
const str = String(cause);
|
|
41
|
+
return str.length > MAX_CAUSE_LENGTH
|
|
42
|
+
? `${str.slice(0, MAX_CAUSE_LENGTH)}...`
|
|
43
|
+
: str;
|
|
44
|
+
} catch {
|
|
45
|
+
return '[unserializable cause]';
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Create a ConvertError with the given code and options.
|
|
51
|
+
* Normalizes cause to prevent bloat and serialization issues.
|
|
52
|
+
*/
|
|
53
|
+
export function convertError(
|
|
54
|
+
code: ConvertErrorCode,
|
|
55
|
+
opts: ConvertErrorOpts
|
|
56
|
+
): ConvertError {
|
|
57
|
+
return {
|
|
58
|
+
code,
|
|
59
|
+
...opts,
|
|
60
|
+
cause: normalizeCause(opts.cause),
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Check if an error code indicates a retryable failure.
|
|
66
|
+
*/
|
|
67
|
+
export function isRetryable(code: ConvertErrorCode): boolean {
|
|
68
|
+
return ['TIMEOUT', 'IO', 'ADAPTER_FAILURE'].includes(code);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Create a standard error result for unsupported file types.
|
|
73
|
+
*/
|
|
74
|
+
export function unsupportedError(
|
|
75
|
+
input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
|
|
76
|
+
converterId = 'registry'
|
|
77
|
+
): ConvertError {
|
|
78
|
+
return convertError('UNSUPPORTED', {
|
|
79
|
+
message: `No converter for ${input.mime} (${input.ext})`,
|
|
80
|
+
retryable: false,
|
|
81
|
+
fatal: false,
|
|
82
|
+
converterId,
|
|
83
|
+
sourcePath: input.sourcePath,
|
|
84
|
+
mime: input.mime,
|
|
85
|
+
ext: input.ext,
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Create an error for files exceeding size limits.
|
|
91
|
+
*/
|
|
92
|
+
export function tooLargeError(
|
|
93
|
+
input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext' | 'bytes' | 'limits'>,
|
|
94
|
+
converterId: string
|
|
95
|
+
): ConvertError {
|
|
96
|
+
return convertError('TOO_LARGE', {
|
|
97
|
+
message: `File size ${input.bytes.length} exceeds limit ${input.limits.maxBytes}`,
|
|
98
|
+
retryable: false,
|
|
99
|
+
fatal: false,
|
|
100
|
+
converterId,
|
|
101
|
+
sourcePath: input.sourcePath,
|
|
102
|
+
mime: input.mime,
|
|
103
|
+
ext: input.ext,
|
|
104
|
+
details: {
|
|
105
|
+
size: input.bytes.length,
|
|
106
|
+
limit: input.limits.maxBytes,
|
|
107
|
+
},
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Create an error for conversion output exceeding size limits.
|
|
113
|
+
* Distinct from tooLargeError (input) - this is for output (zip bomb protection).
|
|
114
|
+
*/
|
|
115
|
+
export function outputTooLargeError(
|
|
116
|
+
input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
|
|
117
|
+
converterId: string,
|
|
118
|
+
opts: { outputChars: number; limitChars: number; stage: 'raw' | 'canonical' }
|
|
119
|
+
): ConvertError {
|
|
120
|
+
return convertError('TOO_LARGE', {
|
|
121
|
+
message: `Conversion output (${opts.outputChars} chars at ${opts.stage}) exceeds limit ${opts.limitChars}`,
|
|
122
|
+
retryable: false,
|
|
123
|
+
fatal: false,
|
|
124
|
+
converterId,
|
|
125
|
+
sourcePath: input.sourcePath,
|
|
126
|
+
mime: input.mime,
|
|
127
|
+
ext: input.ext,
|
|
128
|
+
details: {
|
|
129
|
+
outputChars: opts.outputChars,
|
|
130
|
+
limitChars: opts.limitChars,
|
|
131
|
+
stage: opts.stage,
|
|
132
|
+
},
|
|
133
|
+
});
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Create an error for conversion timeouts.
|
|
138
|
+
*/
|
|
139
|
+
export function timeoutError(
|
|
140
|
+
input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext' | 'limits'>,
|
|
141
|
+
converterId: string
|
|
142
|
+
): ConvertError {
|
|
143
|
+
return convertError('TIMEOUT', {
|
|
144
|
+
message: `Conversion timed out after ${input.limits.timeoutMs}ms`,
|
|
145
|
+
retryable: true,
|
|
146
|
+
fatal: false,
|
|
147
|
+
converterId,
|
|
148
|
+
sourcePath: input.sourcePath,
|
|
149
|
+
mime: input.mime,
|
|
150
|
+
ext: input.ext,
|
|
151
|
+
details: {
|
|
152
|
+
timeoutMs: input.limits.timeoutMs,
|
|
153
|
+
},
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Create an error for corrupt or invalid files.
|
|
159
|
+
*/
|
|
160
|
+
export function corruptError(
|
|
161
|
+
input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
|
|
162
|
+
converterId: string,
|
|
163
|
+
message: string,
|
|
164
|
+
cause?: unknown
|
|
165
|
+
): ConvertError {
|
|
166
|
+
return convertError('CORRUPT', {
|
|
167
|
+
message,
|
|
168
|
+
retryable: false,
|
|
169
|
+
fatal: false,
|
|
170
|
+
converterId,
|
|
171
|
+
sourcePath: input.sourcePath,
|
|
172
|
+
mime: input.mime,
|
|
173
|
+
ext: input.ext,
|
|
174
|
+
cause,
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Create an error for adapter-level failures.
|
|
180
|
+
*/
|
|
181
|
+
export function adapterError(
|
|
182
|
+
input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
|
|
183
|
+
converterId: string,
|
|
184
|
+
message: string,
|
|
185
|
+
cause?: unknown
|
|
186
|
+
): ConvertError {
|
|
187
|
+
return convertError('ADAPTER_FAILURE', {
|
|
188
|
+
message,
|
|
189
|
+
retryable: true,
|
|
190
|
+
fatal: false,
|
|
191
|
+
converterId,
|
|
192
|
+
sourcePath: input.sourcePath,
|
|
193
|
+
mime: input.mime,
|
|
194
|
+
ext: input.ext,
|
|
195
|
+
cause,
|
|
196
|
+
});
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Create an error for internal pipeline failures.
|
|
201
|
+
*/
|
|
202
|
+
export function internalError(
|
|
203
|
+
input: Pick<ConvertInput, 'sourcePath' | 'mime' | 'ext'>,
|
|
204
|
+
converterId: string,
|
|
205
|
+
message: string,
|
|
206
|
+
cause?: unknown
|
|
207
|
+
): ConvertError {
|
|
208
|
+
return convertError('INTERNAL', {
|
|
209
|
+
message,
|
|
210
|
+
retryable: false,
|
|
211
|
+
fatal: true,
|
|
212
|
+
converterId,
|
|
213
|
+
sourcePath: input.sourcePath,
|
|
214
|
+
mime: input.mime,
|
|
215
|
+
ext: input.ext,
|
|
216
|
+
cause,
|
|
217
|
+
});
|
|
218
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Converter subsystem public API.
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { getDefaultPipeline } from './converters';
|
|
6
|
+
* const pipeline = getDefaultPipeline();
|
|
7
|
+
* const result = await pipeline.convert(input);
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
// Canonicalization
|
|
11
|
+
export { canonicalize, mirrorHash } from './canonicalize';
|
|
12
|
+
// Errors
|
|
13
|
+
export {
|
|
14
|
+
adapterError,
|
|
15
|
+
convertError,
|
|
16
|
+
corruptError,
|
|
17
|
+
isRetryable,
|
|
18
|
+
timeoutError,
|
|
19
|
+
tooLargeError,
|
|
20
|
+
unsupportedError,
|
|
21
|
+
} from './errors';
|
|
22
|
+
// MIME detection
|
|
23
|
+
export type { MimeDetection, MimeDetector } from './mime';
|
|
24
|
+
export {
|
|
25
|
+
DefaultMimeDetector,
|
|
26
|
+
getDefaultMimeDetector,
|
|
27
|
+
isSupportedExtension,
|
|
28
|
+
SUPPORTED_EXTENSIONS,
|
|
29
|
+
} from './mime';
|
|
30
|
+
// Pipeline (main entry point)
|
|
31
|
+
export {
|
|
32
|
+
ConversionPipeline,
|
|
33
|
+
getDefaultPipeline,
|
|
34
|
+
resetDefaultPipeline,
|
|
35
|
+
} from './pipeline';
|
|
36
|
+
// Registry
|
|
37
|
+
export { ConverterRegistry, createDefaultRegistry } from './registry';
|
|
38
|
+
// Types
|
|
39
|
+
export type {
|
|
40
|
+
ConversionArtifact,
|
|
41
|
+
ConvertError,
|
|
42
|
+
ConvertErrorCode,
|
|
43
|
+
Converter,
|
|
44
|
+
ConverterId,
|
|
45
|
+
ConvertInput,
|
|
46
|
+
ConvertOutput,
|
|
47
|
+
ConvertResult,
|
|
48
|
+
ConvertWarning,
|
|
49
|
+
PipelineResult,
|
|
50
|
+
} from './types';
|
|
51
|
+
export { DEFAULT_LIMITS } from './types';
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MIME type detection with magic byte sniffing and extension mapping.
|
|
3
|
+
* PRD §8.5 - MIME detection strategy
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { extname } from './path';
|
|
7
|
+
|
|
8
|
+
export interface MimeDetection {
|
|
9
|
+
mime: string;
|
|
10
|
+
ext: string;
|
|
11
|
+
confidence: 'high' | 'medium' | 'low';
|
|
12
|
+
via: 'sniff' | 'sniff+ext' | 'ext' | 'fallback';
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface MimeDetector {
|
|
16
|
+
detect(path: string, bytes: Uint8Array): MimeDetection;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Extension to MIME type mapping (PRD §8.5) */
|
|
20
|
+
const EXTENSION_MAP: Record<string, string> = {
|
|
21
|
+
'.md': 'text/markdown',
|
|
22
|
+
'.txt': 'text/plain',
|
|
23
|
+
'.pdf': 'application/pdf',
|
|
24
|
+
'.docx':
|
|
25
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
26
|
+
'.pptx':
|
|
27
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
28
|
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
/** OOXML extension to MIME mapping */
|
|
32
|
+
const OOXML_MAP: Record<string, string> = {
|
|
33
|
+
'.docx':
|
|
34
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
35
|
+
'.pptx':
|
|
36
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
37
|
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
/** PDF magic bytes: %PDF- */
|
|
41
|
+
const PDF_MAGIC = new Uint8Array([0x25, 0x50, 0x44, 0x46, 0x2d]);
|
|
42
|
+
|
|
43
|
+
/** ZIP/OOXML magic bytes: PK\x03\x04 */
|
|
44
|
+
const ZIP_MAGIC = new Uint8Array([0x50, 0x4b, 0x03, 0x04]);
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Check if bytes start with the given prefix.
|
|
48
|
+
*/
|
|
49
|
+
function startsWith(bytes: Uint8Array, prefix: Uint8Array): boolean {
|
|
50
|
+
if (bytes.length < prefix.length) {
|
|
51
|
+
return false;
|
|
52
|
+
}
|
|
53
|
+
for (let i = 0; i < prefix.length; i++) {
|
|
54
|
+
if (bytes[i] !== prefix[i]) {
|
|
55
|
+
return false;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
return true;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
interface SniffResult {
|
|
62
|
+
mime: string;
|
|
63
|
+
/** True if sniff alone is sufficient (e.g., PDF); false if ext-assisted (OOXML) */
|
|
64
|
+
pureSniff: boolean;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Sniff MIME type from magic bytes.
|
|
69
|
+
* Returns detected MIME or undefined if no match.
|
|
70
|
+
*/
|
|
71
|
+
function sniffMagicBytes(
|
|
72
|
+
bytes: Uint8Array,
|
|
73
|
+
ext: string
|
|
74
|
+
): SniffResult | undefined {
|
|
75
|
+
// PDF detection - pure sniff, no extension needed
|
|
76
|
+
if (startsWith(bytes, PDF_MAGIC)) {
|
|
77
|
+
return { mime: 'application/pdf', pureSniff: true };
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// ZIP/OOXML detection - requires extension to distinguish OOXML from generic ZIP
|
|
81
|
+
if (startsWith(bytes, ZIP_MAGIC)) {
|
|
82
|
+
const ooxmlMime = Object.hasOwn(OOXML_MAP, ext)
|
|
83
|
+
? OOXML_MAP[ext]
|
|
84
|
+
: undefined;
|
|
85
|
+
if (ooxmlMime) {
|
|
86
|
+
// ZIP magic + OOXML extension = extension-assisted sniff
|
|
87
|
+
return { mime: ooxmlMime, pureSniff: false };
|
|
88
|
+
}
|
|
89
|
+
// Generic ZIP (not OOXML)
|
|
90
|
+
return { mime: 'application/zip', pureSniff: true };
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Default MIME detector implementation.
|
|
98
|
+
* Detection priority:
|
|
99
|
+
* 1. Magic bytes (sniff) → high confidence for pure sniff
|
|
100
|
+
* 2. Magic bytes + extension → medium confidence (OOXML via ZIP+ext)
|
|
101
|
+
* 3. Extension map → medium confidence
|
|
102
|
+
* 4. Fallback application/octet-stream → low confidence
|
|
103
|
+
*/
|
|
104
|
+
export class DefaultMimeDetector implements MimeDetector {
|
|
105
|
+
detect(path: string, bytes: Uint8Array): MimeDetection {
|
|
106
|
+
const ext = extname(path);
|
|
107
|
+
|
|
108
|
+
// 1. Try magic byte sniffing (first 512 bytes sufficient)
|
|
109
|
+
// Use subarray for zero-copy view (no allocation)
|
|
110
|
+
const sniffBytes = bytes.subarray(0, 512);
|
|
111
|
+
const sniffed = sniffMagicBytes(sniffBytes, ext);
|
|
112
|
+
if (sniffed) {
|
|
113
|
+
return {
|
|
114
|
+
mime: sniffed.mime,
|
|
115
|
+
ext,
|
|
116
|
+
// Pure sniff (e.g., PDF) is high confidence
|
|
117
|
+
// Extension-assisted sniff (OOXML) is medium confidence
|
|
118
|
+
confidence: sniffed.pureSniff ? 'high' : 'medium',
|
|
119
|
+
via: sniffed.pureSniff ? 'sniff' : 'sniff+ext',
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// 2. Try extension mapping
|
|
124
|
+
const extMime = Object.hasOwn(EXTENSION_MAP, ext)
|
|
125
|
+
? EXTENSION_MAP[ext]
|
|
126
|
+
: undefined;
|
|
127
|
+
if (extMime) {
|
|
128
|
+
return {
|
|
129
|
+
mime: extMime,
|
|
130
|
+
ext,
|
|
131
|
+
confidence: 'medium',
|
|
132
|
+
via: 'ext',
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// 3. Fallback
|
|
137
|
+
return {
|
|
138
|
+
mime: 'application/octet-stream',
|
|
139
|
+
ext,
|
|
140
|
+
confidence: 'low',
|
|
141
|
+
via: 'fallback',
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/** Singleton default detector */
|
|
147
|
+
let defaultDetector: MimeDetector | null = null;
|
|
148
|
+
|
|
149
|
+
export function getDefaultMimeDetector(): MimeDetector {
|
|
150
|
+
if (!defaultDetector) {
|
|
151
|
+
defaultDetector = new DefaultMimeDetector();
|
|
152
|
+
}
|
|
153
|
+
return defaultDetector;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/** Supported extensions for conversion */
|
|
157
|
+
export const SUPPORTED_EXTENSIONS = Object.keys(EXTENSION_MAP);
|
|
158
|
+
|
|
159
|
+
/** Check if extension is supported for conversion (prototype-safe) */
|
|
160
|
+
export function isSupportedExtension(ext: string): boolean {
|
|
161
|
+
const normalized = ext.toLowerCase();
|
|
162
|
+
return Object.hasOwn(EXTENSION_MAP, normalized);
|
|
163
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Native Markdown converter (passthrough).
|
|
3
|
+
* Simply reads .md files and extracts title from first heading.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type { Converter, ConvertInput, ConvertResult } from '../types';
|
|
7
|
+
import { NATIVE_VERSIONS } from '../versions';
|
|
8
|
+
|
|
9
|
+
const CONVERTER_ID = 'native/markdown' as const;
|
|
10
|
+
const CONVERTER_VERSION = NATIVE_VERSIONS.markdown;
|
|
11
|
+
|
|
12
|
+
/** UTF-8 BOM character */
|
|
13
|
+
const BOM = '\uFEFF';
|
|
14
|
+
|
|
15
|
+
/** Regex to match # heading at line start */
|
|
16
|
+
const HEADING_PATTERN = /^\s*#\s+(.+)/;
|
|
17
|
+
|
|
18
|
+
/** Regex to detect code fence start (captures the fence chars and optional info string) */
|
|
19
|
+
const CODE_FENCE_START = /^(`{3,}|~{3,})/;
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Check if a line closes a code fence.
|
|
23
|
+
* Closing fence must be same char type and at least as long as opening.
|
|
24
|
+
*/
|
|
25
|
+
function isClosingFence(
|
|
26
|
+
line: string,
|
|
27
|
+
fenceChar: string,
|
|
28
|
+
fenceLen: number
|
|
29
|
+
): boolean {
|
|
30
|
+
const trimmed = line.trim();
|
|
31
|
+
// Must be only fence chars (no info string on close)
|
|
32
|
+
if (trimmed.length < fenceLen) {
|
|
33
|
+
return false;
|
|
34
|
+
}
|
|
35
|
+
// All chars must be the fence char
|
|
36
|
+
for (const char of trimmed) {
|
|
37
|
+
if (char !== fenceChar) {
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return true;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Extract title from first # heading in markdown, skipping code blocks.
|
|
46
|
+
* Returns undefined if no heading found.
|
|
47
|
+
*/
|
|
48
|
+
function extractFirstHeading(markdown: string): string | undefined {
|
|
49
|
+
const lines = markdown.split('\n');
|
|
50
|
+
let fenceChar = '';
|
|
51
|
+
let fenceLen = 0;
|
|
52
|
+
|
|
53
|
+
for (const line of lines) {
|
|
54
|
+
// If inside a fence, check for closing
|
|
55
|
+
if (fenceLen > 0) {
|
|
56
|
+
if (isClosingFence(line, fenceChar, fenceLen)) {
|
|
57
|
+
fenceChar = '';
|
|
58
|
+
fenceLen = 0;
|
|
59
|
+
}
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Check for fence opening
|
|
64
|
+
const fenceMatch = line.match(CODE_FENCE_START);
|
|
65
|
+
if (fenceMatch?.[1]) {
|
|
66
|
+
fenceChar = fenceMatch[1].charAt(0);
|
|
67
|
+
fenceLen = fenceMatch[1].length;
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// Check for heading (not inside fence)
|
|
72
|
+
const headingMatch = line.match(HEADING_PATTERN);
|
|
73
|
+
if (headingMatch?.[1]) {
|
|
74
|
+
return headingMatch[1].trim();
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export const markdownConverter: Converter = {
|
|
82
|
+
id: CONVERTER_ID,
|
|
83
|
+
version: CONVERTER_VERSION,
|
|
84
|
+
|
|
85
|
+
canHandle(mime: string, ext: string): boolean {
|
|
86
|
+
return mime === 'text/markdown' || ext === '.md';
|
|
87
|
+
},
|
|
88
|
+
|
|
89
|
+
convert(input: ConvertInput): Promise<ConvertResult> {
|
|
90
|
+
// Decode bytes to string (assumes UTF-8)
|
|
91
|
+
let text = new TextDecoder('utf-8', { fatal: false }).decode(input.bytes);
|
|
92
|
+
|
|
93
|
+
// Strip BOM if present (ensures consistent hashes)
|
|
94
|
+
if (text.startsWith(BOM)) {
|
|
95
|
+
text = text.slice(1);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Extract title from first heading
|
|
99
|
+
const title = extractFirstHeading(text);
|
|
100
|
+
|
|
101
|
+
// NOTE: Do NOT canonicalize here - pipeline.ts handles all normalization
|
|
102
|
+
return Promise.resolve({
|
|
103
|
+
ok: true,
|
|
104
|
+
value: {
|
|
105
|
+
markdown: text,
|
|
106
|
+
title,
|
|
107
|
+
meta: {
|
|
108
|
+
converterId: CONVERTER_ID,
|
|
109
|
+
converterVersion: CONVERTER_VERSION,
|
|
110
|
+
sourceMime: input.mime,
|
|
111
|
+
},
|
|
112
|
+
},
|
|
113
|
+
});
|
|
114
|
+
},
|
|
115
|
+
};
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Native plaintext converter.
|
|
3
|
+
* Converts .txt files to markdown (passthrough as paragraphs).
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { basenameWithoutExt } from '../path';
|
|
7
|
+
import type { Converter, ConvertInput, ConvertResult } from '../types';
|
|
8
|
+
import { NATIVE_VERSIONS } from '../versions';
|
|
9
|
+
|
|
10
|
+
const CONVERTER_ID = 'native/plaintext' as const;
|
|
11
|
+
const CONVERTER_VERSION = NATIVE_VERSIONS.plaintext;
|
|
12
|
+
|
|
13
|
+
/** UTF-8 BOM character */
|
|
14
|
+
const BOM = '\uFEFF';
|
|
15
|
+
|
|
16
|
+
export const plaintextConverter: Converter = {
|
|
17
|
+
id: CONVERTER_ID,
|
|
18
|
+
version: CONVERTER_VERSION,
|
|
19
|
+
|
|
20
|
+
canHandle(mime: string, ext: string): boolean {
|
|
21
|
+
return mime === 'text/plain' || ext === '.txt';
|
|
22
|
+
},
|
|
23
|
+
|
|
24
|
+
convert(input: ConvertInput): Promise<ConvertResult> {
|
|
25
|
+
// Decode as UTF-8 with replacement for invalid bytes (deterministic)
|
|
26
|
+
const decoder = new TextDecoder('utf-8', {
|
|
27
|
+
fatal: false, // Don't throw on invalid bytes
|
|
28
|
+
ignoreBOM: false, // We'll strip manually for determinism
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
let text = decoder.decode(input.bytes);
|
|
32
|
+
|
|
33
|
+
// Strip BOM if present (ensures consistent hashes)
|
|
34
|
+
if (text.startsWith(BOM)) {
|
|
35
|
+
text = text.slice(1);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Derive title from filename (cross-platform safe)
|
|
39
|
+
const title = basenameWithoutExt(input.relativePath);
|
|
40
|
+
|
|
41
|
+
// Pass through as paragraphs (no code fence wrapping - better for search)
|
|
42
|
+
// NOTE: Do NOT canonicalize here - pipeline.ts handles all normalization
|
|
43
|
+
return Promise.resolve({
|
|
44
|
+
ok: true,
|
|
45
|
+
value: {
|
|
46
|
+
markdown: text,
|
|
47
|
+
title,
|
|
48
|
+
meta: {
|
|
49
|
+
converterId: CONVERTER_ID,
|
|
50
|
+
converterVersion: CONVERTER_VERSION,
|
|
51
|
+
sourceMime: input.mime,
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
});
|
|
55
|
+
},
|
|
56
|
+
};
|