@gmickel/gno 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +256 -0
  2. package/assets/skill/SKILL.md +112 -0
  3. package/assets/skill/cli-reference.md +327 -0
  4. package/assets/skill/examples.md +234 -0
  5. package/assets/skill/mcp-reference.md +159 -0
  6. package/package.json +90 -0
  7. package/src/app/constants.ts +313 -0
  8. package/src/cli/colors.ts +65 -0
  9. package/src/cli/commands/ask.ts +545 -0
  10. package/src/cli/commands/cleanup.ts +105 -0
  11. package/src/cli/commands/collection/add.ts +120 -0
  12. package/src/cli/commands/collection/index.ts +10 -0
  13. package/src/cli/commands/collection/list.ts +108 -0
  14. package/src/cli/commands/collection/remove.ts +64 -0
  15. package/src/cli/commands/collection/rename.ts +95 -0
  16. package/src/cli/commands/context/add.ts +67 -0
  17. package/src/cli/commands/context/check.ts +153 -0
  18. package/src/cli/commands/context/index.ts +10 -0
  19. package/src/cli/commands/context/list.ts +109 -0
  20. package/src/cli/commands/context/rm.ts +52 -0
  21. package/src/cli/commands/doctor.ts +393 -0
  22. package/src/cli/commands/embed.ts +462 -0
  23. package/src/cli/commands/get.ts +356 -0
  24. package/src/cli/commands/index-cmd.ts +119 -0
  25. package/src/cli/commands/index.ts +102 -0
  26. package/src/cli/commands/init.ts +328 -0
  27. package/src/cli/commands/ls.ts +217 -0
  28. package/src/cli/commands/mcp/config.ts +300 -0
  29. package/src/cli/commands/mcp/index.ts +24 -0
  30. package/src/cli/commands/mcp/install.ts +203 -0
  31. package/src/cli/commands/mcp/paths.ts +470 -0
  32. package/src/cli/commands/mcp/status.ts +222 -0
  33. package/src/cli/commands/mcp/uninstall.ts +158 -0
  34. package/src/cli/commands/mcp.ts +20 -0
  35. package/src/cli/commands/models/clear.ts +103 -0
  36. package/src/cli/commands/models/index.ts +32 -0
  37. package/src/cli/commands/models/list.ts +214 -0
  38. package/src/cli/commands/models/path.ts +51 -0
  39. package/src/cli/commands/models/pull.ts +199 -0
  40. package/src/cli/commands/models/use.ts +85 -0
  41. package/src/cli/commands/multi-get.ts +400 -0
  42. package/src/cli/commands/query.ts +220 -0
  43. package/src/cli/commands/ref-parser.ts +108 -0
  44. package/src/cli/commands/reset.ts +191 -0
  45. package/src/cli/commands/search.ts +136 -0
  46. package/src/cli/commands/shared.ts +156 -0
  47. package/src/cli/commands/skill/index.ts +19 -0
  48. package/src/cli/commands/skill/install.ts +197 -0
  49. package/src/cli/commands/skill/paths-cmd.ts +81 -0
  50. package/src/cli/commands/skill/paths.ts +191 -0
  51. package/src/cli/commands/skill/show.ts +73 -0
  52. package/src/cli/commands/skill/uninstall.ts +141 -0
  53. package/src/cli/commands/status.ts +205 -0
  54. package/src/cli/commands/update.ts +68 -0
  55. package/src/cli/commands/vsearch.ts +188 -0
  56. package/src/cli/context.ts +64 -0
  57. package/src/cli/errors.ts +64 -0
  58. package/src/cli/format/search-results.ts +211 -0
  59. package/src/cli/options.ts +183 -0
  60. package/src/cli/program.ts +1330 -0
  61. package/src/cli/run.ts +213 -0
  62. package/src/cli/ui.ts +92 -0
  63. package/src/config/defaults.ts +20 -0
  64. package/src/config/index.ts +55 -0
  65. package/src/config/loader.ts +161 -0
  66. package/src/config/paths.ts +87 -0
  67. package/src/config/saver.ts +153 -0
  68. package/src/config/types.ts +280 -0
  69. package/src/converters/adapters/markitdownTs/adapter.ts +140 -0
  70. package/src/converters/adapters/officeparser/adapter.ts +126 -0
  71. package/src/converters/canonicalize.ts +89 -0
  72. package/src/converters/errors.ts +218 -0
  73. package/src/converters/index.ts +51 -0
  74. package/src/converters/mime.ts +163 -0
  75. package/src/converters/native/markdown.ts +115 -0
  76. package/src/converters/native/plaintext.ts +56 -0
  77. package/src/converters/path.ts +48 -0
  78. package/src/converters/pipeline.ts +159 -0
  79. package/src/converters/registry.ts +74 -0
  80. package/src/converters/types.ts +123 -0
  81. package/src/converters/versions.ts +24 -0
  82. package/src/index.ts +27 -0
  83. package/src/ingestion/chunker.ts +238 -0
  84. package/src/ingestion/index.ts +32 -0
  85. package/src/ingestion/language.ts +276 -0
  86. package/src/ingestion/sync.ts +671 -0
  87. package/src/ingestion/types.ts +219 -0
  88. package/src/ingestion/walker.ts +235 -0
  89. package/src/llm/cache.ts +467 -0
  90. package/src/llm/errors.ts +191 -0
  91. package/src/llm/index.ts +58 -0
  92. package/src/llm/nodeLlamaCpp/adapter.ts +133 -0
  93. package/src/llm/nodeLlamaCpp/embedding.ts +165 -0
  94. package/src/llm/nodeLlamaCpp/generation.ts +88 -0
  95. package/src/llm/nodeLlamaCpp/lifecycle.ts +317 -0
  96. package/src/llm/nodeLlamaCpp/rerank.ts +94 -0
  97. package/src/llm/registry.ts +86 -0
  98. package/src/llm/types.ts +129 -0
  99. package/src/mcp/resources/index.ts +151 -0
  100. package/src/mcp/server.ts +229 -0
  101. package/src/mcp/tools/get.ts +220 -0
  102. package/src/mcp/tools/index.ts +160 -0
  103. package/src/mcp/tools/multi-get.ts +263 -0
  104. package/src/mcp/tools/query.ts +226 -0
  105. package/src/mcp/tools/search.ts +119 -0
  106. package/src/mcp/tools/status.ts +81 -0
  107. package/src/mcp/tools/vsearch.ts +198 -0
  108. package/src/pipeline/chunk-lookup.ts +44 -0
  109. package/src/pipeline/expansion.ts +256 -0
  110. package/src/pipeline/explain.ts +115 -0
  111. package/src/pipeline/fusion.ts +185 -0
  112. package/src/pipeline/hybrid.ts +535 -0
  113. package/src/pipeline/index.ts +64 -0
  114. package/src/pipeline/query-language.ts +118 -0
  115. package/src/pipeline/rerank.ts +223 -0
  116. package/src/pipeline/search.ts +261 -0
  117. package/src/pipeline/types.ts +328 -0
  118. package/src/pipeline/vsearch.ts +348 -0
  119. package/src/store/index.ts +41 -0
  120. package/src/store/migrations/001-initial.ts +196 -0
  121. package/src/store/migrations/index.ts +20 -0
  122. package/src/store/migrations/runner.ts +187 -0
  123. package/src/store/sqlite/adapter.ts +1242 -0
  124. package/src/store/sqlite/index.ts +7 -0
  125. package/src/store/sqlite/setup.ts +129 -0
  126. package/src/store/sqlite/types.ts +28 -0
  127. package/src/store/types.ts +506 -0
  128. package/src/store/vector/index.ts +13 -0
  129. package/src/store/vector/sqlite-vec.ts +373 -0
  130. package/src/store/vector/stats.ts +152 -0
  131. package/src/store/vector/types.ts +115 -0
@@ -0,0 +1,153 @@
1
+ /**
2
+ * Config saving with atomic writes.
3
+ * Writes config to temp file, then renames to target (atomic on POSIX).
4
+ *
5
+ * @module src/config/saver
6
+ */
7
+
8
+ import { mkdir, rename, unlink } from 'node:fs/promises';
9
+ import { dirname, join } from 'node:path';
10
+ import { expandPath, getConfigPaths } from './paths';
11
+ import { type Config, ConfigSchema } from './types';
12
+
13
+ // ─────────────────────────────────────────────────────────────────────────────
14
+ // Result Types
15
+ // ─────────────────────────────────────────────────────────────────────────────
16
+
17
+ export type SaveResult =
18
+ | { ok: true; path: string }
19
+ | { ok: false; error: SaveError };
20
+
21
+ export type SaveError =
22
+ | { code: 'VALIDATION_ERROR'; message: string }
23
+ | { code: 'IO_ERROR'; message: string; cause: Error };
24
+
25
+ // ─────────────────────────────────────────────────────────────────────────────
26
+ // Saving Functions
27
+ // ─────────────────────────────────────────────────────────────────────────────
28
+
29
+ /**
30
+ * Save config to default location or specified path.
31
+ * Uses atomic write: temp file + rename.
32
+ */
33
+ export function saveConfig(
34
+ config: Config,
35
+ configPath?: string
36
+ ): Promise<SaveResult> {
37
+ const paths = getConfigPaths();
38
+ const targetPath = configPath ? expandPath(configPath) : paths.configFile;
39
+
40
+ return saveConfigToPath(config, targetPath);
41
+ }
42
+
43
+ /**
44
+ * Save config to a specific file path.
45
+ * Creates parent directories if needed.
46
+ * Uses atomic write pattern for safety.
47
+ */
48
+ export async function saveConfigToPath(
49
+ config: Config,
50
+ filePath: string
51
+ ): Promise<SaveResult> {
52
+ // Validate config before saving
53
+ const validation = ConfigSchema.safeParse(config);
54
+ if (!validation.success) {
55
+ return {
56
+ ok: false,
57
+ error: {
58
+ code: 'VALIDATION_ERROR',
59
+ message: `Invalid config: ${validation.error.issues[0]?.message ?? 'unknown error'}`,
60
+ },
61
+ };
62
+ }
63
+
64
+ // Convert to YAML
65
+ const yamlContent = Bun.YAML.stringify(config);
66
+
67
+ // Ensure parent directory exists
68
+ const dir = dirname(filePath);
69
+ try {
70
+ await mkdir(dir, { recursive: true });
71
+ } catch (cause) {
72
+ return {
73
+ ok: false,
74
+ error: {
75
+ code: 'IO_ERROR',
76
+ message: `Failed to create config directory: ${dir}`,
77
+ cause: cause instanceof Error ? cause : new Error(String(cause)),
78
+ },
79
+ };
80
+ }
81
+
82
+ // Write to temp file first (atomic write pattern)
83
+ // Use timestamp + random suffix to avoid collision
84
+ const tempPath = join(
85
+ dir,
86
+ `.index.yml.${Date.now()}.${Math.random().toString(36).slice(2, 8)}.tmp`
87
+ );
88
+
89
+ try {
90
+ await Bun.write(tempPath, yamlContent);
91
+ } catch (cause) {
92
+ return {
93
+ ok: false,
94
+ error: {
95
+ code: 'IO_ERROR',
96
+ message: `Failed to write temp config file: ${tempPath}`,
97
+ cause: cause instanceof Error ? cause : new Error(String(cause)),
98
+ },
99
+ };
100
+ }
101
+
102
+ // Rename temp to target (atomic on POSIX, needs unlink on Windows)
103
+ try {
104
+ // Windows: rename fails if dest exists, so unlink first (ignore if not exists)
105
+ await unlink(filePath).catch(() => {
106
+ /* ENOENT ok */
107
+ });
108
+ await rename(tempPath, filePath);
109
+ } catch (cause) {
110
+ // Clean up temp file on rename failure
111
+ try {
112
+ await unlink(tempPath);
113
+ } catch {
114
+ // Ignore cleanup errors
115
+ }
116
+
117
+ return {
118
+ ok: false,
119
+ error: {
120
+ code: 'IO_ERROR',
121
+ message: `Failed to save config file: ${filePath}`,
122
+ cause: cause instanceof Error ? cause : new Error(String(cause)),
123
+ },
124
+ };
125
+ }
126
+
127
+ return { ok: true, path: filePath };
128
+ }
129
+
130
+ /**
131
+ * Create directories for config, data, and cache.
132
+ * Called during init to set up GNO storage locations.
133
+ */
134
+ export async function ensureDirectories(): Promise<SaveResult> {
135
+ const paths = getConfigPaths();
136
+
137
+ try {
138
+ await mkdir(paths.configDir, { recursive: true });
139
+ await mkdir(paths.dataDir, { recursive: true });
140
+ await mkdir(paths.cacheDir, { recursive: true });
141
+
142
+ return { ok: true, path: paths.configDir };
143
+ } catch (cause) {
144
+ return {
145
+ ok: false,
146
+ error: {
147
+ code: 'IO_ERROR',
148
+ message: 'Failed to create GNO directories',
149
+ cause: cause instanceof Error ? cause : new Error(String(cause)),
150
+ },
151
+ };
152
+ }
153
+ }
@@ -0,0 +1,280 @@
1
+ /**
2
+ * Config schema definitions using Zod.
3
+ * Defines Collection, Context, and Config types for GNO.
4
+ *
5
+ * @module src/config/types
6
+ */
7
+
8
+ import { z } from 'zod';
9
+ import { URI_PREFIX } from '../app/constants';
10
+
11
+ // ─────────────────────────────────────────────────────────────────────────────
12
+ // Constants
13
+ // ─────────────────────────────────────────────────────────────────────────────
14
+
15
+ /** Current config version */
16
+ export const CONFIG_VERSION = '1.0';
17
+
18
+ /** Default glob pattern for file matching */
19
+ export const DEFAULT_PATTERN = '**/*';
20
+
21
+ /** Default exclude patterns for collections */
22
+ export const DEFAULT_EXCLUDES: readonly string[] = [
23
+ '.git',
24
+ 'node_modules',
25
+ '.venv',
26
+ '.idea',
27
+ 'dist',
28
+ 'build',
29
+ '__pycache__',
30
+ '.DS_Store',
31
+ 'Thumbs.db',
32
+ ];
33
+
34
+ /** Valid FTS tokenizer options */
35
+ export const FTS_TOKENIZERS = ['unicode61', 'porter', 'trigram'] as const;
36
+ export type FtsTokenizer = (typeof FTS_TOKENIZERS)[number];
37
+
38
+ /** Default FTS tokenizer */
39
+ export const DEFAULT_FTS_TOKENIZER: FtsTokenizer = 'unicode61';
40
+
41
+ /**
42
+ * BCP-47 language tag pattern (simplified, case-insensitive).
43
+ * Matches: en, de, fr, zh-CN, zh-Hans, und, en-US, etc.
44
+ */
45
+ const BCP47_PATTERN = /^[a-z]{2,3}(-[a-z]{2}|-[a-z]{4})?$/i;
46
+
47
+ /** Validate BCP-47 language hint */
48
+ export function isValidLanguageHint(hint: string): boolean {
49
+ return BCP47_PATTERN.test(hint);
50
+ }
51
+
52
+ // ─────────────────────────────────────────────────────────────────────────────
53
+ // Collection Schema
54
+ // ─────────────────────────────────────────────────────────────────────────────
55
+
56
+ /**
57
+ * Collection name pattern: lowercase alphanumeric, hyphens, underscores.
58
+ * 1-64 chars, must start with alphanumeric.
59
+ */
60
+ const COLLECTION_NAME_REGEX = /^[a-z0-9][a-z0-9_-]{0,63}$/;
61
+
62
+ /** Collection scope key pattern: name (1-64 chars) followed by colon */
63
+ const COLLECTION_SCOPE_REGEX = /^[a-z0-9][a-z0-9_-]{0,63}:$/;
64
+
65
+ export const CollectionSchema = z.object({
66
+ /** Unique collection identifier (lowercase) */
67
+ name: z
68
+ .string()
69
+ .regex(
70
+ COLLECTION_NAME_REGEX,
71
+ 'Collection name must be lowercase alphanumeric with hyphens/underscores, 1-64 chars'
72
+ ),
73
+
74
+ /** Absolute path to collection root */
75
+ path: z.string().min(1, 'Path is required'),
76
+
77
+ /** Glob pattern for file matching */
78
+ pattern: z.string().default(DEFAULT_PATTERN),
79
+
80
+ /** Extension allowlist (empty = all) */
81
+ include: z.array(z.string()).default([]),
82
+
83
+ /** Path patterns to skip */
84
+ exclude: z.array(z.string()).default([...DEFAULT_EXCLUDES]),
85
+
86
+ /** Optional shell command to run before indexing */
87
+ updateCmd: z.string().optional(),
88
+
89
+ /** Optional BCP-47 language hint */
90
+ languageHint: z
91
+ .string()
92
+ .refine((val) => isValidLanguageHint(val), {
93
+ message: 'Invalid BCP-47 language code (e.g., en, de, zh-CN, und)',
94
+ })
95
+ .optional(),
96
+ });
97
+
98
+ export type Collection = z.infer<typeof CollectionSchema>;
99
+
100
+ // ─────────────────────────────────────────────────────────────────────────────
101
+ // Context Schema
102
+ // ─────────────────────────────────────────────────────────────────────────────
103
+
104
+ /**
105
+ * Context scope types:
106
+ * - global: "/" - applies to all documents
107
+ * - collection: "name:" - applies to a specific collection
108
+ * - prefix: "gno://collection/path" - applies to documents under a path
109
+ */
110
+ export const ScopeTypeSchema = z.enum(['global', 'collection', 'prefix']);
111
+ export type ScopeType = z.infer<typeof ScopeTypeSchema>;
112
+
113
+ /**
114
+ * Validate scope key format based on type.
115
+ * - global: must be "/"
116
+ * - collection: must be "name:" format
117
+ * - prefix: must be "gno://collection/path" format
118
+ */
119
+ export const ContextSchema = z
120
+ .object({
121
+ /** Type of scope */
122
+ scopeType: ScopeTypeSchema,
123
+
124
+ /** Scope key (format depends on scopeType) */
125
+ scopeKey: z.string().min(1, 'Scope key is required'),
126
+
127
+ /** Context description text */
128
+ text: z.string().min(1, 'Context text is required'),
129
+ })
130
+ .refine(
131
+ (ctx) => {
132
+ switch (ctx.scopeType) {
133
+ case 'global':
134
+ return ctx.scopeKey === '/';
135
+ case 'collection':
136
+ return COLLECTION_SCOPE_REGEX.test(ctx.scopeKey);
137
+ case 'prefix':
138
+ return ctx.scopeKey.startsWith(URI_PREFIX);
139
+ default:
140
+ return false;
141
+ }
142
+ },
143
+ {
144
+ message: 'Scope key format does not match scope type',
145
+ }
146
+ );
147
+
148
+ export type Context = z.infer<typeof ContextSchema>;
149
+
150
+ // ─────────────────────────────────────────────────────────────────────────────
151
+ // Model Preset Schema
152
+ // ─────────────────────────────────────────────────────────────────────────────
153
+
154
+ export const ModelPresetSchema = z.object({
155
+ /** Unique preset identifier */
156
+ id: z.string().min(1),
157
+ /** Human-readable name */
158
+ name: z.string().min(1),
159
+ /** Embedding model URI (hf: or file:) */
160
+ embed: z.string().min(1),
161
+ /** Reranker model URI */
162
+ rerank: z.string().min(1),
163
+ /** Generation model URI */
164
+ gen: z.string().min(1),
165
+ });
166
+
167
+ export type ModelPreset = z.infer<typeof ModelPresetSchema>;
168
+
169
+ /** Default model presets */
170
+ export const DEFAULT_MODEL_PRESETS: ModelPreset[] = [
171
+ {
172
+ id: 'slim',
173
+ name: 'Slim (Fast, ~1GB)',
174
+ embed: 'hf:gpustack/bge-m3-GGUF/bge-m3-Q4_K_M.gguf',
175
+ rerank:
176
+ 'hf:gpustack/bge-reranker-v2-m3-GGUF/bge-reranker-v2-m3-Q4_K_M.gguf',
177
+ gen: 'hf:unsloth/Qwen3-1.7B-GGUF/Qwen3-1.7B-Q4_K_M.gguf',
178
+ },
179
+ {
180
+ id: 'balanced',
181
+ name: 'Balanced (Default, ~2GB)',
182
+ embed: 'hf:gpustack/bge-m3-GGUF/bge-m3-Q4_K_M.gguf',
183
+ rerank:
184
+ 'hf:gpustack/bge-reranker-v2-m3-GGUF/bge-reranker-v2-m3-Q4_K_M.gguf',
185
+ gen: 'hf:ggml-org/SmolLM3-3B-GGUF/SmolLM3-Q4_K_M.gguf',
186
+ },
187
+ {
188
+ id: 'quality',
189
+ name: 'Quality (Best Answers, ~2.5GB)',
190
+ embed: 'hf:gpustack/bge-m3-GGUF/bge-m3-Q4_K_M.gguf',
191
+ rerank:
192
+ 'hf:gpustack/bge-reranker-v2-m3-GGUF/bge-reranker-v2-m3-Q4_K_M.gguf',
193
+ gen: 'hf:unsloth/Qwen3-4B-Instruct-2507-GGUF/Qwen3-4B-Instruct-2507-Q4_K_M.gguf',
194
+ },
195
+ ];
196
+
197
+ export const ModelConfigSchema = z.object({
198
+ /** Active preset ID */
199
+ activePreset: z.string().default('balanced'),
200
+ /** Model presets */
201
+ presets: z.array(ModelPresetSchema).default(DEFAULT_MODEL_PRESETS),
202
+ /** Model load timeout in ms */
203
+ loadTimeout: z.number().default(60_000),
204
+ /** Inference timeout in ms */
205
+ inferenceTimeout: z.number().default(30_000),
206
+ /** Keep warm model TTL in ms (5 min) */
207
+ warmModelTtl: z.number().default(300_000),
208
+ });
209
+
210
+ export type ModelConfig = z.infer<typeof ModelConfigSchema>;
211
+
212
+ // ─────────────────────────────────────────────────────────────────────────────
213
+ // Config Schema (root)
214
+ // ─────────────────────────────────────────────────────────────────────────────
215
+
216
+ export const ConfigSchema = z.object({
217
+ /** Config schema version */
218
+ version: z.literal(CONFIG_VERSION),
219
+
220
+ /** FTS tokenizer (immutable after init) */
221
+ ftsTokenizer: z.enum(FTS_TOKENIZERS).default(DEFAULT_FTS_TOKENIZER),
222
+
223
+ /** Collection definitions */
224
+ collections: z.array(CollectionSchema).default([]),
225
+
226
+ /** Context metadata */
227
+ contexts: z.array(ContextSchema).default([]),
228
+
229
+ /** Model configuration */
230
+ models: ModelConfigSchema.optional(),
231
+ });
232
+
233
+ export type Config = z.infer<typeof ConfigSchema>;
234
+
235
+ // ─────────────────────────────────────────────────────────────────────────────
236
+ // Scope Utilities
237
+ // ─────────────────────────────────────────────────────────────────────────────
238
+
239
+ /**
240
+ * Parse a scope string into type and key.
241
+ * Input formats (from CLI):
242
+ * - "/" -> { type: 'global', key: '/' }
243
+ * - "notes:" -> { type: 'collection', key: 'notes:' }
244
+ * - "gno://notes/projects" -> { type: 'prefix', key: 'gno://notes/projects' }
245
+ */
246
+ export function parseScope(
247
+ scope: string
248
+ ): { type: ScopeType; key: string } | null {
249
+ if (scope === '/') {
250
+ return { type: 'global', key: '/' };
251
+ }
252
+ if (scope.startsWith(URI_PREFIX)) {
253
+ return { type: 'prefix', key: scope };
254
+ }
255
+ if (COLLECTION_SCOPE_REGEX.test(scope)) {
256
+ return { type: 'collection', key: scope };
257
+ }
258
+ return null;
259
+ }
260
+
261
+ /**
262
+ * Extract collection name from a scope key.
263
+ * - "notes:" -> "notes"
264
+ * - "gno://notes/path" -> "notes"
265
+ * - "/" -> null
266
+ */
267
+ export function getCollectionFromScope(scopeKey: string): string | null {
268
+ if (scopeKey === '/') {
269
+ return null;
270
+ }
271
+ if (scopeKey.endsWith(':')) {
272
+ return scopeKey.slice(0, -1);
273
+ }
274
+ if (scopeKey.startsWith(URI_PREFIX)) {
275
+ const rest = scopeKey.slice(URI_PREFIX.length);
276
+ const slashIndex = rest.indexOf('/');
277
+ return slashIndex === -1 ? rest : rest.slice(0, slashIndex);
278
+ }
279
+ return null;
280
+ }
@@ -0,0 +1,140 @@
1
+ /**
2
+ * markitdown-ts adapter for PDF, DOCX, XLSX conversion.
3
+ * Uses convertBuffer() with bytes for determinism.
4
+ */
5
+
6
+ import { MarkItDown } from 'markitdown-ts';
7
+ import {
8
+ adapterError,
9
+ corruptError,
10
+ timeoutError,
11
+ tooLargeError,
12
+ } from '../../errors';
13
+ import type {
14
+ Converter,
15
+ ConvertInput,
16
+ ConvertResult,
17
+ ConvertWarning,
18
+ } from '../../types';
19
+ import { ADAPTER_VERSIONS } from '../../versions';
20
+
21
+ const CONVERTER_ID = 'adapter/markitdown-ts' as const;
22
+ const CONVERTER_VERSION = ADAPTER_VERSIONS['markitdown-ts'];
23
+
24
+ /** Supported extensions for this adapter */
25
+ const SUPPORTED_EXTENSIONS = ['.pdf', '.docx', '.xlsx'];
26
+
27
+ /** Supported MIME types */
28
+ const SUPPORTED_MIMES = [
29
+ 'application/pdf',
30
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
31
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
32
+ ];
33
+
34
+ /**
35
+ * Create zero-copy Buffer view of Uint8Array.
36
+ * Assumes input.bytes is immutable (contract requirement).
37
+ */
38
+ function toBuffer(bytes: Uint8Array): Buffer {
39
+ return Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength);
40
+ }
41
+
42
+ export const markitdownAdapter: Converter = {
43
+ id: CONVERTER_ID,
44
+ version: CONVERTER_VERSION,
45
+
46
+ canHandle(mime: string, ext: string): boolean {
47
+ return SUPPORTED_EXTENSIONS.includes(ext) || SUPPORTED_MIMES.includes(mime);
48
+ },
49
+
50
+ async convert(input: ConvertInput): Promise<ConvertResult> {
51
+ // 1. Check size limit (defense in depth; EPIC 5 does stat-based pre-check)
52
+ if (input.bytes.length > input.limits.maxBytes) {
53
+ return { ok: false, error: tooLargeError(input, CONVERTER_ID) };
54
+ }
55
+
56
+ // 2. Setup timeout handling
57
+ // Note: markitdown-ts doesn't support AbortSignal, so underlying
58
+ // work may continue after timeout (known limitation; process isolation future work)
59
+ let timeoutId: ReturnType<typeof setTimeout> | undefined;
60
+ let timedOut = false;
61
+
62
+ const timeoutPromise = new Promise<never>((_, reject) => {
63
+ timeoutId = setTimeout(() => {
64
+ timedOut = true;
65
+ reject(new Error('TIMEOUT'));
66
+ }, input.limits.timeoutMs);
67
+ });
68
+
69
+ const converter = new MarkItDown();
70
+
71
+ // IMPORTANT: Use convertBuffer with bytes for determinism
72
+ // Path-based convert() could re-read a modified file
73
+ // Zero-copy Buffer view (input.bytes is immutable by contract)
74
+ const workPromise = converter.convertBuffer(toBuffer(input.bytes), {
75
+ file_extension: input.ext,
76
+ });
77
+
78
+ try {
79
+ const result = await Promise.race([workPromise, timeoutPromise]);
80
+
81
+ // Clear timeout on success
82
+ if (timeoutId) {
83
+ clearTimeout(timeoutId);
84
+ }
85
+
86
+ if (!result?.markdown) {
87
+ return {
88
+ ok: false,
89
+ error: corruptError(input, CONVERTER_ID, 'Empty conversion result'),
90
+ };
91
+ }
92
+
93
+ // Emit warnings for suspicious output
94
+ const warnings: ConvertWarning[] = [];
95
+ if (result.markdown.length < 10 && input.bytes.length > 1000) {
96
+ warnings.push({ code: 'LOSSY', message: 'Suspiciously short output' });
97
+ }
98
+
99
+ // NOTE: Canonicalization happens in pipeline.ts, not here
100
+ return {
101
+ ok: true,
102
+ value: {
103
+ markdown: result.markdown,
104
+ title: result.title ?? undefined,
105
+ meta: {
106
+ converterId: CONVERTER_ID,
107
+ converterVersion: CONVERTER_VERSION,
108
+ sourceMime: input.mime,
109
+ warnings: warnings.length > 0 ? warnings : undefined,
110
+ },
111
+ },
112
+ };
113
+ } catch (err) {
114
+ // Clear timeout on error
115
+ if (timeoutId) {
116
+ clearTimeout(timeoutId);
117
+ }
118
+
119
+ // If we timed out, suppress any later rejection from the work promise
120
+ // to prevent unhandled rejection crashes
121
+ if (timedOut) {
122
+ workPromise.catch(() => {
123
+ // Intentionally swallowed - work continued after timeout
124
+ });
125
+ return { ok: false, error: timeoutError(input, CONVERTER_ID) };
126
+ }
127
+
128
+ // Map adapter errors
129
+ return {
130
+ ok: false,
131
+ error: adapterError(
132
+ input,
133
+ CONVERTER_ID,
134
+ err instanceof Error ? err.message : 'Unknown error',
135
+ err
136
+ ),
137
+ };
138
+ }
139
+ },
140
+ };
@@ -0,0 +1,126 @@
1
+ /**
2
+ * officeparser adapter for PPTX conversion.
3
+ * Uses parseOfficeAsync() with Buffer for in-memory extraction.
4
+ */
5
+
6
+ import { parseOfficeAsync } from 'officeparser';
7
+ import { adapterError, corruptError, tooLargeError } from '../../errors';
8
+ import { basenameWithoutExt } from '../../path';
9
+ import type {
10
+ Converter,
11
+ ConvertInput,
12
+ ConvertResult,
13
+ ConvertWarning,
14
+ } from '../../types';
15
+ import { ADAPTER_VERSIONS } from '../../versions';
16
+
17
+ const CONVERTER_ID = 'adapter/officeparser' as const;
18
+ const CONVERTER_VERSION = ADAPTER_VERSIONS.officeparser;
19
+
20
+ /** Supported MIME type */
21
+ const PPTX_MIME =
22
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation';
23
+
24
+ /**
25
+ * Control character pattern built dynamically to avoid lint issues.
26
+ * Matches U+0000-U+001F and U+007F (all ASCII control chars).
27
+ */
28
+ const CONTROL_CHAR_PATTERN = new RegExp(
29
+ `[${String.fromCharCode(0)}-${String.fromCharCode(31)}${String.fromCharCode(127)}]`,
30
+ 'g'
31
+ );
32
+
33
+ /**
34
+ * Sanitize title for safe Markdown output.
35
+ * Removes control chars, collapses whitespace, ensures single line.
36
+ */
37
+ function sanitizeTitle(title: string): string {
38
+ return title
39
+ .replace(/[\r\n]/g, ' ')
40
+ .replace(CONTROL_CHAR_PATTERN, '')
41
+ .replace(/\s+/g, ' ')
42
+ .trim();
43
+ }
44
+
45
+ /**
46
+ * Get sanitized title from relative path.
47
+ */
48
+ function getTitleFromPath(relativePath: string): string {
49
+ return sanitizeTitle(basenameWithoutExt(relativePath));
50
+ }
51
+
52
+ /**
53
+ * Create zero-copy Buffer view of Uint8Array.
54
+ * Assumes input.bytes is immutable (contract requirement).
55
+ */
56
+ function toBuffer(bytes: Uint8Array): Buffer {
57
+ return Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength);
58
+ }
59
+
60
+ export const officeparserAdapter: Converter = {
61
+ id: CONVERTER_ID,
62
+ version: CONVERTER_VERSION,
63
+
64
+ canHandle(mime: string, ext: string): boolean {
65
+ return ext === '.pptx' || mime === PPTX_MIME;
66
+ },
67
+
68
+ async convert(input: ConvertInput): Promise<ConvertResult> {
69
+ // Size check (defense in depth; EPIC 5 does stat-based pre-check)
70
+ if (input.bytes.length > input.limits.maxBytes) {
71
+ return { ok: false, error: tooLargeError(input, CONVERTER_ID) };
72
+ }
73
+
74
+ try {
75
+ // Zero-copy Buffer view (input.bytes is immutable by contract)
76
+ const buffer = toBuffer(input.bytes);
77
+ const text = await parseOfficeAsync(buffer, {
78
+ newlineDelimiter: '\n',
79
+ ignoreNotes: false, // Include speaker notes
80
+ });
81
+
82
+ if (!text || text.trim().length === 0) {
83
+ return {
84
+ ok: false,
85
+ error: corruptError(input, CONVERTER_ID, 'Empty extraction result'),
86
+ };
87
+ }
88
+
89
+ // Get sanitized title
90
+ const title = getTitleFromPath(input.relativePath);
91
+
92
+ // Convert plain text to Markdown structure
93
+ const markdown = `# ${title}\n\n${text}`;
94
+
95
+ // NOTE: Do NOT canonicalize here - pipeline.ts handles all normalization
96
+ const warnings: ConvertWarning[] = [];
97
+ if (markdown.length < 10 && input.bytes.length > 1000) {
98
+ warnings.push({ code: 'LOSSY', message: 'Suspiciously short output' });
99
+ }
100
+
101
+ return {
102
+ ok: true,
103
+ value: {
104
+ markdown,
105
+ title,
106
+ meta: {
107
+ converterId: CONVERTER_ID,
108
+ converterVersion: CONVERTER_VERSION,
109
+ sourceMime: input.mime,
110
+ warnings: warnings.length > 0 ? warnings : undefined,
111
+ },
112
+ },
113
+ };
114
+ } catch (err) {
115
+ return {
116
+ ok: false,
117
+ error: adapterError(
118
+ input,
119
+ CONVERTER_ID,
120
+ err instanceof Error ? err.message : 'Unknown error',
121
+ err
122
+ ),
123
+ };
124
+ }
125
+ },
126
+ };