@joycodetech/qmd-ja 2.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/CHANGELOG.md +819 -0
  2. package/LICENSE +21 -0
  3. package/README.md +1143 -0
  4. package/bin/qmd +162 -0
  5. package/dist/ast.d.ts +65 -0
  6. package/dist/ast.js +334 -0
  7. package/dist/bench/bench.d.ts +23 -0
  8. package/dist/bench/bench.js +280 -0
  9. package/dist/bench/score.d.ts +33 -0
  10. package/dist/bench/score.js +88 -0
  11. package/dist/bench/types.d.ts +80 -0
  12. package/dist/bench/types.js +8 -0
  13. package/dist/cli/formatter.d.ts +120 -0
  14. package/dist/cli/formatter.js +355 -0
  15. package/dist/cli/qmd.d.ts +43 -0
  16. package/dist/cli/qmd.js +4159 -0
  17. package/dist/collections.d.ts +166 -0
  18. package/dist/collections.js +410 -0
  19. package/dist/db.d.ts +44 -0
  20. package/dist/db.js +75 -0
  21. package/dist/index.d.ts +230 -0
  22. package/dist/index.js +242 -0
  23. package/dist/llm.d.ts +500 -0
  24. package/dist/llm.js +1615 -0
  25. package/dist/maintenance.d.ts +23 -0
  26. package/dist/maintenance.js +37 -0
  27. package/dist/mcp/server.d.ts +24 -0
  28. package/dist/mcp/server.js +702 -0
  29. package/dist/paths.d.ts +1 -0
  30. package/dist/paths.js +4 -0
  31. package/dist/store.d.ts +996 -0
  32. package/dist/store.js +4208 -0
  33. package/models/vaporetto-bccwj.model +0 -0
  34. package/package.json +130 -0
  35. package/scripts/build.mjs +30 -0
  36. package/scripts/check-package-grammars.mjs +29 -0
  37. package/scripts/package-smoke.mjs +65 -0
  38. package/scripts/test-all.mjs +38 -0
  39. package/skills/qmd/SKILL.md +295 -0
  40. package/skills/qmd/references/mcp-setup.md +102 -0
  41. package/skills/release/SKILL.md +139 -0
  42. package/skills/release/scripts/install-hooks.sh +38 -0
  43. package/vendor/vaporetto-node-wasm/package.json +11 -0
  44. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm.d.ts +19 -0
  45. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm.js +202 -0
  46. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm_bg.wasm +0 -0
  47. package/vendor/vaporetto-node-wasm/vaporetto_node_wasm_bg.wasm.d.ts +13 -0
package/dist/llm.d.ts ADDED
@@ -0,0 +1,500 @@
1
+ /**
2
+ * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
3
+ *
4
+ * Provides embeddings, text generation, and reranking using local GGUF models.
5
+ */
6
+ import type { Llama, Token as LlamaToken } from "node-llama-cpp";
7
+ type NodeLlamaCppModule = {
8
+ getLlama: (options: Record<string, unknown>) => Promise<Llama>;
9
+ getLlamaGpuTypes?: (include?: "supported" | "allValid") => Promise<LlamaGpuMode[]>;
10
+ resolveModelFile: (model: string, cacheDir: string) => Promise<string>;
11
+ LlamaChatSession: new (options: {
12
+ contextSequence: unknown;
13
+ }) => {
14
+ prompt: (prompt: string, options?: Record<string, unknown>) => Promise<string>;
15
+ };
16
+ LlamaLogLevel: {
17
+ error: unknown;
18
+ };
19
+ };
20
+ export declare function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void;
21
+ /**
22
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
23
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
24
+ * noise to stderr while native llama initialization is in progress.
25
+ */
26
+ export declare function withNativeStdoutRedirectedToStderr<T>(fn: () => Promise<T>): Promise<T>;
27
+ /**
28
+ * Detect if a model URI uses the Qwen3-Embedding format.
29
+ * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
30
+ */
31
+ export declare function isQwen3EmbeddingModel(modelUri: string): boolean;
32
+ /**
33
+ * Format a query for embedding.
34
+ * Uses nomic-style task prefix format for embeddinggemma (default).
35
+ * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
36
+ */
37
+ export declare function formatQueryForEmbedding(query: string, modelUri?: string): string;
38
+ /**
39
+ * Format a document for embedding.
40
+ * Uses nomic-style format with title and text fields (default).
41
+ * Qwen3-Embedding encodes documents as raw text without special prefixes.
42
+ */
43
+ export declare function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string;
44
+ /**
45
+ * Token with log probability
46
+ */
47
+ export type TokenLogProb = {
48
+ token: string;
49
+ logprob: number;
50
+ };
51
+ /**
52
+ * Embedding result
53
+ */
54
+ export type EmbeddingResult = {
55
+ embedding: number[];
56
+ model: string;
57
+ };
58
+ /**
59
+ * Generation result with optional logprobs
60
+ */
61
+ export type GenerateResult = {
62
+ text: string;
63
+ model: string;
64
+ logprobs?: TokenLogProb[];
65
+ done: boolean;
66
+ };
67
+ /**
68
+ * Rerank result for a single document
69
+ */
70
+ export type RerankDocumentResult = {
71
+ file: string;
72
+ score: number;
73
+ index: number;
74
+ };
75
+ /**
76
+ * Batch rerank result
77
+ */
78
+ export type RerankResult = {
79
+ results: RerankDocumentResult[];
80
+ model: string;
81
+ };
82
+ /**
83
+ * Model info
84
+ */
85
+ export type ModelInfo = {
86
+ name: string;
87
+ exists: boolean;
88
+ path?: string;
89
+ };
90
+ /**
91
+ * Options for embedding
92
+ */
93
+ export type EmbedOptions = {
94
+ model?: string;
95
+ isQuery?: boolean;
96
+ title?: string;
97
+ };
98
+ /**
99
+ * Options for text generation
100
+ */
101
+ export type GenerateOptions = {
102
+ model?: string;
103
+ maxTokens?: number;
104
+ temperature?: number;
105
+ };
106
+ /**
107
+ * Options for reranking
108
+ */
109
+ export type RerankOptions = {
110
+ model?: string;
111
+ };
112
+ /**
113
+ * Options for LLM sessions
114
+ */
115
+ export type LLMSessionOptions = {
116
+ /** Max session duration in ms (default: 10 minutes) */
117
+ maxDuration?: number;
118
+ /** External abort signal */
119
+ signal?: AbortSignal;
120
+ /** Debug name for logging */
121
+ name?: string;
122
+ };
123
+ /**
124
+ * Session interface for scoped LLM access with lifecycle guarantees
125
+ */
126
+ export interface ILLMSession {
127
+ embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
128
+ embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
129
+ expandQuery(query: string, options?: {
130
+ context?: string;
131
+ includeLexical?: boolean;
132
+ }): Promise<Queryable[]>;
133
+ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
134
+ /** Whether this session is still valid (not released or aborted) */
135
+ readonly isValid: boolean;
136
+ /** Abort signal for this session (aborts on release or maxDuration) */
137
+ readonly signal: AbortSignal;
138
+ }
139
+ /**
140
+ * Supported query types for different search backends
141
+ */
142
+ export type QueryType = 'lex' | 'vec' | 'hyde';
143
+ /**
144
+ * A single query and its target backend type
145
+ */
146
+ export type Queryable = {
147
+ type: QueryType;
148
+ text: string;
149
+ };
150
+ /**
151
+ * Document to rerank
152
+ */
153
+ export type RerankDocument = {
154
+ file: string;
155
+ text: string;
156
+ title?: string;
157
+ };
158
+ export declare const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf";
159
+ export declare const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf";
160
+ export declare const DEFAULT_EMBED_MODEL_URI = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
161
+ export declare const DEFAULT_RERANK_MODEL_URI = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
162
+ export declare const DEFAULT_GENERATE_MODEL_URI = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
163
+ export type ModelResolutionConfig = {
164
+ embed?: string;
165
+ generate?: string;
166
+ rerank?: string;
167
+ };
168
+ export declare function resolveEmbedModel(config?: ModelResolutionConfig): string;
169
+ export declare function resolveGenerateModel(config?: ModelResolutionConfig): string;
170
+ export declare function resolveRerankModel(config?: ModelResolutionConfig): string;
171
+ export declare function resolveModels(config?: ModelResolutionConfig): Required<ModelResolutionConfig>;
172
+ export declare const DEFAULT_MODEL_CACHE_DIR: string;
173
+ export type PullResult = {
174
+ model: string;
175
+ path: string;
176
+ sizeBytes: number;
177
+ refreshed: boolean;
178
+ };
179
+ export type GgufFileInspection = {
180
+ exists: boolean;
181
+ valid: boolean;
182
+ kind: "missing" | "gguf" | "html" | "invalid";
183
+ sizeBytes?: number;
184
+ magic?: string;
185
+ details: string;
186
+ };
187
+ /**
188
+ * Inspect a potential GGUF model file without mutating it.
189
+ * Used by doctor for early diagnostics and by runtime validation before load.
190
+ */
191
+ export declare function inspectGgufFile(filePath: string): GgufFileInspection;
192
+ export declare function pullModels(models: string[], options?: {
193
+ refresh?: boolean;
194
+ cacheDir?: string;
195
+ }): Promise<PullResult[]>;
196
+ /**
197
+ * Abstract LLM interface - implement this for different backends
198
+ */
199
+ export interface LLM {
200
+ /**
201
+ * Get embeddings for text
202
+ */
203
+ embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
204
+ /**
205
+ * Generate text completion
206
+ */
207
+ generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
208
+ /**
209
+ * Check if a model exists/is available
210
+ */
211
+ modelExists(model: string): Promise<ModelInfo>;
212
+ /**
213
+ * Expand a search query into multiple variations for different backends.
214
+ * Returns a list of Queryable objects.
215
+ */
216
+ expandQuery(query: string, options?: {
217
+ context?: string;
218
+ includeLexical?: boolean;
219
+ }): Promise<Queryable[]>;
220
+ /**
221
+ * Rerank documents by relevance to a query
222
+ * Returns list of documents with relevance scores (higher = more relevant)
223
+ */
224
+ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
225
+ /**
226
+ * Dispose of resources
227
+ */
228
+ dispose(): Promise<void>;
229
+ }
230
+ export type LlamaCppConfig = {
231
+ embedModel?: string;
232
+ generateModel?: string;
233
+ rerankModel?: string;
234
+ modelCacheDir?: string;
235
+ /**
236
+ * Context size used for query expansion generation contexts.
237
+ * Default: 2048. Can also be set via QMD_EXPAND_CONTEXT_SIZE.
238
+ */
239
+ expandContextSize?: number;
240
+ /**
241
+ * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
242
+ *
243
+ * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
244
+ * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
245
+ * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
246
+ */
247
+ inactivityTimeoutMs?: number;
248
+ /**
249
+ * Whether to dispose models on inactivity (default: false).
250
+ *
251
+ * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
252
+ * memory reclaim.
253
+ */
254
+ disposeModelsOnInactivity?: boolean;
255
+ };
256
+ export type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false;
257
+ type ParallelismOptions = {
258
+ gpu: string | false;
259
+ platform?: NodeJS.Platform;
260
+ computed: number;
261
+ envValue?: string;
262
+ };
263
+ export declare function resolveParallelismOverride(envValue?: string | undefined): number | undefined;
264
+ export declare function resolveSafeParallelism(options: ParallelismOptions): number;
265
+ export declare function resolveLlamaGpuMode(envValue?: string | undefined, forceCpuValue?: string | undefined): LlamaGpuMode;
266
+ export declare class LlamaCpp implements LLM {
267
+ private readonly _ciMode;
268
+ private llama;
269
+ private embedModel;
270
+ private embedContexts;
271
+ private generateModel;
272
+ private rerankModel;
273
+ private rerankContexts;
274
+ private embedModelUri;
275
+ private generateModelUri;
276
+ private rerankModelUri;
277
+ private modelCacheDir;
278
+ private expandContextSize;
279
+ private embedModelLoadPromise;
280
+ private generateModelLoadPromise;
281
+ private rerankModelLoadPromise;
282
+ private llamaLoadPromise;
283
+ private inactivityTimer;
284
+ private inactivityTimeoutMs;
285
+ private disposeModelsOnInactivity;
286
+ private disposed;
287
+ constructor(config?: LlamaCppConfig);
288
+ get embedModelName(): string;
289
+ get generateModelName(): string;
290
+ get rerankModelName(): string;
291
+ /**
292
+ * Reset the inactivity timer. Called after each model operation.
293
+ * When timer fires, models are unloaded to free memory (if no active sessions).
294
+ */
295
+ private touchActivity;
296
+ /**
297
+ * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
298
+ */
299
+ private hasLoadedContexts;
300
+ /**
301
+ * Unload idle resources but keep the instance alive for future use.
302
+ *
303
+ * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
304
+ * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
305
+ */
306
+ unloadIdleResources(): Promise<void>;
307
+ /**
308
+ * Ensure model cache directory exists
309
+ */
310
+ private ensureModelCacheDir;
311
+ /**
312
+ * Initialize the llama instance (lazy)
313
+ */
314
+ private ensureLlama;
315
+ private loadLlamaRuntime;
316
+ private isCpuOffloadForced;
317
+ private modelLoadOptions;
318
+ /**
319
+ * Resolve a model URI to a local path, downloading if needed.
320
+ * Validates the downloaded file is actually a GGUF model (not an HTML error page
321
+ * from a proxy or firewall).
322
+ */
323
+ private resolveModel;
324
+ /**
325
+ * Load embedding model (lazy)
326
+ */
327
+ private ensureEmbedModel;
328
+ /**
329
+ * Compute how many parallel contexts to create.
330
+ *
331
+ * GPU: constrained by VRAM (25% of free, capped at 8).
332
+ * CPU: constrained by cores. Splitting threads across contexts enables
333
+ * true parallelism (each context runs on its own cores). Use at most
334
+ * half the math cores, with at least 4 threads per context.
335
+ */
336
+ private computeParallelism;
337
+ /**
338
+ * Get the number of threads each context should use, given N parallel contexts.
339
+ * Splits available math cores evenly across contexts.
340
+ */
341
+ private threadsPerContext;
342
+ /**
343
+ * Load embedding contexts (lazy). Creates multiple for parallel embedding.
344
+ * Uses promise guard to prevent concurrent context creation race condition.
345
+ */
346
+ private embedContextsCreatePromise;
347
+ private ensureEmbedContexts;
348
+ /**
349
+ * Get a single embed context (for single-embed calls). Uses first from pool.
350
+ */
351
+ private ensureEmbedContext;
352
+ /**
353
+ * Load generation model (lazy) - context is created fresh per call
354
+ */
355
+ private ensureGenerateModel;
356
+ /**
357
+ * Load rerank model (lazy)
358
+ */
359
+ private ensureRerankModel;
360
+ /**
361
+ * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
362
+ * Each context has its own sequence, so they can evaluate independently.
363
+ *
364
+ * Tuning choices:
365
+ * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
366
+ * - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
367
+ * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
368
+ */
369
+ private static readonly RERANK_CONTEXT_SIZE;
370
+ private static readonly EMBED_CONTEXT_SIZE;
371
+ private ensureRerankContexts;
372
+ /**
373
+ * Tokenize text using the embedding model's tokenizer
374
+ * Returns tokenizer tokens (opaque type from node-llama-cpp)
375
+ */
376
+ tokenize(text: string): Promise<readonly LlamaToken[]>;
377
+ /**
378
+ * Count tokens in text using the embedding model's tokenizer
379
+ */
380
+ countTokens(text: string): Promise<number>;
381
+ /**
382
+ * Detokenize token IDs back to text
383
+ */
384
+ detokenize(tokens: readonly LlamaToken[]): Promise<string>;
385
+ /**
386
+ * Truncate text to fit within the embedding model's context window.
387
+ * Uses the model's own tokenizer for accurate token counting, then
388
+ * detokenizes back to text if truncation is needed.
389
+ * Returns the (possibly truncated) text and whether truncation occurred.
390
+ */
391
+ private resolveEmbedTokenLimit;
392
+ private truncateToContextSize;
393
+ embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
394
+ /**
395
+ * Batch embed multiple texts efficiently
396
+ * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
397
+ */
398
+ embedBatch(texts: string[], options?: EmbedOptions): Promise<(EmbeddingResult | null)[]>;
399
+ generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
400
+ modelExists(modelUri: string): Promise<ModelInfo>;
401
+ expandQuery(query: string, options?: {
402
+ context?: string;
403
+ includeLexical?: boolean;
404
+ intent?: string;
405
+ }): Promise<Queryable[]>;
406
+ private static readonly RERANK_TEMPLATE_OVERHEAD;
407
+ private static readonly RERANK_TARGET_DOCS_PER_CONTEXT;
408
+ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
409
+ /**
410
+ * Get device/GPU info for status display.
411
+ * Initializes llama if not already done.
412
+ */
413
+ getDeviceInfo(options?: {
414
+ allowBuild?: boolean;
415
+ }): Promise<{
416
+ gpu: string | false;
417
+ gpuOffloading: boolean;
418
+ gpuDevices: string[];
419
+ vram?: {
420
+ total: number;
421
+ used: number;
422
+ free: number;
423
+ };
424
+ cpuCores: number;
425
+ }>;
426
+ dispose(): Promise<void>;
427
+ }
428
+ /**
429
+ * Error thrown when an operation is attempted on a released or aborted session.
430
+ */
431
+ export declare class SessionReleasedError extends Error {
432
+ constructor(message?: string);
433
+ }
434
+ /**
435
+ * Execute a function with a scoped LLM session.
436
+ * The session provides lifecycle guarantees - resources won't be disposed mid-operation.
437
+ *
438
+ * @example
439
+ * ```typescript
440
+ * await withLLMSession(async (session) => {
441
+ * const expanded = await session.expandQuery(query);
442
+ * const embeddings = await session.embedBatch(texts);
443
+ * const reranked = await session.rerank(query, docs);
444
+ * return reranked;
445
+ * }, { maxDuration: 10 * 60 * 1000, name: 'querySearch' });
446
+ * ```
447
+ */
448
+ export declare function withLLMSession<T>(fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
449
+ /**
450
+ * Execute a function with a scoped LLM session using a specific LlamaCpp instance.
451
+ * Unlike withLLMSession, this does not use the global singleton.
452
+ */
453
+ export declare function withLLMSessionForLlm<T>(llm: LlamaCpp, fn: (session: ILLMSession) => Promise<T>, options?: LLMSessionOptions): Promise<T>;
454
+ /**
455
+ * Check if idle unload is safe (no active sessions or operations).
456
+ * Used internally by LlamaCpp idle timer.
457
+ */
458
+ export declare function canUnloadLLM(): boolean;
459
+ /**
460
+ * Whether QMD's darwin Metal exit-crash mitigation is active in this process:
461
+ * true → residency sets disabled, process exit completes silently
462
+ * false → either non-darwin, or `QMD_METAL_KEEP_RESIDENCY=1` overrode it,
463
+ * in which case the libggml-metal teardown assertion may fire
464
+ */
465
+ export declare function isDarwinMetalMitigationActive(): boolean;
466
+ /**
467
+ * Compatibility shim: previous releases installed a `process.on('exit')` hook
468
+ * that tried to skip the C++ static destructor by calling `process.reallyExit`.
469
+ * That mechanism didn't work on Node (Environment::Exit still calls libc
470
+ * `exit()`), so it was replaced by `GGML_METAL_NO_RESIDENCY=1` from bin/qmd.
471
+ * Kept as a no-op for code paths that still call it; safe to remove once no
472
+ * production launcher predates the residency-set fix.
473
+ */
474
+ export declare function installDarwinExitGuard(): void;
475
+ /** @deprecated Replaced by isDarwinMetalMitigationActive. */
476
+ export declare function isDarwinExitGuardInstalled(): boolean;
477
+ /**
478
+ * Get the default LlamaCpp instance (creates one if needed). The LlamaCpp
479
+ * constructor installs the darwin exit guard, so any code path that obtains
480
+ * the singleton is protected.
481
+ */
482
+ export declare function getDefaultLlamaCpp(): LlamaCpp;
483
+ /**
484
+ * Set a custom default LlamaCpp instance (useful for testing). Setting a
485
+ * non-null instance also ensures the darwin exit guard is installed — keeps
486
+ * the invariant intact for test doubles that didn't go through the real
487
+ * constructor.
488
+ */
489
+ export declare function setDefaultLlamaCpp(llm: LlamaCpp | null): void;
490
+ /**
491
+ * Peek at the default LlamaCpp instance without instantiating one. Used by
492
+ * doctor and lifecycle diagnostics.
493
+ */
494
+ export declare function hasDefaultLlamaCpp(): boolean;
495
+ /**
496
+ * Dispose the default LlamaCpp instance if it exists.
497
+ * Call this before process exit to prevent NAPI crashes.
498
+ */
499
+ export declare function disposeDefaultLlamaCpp(): Promise<void>;
500
+ export {};