@joycodetech/qmd-ja 2.5.4 → 2.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,8 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [2.5.4] - 2026-06-22
6
+
5
7
  ### Documentation
6
8
 
7
9
  - README: documented collection filtering (`-c` semantics), the `collection
package/dist/cli/qmd.js CHANGED
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env node
2
1
  import { isBun, openDatabase } from "../db.js";
3
2
  import fastGlob from "fast-glob";
4
3
  import { execSync, spawn as nodeSpawn } from "child_process";
@@ -7,7 +6,7 @@ import { basename, dirname, join as pathJoin, relative as relativePath, resolve
7
6
  import { parseArgs } from "util";
8
7
  import { readFileSync, readdirSync, realpathSync, statSync, existsSync, unlinkSync, writeFileSync, openSync, closeSync, mkdirSync, lstatSync, rmSync, symlinkSync, readlinkSync, copyFileSync } from "fs";
9
8
  import { createInterface } from "readline/promises";
10
- import { getPwd, getRealPath, homedir, resolve, enableProductionMode, searchFTS, extractSnippet, getContextForFile, getContextForPath, listCollections, removeCollection, renameCollection, findSimilarFiles, findDocumentByDocid, isDocid, matchFilesByGlob, getHashesNeedingEmbedding, clearAllEmbeddings, insertEmbedding, getStatus, hashContent, extractTitle, formatDocForEmbedding, getEmbeddingFingerprint, chunkDocumentByTokens, clearCache, getCacheKey, getCachedResult, setCachedResult, getIndexHealth, parseVirtualPath, buildVirtualPath, isVirtualPath, resolveVirtualPath, toVirtualPath, insertContent, insertDocument, findActiveDocument, findOrMigrateLegacyDocument, updateDocumentTitle, updateDocument, deactivateDocument, getActiveDocumentPaths, cleanupOrphanedContent, deleteLLMCache, deleteInactiveDocuments, cleanupOrphanedVectors, vacuumDatabase, getCollectionsWithoutContext, getTopLevelPathsWithoutContext, handelize, hybridQuery, vectorSearchQuery, structuredSearch, addLineNumbers, DEFAULT_EMBED_MODEL, DEFAULT_EMBED_MAX_BATCH_BYTES, DEFAULT_EMBED_MAX_DOCS_PER_BATCH, DEFAULT_RERANK_MODEL, DEFAULT_QUERY_MODEL, DEFAULT_GLOB, DEFAULT_MULTI_GET_MAX_BYTES, createStore, getDefaultDbPath, reindexCollection, initializeKuromojiTokenizer, generateEmbeddings, maybeAdoptLegacyEmbeddingFingerprint, syncConfigToDb, } from "../store.js";
9
+ import { getPwd, getRealPath, homedir, resolve, enableProductionMode, searchFTS, extractSnippet, getContextForFile, getContextForPath, listCollections, removeCollection, renameCollection, findSimilarFiles, findDocumentByDocid, isDocid, matchFilesByGlob, getHashesNeedingEmbedding, clearAllEmbeddings, insertEmbedding, getStatus, hashContent, extractTitle, formatDocForEmbedding, getEmbeddingFingerprint, chunkDocumentByTokens, clearCache, getCacheKey, getCachedResult, setCachedResult, getIndexHealth, parseVirtualPath, buildVirtualPath, isVirtualPath, resolveVirtualPath, toVirtualPath, insertContent, insertDocument, findActiveDocument, findOrMigrateLegacyDocument, updateDocumentTitle, updateDocument, deactivateDocument, getActiveDocumentPaths, cleanupOrphanedContent, deleteLLMCache, deleteInactiveDocuments, cleanupOrphanedVectors, vacuumDatabase, getCollectionsWithoutContext, getTopLevelPathsWithoutContext, handelize, hybridQuery, vectorSearchQuery, structuredSearch, addLineNumbers, DEFAULT_EMBED_MODEL, DEFAULT_EMBED_MAX_BATCH_BYTES, DEFAULT_EMBED_MAX_DOCS_PER_BATCH, DEFAULT_RERANK_MODEL, DEFAULT_QUERY_MODEL, DEFAULT_GLOB, DEFAULT_MULTI_GET_MAX_BYTES, createStore, getDefaultDbPath, reindexCollection, initializeKuromojiTokenizer, initializeVaporettoTokenizer, FTS_CJK_NORMALIZED_VERSION, resolveVaporettoModelPath, generateEmbeddings, maybeAdoptLegacyEmbeddingFingerprint, syncConfigToDb, } from "../store.js";
11
10
  import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_MODEL_CACHE_DIR, resolveEmbedModel, resolveGenerateModel, resolveRerankModel, resolveModels, inspectGgufFile, isDarwinMetalMitigationActive } from "../llm.js";
12
11
  import { formatSearchResults, formatDocuments, escapeXml, escapeCSV, } from "./formatter.js";
13
12
  import { getCollection as getCollectionFromYaml, listCollections as yamlListCollections, getDefaultCollectionNames, addContext as yamlAddContext, removeContext as yamlRemoveContext, removeCollection as yamlRemoveCollectionFn, renameCollection as yamlRenameCollectionFn, setGlobalContext, listAllContexts, setConfigIndexName, loadConfig, saveConfig, setConfigSource, findLocalConfigPath, getLocalDbPath, getConfigPath, configExists, } from "../collections.js";
@@ -3490,6 +3489,27 @@ async function showDoctor() {
3490
3489
  catch (error) {
3491
3490
  doctorCheck("sqlite-vec", false, error instanceof Error ? error.message : String(error));
3492
3491
  }
3492
+ // CJK tokenizer check (qmd-ja: Vaporetto WASM)
3493
+ try {
3494
+ await initializeVaporettoTokenizer();
3495
+ const modelPath = resolveVaporettoModelPath();
3496
+ const modelName = modelPath.split("/").pop() ?? modelPath;
3497
+ let ftsLabel = "not indexed yet";
3498
+ try {
3499
+ const row = db.prepare(`SELECT value FROM store_config WHERE key = 'fts_cjk_normalized_version'`).get();
3500
+ if (row?.value === FTS_CJK_NORMALIZED_VERSION) {
3501
+ ftsLabel = `v${FTS_CJK_NORMALIZED_VERSION} (current)`;
3502
+ }
3503
+ else if (row?.value) {
3504
+ ftsLabel = `v${row.value} -> v${FTS_CJK_NORMALIZED_VERSION} (stale, run qmd-ja update)`;
3505
+ }
3506
+ }
3507
+ catch { /* ignore */ }
3508
+ doctorCheck("CJK tokenizer", true, `Vaporetto WASM — model: ${modelName}, FTS index: ${ftsLabel}`);
3509
+ }
3510
+ catch (error) {
3511
+ doctorCheck("CJK tokenizer", false, `Vaporetto WASM failed: ${error instanceof Error ? error.message : String(error)}`);
3512
+ }
3493
3513
  const configCheck = checkDoctorIndexConfig(nextSteps);
3494
3514
  const configModels = configCheck.config?.models ?? {};
3495
3515
  checkEnvironmentOverrides(activeModels, configModels);
@@ -3600,7 +3620,7 @@ async function showVersion() {
3600
3620
  // Not a git repo or git not available
3601
3621
  }
3602
3622
  const versionStr = commit ? `${pkg.version} (${commit})` : pkg.version;
3603
- console.log(`qmd ${versionStr}`);
3623
+ console.log(`qmd-ja ${versionStr}`);
3604
3624
  }
3605
3625
  // Main CLI - only run if this is the main module
3606
3626
  const __filename = fileURLToPath(import.meta.url);
package/dist/store.d.ts CHANGED
@@ -187,6 +187,12 @@ export declare function resolveVirtualPath(db: Database, virtualPath: string): s
187
187
  */
188
188
  export declare function toVirtualPath(db: Database, absolutePath: string): string | null;
189
189
  export declare function verifySqliteVecLoaded(db: Database): void;
190
+ export declare const FTS_CJK_NORMALIZED_VERSION = "3";
191
+ /**
192
+ * Resolve the Vaporetto model file path relative to this module.
193
+ * The model is bundled under vendor/vaporetto-node-wasm/../models/ relative to the project root.
194
+ */
195
+ export declare function resolveVaporettoModelPath(): string;
190
196
  /**
191
197
  * Pre-initialize the Vaporetto WASM Japanese morphological analyzer.
192
198
  * Call this before indexing or search operations involving CJK text.
package/dist/store.js CHANGED
@@ -595,7 +595,7 @@ const CJK_CHAR_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\
595
595
  // parts of katakana words (e.g. "ナレッジベース"). Without these, the regex splits
596
596
  // on "ー" and vaporetto receives broken sub-strings like "ナレッジベ" and "ス".
597
597
  const CJK_RUN_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}・ー]+/gu;
598
- const FTS_CJK_NORMALIZED_VERSION = "3"; // bumped: vaporetto WASM morphological tokenization
598
+ export const FTS_CJK_NORMALIZED_VERSION = "3"; // bumped: vaporetto WASM morphological tokenization
599
599
  // --- Vaporetto WASM Japanese morphological analyzer (lazy singleton) ---
600
600
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
601
601
  let _vaporettoTokenizer = null;
@@ -604,7 +604,7 @@ let _vaporettoInitPromise = null;
604
604
  * Resolve the Vaporetto model file path relative to this module.
605
605
  * The model is bundled under vendor/vaporetto-node-wasm/../models/ relative to the project root.
606
606
  */
607
- function _resolveVaporettoModelPath() {
607
+ export function resolveVaporettoModelPath() {
608
608
  // __dirname equivalent for ESM
609
609
  const thisDir = dirname(fileURLToPath(import.meta.url));
610
610
  // src/ → project root → models/
@@ -627,7 +627,7 @@ export async function initializeVaporettoTokenizer() {
627
627
  const req = createRequire(import.meta.url);
628
628
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
629
629
  const { VaporettoTokenizer } = req(pathJoin(dirname(fileURLToPath(import.meta.url)), "..", "vendor", "vaporetto-node-wasm", "vaporetto_node_wasm.js"));
630
- const modelPath = _resolveVaporettoModelPath();
630
+ const modelPath = resolveVaporettoModelPath();
631
631
  const modelData = readFileSync(modelPath);
632
632
  return new VaporettoTokenizer(modelData);
633
633
  })();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@joycodetech/qmd-ja",
3
- "version": "2.5.4",
3
+ "version": "2.5.5",
4
4
  "description": "Japanese-enhanced fork of qmd — On-device hybrid search with Vaporetto WASM morphological tokenizer for accurate Japanese BM25 full-text search",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",