@joycodetech/qmd-ja 2.5.3 → 2.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +2 -0
- package/dist/cli/qmd.js +23 -3
- package/dist/store.d.ts +6 -0
- package/dist/store.js +4 -4
- package/package.json +2 -2
- package/vendor/vaporetto-node-wasm/LICENSE +22 -0
- /package/bin/{qmd → qmd-ja} +0 -0
package/CHANGELOG.md
CHANGED
package/dist/cli/qmd.js
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
1
|
import { isBun, openDatabase } from "../db.js";
|
|
3
2
|
import fastGlob from "fast-glob";
|
|
4
3
|
import { execSync, spawn as nodeSpawn } from "child_process";
|
|
@@ -7,7 +6,7 @@ import { basename, dirname, join as pathJoin, relative as relativePath, resolve
|
|
|
7
6
|
import { parseArgs } from "util";
|
|
8
7
|
import { readFileSync, readdirSync, realpathSync, statSync, existsSync, unlinkSync, writeFileSync, openSync, closeSync, mkdirSync, lstatSync, rmSync, symlinkSync, readlinkSync, copyFileSync } from "fs";
|
|
9
8
|
import { createInterface } from "readline/promises";
|
|
10
|
-
import { getPwd, getRealPath, homedir, resolve, enableProductionMode, searchFTS, extractSnippet, getContextForFile, getContextForPath, listCollections, removeCollection, renameCollection, findSimilarFiles, findDocumentByDocid, isDocid, matchFilesByGlob, getHashesNeedingEmbedding, clearAllEmbeddings, insertEmbedding, getStatus, hashContent, extractTitle, formatDocForEmbedding, getEmbeddingFingerprint, chunkDocumentByTokens, clearCache, getCacheKey, getCachedResult, setCachedResult, getIndexHealth, parseVirtualPath, buildVirtualPath, isVirtualPath, resolveVirtualPath, toVirtualPath, insertContent, insertDocument, findActiveDocument, findOrMigrateLegacyDocument, updateDocumentTitle, updateDocument, deactivateDocument, getActiveDocumentPaths, cleanupOrphanedContent, deleteLLMCache, deleteInactiveDocuments, cleanupOrphanedVectors, vacuumDatabase, getCollectionsWithoutContext, getTopLevelPathsWithoutContext, handelize, hybridQuery, vectorSearchQuery, structuredSearch, addLineNumbers, DEFAULT_EMBED_MODEL, DEFAULT_EMBED_MAX_BATCH_BYTES, DEFAULT_EMBED_MAX_DOCS_PER_BATCH, DEFAULT_RERANK_MODEL, DEFAULT_QUERY_MODEL, DEFAULT_GLOB, DEFAULT_MULTI_GET_MAX_BYTES, createStore, getDefaultDbPath, reindexCollection, initializeKuromojiTokenizer, generateEmbeddings, maybeAdoptLegacyEmbeddingFingerprint, syncConfigToDb, } from "../store.js";
|
|
9
|
+
import { getPwd, getRealPath, homedir, resolve, enableProductionMode, searchFTS, extractSnippet, getContextForFile, getContextForPath, listCollections, removeCollection, renameCollection, findSimilarFiles, findDocumentByDocid, isDocid, matchFilesByGlob, getHashesNeedingEmbedding, clearAllEmbeddings, insertEmbedding, getStatus, hashContent, extractTitle, formatDocForEmbedding, getEmbeddingFingerprint, chunkDocumentByTokens, clearCache, getCacheKey, getCachedResult, setCachedResult, getIndexHealth, parseVirtualPath, buildVirtualPath, isVirtualPath, resolveVirtualPath, toVirtualPath, insertContent, insertDocument, findActiveDocument, findOrMigrateLegacyDocument, updateDocumentTitle, updateDocument, deactivateDocument, getActiveDocumentPaths, cleanupOrphanedContent, deleteLLMCache, deleteInactiveDocuments, cleanupOrphanedVectors, vacuumDatabase, getCollectionsWithoutContext, getTopLevelPathsWithoutContext, handelize, hybridQuery, vectorSearchQuery, structuredSearch, addLineNumbers, DEFAULT_EMBED_MODEL, DEFAULT_EMBED_MAX_BATCH_BYTES, DEFAULT_EMBED_MAX_DOCS_PER_BATCH, DEFAULT_RERANK_MODEL, DEFAULT_QUERY_MODEL, DEFAULT_GLOB, DEFAULT_MULTI_GET_MAX_BYTES, createStore, getDefaultDbPath, reindexCollection, initializeKuromojiTokenizer, initializeVaporettoTokenizer, FTS_CJK_NORMALIZED_VERSION, resolveVaporettoModelPath, generateEmbeddings, maybeAdoptLegacyEmbeddingFingerprint, syncConfigToDb, } from "../store.js";
|
|
11
10
|
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, setDefaultLlamaCpp, LlamaCpp, withLLMSession, pullModels, DEFAULT_MODEL_CACHE_DIR, resolveEmbedModel, resolveGenerateModel, resolveRerankModel, resolveModels, inspectGgufFile, isDarwinMetalMitigationActive } from "../llm.js";
|
|
12
11
|
import { formatSearchResults, formatDocuments, escapeXml, escapeCSV, } from "./formatter.js";
|
|
13
12
|
import { getCollection as getCollectionFromYaml, listCollections as yamlListCollections, getDefaultCollectionNames, addContext as yamlAddContext, removeContext as yamlRemoveContext, removeCollection as yamlRemoveCollectionFn, renameCollection as yamlRenameCollectionFn, setGlobalContext, listAllContexts, setConfigIndexName, loadConfig, saveConfig, setConfigSource, findLocalConfigPath, getLocalDbPath, getConfigPath, configExists, } from "../collections.js";
|
|
@@ -3490,6 +3489,27 @@ async function showDoctor() {
|
|
|
3490
3489
|
catch (error) {
|
|
3491
3490
|
doctorCheck("sqlite-vec", false, error instanceof Error ? error.message : String(error));
|
|
3492
3491
|
}
|
|
3492
|
+
// CJK tokenizer check (qmd-ja: Vaporetto WASM)
|
|
3493
|
+
try {
|
|
3494
|
+
await initializeVaporettoTokenizer();
|
|
3495
|
+
const modelPath = resolveVaporettoModelPath();
|
|
3496
|
+
const modelName = modelPath.split("/").pop() ?? modelPath;
|
|
3497
|
+
let ftsLabel = "not indexed yet";
|
|
3498
|
+
try {
|
|
3499
|
+
const row = db.prepare(`SELECT value FROM store_config WHERE key = 'fts_cjk_normalized_version'`).get();
|
|
3500
|
+
if (row?.value === FTS_CJK_NORMALIZED_VERSION) {
|
|
3501
|
+
ftsLabel = `v${FTS_CJK_NORMALIZED_VERSION} (current)`;
|
|
3502
|
+
}
|
|
3503
|
+
else if (row?.value) {
|
|
3504
|
+
ftsLabel = `v${row.value} -> v${FTS_CJK_NORMALIZED_VERSION} (stale, run qmd-ja update)`;
|
|
3505
|
+
}
|
|
3506
|
+
}
|
|
3507
|
+
catch { /* ignore */ }
|
|
3508
|
+
doctorCheck("CJK tokenizer", true, `Vaporetto WASM — model: ${modelName}, FTS index: ${ftsLabel}`);
|
|
3509
|
+
}
|
|
3510
|
+
catch (error) {
|
|
3511
|
+
doctorCheck("CJK tokenizer", false, `Vaporetto WASM failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
3512
|
+
}
|
|
3493
3513
|
const configCheck = checkDoctorIndexConfig(nextSteps);
|
|
3494
3514
|
const configModels = configCheck.config?.models ?? {};
|
|
3495
3515
|
checkEnvironmentOverrides(activeModels, configModels);
|
|
@@ -3600,7 +3620,7 @@ async function showVersion() {
|
|
|
3600
3620
|
// Not a git repo or git not available
|
|
3601
3621
|
}
|
|
3602
3622
|
const versionStr = commit ? `${pkg.version} (${commit})` : pkg.version;
|
|
3603
|
-
console.log(`qmd ${versionStr}`);
|
|
3623
|
+
console.log(`qmd-ja ${versionStr}`);
|
|
3604
3624
|
}
|
|
3605
3625
|
// Main CLI - only run if this is the main module
|
|
3606
3626
|
const __filename = fileURLToPath(import.meta.url);
|
package/dist/store.d.ts
CHANGED
|
@@ -187,6 +187,12 @@ export declare function resolveVirtualPath(db: Database, virtualPath: string): s
|
|
|
187
187
|
*/
|
|
188
188
|
export declare function toVirtualPath(db: Database, absolutePath: string): string | null;
|
|
189
189
|
export declare function verifySqliteVecLoaded(db: Database): void;
|
|
190
|
+
export declare const FTS_CJK_NORMALIZED_VERSION = "3";
|
|
191
|
+
/**
|
|
192
|
+
* Resolve the Vaporetto model file path relative to this module.
|
|
193
|
+
* The model is bundled under vendor/vaporetto-node-wasm/../models/ relative to the project root.
|
|
194
|
+
*/
|
|
195
|
+
export declare function resolveVaporettoModelPath(): string;
|
|
190
196
|
/**
|
|
191
197
|
* Pre-initialize the Vaporetto WASM Japanese morphological analyzer.
|
|
192
198
|
* Call this before indexing or search operations involving CJK text.
|
package/dist/store.js
CHANGED
|
@@ -595,7 +595,7 @@ const CJK_CHAR_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\
|
|
|
595
595
|
// parts of katakana words (e.g. "ナレッジベース"). Without these, the regex splits
|
|
596
596
|
// on "ー" and vaporetto receives broken sub-strings like "ナレッジベ" and "ス".
|
|
597
597
|
const CJK_RUN_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}・ー]+/gu;
|
|
598
|
-
const FTS_CJK_NORMALIZED_VERSION = "3"; // bumped: vaporetto WASM morphological tokenization
|
|
598
|
+
export const FTS_CJK_NORMALIZED_VERSION = "3"; // bumped: vaporetto WASM morphological tokenization
|
|
599
599
|
// --- Vaporetto WASM Japanese morphological analyzer (lazy singleton) ---
|
|
600
600
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
601
601
|
let _vaporettoTokenizer = null;
|
|
@@ -604,7 +604,7 @@ let _vaporettoInitPromise = null;
|
|
|
604
604
|
* Resolve the Vaporetto model file path relative to this module.
|
|
605
605
|
* The model is bundled under vendor/vaporetto-node-wasm/../models/ relative to the project root.
|
|
606
606
|
*/
|
|
607
|
-
function
|
|
607
|
+
export function resolveVaporettoModelPath() {
|
|
608
608
|
// __dirname equivalent for ESM
|
|
609
609
|
const thisDir = dirname(fileURLToPath(import.meta.url));
|
|
610
610
|
// src/ → project root → models/
|
|
@@ -626,8 +626,8 @@ export async function initializeVaporettoTokenizer() {
|
|
|
626
626
|
_vaporettoInitPromise = (async () => {
|
|
627
627
|
const req = createRequire(import.meta.url);
|
|
628
628
|
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
629
|
-
const { VaporettoTokenizer } = req(pathJoin(dirname(fileURLToPath(import.meta.url)), "
|
|
630
|
-
const modelPath =
|
|
629
|
+
const { VaporettoTokenizer } = req(pathJoin(dirname(fileURLToPath(import.meta.url)), "..", "vendor", "vaporetto-node-wasm", "vaporetto_node_wasm.js"));
|
|
630
|
+
const modelPath = resolveVaporettoModelPath();
|
|
631
631
|
const modelData = readFileSync(modelPath);
|
|
632
632
|
return new VaporettoTokenizer(modelData);
|
|
633
633
|
})();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@joycodetech/qmd-ja",
|
|
3
|
-
"version": "2.5.
|
|
3
|
+
"version": "2.5.5",
|
|
4
4
|
"description": "Japanese-enhanced fork of qmd — On-device hybrid search with Vaporetto WASM morphological tokenizer for accurate Japanese BM25 full-text search",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
}
|
|
13
13
|
},
|
|
14
14
|
"bin": {
|
|
15
|
-
"qmd": "bin/qmd"
|
|
15
|
+
"qmd-ja": "bin/qmd-ja"
|
|
16
16
|
},
|
|
17
17
|
"files": [
|
|
18
18
|
"bin/",
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) Koichi Akabe, Shunsuke Kanda, Yusuke Oda, Shinsuke Mori
|
|
4
|
+
(daac-tools/vaporetto — https://github.com/daac-tools/vaporetto)
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
/package/bin/{qmd → qmd-ja}
RENAMED
|
File without changes
|