npm - @hesed/search - Versions diffs - 0.1.0 → 0.2.0 - Mend

@hesed/search 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +2 -2
package/dist/commands/search.js +68 -2
package/dist/embedders/minilm.d.ts +22 -0
package/dist/embedders/minilm.js +49 -0
package/dist/search-logic.d.ts +16 -1
package/dist/search-logic.js +81 -2
package/oclif.manifest.json +1 -1
package/package.json +2 -1

package/README.md CHANGED Viewed

@@ -20,7 +20,7 @@ $ npm install -g @hesed/search
 $ search COMMAND
 running command...
 $ search (--version)
-@hesed/search/0.1.0 darwin-arm64 node-v22.14.0
+@hesed/search/0.2.0 linux-x64 node-v24.16.0
 $ search --help [COMMAND]
 USAGE
   $ search COMMAND
@@ -62,5 +62,5 @@ EXAMPLES
   $ search search "update jira" --details
 ```
-_See code: [src/commands/search.ts](https://github.com/hesedcasa/search/blob/v0.1.0/src/commands/search.ts)_
+_See code: [src/commands/search.ts](https://github.com/hesedcasa/search/blob/v0.2.0/src/commands/search.ts)_
 <!-- commandsstop -->

package/dist/commands/search.js CHANGED Viewed

@@ -1,4 +1,5 @@
-import { Args, Command, CommandHelp, Flags, toConfiguredId } from '@oclif/core';
+import { Args, Command, CommandHelp, Flags, toConfiguredId, ux } from '@oclif/core';
+import { isMiniLMModelCached, MiniLMCommandEmbedder } from '../embedders/minilm.js';
 import { searchCommands } from '../search-logic.js';
 export default class Search extends Command {
     static args = {
@@ -18,7 +19,14 @@ export default class Search extends Command {
     async run() {
         const { args, flags } = await this.parse(Search);
         const allCommands = this.config.commands.filter((c) => !c.hidden && c.pluginName !== '@oclif/plugin-plugins');
-        const scored = searchCommands(args.query, allCommands).slice(0, flags.limit);
+        const loader = createModelLoader(this.jsonEnabled() || isMiniLMModelCached());
+        let scored;
+        try {
+            scored = (await searchCommands(args.query, allCommands, getSearchOptions(this.config, loader.onProgress))).slice(0, flags.limit);
+        }
+        finally {
+            loader.stop();
+        }
         const results = scored.map((entry) => {
             const { cmd } = entry;
             const configuredId = toConfiguredId(cmd.id, this.config);
@@ -77,3 +85,61 @@ export default class Search extends Command {
         }
     }
 }
+function getSearchOptions(config, onLoadProgress) {
+    const testConfig = config;
+    if (!testConfig.searchEmbedder)
+        return { embedder: new MiniLMCommandEmbedder({ onLoadProgress }) };
+    return { embedder: testConfig.searchEmbedder };
+}
+function createModelLoader(silent) {
+    let started = false;
+    let stopped = false;
+    const start = (status) => {
+        if (silent || stopped)
+            return;
+        if (started) {
+            ux.action.status = status;
+            return;
+        }
+        ux.action.start('Loading semantic search model', status);
+        started = true;
+    };
+    const stop = () => {
+        if (silent || stopped)
+            return;
+        stopped = true;
+        if (started)
+            ux.action.stop();
+    };
+    return {
+        onProgress(progress) {
+            if (progress.status === 'ready') {
+                stop();
+                return;
+            }
+            if (progress.status === 'progress_total' && typeof progress.progress === 'number') {
+                start(`${Math.round(progress.progress)}% ${formatBytes(progress.loaded)} / ${formatBytes(progress.total)}`);
+                return;
+            }
+            if (progress.status === 'download') {
+                start(formatProgressFile(progress.file));
+            }
+        },
+        stop,
+    };
+}
+function formatProgressFile(file) {
+    return file?.split('/').at(-1);
+}
+function formatBytes(value) {
+    if (!value || value <= 0)
+        return '?';
+    const units = ['B', 'KB', 'MB', 'GB'];
+    let size = value;
+    let unit = 0;
+    while (size >= 1024 && unit < units.length - 1) {
+        size /= 1024;
+        unit++;
+    }
+    return `${size.toFixed(size >= 10 || unit === 0 ? 0 : 1)} ${units[unit]}`;
+}

package/dist/embedders/minilm.d.ts ADDED Viewed

@@ -0,0 +1,22 @@
+import { type CommandEmbedder } from '../search-logic.js';
+export type ModelLoadProgress = {
+    file?: string;
+    loaded?: number;
+    model?: string;
+    name?: string;
+    progress?: number;
+    status: string;
+    task?: string;
+    total?: number;
+};
+export declare class MiniLMCommandEmbedder implements CommandEmbedder {
+    private readonly options;
+    private extractorPromise;
+    constructor(options?: {
+        onLoadProgress?: (progress: ModelLoadProgress) => void;
+    });
+    embed(texts: string[]): Promise<number[][]>;
+    private getExtractor;
+}
+export declare function isMiniLMModelCached(): boolean;
+export declare function getMiniLMModelCachePath(): string;

package/dist/embedders/minilm.js ADDED Viewed

@@ -0,0 +1,49 @@
+import { existsSync } from 'node:fs';
+import { createRequire } from 'node:module';
+import path from 'node:path';
+const MINILM_MODEL = 'Xenova/paraphrase-MiniLM-L3-v2';
+const MODEL_CACHE_FILES = ['config.json', 'tokenizer.json', 'tokenizer_config.json', 'onnx/model.onnx'];
+// eslint-disable-next-line no-new-func
+const importTransformers = new Function('specifier', 'return import(specifier)');
+const require = createRequire(import.meta.url);
+export class MiniLMCommandEmbedder {
+    options;
+    extractorPromise;
+    constructor(options = {}) {
+        this.options = options;
+    }
+    async embed(texts) {
+        if (texts.length === 0)
+            return [];
+        const extractor = await this.getExtractor();
+        const output = await extractor(texts, { normalize: true, pooling: 'mean' });
+        return tensorToRows(output.data, output.dims);
+    }
+    async getExtractor() {
+        this.extractorPromise ??= importTransformers('@huggingface/transformers').then(async ({ pipeline }) => {
+            const pipelineOptions = {};
+            // eslint-disable-next-line camelcase
+            pipelineOptions.progress_callback = this.options.onLoadProgress;
+            const extractor = await pipeline('feature-extraction', MINILM_MODEL, pipelineOptions);
+            return extractor;
+        });
+        return this.extractorPromise;
+    }
+}
+export function isMiniLMModelCached() {
+    const cachePath = getMiniLMModelCachePath();
+    return MODEL_CACHE_FILES.every((file) => existsSync(path.join(cachePath, file)));
+}
+export function getMiniLMModelCachePath() {
+    const entrypointPath = require.resolve('@huggingface/transformers');
+    const packagePath = path.dirname(path.dirname(entrypointPath));
+    return path.join(packagePath, '.cache', ...MINILM_MODEL.split('/'));
+}
+function tensorToRows(data, dims) {
+    const rows = dims[0] ?? 0;
+    const columns = dims[1] ?? data.length;
+    return Array.from({ length: rows }, (_, row) => {
+        const rowData = data.slice(row * columns, (row + 1) * columns);
+        return Array.isArray(rowData) ? rowData : [...rowData];
+    });
+}

package/dist/search-logic.d.ts CHANGED Viewed

@@ -8,4 +8,19 @@ export type ScoredCommand<T extends SearchableCommand = SearchableCommand> = {
     cmd: T;
     score: number;
 };
-export declare function searchCommands<T extends SearchableCommand>(query: string, commands: T[]): Array<ScoredCommand<T>>;
+export type CommandEmbedder = {
+    embed(texts: string[]): Promise<number[][]>;
+};
+export type CommandReranker<T extends SearchableCommand = SearchableCommand> = {
+    rerank(query: string, commands: Array<ScoredCommand<T>>): Promise<Array<ScoredCommand<T>>>;
+};
+export type SearchCommandsOptions<T extends SearchableCommand = SearchableCommand> = {
+    embedder?: CommandEmbedder;
+    lexicalWeight?: number;
+    minSemanticScore?: number;
+    reranker?: CommandReranker<T>;
+    semantic?: boolean;
+    semanticWeight?: number;
+};
+export declare function searchCommands<T extends SearchableCommand>(query: string, commands: T[], options?: SearchCommandsOptions<T>): Promise<Array<ScoredCommand<T>>>;
+export declare function searchCommandsLexically<T extends SearchableCommand>(query: string, commands: T[], haystack?: string[]): Array<ScoredCommand<T>>;

package/dist/search-logic.js CHANGED Viewed

@@ -1,7 +1,35 @@
 import UFuzzy from '@leeoniya/ufuzzy';
-export function searchCommands(query, commands) {
+import { MiniLMCommandEmbedder } from './embedders/minilm.js';
+const DEFAULT_MIN_SEMANTIC_SCORE = 0.22;
+const DEFAULT_SEMANTIC_WEIGHT = 0.75;
+const DEFAULT_LEXICAL_WEIGHT = 0.25;
+let defaultEmbedderPromise;
+export async function searchCommands(query, commands, options = {}) {
+    const normalizedQuery = query.trim();
+    if (normalizedQuery.length === 0 || commands.length === 0)
+        return [];
+    const haystack = commands.map((command) => commandSearchText(command));
+    const lexical = searchCommandsLexically(normalizedQuery, commands, haystack);
+    if (options.semantic === false)
+        return lexical;
+    let semantic;
+    try {
+        semantic = await searchCommandsSemantically(normalizedQuery, commands, haystack, options);
+    }
+    catch {
+        return lexical;
+    }
+    if (semantic.length === 0)
+        return lexical;
+    const merged = mergeScores(semantic, lexical, {
+        lexicalWeight: options.lexicalWeight ?? DEFAULT_LEXICAL_WEIGHT,
+        minSemanticScore: options.minSemanticScore ?? DEFAULT_MIN_SEMANTIC_SCORE,
+        semanticWeight: options.semanticWeight ?? DEFAULT_SEMANTIC_WEIGHT,
+    });
+    return options.reranker ? options.reranker.rerank(normalizedQuery, merged) : merged;
+}
+export function searchCommandsLexically(query, commands, haystack = commands.map((command) => commandSearchText(command))) {
     const uf = new UFuzzy({ intraIns: Infinity });
-    const haystack = commands.map((c) => [c.id, c.summary ?? c.description ?? '', c.pluginName ?? ''].filter(Boolean).join(' '));
     const [idxs, , order] = uf.search(haystack, query, 0, Infinity);
     if (idxs && idxs.length > 0) {
         const ranked = order ?? idxs.map((_, i) => i);
@@ -25,3 +53,54 @@ export function searchCommands(query, commands) {
         .sort((a, b) => b[1] - a[1] || a[0] - b[0])
         .map(([idx, hits]) => ({ cmd: commands[idx], score: tokens.length - hits }));
 }
+function commandSearchText(command) {
+    return [command.id, command.summary ?? command.description ?? '', command.pluginName ?? '']
+        .filter(Boolean)
+        .join(' ')
+        .replaceAll(/\s+/g, ' ')
+        .trim();
+}
+async function searchCommandsSemantically(query, commands, haystack, options) {
+    const embedder = options.embedder ?? (await getDefaultEmbedder());
+    const [queryEmbedding, ...commandEmbeddings] = await embedder.embed([query, ...haystack]);
+    return commandEmbeddings
+        .map((embedding, idx) => ({
+        cmd: commands[idx],
+        score: cosineSimilarity(queryEmbedding, embedding),
+    }))
+        .filter(({ score }) => Number.isFinite(score))
+        .sort((a, b) => b.score - a.score);
+}
+async function getDefaultEmbedder() {
+    defaultEmbedderPromise ??= Promise.resolve(new MiniLMCommandEmbedder());
+    return defaultEmbedderPromise;
+}
+function mergeScores(semantic, lexical, options) {
+    const lexicalById = new Map(lexical.map((entry, idx) => [entry.cmd.id, lexicalScore(idx, lexical.length)]));
+    const lexicalIds = new Set(lexical.map((entry) => entry.cmd.id));
+    return semantic
+        .filter((entry) => entry.score >= options.minSemanticScore || lexicalIds.has(entry.cmd.id))
+        .map((entry) => {
+        const finalScore = entry.score * options.semanticWeight + (lexicalById.get(entry.cmd.id) ?? 0) * options.lexicalWeight;
+        return { cmd: entry.cmd, score: 1 - finalScore };
+    })
+        .sort((a, b) => a.score - b.score);
+}
+function lexicalScore(index, length) {
+    if (length <= 1)
+        return 1;
+    return 1 - index / (length - 1);
+}
+function cosineSimilarity(a, b) {
+    let dot = 0;
+    let aMagnitude = 0;
+    let bMagnitude = 0;
+    for (let i = 0; i < Math.min(a.length, b.length); i++) {
+        dot += a[i] * b[i];
+        aMagnitude += a[i] * a[i];
+        bMagnitude += b[i] * b[i];
+    }
+    if (aMagnitude === 0 || bMagnitude === 0)
+        return 0;
+    return dot / (Math.sqrt(aMagnitude) * Math.sqrt(bMagnitude));
+}

package/oclif.manifest.json CHANGED Viewed

@@ -58,5 +58,5 @@
       ]
     }
   },
-  "version": "0.1.0"
+  "version": "0.2.0"
 }

package/package.json CHANGED Viewed

@@ -1,13 +1,14 @@
 {
   "name": "@hesed/search",
   "description": "Intelligence search plugin",
-  "version": "0.1.0",
+  "version": "0.2.0",
   "author": "Hesed",
   "bin": {
     "permission": "./bin/run.js"
   },
   "bugs": "https://github.com/hesedcasa/search/issues",
   "dependencies": {
+    "@huggingface/transformers": "^4.2.0",
     "@leeoniya/ufuzzy": "^1.0.19",
     "@oclif/core": "^4"
   },