@hesed/search 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -20,7 +20,7 @@ $ npm install -g @hesed/search
20
20
  $ search COMMAND
21
21
  running command...
22
22
  $ search (--version)
23
- @hesed/search/0.1.0 darwin-arm64 node-v22.14.0
23
+ @hesed/search/0.2.0 linux-x64 node-v24.16.0
24
24
  $ search --help [COMMAND]
25
25
  USAGE
26
26
  $ search COMMAND
@@ -62,5 +62,5 @@ EXAMPLES
62
62
  $ search search "update jira" --details
63
63
  ```
64
64
 
65
- _See code: [src/commands/search.ts](https://github.com/hesedcasa/search/blob/v0.1.0/src/commands/search.ts)_
65
+ _See code: [src/commands/search.ts](https://github.com/hesedcasa/search/blob/v0.2.0/src/commands/search.ts)_
66
66
  <!-- commandsstop -->
@@ -1,4 +1,5 @@
1
- import { Args, Command, CommandHelp, Flags, toConfiguredId } from '@oclif/core';
1
+ import { Args, Command, CommandHelp, Flags, toConfiguredId, ux } from '@oclif/core';
2
+ import { isMiniLMModelCached, MiniLMCommandEmbedder } from '../embedders/minilm.js';
2
3
  import { searchCommands } from '../search-logic.js';
3
4
  export default class Search extends Command {
4
5
  static args = {
@@ -18,7 +19,14 @@ export default class Search extends Command {
18
19
  async run() {
19
20
  const { args, flags } = await this.parse(Search);
20
21
  const allCommands = this.config.commands.filter((c) => !c.hidden && c.pluginName !== '@oclif/plugin-plugins');
21
- const scored = searchCommands(args.query, allCommands).slice(0, flags.limit);
22
+ const loader = createModelLoader(this.jsonEnabled() || isMiniLMModelCached());
23
+ let scored;
24
+ try {
25
+ scored = (await searchCommands(args.query, allCommands, getSearchOptions(this.config, loader.onProgress))).slice(0, flags.limit);
26
+ }
27
+ finally {
28
+ loader.stop();
29
+ }
22
30
  const results = scored.map((entry) => {
23
31
  const { cmd } = entry;
24
32
  const configuredId = toConfiguredId(cmd.id, this.config);
@@ -77,3 +85,61 @@ export default class Search extends Command {
77
85
  }
78
86
  }
79
87
  }
88
+ function getSearchOptions(config, onLoadProgress) {
89
+ const testConfig = config;
90
+ if (!testConfig.searchEmbedder)
91
+ return { embedder: new MiniLMCommandEmbedder({ onLoadProgress }) };
92
+ return { embedder: testConfig.searchEmbedder };
93
+ }
94
+ function createModelLoader(silent) {
95
+ let started = false;
96
+ let stopped = false;
97
+ const start = (status) => {
98
+ if (silent || stopped)
99
+ return;
100
+ if (started) {
101
+ ux.action.status = status;
102
+ return;
103
+ }
104
+ ux.action.start('Loading semantic search model', status);
105
+ started = true;
106
+ };
107
+ const stop = () => {
108
+ if (silent || stopped)
109
+ return;
110
+ stopped = true;
111
+ if (started)
112
+ ux.action.stop();
113
+ };
114
+ return {
115
+ onProgress(progress) {
116
+ if (progress.status === 'ready') {
117
+ stop();
118
+ return;
119
+ }
120
+ if (progress.status === 'progress_total' && typeof progress.progress === 'number') {
121
+ start(`${Math.round(progress.progress)}% ${formatBytes(progress.loaded)} / ${formatBytes(progress.total)}`);
122
+ return;
123
+ }
124
+ if (progress.status === 'download') {
125
+ start(formatProgressFile(progress.file));
126
+ }
127
+ },
128
+ stop,
129
+ };
130
+ }
131
+ function formatProgressFile(file) {
132
+ return file?.split('/').at(-1);
133
+ }
134
+ function formatBytes(value) {
135
+ if (!value || value <= 0)
136
+ return '?';
137
+ const units = ['B', 'KB', 'MB', 'GB'];
138
+ let size = value;
139
+ let unit = 0;
140
+ while (size >= 1024 && unit < units.length - 1) {
141
+ size /= 1024;
142
+ unit++;
143
+ }
144
+ return `${size.toFixed(size >= 10 || unit === 0 ? 0 : 1)} ${units[unit]}`;
145
+ }
@@ -0,0 +1,22 @@
1
+ import { type CommandEmbedder } from '../search-logic.js';
2
+ export type ModelLoadProgress = {
3
+ file?: string;
4
+ loaded?: number;
5
+ model?: string;
6
+ name?: string;
7
+ progress?: number;
8
+ status: string;
9
+ task?: string;
10
+ total?: number;
11
+ };
12
+ export declare class MiniLMCommandEmbedder implements CommandEmbedder {
13
+ private readonly options;
14
+ private extractorPromise;
15
+ constructor(options?: {
16
+ onLoadProgress?: (progress: ModelLoadProgress) => void;
17
+ });
18
+ embed(texts: string[]): Promise<number[][]>;
19
+ private getExtractor;
20
+ }
21
+ export declare function isMiniLMModelCached(): boolean;
22
+ export declare function getMiniLMModelCachePath(): string;
@@ -0,0 +1,49 @@
1
+ import { existsSync } from 'node:fs';
2
+ import { createRequire } from 'node:module';
3
+ import path from 'node:path';
4
+ const MINILM_MODEL = 'Xenova/paraphrase-MiniLM-L3-v2';
5
+ const MODEL_CACHE_FILES = ['config.json', 'tokenizer.json', 'tokenizer_config.json', 'onnx/model.onnx'];
6
+ // eslint-disable-next-line no-new-func
7
+ const importTransformers = new Function('specifier', 'return import(specifier)');
8
+ const require = createRequire(import.meta.url);
9
+ export class MiniLMCommandEmbedder {
10
+ options;
11
+ extractorPromise;
12
+ constructor(options = {}) {
13
+ this.options = options;
14
+ }
15
+ async embed(texts) {
16
+ if (texts.length === 0)
17
+ return [];
18
+ const extractor = await this.getExtractor();
19
+ const output = await extractor(texts, { normalize: true, pooling: 'mean' });
20
+ return tensorToRows(output.data, output.dims);
21
+ }
22
+ async getExtractor() {
23
+ this.extractorPromise ??= importTransformers('@huggingface/transformers').then(async ({ pipeline }) => {
24
+ const pipelineOptions = {};
25
+ // eslint-disable-next-line camelcase
26
+ pipelineOptions.progress_callback = this.options.onLoadProgress;
27
+ const extractor = await pipeline('feature-extraction', MINILM_MODEL, pipelineOptions);
28
+ return extractor;
29
+ });
30
+ return this.extractorPromise;
31
+ }
32
+ }
33
+ export function isMiniLMModelCached() {
34
+ const cachePath = getMiniLMModelCachePath();
35
+ return MODEL_CACHE_FILES.every((file) => existsSync(path.join(cachePath, file)));
36
+ }
37
+ export function getMiniLMModelCachePath() {
38
+ const entrypointPath = require.resolve('@huggingface/transformers');
39
+ const packagePath = path.dirname(path.dirname(entrypointPath));
40
+ return path.join(packagePath, '.cache', ...MINILM_MODEL.split('/'));
41
+ }
42
+ function tensorToRows(data, dims) {
43
+ const rows = dims[0] ?? 0;
44
+ const columns = dims[1] ?? data.length;
45
+ return Array.from({ length: rows }, (_, row) => {
46
+ const rowData = data.slice(row * columns, (row + 1) * columns);
47
+ return Array.isArray(rowData) ? rowData : [...rowData];
48
+ });
49
+ }
@@ -8,4 +8,19 @@ export type ScoredCommand<T extends SearchableCommand = SearchableCommand> = {
8
8
  cmd: T;
9
9
  score: number;
10
10
  };
11
- export declare function searchCommands<T extends SearchableCommand>(query: string, commands: T[]): Array<ScoredCommand<T>>;
11
+ export type CommandEmbedder = {
12
+ embed(texts: string[]): Promise<number[][]>;
13
+ };
14
+ export type CommandReranker<T extends SearchableCommand = SearchableCommand> = {
15
+ rerank(query: string, commands: Array<ScoredCommand<T>>): Promise<Array<ScoredCommand<T>>>;
16
+ };
17
+ export type SearchCommandsOptions<T extends SearchableCommand = SearchableCommand> = {
18
+ embedder?: CommandEmbedder;
19
+ lexicalWeight?: number;
20
+ minSemanticScore?: number;
21
+ reranker?: CommandReranker<T>;
22
+ semantic?: boolean;
23
+ semanticWeight?: number;
24
+ };
25
+ export declare function searchCommands<T extends SearchableCommand>(query: string, commands: T[], options?: SearchCommandsOptions<T>): Promise<Array<ScoredCommand<T>>>;
26
+ export declare function searchCommandsLexically<T extends SearchableCommand>(query: string, commands: T[], haystack?: string[]): Array<ScoredCommand<T>>;
@@ -1,7 +1,35 @@
1
1
  import UFuzzy from '@leeoniya/ufuzzy';
2
- export function searchCommands(query, commands) {
2
+ import { MiniLMCommandEmbedder } from './embedders/minilm.js';
3
+ const DEFAULT_MIN_SEMANTIC_SCORE = 0.22;
4
+ const DEFAULT_SEMANTIC_WEIGHT = 0.75;
5
+ const DEFAULT_LEXICAL_WEIGHT = 0.25;
6
+ let defaultEmbedderPromise;
7
+ export async function searchCommands(query, commands, options = {}) {
8
+ const normalizedQuery = query.trim();
9
+ if (normalizedQuery.length === 0 || commands.length === 0)
10
+ return [];
11
+ const haystack = commands.map((command) => commandSearchText(command));
12
+ const lexical = searchCommandsLexically(normalizedQuery, commands, haystack);
13
+ if (options.semantic === false)
14
+ return lexical;
15
+ let semantic;
16
+ try {
17
+ semantic = await searchCommandsSemantically(normalizedQuery, commands, haystack, options);
18
+ }
19
+ catch {
20
+ return lexical;
21
+ }
22
+ if (semantic.length === 0)
23
+ return lexical;
24
+ const merged = mergeScores(semantic, lexical, {
25
+ lexicalWeight: options.lexicalWeight ?? DEFAULT_LEXICAL_WEIGHT,
26
+ minSemanticScore: options.minSemanticScore ?? DEFAULT_MIN_SEMANTIC_SCORE,
27
+ semanticWeight: options.semanticWeight ?? DEFAULT_SEMANTIC_WEIGHT,
28
+ });
29
+ return options.reranker ? options.reranker.rerank(normalizedQuery, merged) : merged;
30
+ }
31
+ export function searchCommandsLexically(query, commands, haystack = commands.map((command) => commandSearchText(command))) {
3
32
  const uf = new UFuzzy({ intraIns: Infinity });
4
- const haystack = commands.map((c) => [c.id, c.summary ?? c.description ?? '', c.pluginName ?? ''].filter(Boolean).join(' '));
5
33
  const [idxs, , order] = uf.search(haystack, query, 0, Infinity);
6
34
  if (idxs && idxs.length > 0) {
7
35
  const ranked = order ?? idxs.map((_, i) => i);
@@ -25,3 +53,54 @@ export function searchCommands(query, commands) {
25
53
  .sort((a, b) => b[1] - a[1] || a[0] - b[0])
26
54
  .map(([idx, hits]) => ({ cmd: commands[idx], score: tokens.length - hits }));
27
55
  }
56
+ function commandSearchText(command) {
57
+ return [command.id, command.summary ?? command.description ?? '', command.pluginName ?? '']
58
+ .filter(Boolean)
59
+ .join(' ')
60
+ .replaceAll(/\s+/g, ' ')
61
+ .trim();
62
+ }
63
+ async function searchCommandsSemantically(query, commands, haystack, options) {
64
+ const embedder = options.embedder ?? (await getDefaultEmbedder());
65
+ const [queryEmbedding, ...commandEmbeddings] = await embedder.embed([query, ...haystack]);
66
+ return commandEmbeddings
67
+ .map((embedding, idx) => ({
68
+ cmd: commands[idx],
69
+ score: cosineSimilarity(queryEmbedding, embedding),
70
+ }))
71
+ .filter(({ score }) => Number.isFinite(score))
72
+ .sort((a, b) => b.score - a.score);
73
+ }
74
+ async function getDefaultEmbedder() {
75
+ defaultEmbedderPromise ??= Promise.resolve(new MiniLMCommandEmbedder());
76
+ return defaultEmbedderPromise;
77
+ }
78
+ function mergeScores(semantic, lexical, options) {
79
+ const lexicalById = new Map(lexical.map((entry, idx) => [entry.cmd.id, lexicalScore(idx, lexical.length)]));
80
+ const lexicalIds = new Set(lexical.map((entry) => entry.cmd.id));
81
+ return semantic
82
+ .filter((entry) => entry.score >= options.minSemanticScore || lexicalIds.has(entry.cmd.id))
83
+ .map((entry) => {
84
+ const finalScore = entry.score * options.semanticWeight + (lexicalById.get(entry.cmd.id) ?? 0) * options.lexicalWeight;
85
+ return { cmd: entry.cmd, score: 1 - finalScore };
86
+ })
87
+ .sort((a, b) => a.score - b.score);
88
+ }
89
+ function lexicalScore(index, length) {
90
+ if (length <= 1)
91
+ return 1;
92
+ return 1 - index / (length - 1);
93
+ }
94
+ function cosineSimilarity(a, b) {
95
+ let dot = 0;
96
+ let aMagnitude = 0;
97
+ let bMagnitude = 0;
98
+ for (let i = 0; i < Math.min(a.length, b.length); i++) {
99
+ dot += a[i] * b[i];
100
+ aMagnitude += a[i] * a[i];
101
+ bMagnitude += b[i] * b[i];
102
+ }
103
+ if (aMagnitude === 0 || bMagnitude === 0)
104
+ return 0;
105
+ return dot / (Math.sqrt(aMagnitude) * Math.sqrt(bMagnitude));
106
+ }
@@ -58,5 +58,5 @@
58
58
  ]
59
59
  }
60
60
  },
61
- "version": "0.1.0"
61
+ "version": "0.2.0"
62
62
  }
package/package.json CHANGED
@@ -1,13 +1,14 @@
1
1
  {
2
2
  "name": "@hesed/search",
3
3
  "description": "Intelligence search plugin",
4
- "version": "0.1.0",
4
+ "version": "0.2.0",
5
5
  "author": "Hesed",
6
6
  "bin": {
7
7
  "permission": "./bin/run.js"
8
8
  },
9
9
  "bugs": "https://github.com/hesedcasa/search/issues",
10
10
  "dependencies": {
11
+ "@huggingface/transformers": "^4.2.0",
11
12
  "@leeoniya/ufuzzy": "^1.0.19",
12
13
  "@oclif/core": "^4"
13
14
  },