@hesed/search 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/commands/search.js +68 -2
- package/dist/embedders/minilm.d.ts +22 -0
- package/dist/embedders/minilm.js +49 -0
- package/dist/search-logic.d.ts +16 -1
- package/dist/search-logic.js +81 -2
- package/oclif.manifest.json +1 -1
- package/package.json +2 -1
package/README.md
CHANGED
|
@@ -20,7 +20,7 @@ $ npm install -g @hesed/search
|
|
|
20
20
|
$ search COMMAND
|
|
21
21
|
running command...
|
|
22
22
|
$ search (--version)
|
|
23
|
-
@hesed/search/0.
|
|
23
|
+
@hesed/search/0.2.0 linux-x64 node-v24.16.0
|
|
24
24
|
$ search --help [COMMAND]
|
|
25
25
|
USAGE
|
|
26
26
|
$ search COMMAND
|
|
@@ -62,5 +62,5 @@ EXAMPLES
|
|
|
62
62
|
$ search search "update jira" --details
|
|
63
63
|
```
|
|
64
64
|
|
|
65
|
-
_See code: [src/commands/search.ts](https://github.com/hesedcasa/search/blob/v0.
|
|
65
|
+
_See code: [src/commands/search.ts](https://github.com/hesedcasa/search/blob/v0.2.0/src/commands/search.ts)_
|
|
66
66
|
<!-- commandsstop -->
|
package/dist/commands/search.js
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Args, Command, CommandHelp, Flags, toConfiguredId } from '@oclif/core';
|
|
1
|
+
import { Args, Command, CommandHelp, Flags, toConfiguredId, ux } from '@oclif/core';
|
|
2
|
+
import { isMiniLMModelCached, MiniLMCommandEmbedder } from '../embedders/minilm.js';
|
|
2
3
|
import { searchCommands } from '../search-logic.js';
|
|
3
4
|
export default class Search extends Command {
|
|
4
5
|
static args = {
|
|
@@ -18,7 +19,14 @@ export default class Search extends Command {
|
|
|
18
19
|
async run() {
|
|
19
20
|
const { args, flags } = await this.parse(Search);
|
|
20
21
|
const allCommands = this.config.commands.filter((c) => !c.hidden && c.pluginName !== '@oclif/plugin-plugins');
|
|
21
|
-
const
|
|
22
|
+
const loader = createModelLoader(this.jsonEnabled() || isMiniLMModelCached());
|
|
23
|
+
let scored;
|
|
24
|
+
try {
|
|
25
|
+
scored = (await searchCommands(args.query, allCommands, getSearchOptions(this.config, loader.onProgress))).slice(0, flags.limit);
|
|
26
|
+
}
|
|
27
|
+
finally {
|
|
28
|
+
loader.stop();
|
|
29
|
+
}
|
|
22
30
|
const results = scored.map((entry) => {
|
|
23
31
|
const { cmd } = entry;
|
|
24
32
|
const configuredId = toConfiguredId(cmd.id, this.config);
|
|
@@ -77,3 +85,61 @@ export default class Search extends Command {
|
|
|
77
85
|
}
|
|
78
86
|
}
|
|
79
87
|
}
|
|
88
|
+
function getSearchOptions(config, onLoadProgress) {
|
|
89
|
+
const testConfig = config;
|
|
90
|
+
if (!testConfig.searchEmbedder)
|
|
91
|
+
return { embedder: new MiniLMCommandEmbedder({ onLoadProgress }) };
|
|
92
|
+
return { embedder: testConfig.searchEmbedder };
|
|
93
|
+
}
|
|
94
|
+
function createModelLoader(silent) {
|
|
95
|
+
let started = false;
|
|
96
|
+
let stopped = false;
|
|
97
|
+
const start = (status) => {
|
|
98
|
+
if (silent || stopped)
|
|
99
|
+
return;
|
|
100
|
+
if (started) {
|
|
101
|
+
ux.action.status = status;
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
ux.action.start('Loading semantic search model', status);
|
|
105
|
+
started = true;
|
|
106
|
+
};
|
|
107
|
+
const stop = () => {
|
|
108
|
+
if (silent || stopped)
|
|
109
|
+
return;
|
|
110
|
+
stopped = true;
|
|
111
|
+
if (started)
|
|
112
|
+
ux.action.stop();
|
|
113
|
+
};
|
|
114
|
+
return {
|
|
115
|
+
onProgress(progress) {
|
|
116
|
+
if (progress.status === 'ready') {
|
|
117
|
+
stop();
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
if (progress.status === 'progress_total' && typeof progress.progress === 'number') {
|
|
121
|
+
start(`${Math.round(progress.progress)}% ${formatBytes(progress.loaded)} / ${formatBytes(progress.total)}`);
|
|
122
|
+
return;
|
|
123
|
+
}
|
|
124
|
+
if (progress.status === 'download') {
|
|
125
|
+
start(formatProgressFile(progress.file));
|
|
126
|
+
}
|
|
127
|
+
},
|
|
128
|
+
stop,
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
function formatProgressFile(file) {
|
|
132
|
+
return file?.split('/').at(-1);
|
|
133
|
+
}
|
|
134
|
+
function formatBytes(value) {
|
|
135
|
+
if (!value || value <= 0)
|
|
136
|
+
return '?';
|
|
137
|
+
const units = ['B', 'KB', 'MB', 'GB'];
|
|
138
|
+
let size = value;
|
|
139
|
+
let unit = 0;
|
|
140
|
+
while (size >= 1024 && unit < units.length - 1) {
|
|
141
|
+
size /= 1024;
|
|
142
|
+
unit++;
|
|
143
|
+
}
|
|
144
|
+
return `${size.toFixed(size >= 10 || unit === 0 ? 0 : 1)} ${units[unit]}`;
|
|
145
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { type CommandEmbedder } from '../search-logic.js';
|
|
2
|
+
export type ModelLoadProgress = {
|
|
3
|
+
file?: string;
|
|
4
|
+
loaded?: number;
|
|
5
|
+
model?: string;
|
|
6
|
+
name?: string;
|
|
7
|
+
progress?: number;
|
|
8
|
+
status: string;
|
|
9
|
+
task?: string;
|
|
10
|
+
total?: number;
|
|
11
|
+
};
|
|
12
|
+
export declare class MiniLMCommandEmbedder implements CommandEmbedder {
|
|
13
|
+
private readonly options;
|
|
14
|
+
private extractorPromise;
|
|
15
|
+
constructor(options?: {
|
|
16
|
+
onLoadProgress?: (progress: ModelLoadProgress) => void;
|
|
17
|
+
});
|
|
18
|
+
embed(texts: string[]): Promise<number[][]>;
|
|
19
|
+
private getExtractor;
|
|
20
|
+
}
|
|
21
|
+
export declare function isMiniLMModelCached(): boolean;
|
|
22
|
+
export declare function getMiniLMModelCachePath(): string;
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import { existsSync } from 'node:fs';
|
|
2
|
+
import { createRequire } from 'node:module';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
const MINILM_MODEL = 'Xenova/paraphrase-MiniLM-L3-v2';
|
|
5
|
+
const MODEL_CACHE_FILES = ['config.json', 'tokenizer.json', 'tokenizer_config.json', 'onnx/model.onnx'];
|
|
6
|
+
// eslint-disable-next-line no-new-func
|
|
7
|
+
const importTransformers = new Function('specifier', 'return import(specifier)');
|
|
8
|
+
const require = createRequire(import.meta.url);
|
|
9
|
+
export class MiniLMCommandEmbedder {
|
|
10
|
+
options;
|
|
11
|
+
extractorPromise;
|
|
12
|
+
constructor(options = {}) {
|
|
13
|
+
this.options = options;
|
|
14
|
+
}
|
|
15
|
+
async embed(texts) {
|
|
16
|
+
if (texts.length === 0)
|
|
17
|
+
return [];
|
|
18
|
+
const extractor = await this.getExtractor();
|
|
19
|
+
const output = await extractor(texts, { normalize: true, pooling: 'mean' });
|
|
20
|
+
return tensorToRows(output.data, output.dims);
|
|
21
|
+
}
|
|
22
|
+
async getExtractor() {
|
|
23
|
+
this.extractorPromise ??= importTransformers('@huggingface/transformers').then(async ({ pipeline }) => {
|
|
24
|
+
const pipelineOptions = {};
|
|
25
|
+
// eslint-disable-next-line camelcase
|
|
26
|
+
pipelineOptions.progress_callback = this.options.onLoadProgress;
|
|
27
|
+
const extractor = await pipeline('feature-extraction', MINILM_MODEL, pipelineOptions);
|
|
28
|
+
return extractor;
|
|
29
|
+
});
|
|
30
|
+
return this.extractorPromise;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
export function isMiniLMModelCached() {
|
|
34
|
+
const cachePath = getMiniLMModelCachePath();
|
|
35
|
+
return MODEL_CACHE_FILES.every((file) => existsSync(path.join(cachePath, file)));
|
|
36
|
+
}
|
|
37
|
+
export function getMiniLMModelCachePath() {
|
|
38
|
+
const entrypointPath = require.resolve('@huggingface/transformers');
|
|
39
|
+
const packagePath = path.dirname(path.dirname(entrypointPath));
|
|
40
|
+
return path.join(packagePath, '.cache', ...MINILM_MODEL.split('/'));
|
|
41
|
+
}
|
|
42
|
+
function tensorToRows(data, dims) {
|
|
43
|
+
const rows = dims[0] ?? 0;
|
|
44
|
+
const columns = dims[1] ?? data.length;
|
|
45
|
+
return Array.from({ length: rows }, (_, row) => {
|
|
46
|
+
const rowData = data.slice(row * columns, (row + 1) * columns);
|
|
47
|
+
return Array.isArray(rowData) ? rowData : [...rowData];
|
|
48
|
+
});
|
|
49
|
+
}
|
package/dist/search-logic.d.ts
CHANGED
|
@@ -8,4 +8,19 @@ export type ScoredCommand<T extends SearchableCommand = SearchableCommand> = {
|
|
|
8
8
|
cmd: T;
|
|
9
9
|
score: number;
|
|
10
10
|
};
|
|
11
|
-
export
|
|
11
|
+
export type CommandEmbedder = {
|
|
12
|
+
embed(texts: string[]): Promise<number[][]>;
|
|
13
|
+
};
|
|
14
|
+
export type CommandReranker<T extends SearchableCommand = SearchableCommand> = {
|
|
15
|
+
rerank(query: string, commands: Array<ScoredCommand<T>>): Promise<Array<ScoredCommand<T>>>;
|
|
16
|
+
};
|
|
17
|
+
export type SearchCommandsOptions<T extends SearchableCommand = SearchableCommand> = {
|
|
18
|
+
embedder?: CommandEmbedder;
|
|
19
|
+
lexicalWeight?: number;
|
|
20
|
+
minSemanticScore?: number;
|
|
21
|
+
reranker?: CommandReranker<T>;
|
|
22
|
+
semantic?: boolean;
|
|
23
|
+
semanticWeight?: number;
|
|
24
|
+
};
|
|
25
|
+
export declare function searchCommands<T extends SearchableCommand>(query: string, commands: T[], options?: SearchCommandsOptions<T>): Promise<Array<ScoredCommand<T>>>;
|
|
26
|
+
export declare function searchCommandsLexically<T extends SearchableCommand>(query: string, commands: T[], haystack?: string[]): Array<ScoredCommand<T>>;
|
package/dist/search-logic.js
CHANGED
|
@@ -1,7 +1,35 @@
|
|
|
1
1
|
import UFuzzy from '@leeoniya/ufuzzy';
|
|
2
|
-
|
|
2
|
+
import { MiniLMCommandEmbedder } from './embedders/minilm.js';
|
|
3
|
+
const DEFAULT_MIN_SEMANTIC_SCORE = 0.22;
|
|
4
|
+
const DEFAULT_SEMANTIC_WEIGHT = 0.75;
|
|
5
|
+
const DEFAULT_LEXICAL_WEIGHT = 0.25;
|
|
6
|
+
let defaultEmbedderPromise;
|
|
7
|
+
export async function searchCommands(query, commands, options = {}) {
|
|
8
|
+
const normalizedQuery = query.trim();
|
|
9
|
+
if (normalizedQuery.length === 0 || commands.length === 0)
|
|
10
|
+
return [];
|
|
11
|
+
const haystack = commands.map((command) => commandSearchText(command));
|
|
12
|
+
const lexical = searchCommandsLexically(normalizedQuery, commands, haystack);
|
|
13
|
+
if (options.semantic === false)
|
|
14
|
+
return lexical;
|
|
15
|
+
let semantic;
|
|
16
|
+
try {
|
|
17
|
+
semantic = await searchCommandsSemantically(normalizedQuery, commands, haystack, options);
|
|
18
|
+
}
|
|
19
|
+
catch {
|
|
20
|
+
return lexical;
|
|
21
|
+
}
|
|
22
|
+
if (semantic.length === 0)
|
|
23
|
+
return lexical;
|
|
24
|
+
const merged = mergeScores(semantic, lexical, {
|
|
25
|
+
lexicalWeight: options.lexicalWeight ?? DEFAULT_LEXICAL_WEIGHT,
|
|
26
|
+
minSemanticScore: options.minSemanticScore ?? DEFAULT_MIN_SEMANTIC_SCORE,
|
|
27
|
+
semanticWeight: options.semanticWeight ?? DEFAULT_SEMANTIC_WEIGHT,
|
|
28
|
+
});
|
|
29
|
+
return options.reranker ? options.reranker.rerank(normalizedQuery, merged) : merged;
|
|
30
|
+
}
|
|
31
|
+
export function searchCommandsLexically(query, commands, haystack = commands.map((command) => commandSearchText(command))) {
|
|
3
32
|
const uf = new UFuzzy({ intraIns: Infinity });
|
|
4
|
-
const haystack = commands.map((c) => [c.id, c.summary ?? c.description ?? '', c.pluginName ?? ''].filter(Boolean).join(' '));
|
|
5
33
|
const [idxs, , order] = uf.search(haystack, query, 0, Infinity);
|
|
6
34
|
if (idxs && idxs.length > 0) {
|
|
7
35
|
const ranked = order ?? idxs.map((_, i) => i);
|
|
@@ -25,3 +53,54 @@ export function searchCommands(query, commands) {
|
|
|
25
53
|
.sort((a, b) => b[1] - a[1] || a[0] - b[0])
|
|
26
54
|
.map(([idx, hits]) => ({ cmd: commands[idx], score: tokens.length - hits }));
|
|
27
55
|
}
|
|
56
|
+
function commandSearchText(command) {
|
|
57
|
+
return [command.id, command.summary ?? command.description ?? '', command.pluginName ?? '']
|
|
58
|
+
.filter(Boolean)
|
|
59
|
+
.join(' ')
|
|
60
|
+
.replaceAll(/\s+/g, ' ')
|
|
61
|
+
.trim();
|
|
62
|
+
}
|
|
63
|
+
async function searchCommandsSemantically(query, commands, haystack, options) {
|
|
64
|
+
const embedder = options.embedder ?? (await getDefaultEmbedder());
|
|
65
|
+
const [queryEmbedding, ...commandEmbeddings] = await embedder.embed([query, ...haystack]);
|
|
66
|
+
return commandEmbeddings
|
|
67
|
+
.map((embedding, idx) => ({
|
|
68
|
+
cmd: commands[idx],
|
|
69
|
+
score: cosineSimilarity(queryEmbedding, embedding),
|
|
70
|
+
}))
|
|
71
|
+
.filter(({ score }) => Number.isFinite(score))
|
|
72
|
+
.sort((a, b) => b.score - a.score);
|
|
73
|
+
}
|
|
74
|
+
async function getDefaultEmbedder() {
|
|
75
|
+
defaultEmbedderPromise ??= Promise.resolve(new MiniLMCommandEmbedder());
|
|
76
|
+
return defaultEmbedderPromise;
|
|
77
|
+
}
|
|
78
|
+
function mergeScores(semantic, lexical, options) {
|
|
79
|
+
const lexicalById = new Map(lexical.map((entry, idx) => [entry.cmd.id, lexicalScore(idx, lexical.length)]));
|
|
80
|
+
const lexicalIds = new Set(lexical.map((entry) => entry.cmd.id));
|
|
81
|
+
return semantic
|
|
82
|
+
.filter((entry) => entry.score >= options.minSemanticScore || lexicalIds.has(entry.cmd.id))
|
|
83
|
+
.map((entry) => {
|
|
84
|
+
const finalScore = entry.score * options.semanticWeight + (lexicalById.get(entry.cmd.id) ?? 0) * options.lexicalWeight;
|
|
85
|
+
return { cmd: entry.cmd, score: 1 - finalScore };
|
|
86
|
+
})
|
|
87
|
+
.sort((a, b) => a.score - b.score);
|
|
88
|
+
}
|
|
89
|
+
function lexicalScore(index, length) {
|
|
90
|
+
if (length <= 1)
|
|
91
|
+
return 1;
|
|
92
|
+
return 1 - index / (length - 1);
|
|
93
|
+
}
|
|
94
|
+
function cosineSimilarity(a, b) {
|
|
95
|
+
let dot = 0;
|
|
96
|
+
let aMagnitude = 0;
|
|
97
|
+
let bMagnitude = 0;
|
|
98
|
+
for (let i = 0; i < Math.min(a.length, b.length); i++) {
|
|
99
|
+
dot += a[i] * b[i];
|
|
100
|
+
aMagnitude += a[i] * a[i];
|
|
101
|
+
bMagnitude += b[i] * b[i];
|
|
102
|
+
}
|
|
103
|
+
if (aMagnitude === 0 || bMagnitude === 0)
|
|
104
|
+
return 0;
|
|
105
|
+
return dot / (Math.sqrt(aMagnitude) * Math.sqrt(bMagnitude));
|
|
106
|
+
}
|
package/oclif.manifest.json
CHANGED
package/package.json
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@hesed/search",
|
|
3
3
|
"description": "Intelligence search plugin",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.2.0",
|
|
5
5
|
"author": "Hesed",
|
|
6
6
|
"bin": {
|
|
7
7
|
"permission": "./bin/run.js"
|
|
8
8
|
},
|
|
9
9
|
"bugs": "https://github.com/hesedcasa/search/issues",
|
|
10
10
|
"dependencies": {
|
|
11
|
+
"@huggingface/transformers": "^4.2.0",
|
|
11
12
|
"@leeoniya/ufuzzy": "^1.0.19",
|
|
12
13
|
"@oclif/core": "^4"
|
|
13
14
|
},
|