@futdevpro/nts-dynamo 1.15.24 → 1.15.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/_specifications/BACKLOG.md +28 -0
- package/build/_models/interfaces/compare-data-options.interface.d.ts +27 -0
- package/build/_models/interfaces/compare-data-options.interface.d.ts.map +1 -0
- package/build/_models/interfaces/compare-data-options.interface.js +3 -0
- package/build/_models/interfaces/compare-data-options.interface.js.map +1 -0
- package/build/_models/interfaces/compare-data-result.interface.d.ts +13 -0
- package/build/_models/interfaces/compare-data-result.interface.d.ts.map +1 -0
- package/build/_models/interfaces/compare-data-result.interface.js +3 -0
- package/build/_models/interfaces/compare-data-result.interface.js.map +1 -0
- package/build/_modules/ai/_models/interfaces/dynts-ai-cost-event-callback.interface.d.ts +14 -0
- package/build/_modules/ai/_models/interfaces/dynts-ai-cost-event-callback.interface.d.ts.map +1 -0
- package/build/_modules/ai/_models/interfaces/dynts-ai-cost-event-callback.interface.js +3 -0
- package/build/_modules/ai/_models/interfaces/dynts-ai-cost-event-callback.interface.js.map +1 -0
- package/build/_modules/ai/_models/interfaces/dynts-ai-cost-event.interface.d.ts +50 -0
- package/build/_modules/ai/_models/interfaces/dynts-ai-cost-event.interface.d.ts.map +1 -0
- package/build/_modules/ai/_models/interfaces/dynts-ai-cost-event.interface.js +3 -0
- package/build/_modules/ai/_models/interfaces/dynts-ai-cost-event.interface.js.map +1 -0
- package/build/_modules/ai/_modules/open-ai/_services/oai-embedding.control-service.d.ts.map +1 -1
- package/build/_modules/ai/_modules/open-ai/_services/oai-embedding.control-service.js +32 -0
- package/build/_modules/ai/_modules/open-ai/_services/oai-embedding.control-service.js.map +1 -1
- package/build/_modules/ai/_modules/open-ai/_services/oai-llm-chat.service-base.d.ts.map +1 -1
- package/build/_modules/ai/_modules/open-ai/_services/oai-llm-chat.service-base.js +20 -2
- package/build/_modules/ai/_modules/open-ai/_services/oai-llm-chat.service-base.js.map +1 -1
- package/build/_modules/ai/_modules/open-ai/_services/oai-llm.service-base.d.ts +4 -1
- package/build/_modules/ai/_modules/open-ai/_services/oai-llm.service-base.d.ts.map +1 -1
- package/build/_modules/ai/_modules/open-ai/_services/oai-llm.service-base.js +28 -1
- package/build/_modules/ai/_modules/open-ai/_services/oai-llm.service-base.js.map +1 -1
- package/build/_modules/ai/_services/ai-provider.service-base.d.ts +21 -0
- package/build/_modules/ai/_services/ai-provider.service-base.d.ts.map +1 -1
- package/build/_modules/ai/_services/ai-provider.service-base.js +32 -0
- package/build/_modules/ai/_services/ai-provider.service-base.js.map +1 -1
- package/build/_modules/local-vector-search/_enums/lvs-search-mode.enum.d.ts +17 -1
- package/build/_modules/local-vector-search/_enums/lvs-search-mode.enum.d.ts.map +1 -1
- package/build/_modules/local-vector-search/_enums/lvs-search-mode.enum.js +16 -0
- package/build/_modules/local-vector-search/_enums/lvs-search-mode.enum.js.map +1 -1
- package/build/_modules/local-vector-search/_services/lvs-bm25.util.d.ts +89 -0
- package/build/_modules/local-vector-search/_services/lvs-bm25.util.d.ts.map +1 -0
- package/build/_modules/local-vector-search/_services/lvs-bm25.util.js +190 -0
- package/build/_modules/local-vector-search/_services/lvs-bm25.util.js.map +1 -0
- package/build/_modules/local-vector-search/_services/lvs-local-vector-search.data-service.d.ts +18 -2
- package/build/_modules/local-vector-search/_services/lvs-local-vector-search.data-service.d.ts.map +1 -1
- package/build/_modules/local-vector-search/_services/lvs-local-vector-search.data-service.js +57 -3
- package/build/_modules/local-vector-search/_services/lvs-local-vector-search.data-service.js.map +1 -1
- package/build/_services/base/data.service.d.ts +63 -0
- package/build/_services/base/data.service.d.ts.map +1 -1
- package/build/_services/base/data.service.js +189 -0
- package/build/_services/base/data.service.js.map +1 -1
- package/package.json +1 -1
- package/src/_models/interfaces/compare-data-options.interface.ts +27 -0
- package/src/_models/interfaces/compare-data-result.interface.ts +12 -0
- package/src/_modules/ai/_models/interfaces/dynts-ai-cost-event-callback.interface.ts +14 -0
- package/src/_modules/ai/_models/interfaces/dynts-ai-cost-event.interface.ts +56 -0
- package/src/_modules/ai/_modules/open-ai/_services/oai-embedding.control-service.spec.ts +92 -0
- package/src/_modules/ai/_modules/open-ai/_services/oai-embedding.control-service.ts +38 -4
- package/src/_modules/ai/_modules/open-ai/_services/oai-llm-chat.service-base.ts +24 -5
- package/src/_modules/ai/_modules/open-ai/_services/oai-llm.service-base.spec.ts +52 -0
- package/src/_modules/ai/_modules/open-ai/_services/oai-llm.service-base.ts +39 -10
- package/src/_modules/ai/_services/ai-provider.service-base.spec.ts +79 -0
- package/src/_modules/ai/_services/ai-provider.service-base.ts +41 -3
- package/src/_modules/local-vector-search/_enums/lvs-search-mode.enum.ts +16 -0
- package/src/_modules/local-vector-search/_services/lvs-bm25.util.spec.ts +159 -0
- package/src/_modules/local-vector-search/_services/lvs-bm25.util.ts +206 -0
- package/src/_modules/local-vector-search/_services/lvs-local-vector-search.data-service.spec.ts +135 -0
- package/src/_modules/local-vector-search/_services/lvs-local-vector-search.data-service.ts +95 -9
- package/src/_services/base/data.service.spec.ts +181 -0
- package/src/_services/base/data.service.ts +196 -2
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
import {
|
|
2
|
+
DyNTS_LVS_BM25_Corpus,
|
|
3
|
+
DyNTS_LVS_BM25_DocScore,
|
|
4
|
+
dyNTS_LVS_BM25_minMaxNormalize,
|
|
5
|
+
} from './lvs-bm25.util';
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
describe('| DyNTS_LVS_BM25_Corpus.tokenize', (): void => {
|
|
9
|
+
it('| lowercase + split on \\w+ boundaries', (): void => {
|
|
10
|
+
expect(DyNTS_LVS_BM25_Corpus.tokenize('Hello, World!')).toEqual(['hello', 'world']);
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
it('| identifier marad egy tokenkent (PascalCase)', (): void => {
|
|
14
|
+
expect(DyNTS_LVS_BM25_Corpus.tokenize('UserController')).toEqual(['usercontroller']);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
it('| hyphenated nev ket tokenre esik', (): void => {
|
|
18
|
+
expect(DyNTS_LVS_BM25_Corpus.tokenize('auth-flow')).toEqual(['auth', 'flow']);
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
it('| ures string → ures array', (): void => {
|
|
22
|
+
expect(DyNTS_LVS_BM25_Corpus.tokenize('')).toEqual([]);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it('| csak whitespace/punctuation → ures array', (): void => {
|
|
26
|
+
expect(DyNTS_LVS_BM25_Corpus.tokenize(' ,.! ')).toEqual([]);
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
it('| underscore megmarad (snake_case is egy token)', (): void => {
|
|
30
|
+
expect(DyNTS_LVS_BM25_Corpus.tokenize('user_controller')).toEqual(['user_controller']);
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
describe('| DyNTS_LVS_BM25_Corpus.score', (): void => {
|
|
36
|
+
it('| ures corpus → ures array', (): void => {
|
|
37
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([]);
|
|
38
|
+
expect(corpus.size()).toBe(0);
|
|
39
|
+
expect(corpus.score('anything')).toEqual([]);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
it('| ures query → minden doc 0 score-t kap', (): void => {
|
|
43
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([
|
|
44
|
+
{ id: 'a', text: 'foo bar' },
|
|
45
|
+
{ id: 'b', text: 'baz' },
|
|
46
|
+
]);
|
|
47
|
+
const out: DyNTS_LVS_BM25_DocScore[] = corpus.score('');
|
|
48
|
+
expect(out.length).toBe(2);
|
|
49
|
+
expect(out.every((s: DyNTS_LVS_BM25_DocScore) => s.score === 0)).toBe(true);
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
it('| query NEM matchelo termmel → minden score 0', (): void => {
|
|
53
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([
|
|
54
|
+
{ id: 'a', text: 'foo bar' },
|
|
55
|
+
{ id: 'b', text: 'baz' },
|
|
56
|
+
]);
|
|
57
|
+
const out: DyNTS_LVS_BM25_DocScore[] = corpus.score('xyz');
|
|
58
|
+
expect(out.every((s: DyNTS_LVS_BM25_DocScore) => s.score === 0)).toBe(true);
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
it('| relevant doc magasabb score-t kap mint nem-relevant', (): void => {
|
|
62
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([
|
|
63
|
+
{ id: 'a', text: 'the UserController handles authentication' },
|
|
64
|
+
{ id: 'b', text: 'cooking recipes for desserts' },
|
|
65
|
+
{ id: 'c', text: 'database setup guide' },
|
|
66
|
+
]);
|
|
67
|
+
const out: DyNTS_LVS_BM25_DocScore[] = corpus.score('UserController');
|
|
68
|
+
const a: DyNTS_LVS_BM25_DocScore = out.find((s) => s.id === 'a')!;
|
|
69
|
+
const b: DyNTS_LVS_BM25_DocScore = out.find((s) => s.id === 'b')!;
|
|
70
|
+
expect(a.score).toBeGreaterThan(0);
|
|
71
|
+
expect(b.score).toBe(0);
|
|
72
|
+
expect(a.score).toBeGreaterThan(b.score);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it('| rarer term (alacsonyabb df) magasabb IDF-et ad → magasabb score', (): void => {
|
|
76
|
+
// 'common' minden docban → alacsony IDF; 'rare' csak az 'a' docban → magas IDF
|
|
77
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([
|
|
78
|
+
{ id: 'a', text: 'common rare' },
|
|
79
|
+
{ id: 'b', text: 'common' },
|
|
80
|
+
{ id: 'c', text: 'common' },
|
|
81
|
+
{ id: 'd', text: 'common' },
|
|
82
|
+
]);
|
|
83
|
+
const commonOut: DyNTS_LVS_BM25_DocScore[] = corpus.score('common');
|
|
84
|
+
const rareOut: DyNTS_LVS_BM25_DocScore[] = corpus.score('rare');
|
|
85
|
+
const aCommon: number = commonOut.find((s) => s.id === 'a')!.score;
|
|
86
|
+
const aRare: number = rareOut.find((s) => s.id === 'a')!.score;
|
|
87
|
+
expect(aRare).toBeGreaterThan(aCommon);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
it('| query-term ismetles NEM no monotonan a score-on (term saturation k1)', (): void => {
|
|
91
|
+
// 'a' egy 'foo'-t tartalmaz, 'b' tobbet
|
|
92
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([
|
|
93
|
+
{ id: 'a', text: 'foo bar baz qux' },
|
|
94
|
+
{ id: 'b', text: 'foo foo foo foo bar baz qux' },
|
|
95
|
+
]);
|
|
96
|
+
const out: DyNTS_LVS_BM25_DocScore[] = corpus.score('foo');
|
|
97
|
+
const a: number = out.find((s) => s.id === 'a')!.score;
|
|
98
|
+
const b: number = out.find((s) => s.id === 'b')!.score;
|
|
99
|
+
// b > a, de NEM 4x a-szor (saturation)
|
|
100
|
+
expect(b).toBeGreaterThan(a);
|
|
101
|
+
expect(b).toBeLessThan(4 * a);
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it('| case-insensitive — uppercase query matchel lowercase doc-ot', (): void => {
|
|
105
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([
|
|
106
|
+
{ id: 'a', text: 'usercontroller is great' },
|
|
107
|
+
]);
|
|
108
|
+
const out: DyNTS_LVS_BM25_DocScore[] = corpus.score('USERCONTROLLER');
|
|
109
|
+
expect(out[0].score).toBeGreaterThan(0);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it('| ket query term aggregalja az IDF-eket additivan', (): void => {
|
|
113
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([
|
|
114
|
+
{ id: 'a', text: 'auth flow handler' },
|
|
115
|
+
{ id: 'b', text: 'auth only' },
|
|
116
|
+
{ id: 'c', text: 'flow only' },
|
|
117
|
+
]);
|
|
118
|
+
const both: number = corpus.score('auth flow').find((s) => s.id === 'a')!.score;
|
|
119
|
+
const authOnly: number = corpus.score('auth').find((s) => s.id === 'a')!.score;
|
|
120
|
+
expect(both).toBeGreaterThan(authOnly);
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
it('| invalid input doc (null/missing id/text) silently skipd', (): void => {
|
|
124
|
+
const corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus([
|
|
125
|
+
{ id: 'a', text: 'hello' },
|
|
126
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
127
|
+
null as any,
|
|
128
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
129
|
+
{ id: 'b' } as any,
|
|
130
|
+
]);
|
|
131
|
+
expect(corpus.size()).toBe(1);
|
|
132
|
+
});
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
describe('| dyNTS_LVS_BM25_minMaxNormalize', (): void => {
|
|
137
|
+
it('| ures input → ures array', (): void => {
|
|
138
|
+
expect(dyNTS_LVS_BM25_minMaxNormalize([])).toEqual([]);
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
it('| min 0, max 10 → normalizalva [0..1]', (): void => {
|
|
142
|
+
const out: DyNTS_LVS_BM25_DocScore[] = dyNTS_LVS_BM25_minMaxNormalize([
|
|
143
|
+
{ id: 'a', score: 0 },
|
|
144
|
+
{ id: 'b', score: 5 },
|
|
145
|
+
{ id: 'c', score: 10 },
|
|
146
|
+
]);
|
|
147
|
+
expect(out.find((s) => s.id === 'a')!.score).toBeCloseTo(0, 5);
|
|
148
|
+
expect(out.find((s) => s.id === 'b')!.score).toBeCloseTo(0.5, 5);
|
|
149
|
+
expect(out.find((s) => s.id === 'c')!.score).toBeCloseTo(1, 5);
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
it('| azonos score → minden 0 (NEM 0.5, nincs jel diszkriminaciora)', (): void => {
|
|
153
|
+
const out: DyNTS_LVS_BM25_DocScore[] = dyNTS_LVS_BM25_minMaxNormalize([
|
|
154
|
+
{ id: 'a', score: 3 },
|
|
155
|
+
{ id: 'b', score: 3 },
|
|
156
|
+
]);
|
|
157
|
+
expect(out.every((s) => s.score === 0)).toBe(true);
|
|
158
|
+
});
|
|
159
|
+
});
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* BM25 text-search ranking util a LVS hybrid search-hez (FR-004).
|
|
3
|
+
*
|
|
4
|
+
* Pure TypeScript, dependency-free. In-memory corpus alapjan szamol score-okat,
|
|
5
|
+
* NEM perzisztal indexet (a hybrid hivasonkent ujraepiti a corpus-t a candidate
|
|
6
|
+
* dokumentumokon — kis (~100..10000 dokumentum) LVS-corpus eseten ez gyors es
|
|
7
|
+
* egyszeru).
|
|
8
|
+
*
|
|
9
|
+
* Canonical params: k1=1.2, b=0.75 (industry standard a "lucene-szeru"
|
|
10
|
+
* implementaciokban). NEM expose-oltak — ha kell, FR-002 kovetkezo iteracioban
|
|
11
|
+
* tehetjuk parameterizalhatova.
|
|
12
|
+
*
|
|
13
|
+
* Tokenizer: `text.toLowerCase().match(/\w+/g) || []`. Case-insensitive,
|
|
14
|
+
* alphanumeric+underscore boundary-k. `UserController` egy token marad (jo az
|
|
15
|
+
* identifier match-re), `auth-flow` ket tokenre esik (auth + flow).
|
|
16
|
+
*
|
|
17
|
+
* IDF formula (BM25+): `log((N - df + 0.5) / (df + 0.5) + 1)`. A +1 a logon
|
|
18
|
+
* belul garantalja, hogy a kozos szavak is pozitiv (kicsi) IDF-et kapjanak,
|
|
19
|
+
* NEM negativ-t — fontos a hybrid score-merge-nel hogy ne huzzon le dokumentumot
|
|
20
|
+
* ahol kozos szo szerepel.
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
/** BM25 k1 parameter (term saturation control). Canonical default. */
|
|
25
|
+
const BM25_K1: number = 1.2;
|
|
26
|
+
|
|
27
|
+
/** BM25 b parameter (length normalization weight, 0=off, 1=full). Canonical default. */
|
|
28
|
+
const BM25_B: number = 0.75;
|
|
29
|
+
|
|
30
|
+
/** Token regex — alphanumeric + underscore. */
|
|
31
|
+
const TOKEN_REGEX: RegExp = /\w+/g;
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Egy dokumentum BM25-score-ja egy query ellen, egy elore-felepitett corpus
|
|
36
|
+
* konteztusaban.
|
|
37
|
+
*/
|
|
38
|
+
export interface DyNTS_LVS_BM25_DocScore {
|
|
39
|
+
/** Dokumentum azonosito (mint az LVS_SearchResult `id`-jaben). */
|
|
40
|
+
id: string;
|
|
41
|
+
/** Nyers BM25 score (0..∞). NEM normalizalt. */
|
|
42
|
+
score: number;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Felepitett BM25 corpus — egy adott dokumentumhalmaz indexe. A `score()` az
|
|
48
|
+
* indexen kerdez le egy query-t es minden dokumentumra ad egy score-t.
|
|
49
|
+
*
|
|
50
|
+
* Egy corpus egyszer-hasznalatos a hybrid search hivasban — NEM kell cache-elni,
|
|
51
|
+
* a felepites O(N * |doc|) ami pici N-re elhanyagolhato.
|
|
52
|
+
*/
|
|
53
|
+
export class DyNTS_LVS_BM25_Corpus {
|
|
54
|
+
|
|
55
|
+
/** Tokenizalt dokumentumok: id -> tokens. */
|
|
56
|
+
private readonly docTokens: Map<string, string[]> = new Map<string, string[]>();
|
|
57
|
+
/** Doc-length: id -> token count. */
|
|
58
|
+
private readonly docLengths: Map<string, number> = new Map<string, number>();
|
|
59
|
+
/** Term -> doc-frequency (hany docban szerepel az adott term, legalabb 1x). */
|
|
60
|
+
private readonly termDocFreq: Map<string, number> = new Map<string, number>();
|
|
61
|
+
/** Term -> id -> term-frequency a docban. */
|
|
62
|
+
private readonly termFreqByDoc: Map<string, Map<string, number>> = new Map<string, Map<string, number>>();
|
|
63
|
+
/** Atlagos dokumentum-hossz (token count). */
|
|
64
|
+
private avgDocLength: number = 0;
|
|
65
|
+
/** Total doc count. */
|
|
66
|
+
private docCount: number = 0;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Letrehoz egy uj corpus-t a megadott id->text parok-bol.
|
|
70
|
+
* NEM dob hibat ures input-ra — ures corpus ervenyes (minden score = 0).
|
|
71
|
+
*/
|
|
72
|
+
constructor(docs: { id: string; text: string }[]) {
|
|
73
|
+
if (!Array.isArray(docs) || docs.length === 0) { return; }
|
|
74
|
+
|
|
75
|
+
let totalLength: number = 0;
|
|
76
|
+
for (const doc of docs) {
|
|
77
|
+
if (!doc || typeof doc.id !== 'string' || typeof doc.text !== 'string') { continue; }
|
|
78
|
+
const tokens: string[] = DyNTS_LVS_BM25_Corpus.tokenize(doc.text);
|
|
79
|
+
this.docTokens.set(doc.id, tokens);
|
|
80
|
+
this.docLengths.set(doc.id, tokens.length);
|
|
81
|
+
totalLength += tokens.length;
|
|
82
|
+
|
|
83
|
+
// Term-frequency a docban
|
|
84
|
+
const localTf: Map<string, number> = new Map<string, number>();
|
|
85
|
+
for (const tok of tokens) {
|
|
86
|
+
localTf.set(tok, (localTf.get(tok) ?? 0) + 1);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
for (const [term, tf] of localTf) {
|
|
90
|
+
// Doc-frequency: +1 per term, per doc
|
|
91
|
+
this.termDocFreq.set(term, (this.termDocFreq.get(term) ?? 0) + 1);
|
|
92
|
+
// Term-freq-by-doc reverse index
|
|
93
|
+
let perDoc: Map<string, number> | undefined = this.termFreqByDoc.get(term);
|
|
94
|
+
if (!perDoc) {
|
|
95
|
+
perDoc = new Map<string, number>();
|
|
96
|
+
this.termFreqByDoc.set(term, perDoc);
|
|
97
|
+
}
|
|
98
|
+
perDoc.set(doc.id, tf);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
this.docCount = this.docTokens.size;
|
|
103
|
+
this.avgDocLength = this.docCount > 0 ? totalLength / this.docCount : 0;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Public tokenizer — exportalt, hogy spec-ek + hivok ugyanazt a normalizalast
|
|
109
|
+
* tudjak hasznalni mint a corpus.
|
|
110
|
+
*/
|
|
111
|
+
static tokenize(text: string): string[] {
|
|
112
|
+
if (typeof text !== 'string' || text.length === 0) { return []; }
|
|
113
|
+
return text.toLowerCase().match(TOKEN_REGEX) ?? [];
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Visszaadja a corpus dokumentum-szamat (NEM-ures docok).
|
|
119
|
+
*/
|
|
120
|
+
size(): number {
|
|
121
|
+
return this.docCount;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* BM25 score minden dokumentumra a query-re.
|
|
127
|
+
*
|
|
128
|
+
* Ures query → minden score 0 (degenerate case; a hivo kezelje ha kell).
|
|
129
|
+
* Ures corpus → ures array.
|
|
130
|
+
*/
|
|
131
|
+
score(query: string): DyNTS_LVS_BM25_DocScore[] {
|
|
132
|
+
if (this.docCount === 0) { return []; }
|
|
133
|
+
const queryTokens: string[] = DyNTS_LVS_BM25_Corpus.tokenize(query);
|
|
134
|
+
if (queryTokens.length === 0) {
|
|
135
|
+
// Minden doc 0 score-t kap
|
|
136
|
+
const results: DyNTS_LVS_BM25_DocScore[] = [];
|
|
137
|
+
for (const id of this.docTokens.keys()) {
|
|
138
|
+
results.push({ id: id, score: 0 });
|
|
139
|
+
}
|
|
140
|
+
return results;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Egyedi query-termek halmaza (ismetlodes nem ad nagyobb IDF-et)
|
|
144
|
+
const uniqueTerms: string[] = Array.from(new Set<string>(queryTokens));
|
|
145
|
+
|
|
146
|
+
// Pre-compute IDF a query-termekre
|
|
147
|
+
const idfMap: Map<string, number> = new Map<string, number>();
|
|
148
|
+
for (const term of uniqueTerms) {
|
|
149
|
+
const df: number = this.termDocFreq.get(term) ?? 0;
|
|
150
|
+
// BM25+ IDF: log((N - df + 0.5) / (df + 0.5) + 1)
|
|
151
|
+
const idf: number = Math.log((this.docCount - df + 0.5) / (df + 0.5) + 1);
|
|
152
|
+
idfMap.set(term, idf);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Per-doc BM25 score
|
|
156
|
+
const results: DyNTS_LVS_BM25_DocScore[] = [];
|
|
157
|
+
for (const [docId, docLen] of this.docLengths) {
|
|
158
|
+
let score: number = 0;
|
|
159
|
+
for (const term of uniqueTerms) {
|
|
160
|
+
const tf: number = this.termFreqByDoc.get(term)?.get(docId) ?? 0;
|
|
161
|
+
if (tf === 0) { continue; }
|
|
162
|
+
const idf: number = idfMap.get(term) ?? 0;
|
|
163
|
+
const norm: number = 1 - BM25_B + BM25_B * (docLen / (this.avgDocLength || 1));
|
|
164
|
+
score += idf * (tf * (BM25_K1 + 1)) / (tf + BM25_K1 * norm);
|
|
165
|
+
}
|
|
166
|
+
results.push({ id: docId, score: score });
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
return results;
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Min-max normalizalas [0,1] tartomanyra. A hybrid score-merge-hez kell a BM25
|
|
176
|
+
* score-okat a candidate-szetten 0..1 sav-ba hozni (a cosine mar 0..1).
|
|
177
|
+
*
|
|
178
|
+
* Edge case-ek:
|
|
179
|
+
* - Ures array → [].
|
|
180
|
+
* - Minden score azonos (max-min === 0) → minden 0.0 (NEM 0.5 vagy 1.0; ha
|
|
181
|
+
* nincs diszkriminacio, ne tegyunk hozza signal-t).
|
|
182
|
+
* - Negativ score-ok (BM25+IDF garantal pozitivat, de defensive): a min-max
|
|
183
|
+
* ugyanugy mukodik.
|
|
184
|
+
*/
|
|
185
|
+
export function dyNTS_LVS_BM25_minMaxNormalize(
|
|
186
|
+
scores: DyNTS_LVS_BM25_DocScore[],
|
|
187
|
+
): DyNTS_LVS_BM25_DocScore[] {
|
|
188
|
+
if (!Array.isArray(scores) || scores.length === 0) { return []; }
|
|
189
|
+
let min: number = Infinity;
|
|
190
|
+
let max: number = -Infinity;
|
|
191
|
+
for (const s of scores) {
|
|
192
|
+
if (s.score < min) { min = s.score; }
|
|
193
|
+
if (s.score > max) { max = s.score; }
|
|
194
|
+
}
|
|
195
|
+
const range: number = max - min;
|
|
196
|
+
if (range === 0) {
|
|
197
|
+
// Nincs diszkriminacio — minden 0.0 (NEM huzzon le, NEM huzzon fel)
|
|
198
|
+
return scores.map((s: DyNTS_LVS_BM25_DocScore): DyNTS_LVS_BM25_DocScore => ({
|
|
199
|
+
id: s.id, score: 0,
|
|
200
|
+
}));
|
|
201
|
+
}
|
|
202
|
+
return scores.map((s: DyNTS_LVS_BM25_DocScore): DyNTS_LVS_BM25_DocScore => ({
|
|
203
|
+
id: s.id,
|
|
204
|
+
score: (s.score - min) / range,
|
|
205
|
+
}));
|
|
206
|
+
}
|
package/src/_modules/local-vector-search/_services/lvs-local-vector-search.data-service.spec.ts
CHANGED
|
@@ -341,5 +341,140 @@ describe('| DyNTS_LVS_VectorDataService', () => {
|
|
|
341
341
|
expect(result[1]._id).toBe('data-1');
|
|
342
342
|
});
|
|
343
343
|
});
|
|
344
|
+
|
|
345
|
+
describe('| vectorSearch hybrid (FR-004)', () => {
|
|
346
|
+
const buildHybridCorpus = (): TestDataModel[] => {
|
|
347
|
+
const d1: TestDataModel = new TestDataModel();
|
|
348
|
+
d1._id = 'doc-user';
|
|
349
|
+
d1.content = 'the UserController handles authentication flow';
|
|
350
|
+
d1.contentVectorized = [0.4, 0.5, 0.6];
|
|
351
|
+
const d2: TestDataModel = new TestDataModel();
|
|
352
|
+
d2._id = 'doc-recipe';
|
|
353
|
+
d2.content = 'cooking recipes for desserts and cakes';
|
|
354
|
+
d2.contentVectorized = [0.45, 0.55, 0.65];
|
|
355
|
+
const d3: TestDataModel = new TestDataModel();
|
|
356
|
+
d3._id = 'doc-db';
|
|
357
|
+
d3.content = 'database setup guide for MongoDB';
|
|
358
|
+
d3.contentVectorized = [0.42, 0.52, 0.62];
|
|
359
|
+
return [d1, d2, d3];
|
|
360
|
+
};
|
|
361
|
+
|
|
362
|
+
it('| throws ha textSearchKey hianyzik hybrid modban (VS4)', async () => {
|
|
363
|
+
spyOn(service, 'getAll').and.returnValue(Promise.resolve(buildHybridCorpus()));
|
|
364
|
+
spyOn(service, 'vectorize').and.returnValue(Promise.resolve([0.4, 0.5, 0.6]));
|
|
365
|
+
try {
|
|
366
|
+
await service.vectorSearch({
|
|
367
|
+
input: 'UserController',
|
|
368
|
+
searchInKey: 'contentVectorized',
|
|
369
|
+
searchMode: LVS_Search_Mode.hybrid,
|
|
370
|
+
});
|
|
371
|
+
fail('Should have thrown an error');
|
|
372
|
+
} catch (err) {
|
|
373
|
+
expect(err).toBeInstanceOf(DyFM_Error);
|
|
374
|
+
expect((err as DyFM_Error)._errorCode).toContain('DyNTS-LVS-VS4');
|
|
375
|
+
}
|
|
376
|
+
});
|
|
377
|
+
|
|
378
|
+
it('| throws ha hybridWeight invalid (negativ) (VS5)', async () => {
|
|
379
|
+
spyOn(service, 'getAll').and.returnValue(Promise.resolve(buildHybridCorpus()));
|
|
380
|
+
spyOn(service, 'vectorize').and.returnValue(Promise.resolve([0.4, 0.5, 0.6]));
|
|
381
|
+
try {
|
|
382
|
+
await service.vectorSearch({
|
|
383
|
+
input: 'UserController',
|
|
384
|
+
searchInKey: 'contentVectorized',
|
|
385
|
+
searchMode: LVS_Search_Mode.hybrid,
|
|
386
|
+
textSearchKey: 'content',
|
|
387
|
+
hybridWeight: { vector: -0.5, text: 1.5 },
|
|
388
|
+
});
|
|
389
|
+
fail('Should have thrown an error');
|
|
390
|
+
} catch (err) {
|
|
391
|
+
expect(err).toBeInstanceOf(DyFM_Error);
|
|
392
|
+
expect((err as DyFM_Error)._errorCode).toContain('DyNTS-LVS-VS5');
|
|
393
|
+
}
|
|
394
|
+
});
|
|
395
|
+
|
|
396
|
+
it('| basic hybrid: text-relevant doc top-en', async () => {
|
|
397
|
+
spyOn(service, 'getAll').and.returnValue(Promise.resolve(buildHybridCorpus()));
|
|
398
|
+
spyOn(service, 'vectorize').and.returnValue(Promise.resolve([0.4, 0.5, 0.6]));
|
|
399
|
+
const result: TestDataModel[] = await service.vectorSearch({
|
|
400
|
+
input: 'UserController',
|
|
401
|
+
searchInKey: 'contentVectorized',
|
|
402
|
+
searchMode: LVS_Search_Mode.hybrid,
|
|
403
|
+
textSearchKey: 'content',
|
|
404
|
+
limit: 3,
|
|
405
|
+
});
|
|
406
|
+
expect(result.length).toBe(3);
|
|
407
|
+
expect(result[0]._id).toBe('doc-user');
|
|
408
|
+
});
|
|
409
|
+
|
|
410
|
+
it('| weight {vector:1, text:0} → effektivan pure cosine', async () => {
|
|
411
|
+
spyOn(service, 'getAll').and.returnValue(Promise.resolve(buildHybridCorpus()));
|
|
412
|
+
spyOn(service, 'vectorize').and.returnValue(Promise.resolve([0.42, 0.52, 0.62]));
|
|
413
|
+
const result: TestDataModel[] = await service.vectorSearch({
|
|
414
|
+
input: 'UserController',
|
|
415
|
+
searchInKey: 'contentVectorized',
|
|
416
|
+
searchMode: LVS_Search_Mode.hybrid,
|
|
417
|
+
textSearchKey: 'content',
|
|
418
|
+
hybridWeight: { vector: 1, text: 0 },
|
|
419
|
+
limit: 3,
|
|
420
|
+
});
|
|
421
|
+
expect(result.length).toBe(3);
|
|
422
|
+
expect(result[0]._id).toBe('doc-db');
|
|
423
|
+
});
|
|
424
|
+
|
|
425
|
+
it('| weight {vector:0, text:1} → effektivan pure BM25', async () => {
|
|
426
|
+
spyOn(service, 'getAll').and.returnValue(Promise.resolve(buildHybridCorpus()));
|
|
427
|
+
spyOn(service, 'vectorize').and.returnValue(Promise.resolve([0.45, 0.55, 0.65]));
|
|
428
|
+
const result: TestDataModel[] = await service.vectorSearch({
|
|
429
|
+
input: 'authentication',
|
|
430
|
+
searchInKey: 'contentVectorized',
|
|
431
|
+
searchMode: LVS_Search_Mode.hybrid,
|
|
432
|
+
textSearchKey: 'content',
|
|
433
|
+
hybridWeight: { vector: 0, text: 1 },
|
|
434
|
+
limit: 3,
|
|
435
|
+
});
|
|
436
|
+
expect(result.length).toBe(3);
|
|
437
|
+
expect(result[0]._id).toBe('doc-user');
|
|
438
|
+
});
|
|
439
|
+
|
|
440
|
+
it('| all-zero BM25 fallback → cosine-rendezes marad', async () => {
|
|
441
|
+
spyOn(service, 'getAll').and.returnValue(Promise.resolve(buildHybridCorpus()));
|
|
442
|
+
spyOn(service, 'vectorize').and.returnValue(Promise.resolve([0.45, 0.55, 0.65]));
|
|
443
|
+
const result: TestDataModel[] = await service.vectorSearch({
|
|
444
|
+
input: 'xyzzy-nonexistent-token',
|
|
445
|
+
searchInKey: 'contentVectorized',
|
|
446
|
+
searchMode: LVS_Search_Mode.hybrid,
|
|
447
|
+
textSearchKey: 'content',
|
|
448
|
+
limit: 3,
|
|
449
|
+
});
|
|
450
|
+
expect(result.length).toBe(3);
|
|
451
|
+
expect(result[0]._id).toBe('doc-recipe');
|
|
452
|
+
});
|
|
453
|
+
|
|
454
|
+
it('| limit honored hybrid modban', async () => {
|
|
455
|
+
spyOn(service, 'getAll').and.returnValue(Promise.resolve(buildHybridCorpus()));
|
|
456
|
+
spyOn(service, 'vectorize').and.returnValue(Promise.resolve([0.4, 0.5, 0.6]));
|
|
457
|
+
const result: TestDataModel[] = await service.vectorSearch({
|
|
458
|
+
input: 'UserController',
|
|
459
|
+
searchInKey: 'contentVectorized',
|
|
460
|
+
searchMode: LVS_Search_Mode.hybrid,
|
|
461
|
+
textSearchKey: 'content',
|
|
462
|
+
limit: 1,
|
|
463
|
+
});
|
|
464
|
+
expect(result.length).toBe(1);
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
it('| ures candidate-szet → ures eredmeny', async () => {
|
|
468
|
+
spyOn(service, 'getAll').and.returnValue(Promise.resolve([]));
|
|
469
|
+
spyOn(service, 'vectorize').and.returnValue(Promise.resolve([0.4, 0.5, 0.6]));
|
|
470
|
+
const result: TestDataModel[] = await service.vectorSearch({
|
|
471
|
+
input: 'UserController',
|
|
472
|
+
searchInKey: 'contentVectorized',
|
|
473
|
+
searchMode: LVS_Search_Mode.hybrid,
|
|
474
|
+
textSearchKey: 'content',
|
|
475
|
+
});
|
|
476
|
+
expect(result.length).toBe(0);
|
|
477
|
+
});
|
|
478
|
+
});
|
|
344
479
|
});
|
|
345
480
|
|
|
@@ -10,6 +10,11 @@ import { DyFM_OAI_Settings } from '@futdevpro/fsm-dynamo/ai/open-ai';
|
|
|
10
10
|
import { LVS_Search_Mode } from '../_enums/lvs-search-mode.enum';
|
|
11
11
|
import { LVS_SearchResult } from '../_models/lvs-search-result.interface';
|
|
12
12
|
import { LVS_VectorPool_ControlService } from './lvs-vector-pool.control-service';
|
|
13
|
+
import {
|
|
14
|
+
DyNTS_LVS_BM25_Corpus,
|
|
15
|
+
DyNTS_LVS_BM25_DocScore,
|
|
16
|
+
dyNTS_LVS_BM25_minMaxNormalize,
|
|
17
|
+
} from './lvs-bm25.util';
|
|
13
18
|
import { DyNTS_OAI_VectorDataService } from '../../ai/_modules/open-ai/_services/data-services/oai-vector-data.service';
|
|
14
19
|
import { DyNTS_global_settings } from '../../../_collections/global-settings.const';
|
|
15
20
|
|
|
@@ -109,10 +114,23 @@ export class DyNTS_LVS_VectorDataService<T extends DyFM_Metadata> extends DyNTS_
|
|
|
109
114
|
*/
|
|
110
115
|
filterBy?: DyFM_DBFilter<T>;
|
|
111
116
|
/**
|
|
112
|
-
* Search mode (cosine similarity
|
|
113
|
-
* Defaults to this.defaultSearchMode
|
|
117
|
+
* Search mode (cosine similarity, L2 distance, or hybrid).
|
|
118
|
+
* Defaults to this.defaultSearchMode.
|
|
119
|
+
*
|
|
120
|
+
* `hybrid` mode combines cosine similarity (vector half) with BM25
|
|
121
|
+
* text scoring (text half) — `textSearchKey` is REQUIRED in hybrid mode.
|
|
114
122
|
*/
|
|
115
123
|
searchMode?: LVS_Search_Mode;
|
|
124
|
+
/**
|
|
125
|
+
* Csak `hybrid` modban — weighted score-merge a cosine es a BM25 kozott.
|
|
126
|
+
* Default: { vector: 0.5, text: 0.5 }. Mindkettonek 0..1 tartomany javasolt.
|
|
127
|
+
*/
|
|
128
|
+
hybridWeight?: { vector: number; text: number };
|
|
129
|
+
/**
|
|
130
|
+
* Csak `hybrid` modban — KOTELEZO; melyik string property-n fut a BM25
|
|
131
|
+
* text-search. NEM kell hogy a vectorized property legyen.
|
|
132
|
+
*/
|
|
133
|
+
textSearchKey?: keyof T;
|
|
116
134
|
},
|
|
117
135
|
): Promise<T[]> {
|
|
118
136
|
try {
|
|
@@ -127,7 +145,33 @@ export class DyNTS_LVS_VectorDataService<T extends DyFM_Metadata> extends DyNTS_
|
|
|
127
145
|
}
|
|
128
146
|
|
|
129
147
|
set.limit ??= 3;
|
|
130
|
-
const { input, searchInKey, limit, filterBy, searchMode } = set;
|
|
148
|
+
const { input, searchInKey, limit, filterBy, searchMode, hybridWeight, textSearchKey } = set;
|
|
149
|
+
const effectiveMode: LVS_Search_Mode = searchMode ?? this.defaultSearchMode;
|
|
150
|
+
|
|
151
|
+
// Hybrid mode korai validacio
|
|
152
|
+
if (effectiveMode === LVS_Search_Mode.hybrid) {
|
|
153
|
+
if (!textSearchKey) {
|
|
154
|
+
throw new DyFM_Error({
|
|
155
|
+
...this.getDefaultErrorSettings(
|
|
156
|
+
'vectorSearch',
|
|
157
|
+
new Error('textSearchKey is required when searchMode is hybrid'),
|
|
158
|
+
),
|
|
159
|
+
errorCode: `${DyNTS_global_settings.systemShortCodeName}|DyNTS-LVS-VS4`,
|
|
160
|
+
});
|
|
161
|
+
}
|
|
162
|
+
if (hybridWeight) {
|
|
163
|
+
const w: { vector: number; text: number } = hybridWeight;
|
|
164
|
+
if (!Number.isFinite(w.vector) || !Number.isFinite(w.text) || w.vector < 0 || w.text < 0) {
|
|
165
|
+
throw new DyFM_Error({
|
|
166
|
+
...this.getDefaultErrorSettings(
|
|
167
|
+
'vectorSearch',
|
|
168
|
+
new Error('hybridWeight.vector and .text must be non-negative finite numbers'),
|
|
169
|
+
),
|
|
170
|
+
errorCode: `${DyNTS_global_settings.systemShortCodeName}|DyNTS-LVS-VS5`,
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
131
175
|
|
|
132
176
|
// Validáljuk, hogy a searchInKey létezik-e
|
|
133
177
|
const property: DyFM_DataProperty_Params<any, T> =
|
|
@@ -250,7 +294,7 @@ export class DyNTS_LVS_VectorDataService<T extends DyFM_Metadata> extends DyNTS_
|
|
|
250
294
|
);
|
|
251
295
|
|
|
252
296
|
// 4. Végrehajtjuk a local vector search-t
|
|
253
|
-
const mode: LVS_Search_Mode =
|
|
297
|
+
const mode: LVS_Search_Mode = effectiveMode;
|
|
254
298
|
|
|
255
299
|
if (this.debugLog) {
|
|
256
300
|
DyFM_Log.log(
|
|
@@ -258,11 +302,53 @@ export class DyNTS_LVS_VectorDataService<T extends DyFM_Metadata> extends DyNTS_
|
|
|
258
302
|
);
|
|
259
303
|
}
|
|
260
304
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
305
|
+
let searchResults: LVS_SearchResult[];
|
|
306
|
+
|
|
307
|
+
if (mode === LVS_Search_Mode.hybrid) {
|
|
308
|
+
// Hybrid: cosine ALL candidate-re + BM25 ALL candidate-re + min-max norm + weighted sum
|
|
309
|
+
const allCandidatesCosine: LVS_SearchResult[] = this.vectorPool.search(
|
|
310
|
+
queryVector,
|
|
311
|
+
dataMap.size,
|
|
312
|
+
LVS_Search_Mode.cosineSimilarity,
|
|
313
|
+
);
|
|
314
|
+
|
|
315
|
+
// BM25 corpus epitese a `textSearchKey` property-bol
|
|
316
|
+
const docs: { id: string; text: string }[] = [];
|
|
317
|
+
for (const [docId, dataItem] of dataMap) {
|
|
318
|
+
const textValue: unknown = dataItem[textSearchKey as keyof T];
|
|
319
|
+
docs.push({
|
|
320
|
+
id: docId,
|
|
321
|
+
text: typeof textValue === 'string' ? textValue : '',
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
const bm25Corpus: DyNTS_LVS_BM25_Corpus = new DyNTS_LVS_BM25_Corpus(docs);
|
|
325
|
+
const bm25Raw: DyNTS_LVS_BM25_DocScore[] = bm25Corpus.score(input);
|
|
326
|
+
const bm25Normalized: DyNTS_LVS_BM25_DocScore[] = dyNTS_LVS_BM25_minMaxNormalize(bm25Raw);
|
|
327
|
+
|
|
328
|
+
const bm25ScoreById: Map<string, number> = new Map<string, number>();
|
|
329
|
+
for (const s of bm25Normalized) {
|
|
330
|
+
bm25ScoreById.set(s.id, s.score);
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
const wVector: number = hybridWeight?.vector ?? 0.5;
|
|
334
|
+
const wText: number = hybridWeight?.text ?? 0.5;
|
|
335
|
+
|
|
336
|
+
const merged: LVS_SearchResult[] = allCandidatesCosine.map((c: LVS_SearchResult): LVS_SearchResult => {
|
|
337
|
+
const bm25Score: number = bm25ScoreById.get(c.id) ?? 0;
|
|
338
|
+
return {
|
|
339
|
+
id: c.id,
|
|
340
|
+
score: wVector * c.score + wText * bm25Score,
|
|
341
|
+
};
|
|
342
|
+
});
|
|
343
|
+
merged.sort((a: LVS_SearchResult, b: LVS_SearchResult) => b.score - a.score);
|
|
344
|
+
searchResults = merged.slice(0, limit);
|
|
345
|
+
} else {
|
|
346
|
+
searchResults = this.vectorPool.search(
|
|
347
|
+
queryVector,
|
|
348
|
+
limit,
|
|
349
|
+
mode,
|
|
350
|
+
);
|
|
351
|
+
}
|
|
266
352
|
|
|
267
353
|
if (this.debugLog) {
|
|
268
354
|
DyFM_Log.log(
|