rag-lite-ts 1.0.2 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +605 -93
- package/dist/cli/indexer.js +192 -4
- package/dist/cli/search.js +50 -11
- package/dist/cli.js +183 -26
- package/dist/core/abstract-embedder.d.ts +125 -0
- package/dist/core/abstract-embedder.js +264 -0
- package/dist/core/actionable-error-messages.d.ts +60 -0
- package/dist/core/actionable-error-messages.js +397 -0
- package/dist/core/batch-processing-optimizer.d.ts +155 -0
- package/dist/core/batch-processing-optimizer.js +541 -0
- package/dist/core/binary-index-format.d.ts +52 -0
- package/dist/core/binary-index-format.js +122 -0
- package/dist/core/chunker.d.ts +2 -0
- package/dist/core/cli-database-utils.d.ts +53 -0
- package/dist/core/cli-database-utils.js +239 -0
- package/dist/core/config.js +10 -3
- package/dist/core/content-errors.d.ts +111 -0
- package/dist/core/content-errors.js +362 -0
- package/dist/core/content-manager.d.ts +343 -0
- package/dist/core/content-manager.js +1504 -0
- package/dist/core/content-performance-optimizer.d.ts +150 -0
- package/dist/core/content-performance-optimizer.js +516 -0
- package/dist/core/content-resolver.d.ts +104 -0
- package/dist/core/content-resolver.js +285 -0
- package/dist/core/cross-modal-search.d.ts +164 -0
- package/dist/core/cross-modal-search.js +342 -0
- package/dist/core/database-connection-manager.d.ts +109 -0
- package/dist/core/database-connection-manager.js +304 -0
- package/dist/core/db.d.ts +141 -2
- package/dist/core/db.js +631 -89
- package/dist/core/embedder-factory.d.ts +176 -0
- package/dist/core/embedder-factory.js +338 -0
- package/dist/core/index.d.ts +3 -1
- package/dist/core/index.js +4 -1
- package/dist/core/ingestion.d.ts +85 -15
- package/dist/core/ingestion.js +510 -45
- package/dist/core/lazy-dependency-loader.d.ts +152 -0
- package/dist/core/lazy-dependency-loader.js +453 -0
- package/dist/core/mode-detection-service.d.ts +150 -0
- package/dist/core/mode-detection-service.js +565 -0
- package/dist/core/mode-model-validator.d.ts +92 -0
- package/dist/core/mode-model-validator.js +203 -0
- package/dist/core/model-registry.d.ts +120 -0
- package/dist/core/model-registry.js +415 -0
- package/dist/core/model-validator.d.ts +217 -0
- package/dist/core/model-validator.js +782 -0
- package/dist/core/polymorphic-search-factory.d.ts +154 -0
- package/dist/core/polymorphic-search-factory.js +344 -0
- package/dist/core/raglite-paths.d.ts +121 -0
- package/dist/core/raglite-paths.js +145 -0
- package/dist/core/reranking-config.d.ts +42 -0
- package/dist/core/reranking-config.js +156 -0
- package/dist/core/reranking-factory.d.ts +92 -0
- package/dist/core/reranking-factory.js +591 -0
- package/dist/core/reranking-strategies.d.ts +325 -0
- package/dist/core/reranking-strategies.js +720 -0
- package/dist/core/resource-cleanup.d.ts +163 -0
- package/dist/core/resource-cleanup.js +371 -0
- package/dist/core/resource-manager.d.ts +212 -0
- package/dist/core/resource-manager.js +564 -0
- package/dist/core/search.d.ts +28 -1
- package/dist/core/search.js +83 -5
- package/dist/core/streaming-operations.d.ts +145 -0
- package/dist/core/streaming-operations.js +409 -0
- package/dist/core/types.d.ts +3 -0
- package/dist/core/universal-embedder.d.ts +177 -0
- package/dist/core/universal-embedder.js +139 -0
- package/dist/core/validation-messages.d.ts +99 -0
- package/dist/core/validation-messages.js +334 -0
- package/dist/core/vector-index.d.ts +1 -1
- package/dist/core/vector-index.js +37 -39
- package/dist/factories/index.d.ts +3 -1
- package/dist/factories/index.js +2 -0
- package/dist/factories/polymorphic-factory.d.ts +50 -0
- package/dist/factories/polymorphic-factory.js +159 -0
- package/dist/factories/text-factory.d.ts +128 -34
- package/dist/factories/text-factory.js +346 -97
- package/dist/file-processor.d.ts +88 -2
- package/dist/file-processor.js +720 -17
- package/dist/index.d.ts +32 -0
- package/dist/index.js +29 -0
- package/dist/ingestion.d.ts +16 -0
- package/dist/ingestion.js +21 -0
- package/dist/mcp-server.d.ts +35 -3
- package/dist/mcp-server.js +1107 -31
- package/dist/multimodal/clip-embedder.d.ts +327 -0
- package/dist/multimodal/clip-embedder.js +992 -0
- package/dist/multimodal/index.d.ts +6 -0
- package/dist/multimodal/index.js +6 -0
- package/dist/run-error-recovery-tests.d.ts +7 -0
- package/dist/run-error-recovery-tests.js +101 -0
- package/dist/search.d.ts +60 -9
- package/dist/search.js +82 -11
- package/dist/test-utils.d.ts +8 -26
- package/dist/text/chunker.d.ts +1 -0
- package/dist/text/embedder.js +15 -8
- package/dist/text/index.d.ts +1 -0
- package/dist/text/index.js +1 -0
- package/dist/text/reranker.d.ts +1 -2
- package/dist/text/reranker.js +17 -47
- package/dist/text/sentence-transformer-embedder.d.ts +96 -0
- package/dist/text/sentence-transformer-embedder.js +340 -0
- package/dist/types.d.ts +39 -0
- package/dist/utils/vector-math.d.ts +31 -0
- package/dist/utils/vector-math.js +70 -0
- package/package.json +27 -6
- package/dist/api-errors.d.ts.map +0 -1
- package/dist/api-errors.js.map +0 -1
- package/dist/cli/indexer.d.ts.map +0 -1
- package/dist/cli/indexer.js.map +0 -1
- package/dist/cli/search.d.ts.map +0 -1
- package/dist/cli/search.js.map +0 -1
- package/dist/cli.d.ts.map +0 -1
- package/dist/cli.js.map +0 -1
- package/dist/config.d.ts.map +0 -1
- package/dist/config.js.map +0 -1
- package/dist/core/adapters.d.ts.map +0 -1
- package/dist/core/adapters.js.map +0 -1
- package/dist/core/chunker.d.ts.map +0 -1
- package/dist/core/chunker.js.map +0 -1
- package/dist/core/config.d.ts.map +0 -1
- package/dist/core/config.js.map +0 -1
- package/dist/core/db.d.ts.map +0 -1
- package/dist/core/db.js.map +0 -1
- package/dist/core/error-handler.d.ts.map +0 -1
- package/dist/core/error-handler.js.map +0 -1
- package/dist/core/index.d.ts.map +0 -1
- package/dist/core/index.js.map +0 -1
- package/dist/core/ingestion.d.ts.map +0 -1
- package/dist/core/ingestion.js.map +0 -1
- package/dist/core/interfaces.d.ts.map +0 -1
- package/dist/core/interfaces.js.map +0 -1
- package/dist/core/path-manager.d.ts.map +0 -1
- package/dist/core/path-manager.js.map +0 -1
- package/dist/core/search-example.d.ts +0 -25
- package/dist/core/search-example.d.ts.map +0 -1
- package/dist/core/search-example.js +0 -138
- package/dist/core/search-example.js.map +0 -1
- package/dist/core/search-pipeline-example.d.ts +0 -21
- package/dist/core/search-pipeline-example.d.ts.map +0 -1
- package/dist/core/search-pipeline-example.js +0 -188
- package/dist/core/search-pipeline-example.js.map +0 -1
- package/dist/core/search-pipeline.d.ts.map +0 -1
- package/dist/core/search-pipeline.js.map +0 -1
- package/dist/core/search.d.ts.map +0 -1
- package/dist/core/search.js.map +0 -1
- package/dist/core/types.d.ts.map +0 -1
- package/dist/core/types.js.map +0 -1
- package/dist/core/vector-index.d.ts.map +0 -1
- package/dist/core/vector-index.js.map +0 -1
- package/dist/dom-polyfills.d.ts.map +0 -1
- package/dist/dom-polyfills.js.map +0 -1
- package/dist/examples/clean-api-examples.d.ts +0 -44
- package/dist/examples/clean-api-examples.d.ts.map +0 -1
- package/dist/examples/clean-api-examples.js +0 -206
- package/dist/examples/clean-api-examples.js.map +0 -1
- package/dist/factories/index.d.ts.map +0 -1
- package/dist/factories/index.js.map +0 -1
- package/dist/factories/text-factory.d.ts.map +0 -1
- package/dist/factories/text-factory.js.map +0 -1
- package/dist/file-processor.d.ts.map +0 -1
- package/dist/file-processor.js.map +0 -1
- package/dist/index-manager.d.ts.map +0 -1
- package/dist/index-manager.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/indexer.d.ts.map +0 -1
- package/dist/indexer.js.map +0 -1
- package/dist/ingestion.d.ts.map +0 -1
- package/dist/ingestion.js.map +0 -1
- package/dist/mcp-server.d.ts.map +0 -1
- package/dist/mcp-server.js.map +0 -1
- package/dist/preprocess.d.ts.map +0 -1
- package/dist/preprocess.js.map +0 -1
- package/dist/preprocessors/index.d.ts.map +0 -1
- package/dist/preprocessors/index.js.map +0 -1
- package/dist/preprocessors/mdx.d.ts.map +0 -1
- package/dist/preprocessors/mdx.js.map +0 -1
- package/dist/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/preprocessors/mermaid.js.map +0 -1
- package/dist/preprocessors/registry.d.ts.map +0 -1
- package/dist/preprocessors/registry.js.map +0 -1
- package/dist/search-standalone.d.ts.map +0 -1
- package/dist/search-standalone.js.map +0 -1
- package/dist/search.d.ts.map +0 -1
- package/dist/search.js.map +0 -1
- package/dist/test-utils.d.ts.map +0 -1
- package/dist/test-utils.js.map +0 -1
- package/dist/text/chunker.d.ts.map +0 -1
- package/dist/text/chunker.js.map +0 -1
- package/dist/text/embedder.d.ts.map +0 -1
- package/dist/text/embedder.js.map +0 -1
- package/dist/text/index.d.ts.map +0 -1
- package/dist/text/index.js.map +0 -1
- package/dist/text/preprocessors/index.d.ts.map +0 -1
- package/dist/text/preprocessors/index.js.map +0 -1
- package/dist/text/preprocessors/mdx.d.ts.map +0 -1
- package/dist/text/preprocessors/mdx.js.map +0 -1
- package/dist/text/preprocessors/mermaid.d.ts.map +0 -1
- package/dist/text/preprocessors/mermaid.js.map +0 -1
- package/dist/text/preprocessors/registry.d.ts.map +0 -1
- package/dist/text/preprocessors/registry.js.map +0 -1
- package/dist/text/reranker.d.ts.map +0 -1
- package/dist/text/reranker.js.map +0 -1
- package/dist/text/tokenizer.d.ts.map +0 -1
- package/dist/text/tokenizer.js.map +0 -1
- package/dist/types.d.ts.map +0 -1
- package/dist/types.js.map +0 -1
|
@@ -0,0 +1,720 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-Encoder Reranking Strategy for Text Mode
|
|
3
|
+
*
|
|
4
|
+
* Adapts the existing CrossEncoderReranker to work with the new RerankingStrategy
|
|
5
|
+
* interface defined in the Chameleon Multimodal Architecture.
|
|
6
|
+
*/
|
|
7
|
+
import { CrossEncoderReranker } from '../text/reranker.js';
|
|
8
|
+
/**
|
|
9
|
+
* Cross-Encoder Reranking Strategy Implementation
|
|
10
|
+
*
|
|
11
|
+
* Uses the existing CrossEncoderReranker from the text module to provide
|
|
12
|
+
* reranking functionality for text mode in the new architecture.
|
|
13
|
+
*/
|
|
14
|
+
export class CrossEncoderRerankingStrategy {
|
|
15
|
+
name = 'cross-encoder';
|
|
16
|
+
supportedContentTypes = ['text'];
|
|
17
|
+
isEnabled = true;
|
|
18
|
+
reranker;
|
|
19
|
+
modelName;
|
|
20
|
+
initialized = false;
|
|
21
|
+
constructor(modelName) {
|
|
22
|
+
this.modelName = modelName;
|
|
23
|
+
this.reranker = new CrossEncoderReranker();
|
|
24
|
+
// Set custom model name if provided
|
|
25
|
+
if (modelName) {
|
|
26
|
+
this.reranker.modelName = modelName;
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Initialize the reranker if not already done
|
|
31
|
+
*/
|
|
32
|
+
async ensureInitialized() {
|
|
33
|
+
if (!this.initialized) {
|
|
34
|
+
try {
|
|
35
|
+
await this.reranker.loadModel();
|
|
36
|
+
this.initialized = true;
|
|
37
|
+
this.isEnabled = this.reranker.isLoaded();
|
|
38
|
+
if (!this.isEnabled) {
|
|
39
|
+
console.warn('Cross-encoder reranker failed to load, strategy disabled');
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
catch (error) {
|
|
43
|
+
console.warn(`Cross-encoder reranker initialization failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
44
|
+
this.isEnabled = false;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Rerank search results using cross-encoder model
|
|
50
|
+
*/
|
|
51
|
+
rerank = async (query, results, contentType) => {
|
|
52
|
+
// If strategy is disabled, return results unchanged immediately
|
|
53
|
+
if (!this.isEnabled) {
|
|
54
|
+
return results;
|
|
55
|
+
}
|
|
56
|
+
// Validate content type
|
|
57
|
+
if (contentType && !this.supportedContentTypes.includes(contentType)) {
|
|
58
|
+
throw new Error(`Cross-encoder strategy does not support content type '${contentType}'. ` +
|
|
59
|
+
`Supported types: ${this.supportedContentTypes.join(', ')}`);
|
|
60
|
+
}
|
|
61
|
+
// Ensure reranker is initialized
|
|
62
|
+
await this.ensureInitialized();
|
|
63
|
+
// If reranker failed to initialize, return results unchanged
|
|
64
|
+
if (!this.isEnabled) {
|
|
65
|
+
console.warn('Cross-encoder reranker not enabled, returning results unchanged');
|
|
66
|
+
return results;
|
|
67
|
+
}
|
|
68
|
+
// Filter to only text content if mixed content types are present
|
|
69
|
+
const textResults = results.filter(result => !result.contentType || result.contentType === 'text');
|
|
70
|
+
if (textResults.length === 0) {
|
|
71
|
+
return results; // No text results to rerank
|
|
72
|
+
}
|
|
73
|
+
if (textResults.length !== results.length) {
|
|
74
|
+
console.warn(`Cross-encoder reranker filtering ${results.length - textResults.length} ` +
|
|
75
|
+
`non-text results from reranking`);
|
|
76
|
+
}
|
|
77
|
+
try {
|
|
78
|
+
// Use the existing reranker implementation
|
|
79
|
+
const rerankedTextResults = await this.reranker.rerank(query, textResults);
|
|
80
|
+
// If we filtered results, we need to merge back non-text results
|
|
81
|
+
if (textResults.length !== results.length) {
|
|
82
|
+
const nonTextResults = results.filter(result => result.contentType && result.contentType !== 'text');
|
|
83
|
+
// Append non-text results at the end with their original scores
|
|
84
|
+
return [...rerankedTextResults, ...nonTextResults];
|
|
85
|
+
}
|
|
86
|
+
return rerankedTextResults;
|
|
87
|
+
}
|
|
88
|
+
catch (error) {
|
|
89
|
+
console.warn(`Cross-encoder reranking failed: ${error instanceof Error ? error.message : 'Unknown error'}. ` +
|
|
90
|
+
`Returning original results.`);
|
|
91
|
+
return results;
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
/**
|
|
95
|
+
* Configure the reranking strategy
|
|
96
|
+
*/
|
|
97
|
+
configure(config) {
|
|
98
|
+
if (config.modelName && typeof config.modelName === 'string') {
|
|
99
|
+
this.modelName = config.modelName;
|
|
100
|
+
// Reset initialization to use new model
|
|
101
|
+
this.initialized = false;
|
|
102
|
+
this.reranker = new CrossEncoderReranker();
|
|
103
|
+
this.reranker.modelName = config.modelName;
|
|
104
|
+
}
|
|
105
|
+
if (config.enabled !== undefined) {
|
|
106
|
+
this.isEnabled = Boolean(config.enabled);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Get metadata about this reranking strategy
|
|
111
|
+
*/
|
|
112
|
+
getMetadata() {
|
|
113
|
+
return {
|
|
114
|
+
description: 'Cross-encoder reranking using transformer models for improved text relevance scoring',
|
|
115
|
+
requiredModels: [
|
|
116
|
+
'Xenova/ms-marco-MiniLM-L-6-v2',
|
|
117
|
+
'cross-encoder/ms-marco-MiniLM-L-6-v2',
|
|
118
|
+
'cross-encoder/ms-marco-MiniLM-L-2-v2'
|
|
119
|
+
],
|
|
120
|
+
configOptions: {
|
|
121
|
+
modelName: {
|
|
122
|
+
type: 'string',
|
|
123
|
+
description: 'Cross-encoder model name to use for reranking',
|
|
124
|
+
default: 'Xenova/ms-marco-MiniLM-L-6-v2'
|
|
125
|
+
},
|
|
126
|
+
enabled: {
|
|
127
|
+
type: 'boolean',
|
|
128
|
+
description: 'Enable or disable cross-encoder reranking',
|
|
129
|
+
default: true
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Check if the strategy is ready to use
|
|
136
|
+
*/
|
|
137
|
+
async isReady() {
|
|
138
|
+
await this.ensureInitialized();
|
|
139
|
+
return this.isEnabled && this.reranker.isLoaded();
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Get the current model name being used
|
|
143
|
+
*/
|
|
144
|
+
getModelName() {
|
|
145
|
+
return this.reranker.getModelName();
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Clean up resources
|
|
149
|
+
*/
|
|
150
|
+
async cleanup() {
|
|
151
|
+
// The existing CrossEncoderReranker doesn't have explicit cleanup
|
|
152
|
+
// but we can reset the initialization state
|
|
153
|
+
this.initialized = false;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Factory function to create a cross-encoder reranking strategy
|
|
158
|
+
*
|
|
159
|
+
* This provides a simple way to create the strategy without complex factory patterns,
|
|
160
|
+
* following the design principle of using simple functions over complex factories.
|
|
161
|
+
*/
|
|
162
|
+
export function createCrossEncoderStrategy(modelName) {
|
|
163
|
+
return new CrossEncoderRerankingStrategy(modelName);
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Text-Derived Reranking Strategy Implementation
|
|
167
|
+
*
|
|
168
|
+
* Converts images to text descriptions using image-to-text models, then applies
|
|
169
|
+
* cross-encoder reranking to the text descriptions. This enables multimodal
|
|
170
|
+
* content to be reranked using text-based reranking models.
|
|
171
|
+
*/
|
|
172
|
+
export class TextDerivedRerankingStrategy {
|
|
173
|
+
name = 'text-derived';
|
|
174
|
+
supportedContentTypes = ['text', 'image'];
|
|
175
|
+
isEnabled = true;
|
|
176
|
+
crossEncoderReranker;
|
|
177
|
+
imageToTextModel = null;
|
|
178
|
+
imageToTextModelName = 'Xenova/vit-gpt2-image-captioning';
|
|
179
|
+
initialized = false;
|
|
180
|
+
constructor(imageToTextModelName, crossEncoderModelName) {
|
|
181
|
+
if (imageToTextModelName) {
|
|
182
|
+
this.imageToTextModelName = imageToTextModelName;
|
|
183
|
+
}
|
|
184
|
+
// Create the underlying cross-encoder strategy
|
|
185
|
+
this.crossEncoderReranker = new CrossEncoderRerankingStrategy(crossEncoderModelName);
|
|
186
|
+
}
|
|
187
|
+
/**
|
|
188
|
+
* Initialize the image-to-text model if not already done
|
|
189
|
+
*/
|
|
190
|
+
async ensureInitialized() {
|
|
191
|
+
if (!this.initialized) {
|
|
192
|
+
try {
|
|
193
|
+
console.log(`Loading image-to-text model: ${this.imageToTextModelName}`);
|
|
194
|
+
// Set up polyfills for transformers.js
|
|
195
|
+
this.ensurePolyfills();
|
|
196
|
+
const { pipeline } = await import('@huggingface/transformers');
|
|
197
|
+
this.imageToTextModel = await pipeline('image-to-text', this.imageToTextModelName);
|
|
198
|
+
this.initialized = true;
|
|
199
|
+
console.log(`Image-to-text model loaded successfully: ${this.imageToTextModelName}`);
|
|
200
|
+
}
|
|
201
|
+
catch (error) {
|
|
202
|
+
console.warn(`Image-to-text model initialization failed: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
203
|
+
this.isEnabled = false;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Ensure DOM polyfills are set up for transformers.js
|
|
209
|
+
*/
|
|
210
|
+
ensurePolyfills() {
|
|
211
|
+
if (typeof window === 'undefined' && typeof globalThis !== 'undefined') {
|
|
212
|
+
if (typeof globalThis.self === 'undefined') {
|
|
213
|
+
globalThis.self = globalThis;
|
|
214
|
+
}
|
|
215
|
+
if (typeof global.self === 'undefined') {
|
|
216
|
+
global.self = global;
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Generate text description for an image
|
|
222
|
+
*/
|
|
223
|
+
async generateImageDescription(imagePath) {
|
|
224
|
+
await this.ensureInitialized();
|
|
225
|
+
if (!this.imageToTextModel) {
|
|
226
|
+
throw new Error('Image-to-text model not loaded');
|
|
227
|
+
}
|
|
228
|
+
try {
|
|
229
|
+
const result = await this.imageToTextModel(imagePath);
|
|
230
|
+
// Handle different response formats from the pipeline
|
|
231
|
+
if (Array.isArray(result) && result.length > 0) {
|
|
232
|
+
return result[0].generated_text || result[0].text || String(result[0]);
|
|
233
|
+
}
|
|
234
|
+
else if (result && typeof result === 'object') {
|
|
235
|
+
return result.generated_text || result.text || String(result);
|
|
236
|
+
}
|
|
237
|
+
else {
|
|
238
|
+
return String(result);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
catch (error) {
|
|
242
|
+
console.warn(`Failed to generate description for image ${imagePath}: ${error instanceof Error ? error.message : 'Unknown error'}`);
|
|
243
|
+
// Fallback to filename-based description
|
|
244
|
+
const filename = imagePath.split('/').pop() || imagePath;
|
|
245
|
+
return `Image file: ${filename}`;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Rerank search results using text-derived approach
|
|
250
|
+
*/
|
|
251
|
+
rerank = async (query, results, contentType) => {
|
|
252
|
+
// If strategy is disabled, return results unchanged
|
|
253
|
+
if (!this.isEnabled) {
|
|
254
|
+
return results;
|
|
255
|
+
}
|
|
256
|
+
// Validate content type
|
|
257
|
+
if (contentType && !this.supportedContentTypes.includes(contentType)) {
|
|
258
|
+
throw new Error(`Text-derived strategy does not support content type '${contentType}'. ` +
|
|
259
|
+
`Supported types: ${this.supportedContentTypes.join(', ')}`);
|
|
260
|
+
}
|
|
261
|
+
// Ensure models are initialized
|
|
262
|
+
await this.ensureInitialized();
|
|
263
|
+
// If initialization failed, return results unchanged
|
|
264
|
+
if (!this.isEnabled) {
|
|
265
|
+
console.warn('Text-derived reranker not enabled, returning results unchanged');
|
|
266
|
+
return results;
|
|
267
|
+
}
|
|
268
|
+
try {
|
|
269
|
+
// Step 1: Convert images to text descriptions
|
|
270
|
+
const processedResults = await Promise.all(results.map(async (result) => {
|
|
271
|
+
if (result.contentType === 'image') {
|
|
272
|
+
// Generate text description for image
|
|
273
|
+
const description = await this.generateImageDescription(result.content);
|
|
274
|
+
return {
|
|
275
|
+
...result,
|
|
276
|
+
content: description,
|
|
277
|
+
originalContent: result.content,
|
|
278
|
+
originalContentType: result.contentType,
|
|
279
|
+
metadata: {
|
|
280
|
+
...result.metadata,
|
|
281
|
+
originalImagePath: result.content,
|
|
282
|
+
generatedDescription: description
|
|
283
|
+
}
|
|
284
|
+
};
|
|
285
|
+
}
|
|
286
|
+
return result;
|
|
287
|
+
}));
|
|
288
|
+
// Step 2: Use cross-encoder reranking on the text descriptions
|
|
289
|
+
const rerankedResults = await this.crossEncoderReranker.rerank(query, processedResults);
|
|
290
|
+
// Step 3: Restore original content for images
|
|
291
|
+
return rerankedResults.map(result => {
|
|
292
|
+
if (result.originalContent && result.originalContentType) {
|
|
293
|
+
return {
|
|
294
|
+
...result,
|
|
295
|
+
content: result.originalContent,
|
|
296
|
+
contentType: result.originalContentType,
|
|
297
|
+
// Keep the generated description in metadata for reference
|
|
298
|
+
metadata: {
|
|
299
|
+
...result.metadata,
|
|
300
|
+
generatedDescription: result.content
|
|
301
|
+
}
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
return result;
|
|
305
|
+
});
|
|
306
|
+
}
|
|
307
|
+
catch (error) {
|
|
308
|
+
console.warn(`Text-derived reranking failed: ${error instanceof Error ? error.message : 'Unknown error'}. ` +
|
|
309
|
+
`Returning original results.`);
|
|
310
|
+
return results;
|
|
311
|
+
}
|
|
312
|
+
};
|
|
313
|
+
/**
|
|
314
|
+
* Configure the reranking strategy
|
|
315
|
+
*/
|
|
316
|
+
configure(config) {
|
|
317
|
+
if (config.imageToTextModel && typeof config.imageToTextModel === 'string') {
|
|
318
|
+
this.imageToTextModelName = config.imageToTextModel;
|
|
319
|
+
// Reset initialization to use new model
|
|
320
|
+
this.initialized = false;
|
|
321
|
+
this.imageToTextModel = null;
|
|
322
|
+
}
|
|
323
|
+
if (config.crossEncoderModel && typeof config.crossEncoderModel === 'string') {
|
|
324
|
+
this.crossEncoderReranker.configure({ modelName: config.crossEncoderModel });
|
|
325
|
+
}
|
|
326
|
+
if (config.enabled !== undefined) {
|
|
327
|
+
this.isEnabled = Boolean(config.enabled);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
/**
|
|
331
|
+
* Get metadata about this reranking strategy
|
|
332
|
+
*/
|
|
333
|
+
getMetadata() {
|
|
334
|
+
return {
|
|
335
|
+
description: 'Text-derived reranking that converts images to text descriptions then applies cross-encoder reranking',
|
|
336
|
+
requiredModels: [
|
|
337
|
+
'Xenova/vit-gpt2-image-captioning', // Image-to-text model
|
|
338
|
+
'Xenova/ms-marco-MiniLM-L-6-v2' // Cross-encoder model
|
|
339
|
+
],
|
|
340
|
+
configOptions: {
|
|
341
|
+
imageToTextModel: {
|
|
342
|
+
type: 'string',
|
|
343
|
+
description: 'Image-to-text model name for generating descriptions',
|
|
344
|
+
default: 'Xenova/vit-gpt2-image-captioning'
|
|
345
|
+
},
|
|
346
|
+
crossEncoderModel: {
|
|
347
|
+
type: 'string',
|
|
348
|
+
description: 'Cross-encoder model name for text reranking',
|
|
349
|
+
default: 'Xenova/ms-marco-MiniLM-L-6-v2'
|
|
350
|
+
},
|
|
351
|
+
enabled: {
|
|
352
|
+
type: 'boolean',
|
|
353
|
+
description: 'Enable or disable text-derived reranking',
|
|
354
|
+
default: true
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
/**
|
|
360
|
+
* Check if the strategy is ready to use
|
|
361
|
+
*/
|
|
362
|
+
async isReady() {
|
|
363
|
+
await this.ensureInitialized();
|
|
364
|
+
const crossEncoderReady = await this.crossEncoderReranker.isReady();
|
|
365
|
+
return this.isEnabled && this.imageToTextModel !== null && crossEncoderReady;
|
|
366
|
+
}
|
|
367
|
+
/**
|
|
368
|
+
* Get the current model names being used
|
|
369
|
+
*/
|
|
370
|
+
getModelNames() {
|
|
371
|
+
return {
|
|
372
|
+
imageToText: this.imageToTextModelName,
|
|
373
|
+
crossEncoder: this.crossEncoderReranker.getModelName()
|
|
374
|
+
};
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Clean up resources
|
|
378
|
+
*/
|
|
379
|
+
async cleanup() {
|
|
380
|
+
this.initialized = false;
|
|
381
|
+
this.imageToTextModel = null;
|
|
382
|
+
await this.crossEncoderReranker.cleanup();
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Factory function to create a text-derived reranking strategy
|
|
387
|
+
*/
|
|
388
|
+
export function createTextDerivedStrategy(imageToTextModelName, crossEncoderModelName) {
|
|
389
|
+
return new TextDerivedRerankingStrategy(imageToTextModelName, crossEncoderModelName);
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* Create a RerankFunction using the text-derived strategy
|
|
393
|
+
*/
|
|
394
|
+
export function createTextDerivedRerankFunction(imageToTextModelName, crossEncoderModelName) {
|
|
395
|
+
const strategy = createTextDerivedStrategy(imageToTextModelName, crossEncoderModelName);
|
|
396
|
+
return strategy.rerank;
|
|
397
|
+
}
|
|
398
|
+
/**
|
|
399
|
+
* Metadata-Based Reranking Strategy Implementation
|
|
400
|
+
*
|
|
401
|
+
* Reranks search results based on filename patterns, metadata, and content type
|
|
402
|
+
* information. This strategy is particularly useful for multimodal content where
|
|
403
|
+
* semantic similarity might not capture all relevant aspects.
|
|
404
|
+
*/
|
|
405
|
+
export class MetadataRerankingStrategy {
|
|
406
|
+
name = 'metadata';
|
|
407
|
+
supportedContentTypes = ['text', 'image', 'pdf', 'docx'];
|
|
408
|
+
isEnabled = true;
|
|
409
|
+
config;
|
|
410
|
+
constructor(config) {
|
|
411
|
+
// Default configuration with reasonable weights and boost factors
|
|
412
|
+
const defaultConfig = {
|
|
413
|
+
weights: {
|
|
414
|
+
filename: 0.4,
|
|
415
|
+
contentType: 0.3,
|
|
416
|
+
metadata: 0.3
|
|
417
|
+
},
|
|
418
|
+
boostFactors: {
|
|
419
|
+
diagram: 1.5,
|
|
420
|
+
chart: 1.4,
|
|
421
|
+
graph: 1.4,
|
|
422
|
+
image: 1.2,
|
|
423
|
+
screenshot: 1.3,
|
|
424
|
+
figure: 1.3
|
|
425
|
+
},
|
|
426
|
+
keywordBoosts: {
|
|
427
|
+
// Technical terms
|
|
428
|
+
'api': 1.2,
|
|
429
|
+
'architecture': 1.3,
|
|
430
|
+
'design': 1.2,
|
|
431
|
+
'implementation': 1.2,
|
|
432
|
+
'configuration': 1.2,
|
|
433
|
+
'setup': 1.2,
|
|
434
|
+
'guide': 1.2,
|
|
435
|
+
'tutorial': 1.2,
|
|
436
|
+
'example': 1.1,
|
|
437
|
+
'demo': 1.1,
|
|
438
|
+
// Visual content indicators
|
|
439
|
+
'visual': 1.3,
|
|
440
|
+
'overview': 1.2,
|
|
441
|
+
'flow': 1.3,
|
|
442
|
+
'process': 1.2,
|
|
443
|
+
'workflow': 1.3
|
|
444
|
+
}
|
|
445
|
+
};
|
|
446
|
+
this.config = {
|
|
447
|
+
weights: { ...defaultConfig.weights, ...config?.weights },
|
|
448
|
+
boostFactors: { ...defaultConfig.boostFactors, ...config?.boostFactors },
|
|
449
|
+
keywordBoosts: { ...defaultConfig.keywordBoosts, ...config?.keywordBoosts }
|
|
450
|
+
};
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* Calculate filename-based score
|
|
454
|
+
*/
|
|
455
|
+
calculateFilenameScore(query, filename) {
|
|
456
|
+
const queryLower = query.toLowerCase();
|
|
457
|
+
const filenameLower = filename.toLowerCase();
|
|
458
|
+
let score = 0;
|
|
459
|
+
// Exact filename match gets highest score
|
|
460
|
+
if (filenameLower.includes(queryLower)) {
|
|
461
|
+
score += 1.0;
|
|
462
|
+
}
|
|
463
|
+
// Word-level matching
|
|
464
|
+
const queryWords = queryLower.split(/\s+/).filter(word => word.length > 2);
|
|
465
|
+
const filenameWords = filenameLower.split(/[_\-\s\.]+/).filter(word => word.length > 2);
|
|
466
|
+
for (const queryWord of queryWords) {
|
|
467
|
+
for (const filenameWord of filenameWords) {
|
|
468
|
+
if (filenameWord.includes(queryWord) || queryWord.includes(filenameWord)) {
|
|
469
|
+
score += 0.3;
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
}
|
|
473
|
+
// Apply keyword boosts
|
|
474
|
+
for (const [keyword, boost] of Object.entries(this.config.keywordBoosts)) {
|
|
475
|
+
if (filenameLower.includes(keyword)) {
|
|
476
|
+
score *= boost;
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
// Apply pattern-based boosts
|
|
480
|
+
for (const [pattern, boost] of Object.entries(this.config.boostFactors)) {
|
|
481
|
+
if (filenameLower.includes(pattern)) {
|
|
482
|
+
score *= boost;
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
return Math.min(score, 2.0); // Cap at 2.0 to prevent extreme scores
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Calculate content type-based score
|
|
489
|
+
*/
|
|
490
|
+
calculateContentTypeScore(query, contentType) {
|
|
491
|
+
const queryLower = query.toLowerCase();
|
|
492
|
+
// Base scores for different content types
|
|
493
|
+
const contentTypeScores = {
|
|
494
|
+
'image': 0.8,
|
|
495
|
+
'text': 1.0,
|
|
496
|
+
'pdf': 0.9,
|
|
497
|
+
'docx': 0.9
|
|
498
|
+
};
|
|
499
|
+
let score = contentTypeScores[contentType] || 0.5;
|
|
500
|
+
// Boost image content for visual-related queries
|
|
501
|
+
if (contentType === 'image') {
|
|
502
|
+
const visualKeywords = ['diagram', 'chart', 'graph', 'image', 'visual', 'screenshot', 'figure', 'illustration', 'visualization'];
|
|
503
|
+
for (const keyword of visualKeywords) {
|
|
504
|
+
if (queryLower.includes(keyword)) {
|
|
505
|
+
score *= 2.0; // More aggressive boost
|
|
506
|
+
break;
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
// Boost document content for text-heavy queries
|
|
511
|
+
if (contentType === 'text' || contentType === 'pdf' || contentType === 'docx') {
|
|
512
|
+
const textKeywords = ['documentation', 'guide', 'tutorial', 'explanation', 'description', 'details'];
|
|
513
|
+
for (const keyword of textKeywords) {
|
|
514
|
+
if (queryLower.includes(keyword)) {
|
|
515
|
+
score *= 1.8; // More aggressive boost
|
|
516
|
+
break;
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
return score;
|
|
521
|
+
}
|
|
522
|
+
/**
|
|
523
|
+
* Calculate metadata-based score
|
|
524
|
+
*/
|
|
525
|
+
calculateMetadataScore(query, metadata = {}) {
|
|
526
|
+
let score = 0;
|
|
527
|
+
const queryLower = query.toLowerCase();
|
|
528
|
+
// Check various metadata fields
|
|
529
|
+
const metadataFields = ['title', 'description', 'tags', 'category', 'type'];
|
|
530
|
+
for (const field of metadataFields) {
|
|
531
|
+
const value = metadata[field];
|
|
532
|
+
if (typeof value === 'string') {
|
|
533
|
+
const valueLower = value.toLowerCase();
|
|
534
|
+
if (valueLower.includes(queryLower)) {
|
|
535
|
+
score += 0.5;
|
|
536
|
+
}
|
|
537
|
+
// Word-level matching
|
|
538
|
+
const queryWords = queryLower.split(/\s+/).filter(word => word.length > 2);
|
|
539
|
+
for (const word of queryWords) {
|
|
540
|
+
if (valueLower.includes(word)) {
|
|
541
|
+
score += 0.2;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
else if (Array.isArray(value)) {
|
|
546
|
+
// Handle tag arrays
|
|
547
|
+
for (const item of value) {
|
|
548
|
+
if (typeof item === 'string' && item.toLowerCase().includes(queryLower)) {
|
|
549
|
+
score += 0.3;
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
// Special handling for image metadata
|
|
555
|
+
if (metadata.dimensions) {
|
|
556
|
+
// Larger images might be more important
|
|
557
|
+
const { width, height } = metadata.dimensions;
|
|
558
|
+
if (width && height && width * height > 500000) { // > 500k pixels
|
|
559
|
+
score += 0.5; // More significant boost for large images
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
// File size considerations
|
|
563
|
+
if (metadata.fileSize) {
|
|
564
|
+
// Very small files might be less important
|
|
565
|
+
if (metadata.fileSize < 1000) {
|
|
566
|
+
score -= 0.1;
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
return Math.max(score, 0); // Ensure non-negative
|
|
570
|
+
}
|
|
571
|
+
/**
|
|
572
|
+
* Rerank search results using metadata-based scoring
|
|
573
|
+
*/
|
|
574
|
+
rerank = async (query, results, contentType) => {
|
|
575
|
+
// If strategy is disabled, return results unchanged
|
|
576
|
+
if (!this.isEnabled) {
|
|
577
|
+
return results;
|
|
578
|
+
}
|
|
579
|
+
// Validate content type if specified
|
|
580
|
+
if (contentType && !this.supportedContentTypes.includes(contentType)) {
|
|
581
|
+
console.warn(`Metadata strategy does not support content type '${contentType}'. ` +
|
|
582
|
+
`Supported types: ${this.supportedContentTypes.join(', ')}. Proceeding anyway.`);
|
|
583
|
+
}
|
|
584
|
+
try {
|
|
585
|
+
// Calculate metadata scores for each result
|
|
586
|
+
const scoredResults = results.map(result => {
|
|
587
|
+
const filename = result.document.source.split('/').pop() || result.document.source;
|
|
588
|
+
// Calculate individual scores
|
|
589
|
+
const filenameScore = this.calculateFilenameScore(query, filename);
|
|
590
|
+
const contentTypeScore = this.calculateContentTypeScore(query, result.contentType);
|
|
591
|
+
const metadataScore = this.calculateMetadataScore(query, result.metadata);
|
|
592
|
+
// Combine scores using configured weights
|
|
593
|
+
const metadataBoost = (filenameScore * this.config.weights.filename +
|
|
594
|
+
contentTypeScore * this.config.weights.contentType +
|
|
595
|
+
metadataScore * this.config.weights.metadata);
|
|
596
|
+
// Combine with original vector similarity score
|
|
597
|
+
// Use a weighted combination where metadata boost can have significant influence
|
|
598
|
+
const combinedScore = result.score * 0.4 + metadataBoost * 0.6;
|
|
599
|
+
return {
|
|
600
|
+
...result,
|
|
601
|
+
score: combinedScore,
|
|
602
|
+
metadata: {
|
|
603
|
+
...result.metadata,
|
|
604
|
+
rerankingScores: {
|
|
605
|
+
original: result.score,
|
|
606
|
+
filename: filenameScore,
|
|
607
|
+
contentType: contentTypeScore,
|
|
608
|
+
metadata: metadataScore,
|
|
609
|
+
combined: combinedScore
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
};
|
|
613
|
+
});
|
|
614
|
+
// Sort by combined score (descending)
|
|
615
|
+
scoredResults.sort((a, b) => b.score - a.score);
|
|
616
|
+
return scoredResults;
|
|
617
|
+
}
|
|
618
|
+
catch (error) {
|
|
619
|
+
console.warn(`Metadata reranking failed: ${error instanceof Error ? error.message : 'Unknown error'}. ` +
|
|
620
|
+
`Returning original results.`);
|
|
621
|
+
return results;
|
|
622
|
+
}
|
|
623
|
+
};
|
|
624
|
+
/**
|
|
625
|
+
* Configure the reranking strategy
|
|
626
|
+
*/
|
|
627
|
+
configure(config) {
|
|
628
|
+
if (config.weights && typeof config.weights === 'object') {
|
|
629
|
+
this.config.weights = { ...this.config.weights, ...config.weights };
|
|
630
|
+
}
|
|
631
|
+
if (config.boostFactors && typeof config.boostFactors === 'object') {
|
|
632
|
+
this.config.boostFactors = { ...this.config.boostFactors, ...config.boostFactors };
|
|
633
|
+
}
|
|
634
|
+
if (config.keywordBoosts && typeof config.keywordBoosts === 'object') {
|
|
635
|
+
this.config.keywordBoosts = { ...this.config.keywordBoosts, ...config.keywordBoosts };
|
|
636
|
+
}
|
|
637
|
+
if (config.enabled !== undefined) {
|
|
638
|
+
this.isEnabled = Boolean(config.enabled);
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
/**
|
|
642
|
+
* Get metadata about this reranking strategy
|
|
643
|
+
*/
|
|
644
|
+
getMetadata() {
|
|
645
|
+
return {
|
|
646
|
+
description: 'Metadata-based reranking using filename patterns, content types, and file metadata',
|
|
647
|
+
requiredModels: [], // No models required
|
|
648
|
+
configOptions: {
|
|
649
|
+
weights: {
|
|
650
|
+
type: 'object',
|
|
651
|
+
description: 'Weights for different scoring components',
|
|
652
|
+
default: this.config.weights,
|
|
653
|
+
properties: {
|
|
654
|
+
filename: { type: 'number', min: 0, max: 1 },
|
|
655
|
+
contentType: { type: 'number', min: 0, max: 1 },
|
|
656
|
+
metadata: { type: 'number', min: 0, max: 1 }
|
|
657
|
+
}
|
|
658
|
+
},
|
|
659
|
+
boostFactors: {
|
|
660
|
+
type: 'object',
|
|
661
|
+
description: 'Boost factors for specific file patterns',
|
|
662
|
+
default: this.config.boostFactors
|
|
663
|
+
},
|
|
664
|
+
keywordBoosts: {
|
|
665
|
+
type: 'object',
|
|
666
|
+
description: 'Boost factors for specific keywords in filenames',
|
|
667
|
+
default: this.config.keywordBoosts
|
|
668
|
+
},
|
|
669
|
+
enabled: {
|
|
670
|
+
type: 'boolean',
|
|
671
|
+
description: 'Enable or disable metadata-based reranking',
|
|
672
|
+
default: true
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
};
|
|
676
|
+
}
|
|
677
|
+
/**
|
|
678
|
+
* Check if the strategy is ready to use
|
|
679
|
+
*/
|
|
680
|
+
async isReady() {
|
|
681
|
+
// Metadata strategy doesn't require model loading, so it's always ready if enabled
|
|
682
|
+
return this.isEnabled;
|
|
683
|
+
}
|
|
684
|
+
/**
|
|
685
|
+
* Get current configuration
|
|
686
|
+
*/
|
|
687
|
+
getConfig() {
|
|
688
|
+
return { ...this.config };
|
|
689
|
+
}
|
|
690
|
+
/**
|
|
691
|
+
* Clean up resources (no-op for metadata strategy)
|
|
692
|
+
*/
|
|
693
|
+
async cleanup() {
|
|
694
|
+
// No resources to clean up for metadata-based reranking
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Factory function to create a metadata reranking strategy
|
|
699
|
+
*/
|
|
700
|
+
export function createMetadataStrategy(config) {
|
|
701
|
+
return new MetadataRerankingStrategy(config);
|
|
702
|
+
}
|
|
703
|
+
/**
|
|
704
|
+
* Create a RerankFunction using the metadata strategy
|
|
705
|
+
*/
|
|
706
|
+
export function createMetadataRerankFunction(config) {
|
|
707
|
+
const strategy = createMetadataStrategy(config);
|
|
708
|
+
return strategy.rerank;
|
|
709
|
+
}
|
|
710
|
+
/**
|
|
711
|
+
* Create a RerankFunction using the cross-encoder strategy
|
|
712
|
+
*
|
|
713
|
+
* This provides backward compatibility with the existing RerankFunction interface
|
|
714
|
+
* while using the new strategy-based architecture internally.
|
|
715
|
+
*/
|
|
716
|
+
export function createCrossEncoderRerankFunction(modelName) {
|
|
717
|
+
const strategy = createCrossEncoderStrategy(modelName);
|
|
718
|
+
return strategy.rerank;
|
|
719
|
+
}
|
|
720
|
+
//# sourceMappingURL=reranking-strategies.js.map
|