smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,768 @@
|
|
|
1
|
+
require_relative '../core/embedding'
|
|
2
|
+
require_relative '../core/fulltext_manager'
|
|
3
|
+
require_relative '../errors'
|
|
4
|
+
|
|
5
|
+
require 'concurrent'
|
|
6
|
+
require 'logger'
|
|
7
|
+
require 'json'
|
|
8
|
+
|
|
9
|
+
module SmartRAG
|
|
10
|
+
module Services
|
|
11
|
+
# HybridSearchService provides unified interface for hybrid search combining vector and full-text search
|
|
12
|
+
# Uses RRF (Reciprocal Rank Fusion) algorithm to combine results
|
|
13
|
+
class HybridSearchService
|
|
14
|
+
attr_reader :embedding_manager, :fulltext_manager, :config, :logger
|
|
15
|
+
|
|
16
|
+
# Default configuration for hybrid search
|
|
17
|
+
DEFAULT_CONFIG = {
|
|
18
|
+
# RRF parameters
|
|
19
|
+
rrf_k: 60, # RRF constant (higher = more weight to lower ranks)
|
|
20
|
+
default_alpha: 0.95, # Weight for vector search results (0.0-1.0)
|
|
21
|
+
vector_similarity_weight: 0.3,
|
|
22
|
+
rerank_limit: 64,
|
|
23
|
+
fusion_method: :weighted_sum,
|
|
24
|
+
|
|
25
|
+
# Search parameters
|
|
26
|
+
default_limit: 20,
|
|
27
|
+
max_limit: 100,
|
|
28
|
+
min_limit: 1,
|
|
29
|
+
|
|
30
|
+
# Query parameters
|
|
31
|
+
min_query_length: 2,
|
|
32
|
+
max_query_length: 1000,
|
|
33
|
+
|
|
34
|
+
# Result parameters
|
|
35
|
+
deduplicate_results: true,
|
|
36
|
+
include_explanations: false,
|
|
37
|
+
|
|
38
|
+
# Vector search weight adjustments
|
|
39
|
+
vector_weight_boost: 1.0,
|
|
40
|
+
fulltext_weight_boost: 1.0
|
|
41
|
+
}.freeze
|
|
42
|
+
|
|
43
|
+
# Initialize HybridSearchService
|
|
44
|
+
# @param embedding_manager [Core::Embedding] Vector embedding manager
|
|
45
|
+
# @param fulltext_manager [Core::FulltextManager] Full-text search manager
|
|
46
|
+
# @param config [Hash] Configuration options
|
|
47
|
+
def initialize(embedding_manager, fulltext_manager, config = {})
|
|
48
|
+
@embedding_manager = embedding_manager
|
|
49
|
+
@fulltext_manager = fulltext_manager
|
|
50
|
+
@config = DEFAULT_CONFIG.merge(config)
|
|
51
|
+
@logger = config[:logger] || Logger.new(STDOUT)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Perform hybrid search combining vector and full-text results
|
|
55
|
+
# @param query [String] Search query text
|
|
56
|
+
# @param options [Hash] Search options
|
|
57
|
+
# @option options [String] :language Language code (auto-detect if nil)
|
|
58
|
+
# @option options [Integer] :limit Maximum results (default: 20)
|
|
59
|
+
# @option options [Float] :alpha Vector search weight (0.0-1.0, default: 0.7)
|
|
60
|
+
# @option options [Integer] :rrf_k RRF constant (default: 60)
|
|
61
|
+
# @option options [Hash] :filters Search filters
|
|
62
|
+
# @option options [Array<Integer>] :document_ids Filter by document IDs
|
|
63
|
+
# @option options [Array<Integer>] :tag_ids Filter by tag IDs
|
|
64
|
+
# @option options [Array<Tag>] :tags Tags to filter by
|
|
65
|
+
# @option options [Array<Float>] :query_embedding Pre-computed query embedding (optional)
|
|
66
|
+
# @option options [Boolean] :include_content Include full content
|
|
67
|
+
# @option options [Boolean] :include_metadata Include metadata
|
|
68
|
+
# @option options [Boolean] :enable_deduplication Deduplicate results
|
|
69
|
+
# @option options [Boolean] :include_explanations Include score explanations
|
|
70
|
+
# @return [Hash] Search results with combined rankings
|
|
71
|
+
def search(query, options = {})
|
|
72
|
+
# Initialize variables for error handling
|
|
73
|
+
final_results = []
|
|
74
|
+
start_time = Time.now
|
|
75
|
+
language = options[:language]
|
|
76
|
+
alpha = config[:default_alpha]
|
|
77
|
+
rrf_k = config[:rrf_k]
|
|
78
|
+
@last_vector_search_failed = false
|
|
79
|
+
@last_text_search_failed = false
|
|
80
|
+
|
|
81
|
+
begin
|
|
82
|
+
# Validate query
|
|
83
|
+
validation_error = validate_query(query)
|
|
84
|
+
if validation_error
|
|
85
|
+
@logger.error "Hybrid search validation failed: #{validation_error}"
|
|
86
|
+
raise ArgumentError, validation_error
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Extract options
|
|
90
|
+
language = options[:language] || detect_language(query)
|
|
91
|
+
limit = validate_limit(options[:limit] || config[:default_limit])
|
|
92
|
+
alpha = validate_alpha(options[:alpha] || config[:default_alpha])
|
|
93
|
+
alpha = adjust_alpha_for_query(alpha, query, language)
|
|
94
|
+
rrf_k = options[:rrf_k] || config[:rrf_k]
|
|
95
|
+
filters = options[:filters] || {}
|
|
96
|
+
deduplicate = options.fetch(:enable_deduplication, config[:deduplicate_results])
|
|
97
|
+
include_content = options.fetch(:include_content, false)
|
|
98
|
+
include_metadata = options.fetch(:include_metadata, true)
|
|
99
|
+
include_explanations = options.fetch(:include_explanations, config[:include_explanations])
|
|
100
|
+
query_embedding = options[:query_embedding]
|
|
101
|
+
|
|
102
|
+
rerank_limit = normalize_rerank_limit(options[:rerank_limit] || config[:rerank_limit], limit)
|
|
103
|
+
recall_limit = [rerank_limit, limit].max
|
|
104
|
+
|
|
105
|
+
@logger.info "Hybrid search: '#{query}', language: #{language}, limit: #{limit}, alpha: #{alpha}, recall_limit: #{recall_limit}"
|
|
106
|
+
|
|
107
|
+
# Execute both search methods
|
|
108
|
+
start_time = Time.now
|
|
109
|
+
@logger.debug 'Starting vector search...'
|
|
110
|
+
vector_results = perform_vector_search(query, query_embedding, recall_limit, filters, options)
|
|
111
|
+
@logger.debug "Vector search completed: #{vector_results.length} results"
|
|
112
|
+
|
|
113
|
+
@logger.debug 'Starting text search...'
|
|
114
|
+
text_results = perform_text_search(query, language, recall_limit, filters)
|
|
115
|
+
@logger.debug "Text search completed: #{text_results.length} results"
|
|
116
|
+
|
|
117
|
+
combined_results = combine_results(
|
|
118
|
+
text_results,
|
|
119
|
+
vector_results,
|
|
120
|
+
alpha: alpha,
|
|
121
|
+
k: rrf_k,
|
|
122
|
+
deduplicate: deduplicate
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if combined_results.empty? && !@last_vector_search_failed && !@last_text_search_failed
|
|
126
|
+
@logger.info "Hybrid search fallback: relaxing query and thresholds"
|
|
127
|
+
relaxed_query = relax_query(query)
|
|
128
|
+
text_results = perform_text_search(relaxed_query, language, recall_limit, filters)
|
|
129
|
+
vector_results = perform_vector_search(relaxed_query, query_embedding, recall_limit, filters, options.merge(fallback_threshold: 0.05))
|
|
130
|
+
|
|
131
|
+
combined_results = combine_results(
|
|
132
|
+
text_results,
|
|
133
|
+
vector_results,
|
|
134
|
+
alpha: alpha,
|
|
135
|
+
k: rrf_k,
|
|
136
|
+
deduplicate: deduplicate
|
|
137
|
+
)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
reranked = rerank_results(
|
|
141
|
+
combined_results.first(rerank_limit),
|
|
142
|
+
query,
|
|
143
|
+
text_results: text_results,
|
|
144
|
+
vector_results: vector_results,
|
|
145
|
+
vector_similarity_weight: options[:vector_similarity_weight] || alpha || config[:vector_similarity_weight]
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
final_results = reranked.first(limit)
|
|
149
|
+
|
|
150
|
+
@logger.debug "Before enrichment: final_results count=#{final_results.length}"
|
|
151
|
+
|
|
152
|
+
# Enrich results if requested
|
|
153
|
+
if include_content || include_metadata || include_explanations
|
|
154
|
+
@logger.debug "Calling enrich_results with include_content=#{include_content}, include_metadata=#{include_metadata}"
|
|
155
|
+
final_results = enrich_results(final_results, include_content, include_metadata, include_explanations)
|
|
156
|
+
@logger.debug "After enrichment: enriched results count=#{final_results.length}"
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
execution_time = ((Time.now - start_time) * 1000).round
|
|
160
|
+
|
|
161
|
+
# Build response
|
|
162
|
+
response = {
|
|
163
|
+
query: query,
|
|
164
|
+
results: final_results,
|
|
165
|
+
metadata: {
|
|
166
|
+
total_count: final_results.length,
|
|
167
|
+
execution_time_ms: execution_time,
|
|
168
|
+
language: language,
|
|
169
|
+
alpha: alpha,
|
|
170
|
+
rrf_k: rrf_k,
|
|
171
|
+
rerank_limit: rerank_limit,
|
|
172
|
+
text_result_count: text_results.length,
|
|
173
|
+
vector_result_count: vector_results.length,
|
|
174
|
+
combined_score_stats: calculate_score_stats(final_results)
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
# Log search
|
|
179
|
+
log_search(query, 'hybrid', response[:results].length, execution_time)
|
|
180
|
+
|
|
181
|
+
@logger.info "Hybrid search completed: #{final_results.length} results in #{execution_time}ms"
|
|
182
|
+
|
|
183
|
+
response
|
|
184
|
+
rescue ::SmartRAG::Errors::HybridSearchServiceError
|
|
185
|
+
raise
|
|
186
|
+
rescue ArgumentError => e
|
|
187
|
+
# Return empty results on validation error
|
|
188
|
+
execution_time = ((Time.now - start_time) * 1000).round
|
|
189
|
+
{
|
|
190
|
+
query: query,
|
|
191
|
+
results: [],
|
|
192
|
+
metadata: {
|
|
193
|
+
total_count: 0,
|
|
194
|
+
execution_time_ms: execution_time,
|
|
195
|
+
language: language || config[:default_language],
|
|
196
|
+
alpha: alpha || config[:default_alpha],
|
|
197
|
+
rrf_k: rrf_k || config[:rrf_k],
|
|
198
|
+
text_result_count: 0,
|
|
199
|
+
vector_result_count: 0,
|
|
200
|
+
combined_score_stats: {},
|
|
201
|
+
error: e.message
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
rescue StandardError => e
|
|
205
|
+
@logger.error "Hybrid search failed: #{e.message}"
|
|
206
|
+
@logger.error e.backtrace.join("\n")
|
|
207
|
+
log_search(query, 'hybrid', 0, 0, e.message)
|
|
208
|
+
# Return empty results on error instead of crashing
|
|
209
|
+
# This allows tests and callers to handle errors gracefully
|
|
210
|
+
execution_time = ((Time.now - start_time) * 1000).round
|
|
211
|
+
{
|
|
212
|
+
query: query,
|
|
213
|
+
results: [],
|
|
214
|
+
metadata: {
|
|
215
|
+
total_count: 0,
|
|
216
|
+
execution_time_ms: execution_time,
|
|
217
|
+
language: language || config[:default_language],
|
|
218
|
+
alpha: alpha || config[:default_alpha],
|
|
219
|
+
rrf_k: rrf_k || config[:rrf_k],
|
|
220
|
+
text_result_count: 0,
|
|
221
|
+
vector_result_count: 0,
|
|
222
|
+
combined_score_stats: {},
|
|
223
|
+
error: e.message
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
def validate_query(query)
|
|
230
|
+
return 'Query cannot be nil' if query.nil?
|
|
231
|
+
return 'Query cannot be empty' if query.strip.empty?
|
|
232
|
+
|
|
233
|
+
length = query.strip.length
|
|
234
|
+
return "Query too short (minimum #{config[:min_query_length]} characters)" if length < config[:min_query_length]
|
|
235
|
+
|
|
236
|
+
return "Query too long (maximum #{config[:max_query_length]} characters)" if length > config[:max_query_length]
|
|
237
|
+
|
|
238
|
+
nil
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def log_search(query, search_type, result_count, execution_time, error = nil)
|
|
242
|
+
# Skip logging validation errors (nil/empty queries)
|
|
243
|
+
return if query.nil? || query.to_s.strip.empty?
|
|
244
|
+
|
|
245
|
+
begin
|
|
246
|
+
# Skip logging if database or fulltext_manager is not available
|
|
247
|
+
return unless @fulltext_manager && @fulltext_manager.respond_to?(:db) && @fulltext_manager.db
|
|
248
|
+
|
|
249
|
+
# Build insert hash without error_message column (not in migration)
|
|
250
|
+
log_data = {
|
|
251
|
+
query: query.to_s,
|
|
252
|
+
search_type: search_type,
|
|
253
|
+
execution_time_ms: execution_time,
|
|
254
|
+
results_count: result_count,
|
|
255
|
+
created_at: Sequel::CURRENT_TIMESTAMP
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
# Only add filters if we have error (but format differently for existing columns)
|
|
259
|
+
log_data[:filters] = { error: error }.to_json if error
|
|
260
|
+
|
|
261
|
+
@fulltext_manager.db[:search_logs].insert(log_data) if fulltext_manager.db[:search_logs]
|
|
262
|
+
rescue StandardError => e
|
|
263
|
+
@logger.error "Failed to log search: #{e.message}"
|
|
264
|
+
end
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
private
|
|
268
|
+
|
|
269
|
+
def detect_language(query)
|
|
270
|
+
fulltext_manager.detect_language(query)
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def validate_limit(limit)
|
|
274
|
+
limit = limit.to_i
|
|
275
|
+
[[config[:min_limit], limit].max, config[:max_limit]].min
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def validate_alpha(alpha)
|
|
279
|
+
[[0.0, alpha.to_f].max, 1.0].min
|
|
280
|
+
end
|
|
281
|
+
|
|
282
|
+
def adjust_alpha_for_query(alpha, query, language)
|
|
283
|
+
return alpha unless query
|
|
284
|
+
|
|
285
|
+
length = query.strip.length
|
|
286
|
+
return alpha if length == 0
|
|
287
|
+
|
|
288
|
+
# Favor fulltext for short queries where exact keyword match is strong.
|
|
289
|
+
lang = language.to_s
|
|
290
|
+
if (lang == 'zh' || lang.start_with?('zh_')) && length <= 4
|
|
291
|
+
return [alpha, 0.3].min
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
alpha
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
def extract_document_id(section)
|
|
298
|
+
if section.is_a?(Hash)
|
|
299
|
+
section[:document_id] || section['document_id']
|
|
300
|
+
else
|
|
301
|
+
section&.document_id
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
def symbolize_keys(hash)
|
|
306
|
+
return hash unless hash.is_a?(Hash)
|
|
307
|
+
|
|
308
|
+
hash.each_with_object({}) do |(key, value), result|
|
|
309
|
+
result[key.to_sym] = value
|
|
310
|
+
end
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def perform_text_search(query, language, limit, filters)
|
|
314
|
+
if filters && !filters.empty?
|
|
315
|
+
# Convert tags to tag_ids for fulltext manager
|
|
316
|
+
search_filters = filters.dup
|
|
317
|
+
if filters[:tags] && !filters[:tags].empty?
|
|
318
|
+
# Convert Tag objects to IDs
|
|
319
|
+
search_filters[:tag_ids] = filters[:tags].map do |tag|
|
|
320
|
+
case tag
|
|
321
|
+
when ::SmartRAG::Models::Tag
|
|
322
|
+
tag.id
|
|
323
|
+
when Integer
|
|
324
|
+
tag
|
|
325
|
+
when String
|
|
326
|
+
tag_obj = ::SmartRAG::Models::Tag.find(name: tag)
|
|
327
|
+
tag_obj&.id
|
|
328
|
+
end
|
|
329
|
+
end.compact
|
|
330
|
+
# Remove the original :tags key as fulltext manager expects :tag_ids
|
|
331
|
+
search_filters.delete(:tags)
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
fulltext_manager.search_by_text(query, language, limit, filters: search_filters)
|
|
335
|
+
else
|
|
336
|
+
fulltext_manager.search_by_text(query, language, limit)
|
|
337
|
+
end
|
|
338
|
+
rescue StandardError => e
|
|
339
|
+
@last_text_search_failed = true
|
|
340
|
+
@logger.warn "Text search failed, continuing with vector-only results: #{e.message}"
|
|
341
|
+
[]
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
def perform_vector_search(query, query_embedding, limit, filters, options = {})
|
|
345
|
+
@last_vector_search_failed = false
|
|
346
|
+
query_embedding ||= embedding_manager.send(:generate_query_embedding, query, options)
|
|
347
|
+
|
|
348
|
+
# Use embedding-based search path by default for predictable error handling.
|
|
349
|
+
tags = filters[:tags]
|
|
350
|
+
if tags && !tags.empty?
|
|
351
|
+
embedding_manager.search_by_vector_with_tags(query_embedding, tags, options.merge(limit: limit))
|
|
352
|
+
else
|
|
353
|
+
embedding_manager.search_by_vector(query_embedding, options.merge(limit: limit))
|
|
354
|
+
end
|
|
355
|
+
rescue PG::ConnectionBad => e
|
|
356
|
+
@last_vector_search_failed = true
|
|
357
|
+
raise ::SmartRAG::Errors::HybridSearchServiceError, "Vector database unavailable: #{e.message}"
|
|
358
|
+
rescue PG::Error => e
|
|
359
|
+
@last_vector_search_failed = true
|
|
360
|
+
@logger.warn "Vector search temporarily unavailable, continuing with text-only results: #{e.message}"
|
|
361
|
+
[]
|
|
362
|
+
rescue StandardError => e
|
|
363
|
+
@last_vector_search_failed = true
|
|
364
|
+
@logger.warn "Vector search failed, continuing with text-only results: #{e.message}"
|
|
365
|
+
[]
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
def combine_with_weighted_rrf(text_results, vector_results, alpha:, k:, deduplicate:)
|
|
369
|
+
# Convert text results to RRF format
|
|
370
|
+
text_rrf = text_results.each_with_index.map do |result, idx|
|
|
371
|
+
{ result: result, section: normalize_result_section(result), score: 1.0 / (k + idx + 1), source: :text }
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
# Convert vector results to RRF format
|
|
375
|
+
vector_rrf = vector_results.each_with_index.map do |result, idx|
|
|
376
|
+
{ result: result, section: normalize_result_section(result), score: 1.0 / (k + idx + 1), source: :vector }
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
# Group by section_id or other unique identifier
|
|
380
|
+
combined = {}
|
|
381
|
+
text_rrf.each do |item|
|
|
382
|
+
key = extract_result_key(item[:result])
|
|
383
|
+
combined[key] ||= { section: item[:section], text_score: 0, vector_score: 0 }
|
|
384
|
+
combined[key][:text_score] = item[:score]
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
vector_rrf.each do |item|
|
|
388
|
+
key = extract_result_key(item[:result])
|
|
389
|
+
combined[key] ||= { section: item[:section], text_score: 0, vector_score: 0 }
|
|
390
|
+
combined[key][:vector_score] = item[:score]
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
# Calculate weighted scores and sort
|
|
394
|
+
combined.map do |_key, data|
|
|
395
|
+
combined_score = alpha * data[:vector_score] + (1 - alpha) * data[:text_score]
|
|
396
|
+
{
|
|
397
|
+
section: data[:section],
|
|
398
|
+
combined_score: combined_score,
|
|
399
|
+
vector_score: data[:vector_score],
|
|
400
|
+
text_score: data[:text_score]
|
|
401
|
+
}
|
|
402
|
+
end.sort_by { |r| -r[:combined_score] }
|
|
403
|
+
end
|
|
404
|
+
|
|
405
|
+
def combine_results(text_results, vector_results, alpha:, k:, deduplicate:)
|
|
406
|
+
case config[:fusion_method]
|
|
407
|
+
when :rrf
|
|
408
|
+
combine_with_weighted_rrf(text_results, vector_results, alpha: alpha, k: k, deduplicate: deduplicate)
|
|
409
|
+
else
|
|
410
|
+
combine_with_weighted_scores(text_results, vector_results, alpha: alpha, deduplicate: deduplicate)
|
|
411
|
+
end
|
|
412
|
+
end
|
|
413
|
+
|
|
414
|
+
def combine_with_weighted_scores(text_results, vector_results, alpha:, deduplicate:)
|
|
415
|
+
text_scores = build_text_score_map(text_results)
|
|
416
|
+
vector_scores = build_vector_score_map(vector_results)
|
|
417
|
+
|
|
418
|
+
normalized_text = normalize_scores(text_scores)
|
|
419
|
+
normalized_vector = normalize_scores(vector_scores)
|
|
420
|
+
|
|
421
|
+
combined = {}
|
|
422
|
+
text_results.each do |result|
|
|
423
|
+
key = extract_result_key(result)
|
|
424
|
+
combined[key] ||= { section: normalize_result_section(result), text_score: 0.0, vector_score: 0.0 }
|
|
425
|
+
combined[key][:text_score] = normalized_text[extract_result_section_id(result)] || 0.0
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
vector_results.each do |result|
|
|
429
|
+
key = extract_result_key(result)
|
|
430
|
+
combined[key] ||= { section: normalize_result_section(result), text_score: 0.0, vector_score: 0.0 }
|
|
431
|
+
combined[key][:vector_score] = normalized_vector[extract_result_section_id(result)] || 0.0
|
|
432
|
+
end
|
|
433
|
+
|
|
434
|
+
combined.map do |_key, data|
|
|
435
|
+
text_score = data[:text_score] * config[:fulltext_weight_boost]
|
|
436
|
+
vector_score = data[:vector_score] * config[:vector_weight_boost]
|
|
437
|
+
combined_score = alpha * vector_score + (1 - alpha) * text_score
|
|
438
|
+
{
|
|
439
|
+
section: data[:section],
|
|
440
|
+
combined_score: combined_score,
|
|
441
|
+
vector_score: vector_score,
|
|
442
|
+
text_score: text_score
|
|
443
|
+
}
|
|
444
|
+
end.sort_by { |r| -r[:combined_score] }
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
def normalize_rerank_limit(rerank_limit, limit)
|
|
448
|
+
rerank_limit = rerank_limit.to_i
|
|
449
|
+
rerank_limit = limit * 2 if rerank_limit <= 0
|
|
450
|
+
base = [rerank_limit, limit].max
|
|
451
|
+
((base + 63) / 64) * 64
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
def rerank_results(results, query, text_results:, vector_results:, vector_similarity_weight:)
|
|
455
|
+
return results if results.empty?
|
|
456
|
+
|
|
457
|
+
tkweight = 1.0 - vector_similarity_weight.to_f
|
|
458
|
+
vtweight = vector_similarity_weight.to_f
|
|
459
|
+
|
|
460
|
+
vector_map = build_vector_score_map(vector_results)
|
|
461
|
+
|
|
462
|
+
query_tokens = tokenize(query)
|
|
463
|
+
return results if query_tokens.empty?
|
|
464
|
+
if query_tokens.length >= 6
|
|
465
|
+
vtweight = [vtweight, 0.2].min
|
|
466
|
+
tkweight = 1.0 - vtweight
|
|
467
|
+
end
|
|
468
|
+
|
|
469
|
+
results.map do |result|
|
|
470
|
+
section = result[:section]
|
|
471
|
+
content = extract_section_content(section)
|
|
472
|
+
title = extract_section_title(section)
|
|
473
|
+
tags = extract_section_tags(section)
|
|
474
|
+
|
|
475
|
+
token_score = token_similarity(query_tokens, content, title, tags)
|
|
476
|
+
vector_score = vector_map[extract_section_id(section)] || 0.0
|
|
477
|
+
if token_score <= 0.0 && query_tokens.length >= 3
|
|
478
|
+
vector_score *= 0.3
|
|
479
|
+
end
|
|
480
|
+
rank_feature = token_score > 0 ? rank_feature_score(query_tokens, tags) : 0.0
|
|
481
|
+
rerank_score = (tkweight * token_score) + (vtweight * vector_score) + rank_feature
|
|
482
|
+
|
|
483
|
+
result.merge(
|
|
484
|
+
rerank_score: rerank_score,
|
|
485
|
+
combined_score: rerank_score
|
|
486
|
+
)
|
|
487
|
+
end.sort_by { |r| -r[:combined_score] }
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
def build_text_score_map(text_results)
|
|
491
|
+
text_results.each_with_object({}) do |result, map|
|
|
492
|
+
section_id = extract_result_section_id(result)
|
|
493
|
+
next unless section_id
|
|
494
|
+
|
|
495
|
+
score = result[:rank_score] || 0.0
|
|
496
|
+
map[section_id] = score
|
|
497
|
+
end
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
def build_vector_score_map(vector_results)
|
|
501
|
+
vector_results.each_with_object({}) do |result, map|
|
|
502
|
+
section_id = extract_result_section_id(result)
|
|
503
|
+
next unless section_id
|
|
504
|
+
|
|
505
|
+
score = result[:boosted_score] || result[:similarity] || 0.0
|
|
506
|
+
map[section_id] = score
|
|
507
|
+
end
|
|
508
|
+
end
|
|
509
|
+
|
|
510
|
+
def extract_result_section_id(result)
|
|
511
|
+
return result[:section_id] if result.is_a?(Hash) && result[:section_id]
|
|
512
|
+
return result[:section][:id] if result.is_a?(Hash) && result[:section].is_a?(Hash)
|
|
513
|
+
return result[:section].id if result.is_a?(Hash) && result[:section].respond_to?(:id)
|
|
514
|
+
return result.id if result.respond_to?(:id)
|
|
515
|
+
|
|
516
|
+
nil
|
|
517
|
+
end
|
|
518
|
+
|
|
519
|
+
def extract_section_id(section)
|
|
520
|
+
return section[:id] if section.is_a?(Hash)
|
|
521
|
+
return section.id if section.respond_to?(:id)
|
|
522
|
+
|
|
523
|
+
nil
|
|
524
|
+
end
|
|
525
|
+
|
|
526
|
+
def extract_section_content(section)
|
|
527
|
+
if section.is_a?(Hash)
|
|
528
|
+
section[:content] || section['content'] || ''
|
|
529
|
+
else
|
|
530
|
+
section&.content.to_s
|
|
531
|
+
end
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
def extract_section_title(section)
|
|
535
|
+
if section.is_a?(Hash)
|
|
536
|
+
section[:title] || section[:section_title] || section['title'] || ''
|
|
537
|
+
else
|
|
538
|
+
section&.section_title.to_s
|
|
539
|
+
end
|
|
540
|
+
end
|
|
541
|
+
|
|
542
|
+
def extract_section_tags(section)
|
|
543
|
+
if section.is_a?(Hash)
|
|
544
|
+
tags = section[:tags] || section['tags']
|
|
545
|
+
return tags if tags.is_a?(Array)
|
|
546
|
+
return tags.split(',').map(&:strip) if tags.is_a?(String)
|
|
547
|
+
elsif section.respond_to?(:tags)
|
|
548
|
+
return section.tags.map(&:name)
|
|
549
|
+
end
|
|
550
|
+
|
|
551
|
+
[]
|
|
552
|
+
end
|
|
553
|
+
|
|
554
|
+
def tokenize(text)
|
|
555
|
+
return [] if text.nil?
|
|
556
|
+
|
|
557
|
+
tokens = []
|
|
558
|
+
text.scan(/[\p{Han}]+|[A-Za-z0-9_]+/) do |chunk|
|
|
559
|
+
if chunk.match?(/\p{Han}/)
|
|
560
|
+
if chunk.length <= 2
|
|
561
|
+
tokens << chunk
|
|
562
|
+
else
|
|
563
|
+
tokens << chunk
|
|
564
|
+
0.upto(chunk.length - 2) do |idx|
|
|
565
|
+
tokens << chunk[idx, 2]
|
|
566
|
+
end
|
|
567
|
+
end
|
|
568
|
+
else
|
|
569
|
+
tokens << chunk.downcase
|
|
570
|
+
end
|
|
571
|
+
end
|
|
572
|
+
tokens
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
def token_similarity(query_tokens, content, title, tags)
|
|
576
|
+
query_tokens = query_tokens.uniq
|
|
577
|
+
doc_tokens = tokenize(content)
|
|
578
|
+
return 0.0 if doc_tokens.empty?
|
|
579
|
+
|
|
580
|
+
title_tokens = tokenize(title)
|
|
581
|
+
token_counts = Hash.new(0)
|
|
582
|
+
doc_tokens.each { |t| token_counts[t] += 1 }
|
|
583
|
+
title_tokens.each { |t| token_counts[t] += 2 }
|
|
584
|
+
Array(tags).each { |t| token_counts[t.to_s.downcase] += 5 }
|
|
585
|
+
|
|
586
|
+
hits = 0
|
|
587
|
+
query_tokens.each do |token|
|
|
588
|
+
hits += token_counts[token]
|
|
589
|
+
end
|
|
590
|
+
|
|
591
|
+
hits.to_f / query_tokens.length
|
|
592
|
+
end
|
|
593
|
+
|
|
594
|
+
def rank_feature_score(query_tokens, tags)
|
|
595
|
+
tag_tokens = Array(tags).map { |t| t.to_s.downcase }
|
|
596
|
+
tag_hits = query_tokens.count { |token| tag_tokens.include?(token) }
|
|
597
|
+
tag_score = tag_hits.to_f / [query_tokens.length, 1].max
|
|
598
|
+
|
|
599
|
+
tag_score
|
|
600
|
+
end
|
|
601
|
+
|
|
602
|
+
def normalize_scores(score_map)
|
|
603
|
+
return {} if score_map.empty?
|
|
604
|
+
|
|
605
|
+
values = score_map.values
|
|
606
|
+
min = values.min
|
|
607
|
+
max = values.max
|
|
608
|
+
if (max - min).abs < 1e-9
|
|
609
|
+
return score_map.transform_values { |v| v > 0 ? 1.0 : 0.0 }
|
|
610
|
+
end
|
|
611
|
+
|
|
612
|
+
score_map.transform_values { |v| (v - min) / (max - min) }
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
def normalize_result_section(result)
|
|
616
|
+
return result[:section] if result.is_a?(Hash) && result[:section]
|
|
617
|
+
|
|
618
|
+
result
|
|
619
|
+
end
|
|
620
|
+
|
|
621
|
+
def relax_query(query)
|
|
622
|
+
return query if query.nil?
|
|
623
|
+
|
|
624
|
+
relaxed = query.to_s.dup
|
|
625
|
+
relaxed.gsub!(/["']/, ' ')
|
|
626
|
+
relaxed.gsub!(/\b(AND|OR|NOT)\b/i, ' ')
|
|
627
|
+
relaxed.gsub!(/[()]/, ' ')
|
|
628
|
+
relaxed.gsub!(/\s+/, ' ')
|
|
629
|
+
relaxed.strip!
|
|
630
|
+
|
|
631
|
+
relaxed.empty? ? query : relaxed
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
def extract_result_key(result)
|
|
635
|
+
# Extract a unique key for deduplication
|
|
636
|
+
# For vector search results, we want document_id (not section_id)
|
|
637
|
+
# because we want to dedupe by document, not by section
|
|
638
|
+
case result
|
|
639
|
+
when Hash
|
|
640
|
+
# If result is from vector search, it has format: {embedding, section, similarity}
|
|
641
|
+
# We want to use document_id for deduplication
|
|
642
|
+
if result[:section]
|
|
643
|
+
# section may be a Hash or a SourceSection object
|
|
644
|
+
if result[:section].is_a?(Hash)
|
|
645
|
+
result[:section][:document_id] || result[:section][:id] || result[:section][:section_id] || result[:id]
|
|
646
|
+
else
|
|
647
|
+
# SourceSection object - try to get document_id
|
|
648
|
+
begin
|
|
649
|
+
result[:section].document_id
|
|
650
|
+
rescue StandardError
|
|
651
|
+
result[:id] || result.object_id
|
|
652
|
+
end
|
|
653
|
+
end
|
|
654
|
+
else
|
|
655
|
+
# For fulltext search results or fallback
|
|
656
|
+
result[:section_id] || result[:id] || result.object_id
|
|
657
|
+
end
|
|
658
|
+
else
|
|
659
|
+
# For objects (SourceSection, etc.)
|
|
660
|
+
begin
|
|
661
|
+
result.document_id
|
|
662
|
+
rescue StandardError
|
|
663
|
+
result.id || result.object_id
|
|
664
|
+
end
|
|
665
|
+
end
|
|
666
|
+
end
|
|
667
|
+
|
|
668
|
+
def enrich_results(results, include_content, include_metadata, include_explanations)
|
|
669
|
+
@logger.debug "enrich_results called with: include_content=#{include_content}, include_metadata=#{include_metadata}, include_explanations=#{include_explanations}"
|
|
670
|
+
|
|
671
|
+
results.map do |result|
|
|
672
|
+
enriched = result.dup
|
|
673
|
+
section = result[:section]
|
|
674
|
+
|
|
675
|
+
@logger.debug "Processing result, section class=#{section.class}, section inspect=#{section.inspect[0..200]}"
|
|
676
|
+
|
|
677
|
+
if include_content
|
|
678
|
+
enriched[:content] = if section.is_a?(Hash)
|
|
679
|
+
section[:content] || section['content'] || ''
|
|
680
|
+
else
|
|
681
|
+
section&.content || ''
|
|
682
|
+
end
|
|
683
|
+
end
|
|
684
|
+
|
|
685
|
+
if include_metadata
|
|
686
|
+
document_id = if section.is_a?(Hash)
|
|
687
|
+
section[:document_id] || section['document_id']
|
|
688
|
+
else
|
|
689
|
+
section&.document_id
|
|
690
|
+
end
|
|
691
|
+
|
|
692
|
+
@logger.debug "Document ID extracted: #{document_id.inspect} (section type: #{section.class})"
|
|
693
|
+
|
|
694
|
+
base_metadata = if section.is_a?(Hash)
|
|
695
|
+
{
|
|
696
|
+
section_id: section[:id] || section['id'] || section[:section_id],
|
|
697
|
+
document_id: document_id
|
|
698
|
+
}
|
|
699
|
+
else
|
|
700
|
+
{
|
|
701
|
+
section_id: section&.id,
|
|
702
|
+
document_id: document_id
|
|
703
|
+
}
|
|
704
|
+
end
|
|
705
|
+
|
|
706
|
+
if document_id && document_id != ''
|
|
707
|
+
begin
|
|
708
|
+
doc = @fulltext_manager.db[:source_documents].where(id: document_id).first
|
|
709
|
+
@logger.debug "Fetched document for id=#{document_id}: doc=#{doc ? 'found' : 'nil'}"
|
|
710
|
+
|
|
711
|
+
if doc
|
|
712
|
+
# Add document title
|
|
713
|
+
base_metadata[:document_title] = doc[:title] if doc[:title]
|
|
714
|
+
|
|
715
|
+
# Merge document metadata (may contain category, author, etc.)
|
|
716
|
+
if doc[:metadata]
|
|
717
|
+
@logger.debug "Document metadata found: #{doc[:metadata].inspect}"
|
|
718
|
+
parsed_metadata = if doc[:metadata].is_a?(String)
|
|
719
|
+
begin
|
|
720
|
+
JSON.parse(doc[:metadata])
|
|
721
|
+
rescue StandardError
|
|
722
|
+
{}
|
|
723
|
+
end
|
|
724
|
+
else
|
|
725
|
+
doc[:metadata]
|
|
726
|
+
end
|
|
727
|
+
parsed_metadata = symbolize_keys(parsed_metadata) if parsed_metadata.is_a?(Hash)
|
|
728
|
+
@logger.debug "Parsed metadata: #{parsed_metadata.inspect}"
|
|
729
|
+
base_metadata.merge!(parsed_metadata) if parsed_metadata.is_a?(Hash)
|
|
730
|
+
else
|
|
731
|
+
@logger.debug 'Document has no metadata field or is nil'
|
|
732
|
+
end
|
|
733
|
+
else
|
|
734
|
+
@logger.warn "Document not found for id=#{document_id}"
|
|
735
|
+
end
|
|
736
|
+
rescue StandardError => e
|
|
737
|
+
@logger.warn "Failed to fetch document metadata for document_id=#{document_id}: #{e.message}"
|
|
738
|
+
@logger.debug e.backtrace[0..5].join("\n")
|
|
739
|
+
end
|
|
740
|
+
else
|
|
741
|
+
@logger.warn 'Document ID is nil or empty'
|
|
742
|
+
end
|
|
743
|
+
|
|
744
|
+
enriched[:metadata] = base_metadata
|
|
745
|
+
end
|
|
746
|
+
|
|
747
|
+
if include_explanations
|
|
748
|
+
enriched[:score_explanation] =
|
|
749
|
+
"Combined: #{result[:combined_score].round(4)} (vector: #{result[:vector_score].round(4)}, text: #{result[:text_score].round(4)})"
|
|
750
|
+
end
|
|
751
|
+
|
|
752
|
+
enriched
|
|
753
|
+
end
|
|
754
|
+
end
|
|
755
|
+
|
|
756
|
+
def calculate_score_stats(results)
|
|
757
|
+
return {} if results.empty?
|
|
758
|
+
|
|
759
|
+
scores = results.map { |r| r[:combined_score] }
|
|
760
|
+
{
|
|
761
|
+
min: scores.min.round(4),
|
|
762
|
+
max: scores.max.round(4),
|
|
763
|
+
avg: (scores.sum / scores.size.to_f).round(4)
|
|
764
|
+
}
|
|
765
|
+
end
|
|
766
|
+
end
|
|
767
|
+
end
|
|
768
|
+
end
|