smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,768 @@
1
+ require_relative '../core/embedding'
2
+ require_relative '../core/fulltext_manager'
3
+ require_relative '../errors'
4
+
5
+ require 'concurrent'
6
+ require 'logger'
7
+ require 'json'
8
+
9
+ module SmartRAG
10
+ module Services
11
+ # HybridSearchService provides unified interface for hybrid search combining vector and full-text search
12
+ # Uses RRF (Reciprocal Rank Fusion) algorithm to combine results
13
+ class HybridSearchService
14
+ attr_reader :embedding_manager, :fulltext_manager, :config, :logger
15
+
16
+ # Default configuration for hybrid search
17
+ DEFAULT_CONFIG = {
18
+ # RRF parameters
19
+ rrf_k: 60, # RRF constant (higher = more weight to lower ranks)
20
+ default_alpha: 0.95, # Weight for vector search results (0.0-1.0)
21
+ vector_similarity_weight: 0.3,
22
+ rerank_limit: 64,
23
+ fusion_method: :weighted_sum,
24
+
25
+ # Search parameters
26
+ default_limit: 20,
27
+ max_limit: 100,
28
+ min_limit: 1,
29
+
30
+ # Query parameters
31
+ min_query_length: 2,
32
+ max_query_length: 1000,
33
+
34
+ # Result parameters
35
+ deduplicate_results: true,
36
+ include_explanations: false,
37
+
38
+ # Vector search weight adjustments
39
+ vector_weight_boost: 1.0,
40
+ fulltext_weight_boost: 1.0
41
+ }.freeze
42
+
43
+ # Initialize HybridSearchService
44
+ # @param embedding_manager [Core::Embedding] Vector embedding manager
45
+ # @param fulltext_manager [Core::FulltextManager] Full-text search manager
46
+ # @param config [Hash] Configuration options
47
+ def initialize(embedding_manager, fulltext_manager, config = {})
48
+ @embedding_manager = embedding_manager
49
+ @fulltext_manager = fulltext_manager
50
+ @config = DEFAULT_CONFIG.merge(config)
51
+ @logger = config[:logger] || Logger.new(STDOUT)
52
+ end
53
+
54
+ # Perform hybrid search combining vector and full-text results
55
+ # @param query [String] Search query text
56
+ # @param options [Hash] Search options
57
+ # @option options [String] :language Language code (auto-detect if nil)
58
+ # @option options [Integer] :limit Maximum results (default: 20)
59
+ # @option options [Float] :alpha Vector search weight (0.0-1.0, default: 0.7)
60
+ # @option options [Integer] :rrf_k RRF constant (default: 60)
61
+ # @option options [Hash] :filters Search filters
62
+ # @option options [Array<Integer>] :document_ids Filter by document IDs
63
+ # @option options [Array<Integer>] :tag_ids Filter by tag IDs
64
+ # @option options [Array<Tag>] :tags Tags to filter by
65
+ # @option options [Array<Float>] :query_embedding Pre-computed query embedding (optional)
66
+ # @option options [Boolean] :include_content Include full content
67
+ # @option options [Boolean] :include_metadata Include metadata
68
+ # @option options [Boolean] :enable_deduplication Deduplicate results
69
+ # @option options [Boolean] :include_explanations Include score explanations
70
+ # @return [Hash] Search results with combined rankings
71
+ def search(query, options = {})
72
+ # Initialize variables for error handling
73
+ final_results = []
74
+ start_time = Time.now
75
+ language = options[:language]
76
+ alpha = config[:default_alpha]
77
+ rrf_k = config[:rrf_k]
78
+ @last_vector_search_failed = false
79
+ @last_text_search_failed = false
80
+
81
+ begin
82
+ # Validate query
83
+ validation_error = validate_query(query)
84
+ if validation_error
85
+ @logger.error "Hybrid search validation failed: #{validation_error}"
86
+ raise ArgumentError, validation_error
87
+ end
88
+
89
+ # Extract options
90
+ language = options[:language] || detect_language(query)
91
+ limit = validate_limit(options[:limit] || config[:default_limit])
92
+ alpha = validate_alpha(options[:alpha] || config[:default_alpha])
93
+ alpha = adjust_alpha_for_query(alpha, query, language)
94
+ rrf_k = options[:rrf_k] || config[:rrf_k]
95
+ filters = options[:filters] || {}
96
+ deduplicate = options.fetch(:enable_deduplication, config[:deduplicate_results])
97
+ include_content = options.fetch(:include_content, false)
98
+ include_metadata = options.fetch(:include_metadata, true)
99
+ include_explanations = options.fetch(:include_explanations, config[:include_explanations])
100
+ query_embedding = options[:query_embedding]
101
+
102
+ rerank_limit = normalize_rerank_limit(options[:rerank_limit] || config[:rerank_limit], limit)
103
+ recall_limit = [rerank_limit, limit].max
104
+
105
+ @logger.info "Hybrid search: '#{query}', language: #{language}, limit: #{limit}, alpha: #{alpha}, recall_limit: #{recall_limit}"
106
+
107
+ # Execute both search methods
108
+ start_time = Time.now
109
+ @logger.debug 'Starting vector search...'
110
+ vector_results = perform_vector_search(query, query_embedding, recall_limit, filters, options)
111
+ @logger.debug "Vector search completed: #{vector_results.length} results"
112
+
113
+ @logger.debug 'Starting text search...'
114
+ text_results = perform_text_search(query, language, recall_limit, filters)
115
+ @logger.debug "Text search completed: #{text_results.length} results"
116
+
117
+ combined_results = combine_results(
118
+ text_results,
119
+ vector_results,
120
+ alpha: alpha,
121
+ k: rrf_k,
122
+ deduplicate: deduplicate
123
+ )
124
+
125
+ if combined_results.empty? && !@last_vector_search_failed && !@last_text_search_failed
126
+ @logger.info "Hybrid search fallback: relaxing query and thresholds"
127
+ relaxed_query = relax_query(query)
128
+ text_results = perform_text_search(relaxed_query, language, recall_limit, filters)
129
+ vector_results = perform_vector_search(relaxed_query, query_embedding, recall_limit, filters, options.merge(fallback_threshold: 0.05))
130
+
131
+ combined_results = combine_results(
132
+ text_results,
133
+ vector_results,
134
+ alpha: alpha,
135
+ k: rrf_k,
136
+ deduplicate: deduplicate
137
+ )
138
+ end
139
+
140
+ reranked = rerank_results(
141
+ combined_results.first(rerank_limit),
142
+ query,
143
+ text_results: text_results,
144
+ vector_results: vector_results,
145
+ vector_similarity_weight: options[:vector_similarity_weight] || alpha || config[:vector_similarity_weight]
146
+ )
147
+
148
+ final_results = reranked.first(limit)
149
+
150
+ @logger.debug "Before enrichment: final_results count=#{final_results.length}"
151
+
152
+ # Enrich results if requested
153
+ if include_content || include_metadata || include_explanations
154
+ @logger.debug "Calling enrich_results with include_content=#{include_content}, include_metadata=#{include_metadata}"
155
+ final_results = enrich_results(final_results, include_content, include_metadata, include_explanations)
156
+ @logger.debug "After enrichment: enriched results count=#{final_results.length}"
157
+ end
158
+
159
+ execution_time = ((Time.now - start_time) * 1000).round
160
+
161
+ # Build response
162
+ response = {
163
+ query: query,
164
+ results: final_results,
165
+ metadata: {
166
+ total_count: final_results.length,
167
+ execution_time_ms: execution_time,
168
+ language: language,
169
+ alpha: alpha,
170
+ rrf_k: rrf_k,
171
+ rerank_limit: rerank_limit,
172
+ text_result_count: text_results.length,
173
+ vector_result_count: vector_results.length,
174
+ combined_score_stats: calculate_score_stats(final_results)
175
+ }
176
+ }
177
+
178
+ # Log search
179
+ log_search(query, 'hybrid', response[:results].length, execution_time)
180
+
181
+ @logger.info "Hybrid search completed: #{final_results.length} results in #{execution_time}ms"
182
+
183
+ response
184
+ rescue ::SmartRAG::Errors::HybridSearchServiceError
185
+ raise
186
+ rescue ArgumentError => e
187
+ # Return empty results on validation error
188
+ execution_time = ((Time.now - start_time) * 1000).round
189
+ {
190
+ query: query,
191
+ results: [],
192
+ metadata: {
193
+ total_count: 0,
194
+ execution_time_ms: execution_time,
195
+ language: language || config[:default_language],
196
+ alpha: alpha || config[:default_alpha],
197
+ rrf_k: rrf_k || config[:rrf_k],
198
+ text_result_count: 0,
199
+ vector_result_count: 0,
200
+ combined_score_stats: {},
201
+ error: e.message
202
+ }
203
+ }
204
+ rescue StandardError => e
205
+ @logger.error "Hybrid search failed: #{e.message}"
206
+ @logger.error e.backtrace.join("\n")
207
+ log_search(query, 'hybrid', 0, 0, e.message)
208
+ # Return empty results on error instead of crashing
209
+ # This allows tests and callers to handle errors gracefully
210
+ execution_time = ((Time.now - start_time) * 1000).round
211
+ {
212
+ query: query,
213
+ results: [],
214
+ metadata: {
215
+ total_count: 0,
216
+ execution_time_ms: execution_time,
217
+ language: language || config[:default_language],
218
+ alpha: alpha || config[:default_alpha],
219
+ rrf_k: rrf_k || config[:rrf_k],
220
+ text_result_count: 0,
221
+ vector_result_count: 0,
222
+ combined_score_stats: {},
223
+ error: e.message
224
+ }
225
+ }
226
+ end
227
+ end
228
+
229
+ def validate_query(query)
230
+ return 'Query cannot be nil' if query.nil?
231
+ return 'Query cannot be empty' if query.strip.empty?
232
+
233
+ length = query.strip.length
234
+ return "Query too short (minimum #{config[:min_query_length]} characters)" if length < config[:min_query_length]
235
+
236
+ return "Query too long (maximum #{config[:max_query_length]} characters)" if length > config[:max_query_length]
237
+
238
+ nil
239
+ end
240
+
241
+ def log_search(query, search_type, result_count, execution_time, error = nil)
242
+ # Skip logging validation errors (nil/empty queries)
243
+ return if query.nil? || query.to_s.strip.empty?
244
+
245
+ begin
246
+ # Skip logging if database or fulltext_manager is not available
247
+ return unless @fulltext_manager && @fulltext_manager.respond_to?(:db) && @fulltext_manager.db
248
+
249
+ # Build insert hash without error_message column (not in migration)
250
+ log_data = {
251
+ query: query.to_s,
252
+ search_type: search_type,
253
+ execution_time_ms: execution_time,
254
+ results_count: result_count,
255
+ created_at: Sequel::CURRENT_TIMESTAMP
256
+ }
257
+
258
+ # Only add filters if we have error (but format differently for existing columns)
259
+ log_data[:filters] = { error: error }.to_json if error
260
+
261
+ @fulltext_manager.db[:search_logs].insert(log_data) if fulltext_manager.db[:search_logs]
262
+ rescue StandardError => e
263
+ @logger.error "Failed to log search: #{e.message}"
264
+ end
265
+ end
266
+
267
+ private
268
+
269
+ def detect_language(query)
270
+ fulltext_manager.detect_language(query)
271
+ end
272
+
273
+ def validate_limit(limit)
274
+ limit = limit.to_i
275
+ [[config[:min_limit], limit].max, config[:max_limit]].min
276
+ end
277
+
278
+ def validate_alpha(alpha)
279
+ [[0.0, alpha.to_f].max, 1.0].min
280
+ end
281
+
282
+ def adjust_alpha_for_query(alpha, query, language)
283
+ return alpha unless query
284
+
285
+ length = query.strip.length
286
+ return alpha if length == 0
287
+
288
+ # Favor fulltext for short queries where exact keyword match is strong.
289
+ lang = language.to_s
290
+ if (lang == 'zh' || lang.start_with?('zh_')) && length <= 4
291
+ return [alpha, 0.3].min
292
+ end
293
+
294
+ alpha
295
+ end
296
+
297
+ def extract_document_id(section)
298
+ if section.is_a?(Hash)
299
+ section[:document_id] || section['document_id']
300
+ else
301
+ section&.document_id
302
+ end
303
+ end
304
+
305
+ def symbolize_keys(hash)
306
+ return hash unless hash.is_a?(Hash)
307
+
308
+ hash.each_with_object({}) do |(key, value), result|
309
+ result[key.to_sym] = value
310
+ end
311
+ end
312
+
313
+ def perform_text_search(query, language, limit, filters)
314
+ if filters && !filters.empty?
315
+ # Convert tags to tag_ids for fulltext manager
316
+ search_filters = filters.dup
317
+ if filters[:tags] && !filters[:tags].empty?
318
+ # Convert Tag objects to IDs
319
+ search_filters[:tag_ids] = filters[:tags].map do |tag|
320
+ case tag
321
+ when ::SmartRAG::Models::Tag
322
+ tag.id
323
+ when Integer
324
+ tag
325
+ when String
326
+ tag_obj = ::SmartRAG::Models::Tag.find(name: tag)
327
+ tag_obj&.id
328
+ end
329
+ end.compact
330
+ # Remove the original :tags key as fulltext manager expects :tag_ids
331
+ search_filters.delete(:tags)
332
+ end
333
+
334
+ fulltext_manager.search_by_text(query, language, limit, filters: search_filters)
335
+ else
336
+ fulltext_manager.search_by_text(query, language, limit)
337
+ end
338
+ rescue StandardError => e
339
+ @last_text_search_failed = true
340
+ @logger.warn "Text search failed, continuing with vector-only results: #{e.message}"
341
+ []
342
+ end
343
+
344
+ def perform_vector_search(query, query_embedding, limit, filters, options = {})
345
+ @last_vector_search_failed = false
346
+ query_embedding ||= embedding_manager.send(:generate_query_embedding, query, options)
347
+
348
+ # Use embedding-based search path by default for predictable error handling.
349
+ tags = filters[:tags]
350
+ if tags && !tags.empty?
351
+ embedding_manager.search_by_vector_with_tags(query_embedding, tags, options.merge(limit: limit))
352
+ else
353
+ embedding_manager.search_by_vector(query_embedding, options.merge(limit: limit))
354
+ end
355
+ rescue PG::ConnectionBad => e
356
+ @last_vector_search_failed = true
357
+ raise ::SmartRAG::Errors::HybridSearchServiceError, "Vector database unavailable: #{e.message}"
358
+ rescue PG::Error => e
359
+ @last_vector_search_failed = true
360
+ @logger.warn "Vector search temporarily unavailable, continuing with text-only results: #{e.message}"
361
+ []
362
+ rescue StandardError => e
363
+ @last_vector_search_failed = true
364
+ @logger.warn "Vector search failed, continuing with text-only results: #{e.message}"
365
+ []
366
+ end
367
+
368
+ def combine_with_weighted_rrf(text_results, vector_results, alpha:, k:, deduplicate:)
369
+ # Convert text results to RRF format
370
+ text_rrf = text_results.each_with_index.map do |result, idx|
371
+ { result: result, section: normalize_result_section(result), score: 1.0 / (k + idx + 1), source: :text }
372
+ end
373
+
374
+ # Convert vector results to RRF format
375
+ vector_rrf = vector_results.each_with_index.map do |result, idx|
376
+ { result: result, section: normalize_result_section(result), score: 1.0 / (k + idx + 1), source: :vector }
377
+ end
378
+
379
+ # Group by section_id or other unique identifier
380
+ combined = {}
381
+ text_rrf.each do |item|
382
+ key = extract_result_key(item[:result])
383
+ combined[key] ||= { section: item[:section], text_score: 0, vector_score: 0 }
384
+ combined[key][:text_score] = item[:score]
385
+ end
386
+
387
+ vector_rrf.each do |item|
388
+ key = extract_result_key(item[:result])
389
+ combined[key] ||= { section: item[:section], text_score: 0, vector_score: 0 }
390
+ combined[key][:vector_score] = item[:score]
391
+ end
392
+
393
+ # Calculate weighted scores and sort
394
+ combined.map do |_key, data|
395
+ combined_score = alpha * data[:vector_score] + (1 - alpha) * data[:text_score]
396
+ {
397
+ section: data[:section],
398
+ combined_score: combined_score,
399
+ vector_score: data[:vector_score],
400
+ text_score: data[:text_score]
401
+ }
402
+ end.sort_by { |r| -r[:combined_score] }
403
+ end
404
+
405
+ def combine_results(text_results, vector_results, alpha:, k:, deduplicate:)
406
+ case config[:fusion_method]
407
+ when :rrf
408
+ combine_with_weighted_rrf(text_results, vector_results, alpha: alpha, k: k, deduplicate: deduplicate)
409
+ else
410
+ combine_with_weighted_scores(text_results, vector_results, alpha: alpha, deduplicate: deduplicate)
411
+ end
412
+ end
413
+
414
+ def combine_with_weighted_scores(text_results, vector_results, alpha:, deduplicate:)
415
+ text_scores = build_text_score_map(text_results)
416
+ vector_scores = build_vector_score_map(vector_results)
417
+
418
+ normalized_text = normalize_scores(text_scores)
419
+ normalized_vector = normalize_scores(vector_scores)
420
+
421
+ combined = {}
422
+ text_results.each do |result|
423
+ key = extract_result_key(result)
424
+ combined[key] ||= { section: normalize_result_section(result), text_score: 0.0, vector_score: 0.0 }
425
+ combined[key][:text_score] = normalized_text[extract_result_section_id(result)] || 0.0
426
+ end
427
+
428
+ vector_results.each do |result|
429
+ key = extract_result_key(result)
430
+ combined[key] ||= { section: normalize_result_section(result), text_score: 0.0, vector_score: 0.0 }
431
+ combined[key][:vector_score] = normalized_vector[extract_result_section_id(result)] || 0.0
432
+ end
433
+
434
+ combined.map do |_key, data|
435
+ text_score = data[:text_score] * config[:fulltext_weight_boost]
436
+ vector_score = data[:vector_score] * config[:vector_weight_boost]
437
+ combined_score = alpha * vector_score + (1 - alpha) * text_score
438
+ {
439
+ section: data[:section],
440
+ combined_score: combined_score,
441
+ vector_score: vector_score,
442
+ text_score: text_score
443
+ }
444
+ end.sort_by { |r| -r[:combined_score] }
445
+ end
446
+
447
+ def normalize_rerank_limit(rerank_limit, limit)
448
+ rerank_limit = rerank_limit.to_i
449
+ rerank_limit = limit * 2 if rerank_limit <= 0
450
+ base = [rerank_limit, limit].max
451
+ ((base + 63) / 64) * 64
452
+ end
453
+
454
+ def rerank_results(results, query, text_results:, vector_results:, vector_similarity_weight:)
455
+ return results if results.empty?
456
+
457
+ tkweight = 1.0 - vector_similarity_weight.to_f
458
+ vtweight = vector_similarity_weight.to_f
459
+
460
+ vector_map = build_vector_score_map(vector_results)
461
+
462
+ query_tokens = tokenize(query)
463
+ return results if query_tokens.empty?
464
+ if query_tokens.length >= 6
465
+ vtweight = [vtweight, 0.2].min
466
+ tkweight = 1.0 - vtweight
467
+ end
468
+
469
+ results.map do |result|
470
+ section = result[:section]
471
+ content = extract_section_content(section)
472
+ title = extract_section_title(section)
473
+ tags = extract_section_tags(section)
474
+
475
+ token_score = token_similarity(query_tokens, content, title, tags)
476
+ vector_score = vector_map[extract_section_id(section)] || 0.0
477
+ if token_score <= 0.0 && query_tokens.length >= 3
478
+ vector_score *= 0.3
479
+ end
480
+ rank_feature = token_score > 0 ? rank_feature_score(query_tokens, tags) : 0.0
481
+ rerank_score = (tkweight * token_score) + (vtweight * vector_score) + rank_feature
482
+
483
+ result.merge(
484
+ rerank_score: rerank_score,
485
+ combined_score: rerank_score
486
+ )
487
+ end.sort_by { |r| -r[:combined_score] }
488
+ end
489
+
490
+ def build_text_score_map(text_results)
491
+ text_results.each_with_object({}) do |result, map|
492
+ section_id = extract_result_section_id(result)
493
+ next unless section_id
494
+
495
+ score = result[:rank_score] || 0.0
496
+ map[section_id] = score
497
+ end
498
+ end
499
+
500
+ def build_vector_score_map(vector_results)
501
+ vector_results.each_with_object({}) do |result, map|
502
+ section_id = extract_result_section_id(result)
503
+ next unless section_id
504
+
505
+ score = result[:boosted_score] || result[:similarity] || 0.0
506
+ map[section_id] = score
507
+ end
508
+ end
509
+
510
+ def extract_result_section_id(result)
511
+ return result[:section_id] if result.is_a?(Hash) && result[:section_id]
512
+ return result[:section][:id] if result.is_a?(Hash) && result[:section].is_a?(Hash)
513
+ return result[:section].id if result.is_a?(Hash) && result[:section].respond_to?(:id)
514
+ return result.id if result.respond_to?(:id)
515
+
516
+ nil
517
+ end
518
+
519
+ def extract_section_id(section)
520
+ return section[:id] if section.is_a?(Hash)
521
+ return section.id if section.respond_to?(:id)
522
+
523
+ nil
524
+ end
525
+
526
+ def extract_section_content(section)
527
+ if section.is_a?(Hash)
528
+ section[:content] || section['content'] || ''
529
+ else
530
+ section&.content.to_s
531
+ end
532
+ end
533
+
534
+ def extract_section_title(section)
535
+ if section.is_a?(Hash)
536
+ section[:title] || section[:section_title] || section['title'] || ''
537
+ else
538
+ section&.section_title.to_s
539
+ end
540
+ end
541
+
542
+ def extract_section_tags(section)
543
+ if section.is_a?(Hash)
544
+ tags = section[:tags] || section['tags']
545
+ return tags if tags.is_a?(Array)
546
+ return tags.split(',').map(&:strip) if tags.is_a?(String)
547
+ elsif section.respond_to?(:tags)
548
+ return section.tags.map(&:name)
549
+ end
550
+
551
+ []
552
+ end
553
+
554
+ def tokenize(text)
555
+ return [] if text.nil?
556
+
557
+ tokens = []
558
+ text.scan(/[\p{Han}]+|[A-Za-z0-9_]+/) do |chunk|
559
+ if chunk.match?(/\p{Han}/)
560
+ if chunk.length <= 2
561
+ tokens << chunk
562
+ else
563
+ tokens << chunk
564
+ 0.upto(chunk.length - 2) do |idx|
565
+ tokens << chunk[idx, 2]
566
+ end
567
+ end
568
+ else
569
+ tokens << chunk.downcase
570
+ end
571
+ end
572
+ tokens
573
+ end
574
+
575
+ def token_similarity(query_tokens, content, title, tags)
576
+ query_tokens = query_tokens.uniq
577
+ doc_tokens = tokenize(content)
578
+ return 0.0 if doc_tokens.empty?
579
+
580
+ title_tokens = tokenize(title)
581
+ token_counts = Hash.new(0)
582
+ doc_tokens.each { |t| token_counts[t] += 1 }
583
+ title_tokens.each { |t| token_counts[t] += 2 }
584
+ Array(tags).each { |t| token_counts[t.to_s.downcase] += 5 }
585
+
586
+ hits = 0
587
+ query_tokens.each do |token|
588
+ hits += token_counts[token]
589
+ end
590
+
591
+ hits.to_f / query_tokens.length
592
+ end
593
+
594
+ def rank_feature_score(query_tokens, tags)
595
+ tag_tokens = Array(tags).map { |t| t.to_s.downcase }
596
+ tag_hits = query_tokens.count { |token| tag_tokens.include?(token) }
597
+ tag_score = tag_hits.to_f / [query_tokens.length, 1].max
598
+
599
+ tag_score
600
+ end
601
+
602
+ def normalize_scores(score_map)
603
+ return {} if score_map.empty?
604
+
605
+ values = score_map.values
606
+ min = values.min
607
+ max = values.max
608
+ if (max - min).abs < 1e-9
609
+ return score_map.transform_values { |v| v > 0 ? 1.0 : 0.0 }
610
+ end
611
+
612
+ score_map.transform_values { |v| (v - min) / (max - min) }
613
+ end
614
+
615
+ def normalize_result_section(result)
616
+ return result[:section] if result.is_a?(Hash) && result[:section]
617
+
618
+ result
619
+ end
620
+
621
+ def relax_query(query)
622
+ return query if query.nil?
623
+
624
+ relaxed = query.to_s.dup
625
+ relaxed.gsub!(/["']/, ' ')
626
+ relaxed.gsub!(/\b(AND|OR|NOT)\b/i, ' ')
627
+ relaxed.gsub!(/[()]/, ' ')
628
+ relaxed.gsub!(/\s+/, ' ')
629
+ relaxed.strip!
630
+
631
+ relaxed.empty? ? query : relaxed
632
+ end
633
+
634
+ def extract_result_key(result)
635
+ # Extract a unique key for deduplication
636
+ # For vector search results, we want document_id (not section_id)
637
+ # because we want to dedupe by document, not by section
638
+ case result
639
+ when Hash
640
+ # If result is from vector search, it has format: {embedding, section, similarity}
641
+ # We want to use document_id for deduplication
642
+ if result[:section]
643
+ # section may be a Hash or a SourceSection object
644
+ if result[:section].is_a?(Hash)
645
+ result[:section][:document_id] || result[:section][:id] || result[:section][:section_id] || result[:id]
646
+ else
647
+ # SourceSection object - try to get document_id
648
+ begin
649
+ result[:section].document_id
650
+ rescue StandardError
651
+ result[:id] || result.object_id
652
+ end
653
+ end
654
+ else
655
+ # For fulltext search results or fallback
656
+ result[:section_id] || result[:id] || result.object_id
657
+ end
658
+ else
659
+ # For objects (SourceSection, etc.)
660
+ begin
661
+ result.document_id
662
+ rescue StandardError
663
+ result.id || result.object_id
664
+ end
665
+ end
666
+ end
667
+
668
+ def enrich_results(results, include_content, include_metadata, include_explanations)
669
+ @logger.debug "enrich_results called with: include_content=#{include_content}, include_metadata=#{include_metadata}, include_explanations=#{include_explanations}"
670
+
671
+ results.map do |result|
672
+ enriched = result.dup
673
+ section = result[:section]
674
+
675
+ @logger.debug "Processing result, section class=#{section.class}, section inspect=#{section.inspect[0..200]}"
676
+
677
+ if include_content
678
+ enriched[:content] = if section.is_a?(Hash)
679
+ section[:content] || section['content'] || ''
680
+ else
681
+ section&.content || ''
682
+ end
683
+ end
684
+
685
+ if include_metadata
686
+ document_id = if section.is_a?(Hash)
687
+ section[:document_id] || section['document_id']
688
+ else
689
+ section&.document_id
690
+ end
691
+
692
+ @logger.debug "Document ID extracted: #{document_id.inspect} (section type: #{section.class})"
693
+
694
+ base_metadata = if section.is_a?(Hash)
695
+ {
696
+ section_id: section[:id] || section['id'] || section[:section_id],
697
+ document_id: document_id
698
+ }
699
+ else
700
+ {
701
+ section_id: section&.id,
702
+ document_id: document_id
703
+ }
704
+ end
705
+
706
+ if document_id && document_id != ''
707
+ begin
708
+ doc = @fulltext_manager.db[:source_documents].where(id: document_id).first
709
+ @logger.debug "Fetched document for id=#{document_id}: doc=#{doc ? 'found' : 'nil'}"
710
+
711
+ if doc
712
+ # Add document title
713
+ base_metadata[:document_title] = doc[:title] if doc[:title]
714
+
715
+ # Merge document metadata (may contain category, author, etc.)
716
+ if doc[:metadata]
717
+ @logger.debug "Document metadata found: #{doc[:metadata].inspect}"
718
+ parsed_metadata = if doc[:metadata].is_a?(String)
719
+ begin
720
+ JSON.parse(doc[:metadata])
721
+ rescue StandardError
722
+ {}
723
+ end
724
+ else
725
+ doc[:metadata]
726
+ end
727
+ parsed_metadata = symbolize_keys(parsed_metadata) if parsed_metadata.is_a?(Hash)
728
+ @logger.debug "Parsed metadata: #{parsed_metadata.inspect}"
729
+ base_metadata.merge!(parsed_metadata) if parsed_metadata.is_a?(Hash)
730
+ else
731
+ @logger.debug 'Document has no metadata field or is nil'
732
+ end
733
+ else
734
+ @logger.warn "Document not found for id=#{document_id}"
735
+ end
736
+ rescue StandardError => e
737
+ @logger.warn "Failed to fetch document metadata for document_id=#{document_id}: #{e.message}"
738
+ @logger.debug e.backtrace[0..5].join("\n")
739
+ end
740
+ else
741
+ @logger.warn 'Document ID is nil or empty'
742
+ end
743
+
744
+ enriched[:metadata] = base_metadata
745
+ end
746
+
747
+ if include_explanations
748
+ enriched[:score_explanation] =
749
+ "Combined: #{result[:combined_score].round(4)} (vector: #{result[:vector_score].round(4)}, text: #{result[:text_score].round(4)})"
750
+ end
751
+
752
+ enriched
753
+ end
754
+ end
755
+
756
+ def calculate_score_stats(results)
757
+ return {} if results.empty?
758
+
759
+ scores = results.map { |r| r[:combined_score] }
760
+ {
761
+ min: scores.min.round(4),
762
+ max: scores.max.round(4),
763
+ avg: (scores.sum / scores.size.to_f).round(4)
764
+ }
765
+ end
766
+ end
767
+ end
768
+ end