smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,745 @@
1
+ require 'securerandom'
2
+ require 'json'
3
+ require 'time'
4
+ require 'digest'
5
+
6
+ module SmartRAG
7
+ # Executes RetrievalPlan and formats the response as EvidencePack.
8
+ class Retrieve
9
+ DEFAULT_TOP_K = 30
10
+ DEFAULT_CANDIDATE_K = 200
11
+ DEFAULT_RRF_K = 60
12
+ DEFAULT_MAX_SNIPPET_CHARS = 800
13
+
14
+ def initialize(client)
15
+ @client = client
16
+ @logger = client.logger
17
+ end
18
+
19
+ def execute(plan:)
20
+ normalized_plan = normalize_plan(plan)
21
+ validate_plan!(normalized_plan)
22
+
23
+ started_at = monotonic_now
24
+ generated_at = Time.now.utc.iso8601
25
+ request_id = normalized_plan[:request_id] || SecureRandom.uuid
26
+ plan_id = SecureRandom.uuid
27
+ warnings = []
28
+ ignored_fields = []
29
+ applied_filters = {}
30
+
31
+ candidates, by_mode_stats = gather_candidates(
32
+ normalized_plan,
33
+ warnings: warnings,
34
+ ignored_fields: ignored_fields,
35
+ applied_filters: applied_filters
36
+ )
37
+
38
+ aggregated = aggregate_candidates(candidates, normalized_plan)
39
+ selected = apply_budget_and_diversity(aggregated, normalized_plan, ignored_fields)
40
+ evidences = build_evidences(selected, normalized_plan, generated_at)
41
+
42
+ took_ms = ((monotonic_now - started_at) * 1000).round
43
+ stats = {
44
+ candidates: candidates.length,
45
+ returned: evidences.length,
46
+ took_ms: took_ms,
47
+ by_mode: by_mode_stats
48
+ }
49
+
50
+ explain = {
51
+ fusion: build_fusion_explain(normalized_plan),
52
+ rerank: build_rerank_explain(normalized_plan),
53
+ filters_applied: applied_filters,
54
+ diversity: build_diversity_explain(normalized_plan),
55
+ ignored_fields: ignored_fields.uniq
56
+ }
57
+
58
+ pack = {
59
+ version: normalized_plan[:version] || '0.1',
60
+ plan: normalized_plan,
61
+ plan_id: plan_id,
62
+ request_id: request_id,
63
+ generated_at: generated_at,
64
+ evidences: evidences,
65
+ stats: stats,
66
+ explain: explain,
67
+ warnings: warnings.uniq
68
+ }
69
+
70
+ log_retrieve(plan: normalized_plan, stats: stats, explain: explain, warnings: warnings)
71
+ pack
72
+ end
73
+
74
+ private
75
+
76
+ def gather_candidates(plan, warnings:, ignored_fields:, applied_filters:)
77
+ budget = plan[:budget] || {}
78
+ per_mode_k = symbolize_keys(budget[:per_mode_k] || {})
79
+ candidate_k = positive_int(budget[:candidate_k], DEFAULT_CANDIDATE_K)
80
+ by_mode = {}
81
+ candidates = []
82
+
83
+ plan[:queries].each_with_index do |query, index|
84
+ query_text = query[:text].to_s.strip
85
+ next if query_text.empty?
86
+
87
+ mode = normalize_mode(query[:mode])
88
+ search_type = mode_to_search_type(mode)
89
+ unless search_type
90
+ warnings << "query[#{index}] mode=#{mode} not supported; fallback to hybrid"
91
+ search_type = 'hybrid'
92
+ end
93
+
94
+ mode_limit = positive_int(per_mode_k[mode], candidate_k)
95
+ query_weight = query[:weight].to_f
96
+ query_weight = 1.0 if query_weight <= 0.0
97
+
98
+ query_filters = symbolize_keys(query[:filters] || {})
99
+ global_filters = symbolize_keys(plan[:global_filters] || {})
100
+ merged_filters = merge_filters(global_filters, query_filters)
101
+ search_options, query_ignored, applied = build_search_options(merged_filters)
102
+
103
+ ignored_fields.concat(query_ignored)
104
+ applied_filters.merge!(applied) { |_k, old_v, new_v| merge_filter_values(old_v, new_v) }
105
+
106
+ response = @client.search(
107
+ query_text,
108
+ search_options.merge(
109
+ search_type: search_type,
110
+ limit: mode_limit
111
+ )
112
+ )
113
+ results = extract_results(response)
114
+
115
+ by_mode[mode] ||= { candidates: 0, returned: 0 }
116
+ by_mode[mode][:candidates] += results.length
117
+ by_mode[mode][:returned] += results.length
118
+
119
+ results.each_with_index do |result, rank_index|
120
+ candidate = build_candidate(
121
+ result,
122
+ mode: mode,
123
+ query_text: query_text,
124
+ query_index: index,
125
+ rank_index: rank_index,
126
+ query_weight: query_weight
127
+ )
128
+ next if candidate.nil?
129
+ next unless candidate_passes_filters?(candidate, merged_filters)
130
+
131
+ candidates << candidate
132
+ end
133
+ end
134
+
135
+ [candidates.compact, by_mode]
136
+ end
137
+
138
+ def build_candidate(result, mode:, query_text:, query_index:, rank_index:, query_weight:)
139
+ section = extract_section(result)
140
+ section_id = extract_section_id(result, section)
141
+ document_id = extract_document_id(result, section)
142
+ return nil if section_id.nil? && document_id.nil?
143
+
144
+ snippet = extract_snippet(result, section)
145
+ title = extract_title(result, section)
146
+ language = extract_language(result, section)
147
+ source_uri = extract_source_uri(result, section)
148
+ source_type = extract_source_type(source_uri, result)
149
+
150
+ vector_score = extract_vector_score(result, mode)
151
+ fts_score = extract_fts_score(result, mode)
152
+ rerank_score = numeric_or_nil(result[:rerank_score])
153
+ rrf_score = query_weight / (DEFAULT_RRF_K + rank_index + 1).to_f
154
+
155
+ {
156
+ key: evidence_key(section_id, document_id, snippet),
157
+ id: stable_evidence_id(section_id, document_id, snippet),
158
+ section_id: section_id,
159
+ document_id: document_id,
160
+ title: title,
161
+ snippet: snippet,
162
+ language: language,
163
+ source_uri: source_uri,
164
+ source_type: source_type,
165
+ signals: {
166
+ vector_score: vector_score,
167
+ vector_rank: vector_score ? rank_index + 1 : nil,
168
+ fts_score: fts_score,
169
+ fts_rank: fts_score ? rank_index + 1 : nil,
170
+ rrf_score: rrf_score,
171
+ rerank_score: rerank_score,
172
+ tag_score: 0.0,
173
+ topic_score: 0.0
174
+ },
175
+ provenance: {
176
+ mode: mode,
177
+ query_text: query_text,
178
+ query_index: query_index
179
+ },
180
+ metadata: extract_metadata(result, section),
181
+ raw: {
182
+ content_ref: section_id ? "section:#{section_id}" : nil
183
+ }
184
+ }
185
+ end
186
+
187
+ def aggregate_candidates(candidates, plan)
188
+ rerank_enabled = !!plan.dig(:ranking, :rerank, :enabled)
189
+ grouped = {}
190
+
191
+ candidates.each do |candidate|
192
+ key = candidate[:key]
193
+ if grouped[key]
194
+ grouped[key][:signals][:rrf_score] += candidate[:signals][:rrf_score].to_f
195
+ grouped[key][:signals][:vector_score] = max_numeric(
196
+ grouped[key][:signals][:vector_score],
197
+ candidate[:signals][:vector_score]
198
+ )
199
+ grouped[key][:signals][:fts_score] = max_numeric(
200
+ grouped[key][:signals][:fts_score],
201
+ candidate[:signals][:fts_score]
202
+ )
203
+ grouped[key][:signals][:rerank_score] = max_numeric(
204
+ grouped[key][:signals][:rerank_score],
205
+ candidate[:signals][:rerank_score]
206
+ )
207
+ grouped[key][:signals][:vector_rank] = min_numeric(
208
+ grouped[key][:signals][:vector_rank],
209
+ candidate[:signals][:vector_rank]
210
+ )
211
+ grouped[key][:signals][:fts_rank] = min_numeric(
212
+ grouped[key][:signals][:fts_rank],
213
+ candidate[:signals][:fts_rank]
214
+ )
215
+
216
+ current_best = grouped[key][:final_score]
217
+ incoming_score = score_for_sort(candidate, rerank_enabled)
218
+ if incoming_score > current_best
219
+ grouped[key][:provenance] = candidate[:provenance]
220
+ grouped[key][:snippet] = candidate[:snippet] if candidate[:snippet]
221
+ grouped[key][:metadata] = merge_hash(grouped[key][:metadata], candidate[:metadata])
222
+ grouped[key][:final_score] = incoming_score
223
+ end
224
+ else
225
+ grouped[key] = candidate.dup
226
+ grouped[key][:final_score] = score_for_sort(candidate, rerank_enabled)
227
+ end
228
+ end
229
+
230
+ grouped.values
231
+ end
232
+
233
+ def apply_budget_and_diversity(candidates, plan, ignored_fields)
234
+ budget = plan[:budget] || {}
235
+ top_k = positive_int(budget[:top_k], DEFAULT_TOP_K)
236
+ candidate_k = positive_int(budget[:candidate_k], DEFAULT_CANDIDATE_K)
237
+ diversity_by_document = positive_int(budget.dig(:diversity, :by_document), nil)
238
+ diversity_by_source = positive_int(budget.dig(:diversity, :by_source), nil)
239
+ diversity_by_section = positive_int(budget.dig(:diversity, :by_section), nil)
240
+ ignored_fields << 'budget.diversity.by_section not supported' if diversity_by_section
241
+
242
+ sorted = candidates.sort_by { |item| -item[:final_score].to_f }.first(candidate_k)
243
+ return sorted.first(top_k) if diversity_by_document.nil? && diversity_by_source.nil?
244
+
245
+ selected = []
246
+ per_document = Hash.new(0)
247
+ per_source = Hash.new(0)
248
+ sorted.each do |item|
249
+ doc_id = item[:document_id]
250
+ if diversity_by_document && doc_id && per_document[doc_id] >= diversity_by_document
251
+ next
252
+ end
253
+
254
+ source_key = item[:source_uri].to_s.strip
255
+ source_key = item[:source_type].to_s if source_key.empty?
256
+ if diversity_by_source && !source_key.empty? && per_source[source_key] >= diversity_by_source
257
+ next
258
+ end
259
+
260
+ selected << item
261
+ per_document[doc_id] += 1 if doc_id
262
+ per_source[source_key] += 1 if diversity_by_source && !source_key.empty?
263
+ break if selected.length >= top_k
264
+ end
265
+
266
+ if selected.length < top_k
267
+ ignored_fields << 'budget.diversity constraints partially applied due to insufficient diversity in candidates'
268
+ end
269
+
270
+ selected
271
+ end
272
+
273
+ def build_evidences(selected, plan, generated_at)
274
+ output = plan[:output] || {}
275
+ include_signals = output.fetch(:include_signals, true)
276
+ include_snippets = output.fetch(:include_snippets, true)
277
+ include_provenance = output.fetch(:include_provenance, true)
278
+ include_raw = output.fetch(:include_raw, false)
279
+ max_snippet_chars = positive_int(output[:max_snippet_chars], DEFAULT_MAX_SNIPPET_CHARS)
280
+ snippet_policy = output[:snippet_policy] || 'auto'
281
+
282
+ selected.map do |item|
283
+ evidence = {
284
+ id: item[:id],
285
+ kind: 'resource_section',
286
+ document_id: item[:document_id],
287
+ section_id: item[:section_id],
288
+ title: item[:title],
289
+ source_uri: item[:source_uri],
290
+ source_type: item[:source_type],
291
+ snippet_policy: snippet_policy,
292
+ language: item[:language],
293
+ metadata: item[:metadata] || {}
294
+ }
295
+
296
+ if include_snippets
297
+ evidence[:snippet] = truncate_text(item[:snippet], max_snippet_chars)
298
+ else
299
+ evidence[:snippet] = ''
300
+ end
301
+
302
+ evidence[:signals] = sanitize_signals(item[:signals]) if include_signals
303
+
304
+ if include_provenance
305
+ evidence[:provenance] = item[:provenance].merge(retrieved_at: generated_at)
306
+ end
307
+
308
+ evidence[:raw] = item[:raw] if include_raw
309
+ evidence
310
+ end
311
+ end
312
+
313
+ def build_search_options(filters)
314
+ ignored = []
315
+ applied = {}
316
+
317
+ options = {
318
+ include_content: true,
319
+ include_metadata: true
320
+ }
321
+
322
+ if filters[:document_ids]
323
+ options[:document_ids] = filters[:document_ids]
324
+ applied[:document_ids] = filters[:document_ids]
325
+ end
326
+
327
+ if filters[:tag_ids]
328
+ options[:tag_ids] = filters[:tag_ids]
329
+ applied[:tag_ids] = filters[:tag_ids]
330
+ end
331
+
332
+ if filters[:language]
333
+ options[:language] = Array(filters[:language]).first
334
+ applied[:language] = Array(filters[:language])
335
+ end
336
+
337
+ if filters[:time_range].is_a?(Hash)
338
+ time_from = filters[:time_range][:from] || filters[:time_range]['from']
339
+ time_to = filters[:time_range][:to] || filters[:time_range]['to']
340
+ options[:date_from] = time_from if time_from
341
+ options[:date_to] = time_to if time_to
342
+ applied[:time_range] = { from: time_from, to: time_to }
343
+ end
344
+
345
+ if filters[:source_type]
346
+ applied[:source_type] = Array(filters[:source_type]).map(&:to_s)
347
+ end
348
+ if filters[:source_uri_prefix]
349
+ applied[:source_uri_prefix] = Array(filters[:source_uri_prefix]).map(&:to_s)
350
+ end
351
+ if filters[:topic_ids]
352
+ applied[:topic_ids] = Array(filters[:topic_ids]).map(&:to_i)
353
+ end
354
+
355
+ [options, ignored, applied]
356
+ end
357
+
358
+ def extract_results(response)
359
+ return [] unless response.is_a?(Hash)
360
+
361
+ Array(response[:results] || response['results'])
362
+ end
363
+
364
+ def extract_section(result)
365
+ return result[:section] if result.is_a?(Hash) && result[:section]
366
+ return result['section'] if result.is_a?(Hash) && result['section']
367
+
368
+ result
369
+ end
370
+
371
+ def extract_section_id(result, section)
372
+ return result[:section_id] if result.is_a?(Hash) && result[:section_id]
373
+ return result['section_id'] if result.is_a?(Hash) && result['section_id']
374
+ return section[:id] if section.is_a?(Hash) && section[:id]
375
+ return section['id'] if section.is_a?(Hash) && section['id']
376
+ return section.id if section.respond_to?(:id)
377
+
378
+ nil
379
+ end
380
+
381
+ def extract_document_id(result, section)
382
+ return result[:document_id] if result.is_a?(Hash) && result[:document_id]
383
+ return result['document_id'] if result.is_a?(Hash) && result['document_id']
384
+ return section[:document_id] if section.is_a?(Hash) && section[:document_id]
385
+ return section['document_id'] if section.is_a?(Hash) && section['document_id']
386
+ return section.document_id if section.respond_to?(:document_id)
387
+
388
+ nil
389
+ end
390
+
391
+ def extract_snippet(result, section)
392
+ if result.is_a?(Hash)
393
+ return result[:content] if result[:content]
394
+ return result[:highlight] if result[:highlight]
395
+ return result['content'] if result['content']
396
+ end
397
+
398
+ if section.is_a?(Hash)
399
+ return section[:content] if section[:content]
400
+ return section['content'] if section['content']
401
+ elsif section.respond_to?(:content)
402
+ return section.content
403
+ end
404
+
405
+ ''
406
+ end
407
+
408
+ def extract_title(result, section)
409
+ if result.is_a?(Hash)
410
+ return result[:document_title] if result[:document_title]
411
+ return result[:title] if result[:title]
412
+ return result['document_title'] if result['document_title']
413
+ end
414
+
415
+ if section.is_a?(Hash)
416
+ return section[:section_title] if section[:section_title]
417
+ return section[:title] if section[:title]
418
+ elsif section.respond_to?(:section_title)
419
+ return section.section_title
420
+ end
421
+
422
+ nil
423
+ end
424
+
425
+ def extract_language(result, section)
426
+ if result.is_a?(Hash)
427
+ return result[:language] if result[:language]
428
+ return result['language'] if result['language']
429
+ end
430
+
431
+ if section.is_a?(Hash)
432
+ return section[:language] if section[:language]
433
+ return section['language'] if section['language']
434
+ elsif section.respond_to?(:language)
435
+ return section.language
436
+ end
437
+
438
+ nil
439
+ end
440
+
441
+ def extract_source_uri(result, section)
442
+ if result.is_a?(Hash)
443
+ metadata = result[:metadata] || result['metadata']
444
+ if metadata.is_a?(Hash)
445
+ uri = metadata[:source_uri] || metadata['source_uri']
446
+ return uri if uri
447
+ end
448
+ return result[:url] if result[:url]
449
+ end
450
+
451
+ if section.respond_to?(:document) && section.document
452
+ return section.document.url if section.document.respond_to?(:url)
453
+ end
454
+
455
+ nil
456
+ end
457
+
458
+ def extract_source_type(source_uri, result)
459
+ if result.is_a?(Hash)
460
+ metadata = result[:metadata] || result['metadata']
461
+ if metadata.is_a?(Hash)
462
+ source_type = metadata[:source_type] || metadata['source_type']
463
+ return source_type if source_type
464
+ end
465
+ end
466
+
467
+ return 'url' if source_uri.to_s.start_with?('http://', 'https://')
468
+ return 'file' if source_uri.to_s.start_with?('file://', '/')
469
+
470
+ 'manual'
471
+ end
472
+
473
+ def candidate_passes_filters?(candidate, filters)
474
+ return true unless filters.is_a?(Hash) && !filters.empty?
475
+
476
+ if filters[:source_type]
477
+ allowed = Array(filters[:source_type]).map { |v| v.to_s.downcase }
478
+ actual = candidate[:source_type].to_s.downcase
479
+ return false unless allowed.include?(actual)
480
+ end
481
+
482
+ if filters[:source_uri_prefix]
483
+ prefixes = Array(filters[:source_uri_prefix]).map(&:to_s)
484
+ uri = candidate[:source_uri].to_s
485
+ return false unless prefixes.any? { |prefix| uri.start_with?(prefix) }
486
+ end
487
+
488
+ if filters[:topic_ids]
489
+ required_topic_ids = Array(filters[:topic_ids]).map(&:to_i).uniq
490
+ section_topics = section_topic_ids_for(candidate[:section_id])
491
+ return false if required_topic_ids.any? && (required_topic_ids & section_topics).empty?
492
+ end
493
+
494
+ true
495
+ end
496
+
497
+ def section_topic_ids_for(section_id)
498
+ return [] if section_id.nil?
499
+ return [] unless defined?(::SmartRAG) && ::SmartRAG.respond_to?(:db) && ::SmartRAG.db
500
+
501
+ @section_topic_cache ||= {}
502
+ return @section_topic_cache[section_id] if @section_topic_cache.key?(section_id)
503
+
504
+ topic_ids = ::SmartRAG.db[:research_topic_sections]
505
+ .where(section_id: section_id)
506
+ .select_map(:research_topic_id)
507
+ .map(&:to_i)
508
+ @section_topic_cache[section_id] = topic_ids
509
+ rescue StandardError
510
+ []
511
+ end
512
+
513
+ def extract_metadata(result, section)
514
+ metadata = {}
515
+ metadata_from_result = result.is_a?(Hash) ? (result[:metadata] || result['metadata']) : nil
516
+ metadata.merge!(metadata_from_result) if metadata_from_result.is_a?(Hash)
517
+
518
+ section_id = extract_section_id(result, section)
519
+ document_id = extract_document_id(result, section)
520
+
521
+ metadata[:section_id] ||= section_id if section_id
522
+ metadata[:document_id] ||= document_id if document_id
523
+ metadata
524
+ end
525
+
526
+ def extract_vector_score(result, mode)
527
+ return numeric_or_nil(result[:vector_score]) if result.is_a?(Hash) && result.key?(:vector_score)
528
+ return numeric_or_nil(result[:similarity]) if mode == 'semantic' && result.is_a?(Hash) && result.key?(:similarity)
529
+ return numeric_or_nil(result[:boosted_score]) if result.is_a?(Hash) && result.key?(:boosted_score)
530
+
531
+ nil
532
+ end
533
+
534
+ def extract_fts_score(result, mode)
535
+ return numeric_or_nil(result[:fts_score]) if result.is_a?(Hash) && result.key?(:fts_score)
536
+ return numeric_or_nil(result[:text_score]) if result.is_a?(Hash) && result.key?(:text_score)
537
+ return numeric_or_nil(result[:rank_score]) if mode == 'exact' && result.is_a?(Hash) && result.key?(:rank_score)
538
+
539
+ nil
540
+ end
541
+
542
+ def sanitize_signals(signals)
543
+ {
544
+ vector_score: signals[:vector_score] || 0.0,
545
+ vector_rank: signals[:vector_rank],
546
+ fts_score: signals[:fts_score] || 0.0,
547
+ fts_rank: signals[:fts_rank],
548
+ rrf_score: signals[:rrf_score] || 0.0,
549
+ rerank_score: signals[:rerank_score],
550
+ tag_score: signals[:tag_score] || 0.0,
551
+ topic_score: signals[:topic_score] || 0.0
552
+ }
553
+ end
554
+
555
+ def mode_to_search_type(mode)
556
+ case mode
557
+ when 'exact' then 'fulltext'
558
+ when 'semantic' then 'vector'
559
+ when 'hybrid' then 'hybrid'
560
+ else nil
561
+ end
562
+ end
563
+
564
+ def normalize_mode(mode)
565
+ case mode.to_s
566
+ when 'exact', 'semantic', 'hybrid' then mode.to_s
567
+ when 'relational', 'associative' then 'hybrid'
568
+ else 'hybrid'
569
+ end
570
+ end
571
+
572
+ def normalize_plan(plan)
573
+ return {} unless plan.is_a?(Hash)
574
+
575
+ deep_symbolize(plan)
576
+ end
577
+
578
+ def validate_plan!(plan)
579
+ raise ArgumentError, 'Retrieval plan must be a hash' unless plan.is_a?(Hash)
580
+ raise ArgumentError, 'Retrieval plan requires queries' unless plan[:queries].is_a?(Array) && !plan[:queries].empty?
581
+ end
582
+
583
+ def deep_symbolize(value)
584
+ case value
585
+ when Hash
586
+ value.each_with_object({}) do |(key, child), memo|
587
+ normalized_key = key.respond_to?(:to_sym) ? key.to_sym : key
588
+ memo[normalized_key] = deep_symbolize(child)
589
+ end
590
+ when Array
591
+ value.map { |item| deep_symbolize(item) }
592
+ else
593
+ value
594
+ end
595
+ end
596
+
597
+ def symbolize_keys(value)
598
+ return {} unless value.is_a?(Hash)
599
+
600
+ value.each_with_object({}) do |(key, val), memo|
601
+ memo[key.to_sym] = val
602
+ end
603
+ end
604
+
605
+ def merge_filters(global_filters, query_filters)
606
+ merged = global_filters.dup
607
+ query_filters.each do |key, value|
608
+ merged[key] = value
609
+ end
610
+ merged
611
+ end
612
+
613
+ def merge_filter_values(old_value, new_value)
614
+ old_array = old_value.is_a?(Array) ? old_value : [old_value].compact
615
+ new_array = new_value.is_a?(Array) ? new_value : [new_value].compact
616
+ merged = (old_array + new_array).uniq
617
+ merged.length == 1 ? merged.first : merged
618
+ end
619
+
620
+ def merge_hash(base_hash, new_hash)
621
+ return base_hash unless new_hash.is_a?(Hash)
622
+
623
+ base_hash.merge(new_hash) { |_key, old_v, new_v| new_v.nil? ? old_v : new_v }
624
+ end
625
+
626
+ def build_fusion_explain(plan)
627
+ fusion = plan.dig(:ranking, :fusion) || {}
628
+ {
629
+ method: fusion[:method] || 'rrf',
630
+ rrf_k: fusion[:rrf_k] || DEFAULT_RRF_K,
631
+ weights: fusion[:weights] || { exact: 1.0, semantic: 1.0 }
632
+ }
633
+ end
634
+
635
+ def build_rerank_explain(plan)
636
+ rerank = plan.dig(:ranking, :rerank) || {}
637
+ {
638
+ enabled: !!rerank[:enabled],
639
+ model: rerank[:model],
640
+ top_n: rerank[:top_n]
641
+ }
642
+ end
643
+
644
+ def build_diversity_explain(plan)
645
+ diversity = plan.dig(:budget, :diversity) || {}
646
+ {
647
+ by_document: diversity[:by_document],
648
+ by_source: diversity[:by_source],
649
+ by_section: diversity[:by_section],
650
+ applied: !diversity.empty?
651
+ }
652
+ end
653
+
654
+ def evidence_key(section_id, document_id, snippet)
655
+ return "section:#{section_id}" if section_id
656
+ return "document:#{document_id}:#{snippet.to_s[0, 64]}" if document_id
657
+
658
+ "snippet:#{snippet.to_s[0, 64]}"
659
+ end
660
+
661
+ def stable_evidence_id(section_id, document_id, snippet)
662
+ return "section:#{section_id}" if section_id
663
+
664
+ digest = Digest::SHA1.hexdigest(snippet.to_s)[0, 12]
665
+ return "doc:#{document_id}:#{digest}" if document_id
666
+
667
+ "snippet:#{digest}"
668
+ end
669
+
670
+ def score_for_sort(candidate, rerank_enabled)
671
+ rerank_score = candidate.dig(:signals, :rerank_score)
672
+ return rerank_score.to_f if rerank_enabled && !rerank_score.nil?
673
+
674
+ candidate.dig(:signals, :rrf_score).to_f
675
+ end
676
+
677
+ def max_numeric(a, b)
678
+ return b if a.nil?
679
+ return a if b.nil?
680
+
681
+ [a.to_f, b.to_f].max
682
+ end
683
+
684
+ def min_numeric(a, b)
685
+ return b if a.nil?
686
+ return a if b.nil?
687
+
688
+ [a.to_i, b.to_i].min
689
+ end
690
+
691
+ def numeric_or_nil(value)
692
+ return nil if value.nil?
693
+
694
+ value.to_f
695
+ rescue StandardError
696
+ nil
697
+ end
698
+
699
+ def truncate_text(text, max_chars)
700
+ return '' if text.nil?
701
+
702
+ str = text.to_s
703
+ return str if max_chars.nil? || max_chars <= 0 || str.length <= max_chars
704
+
705
+ str[0, max_chars]
706
+ end
707
+
708
+ def positive_int(value, default_value)
709
+ return default_value if value.nil?
710
+
711
+ int_value = value.to_i
712
+ return default_value if int_value <= 0
713
+
714
+ int_value
715
+ end
716
+
717
+ def monotonic_now
718
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
719
+ end
720
+
721
+ def log_retrieve(plan:, stats:, explain:, warnings:)
722
+ return unless ::SmartRAG.db
723
+
724
+ payload = {
725
+ request_id: plan[:request_id],
726
+ version: plan[:version],
727
+ plan_json: plan,
728
+ stats: stats,
729
+ explain: explain,
730
+ warnings: warnings
731
+ }
732
+
733
+ ::SmartRAG.db[:search_logs].insert(
734
+ query: "retrieve:#{plan[:purpose] || 'other'}",
735
+ search_type: 'hybrid',
736
+ execution_time_ms: stats[:took_ms],
737
+ results_count: stats[:returned],
738
+ filters: payload.to_json,
739
+ created_at: Sequel::CURRENT_TIMESTAMP
740
+ )
741
+ rescue StandardError => e
742
+ @logger.warn "Failed to log retrieve plan: #{e.message}" if @logger
743
+ end
744
+ end
745
+ end