smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,577 @@
1
+ require_relative '../services/embedding_service'
2
+ require_relative '../services/tag_service'
3
+ require_relative '../services/vector_search_service'
4
+ require_relative '../services/fulltext_search_service'
5
+ require_relative '../services/hybrid_search_service'
6
+ require_relative '../services/summarization_service'
7
+ require_relative '../errors'
8
+
9
+ module SmartRAG
10
+ module Core
11
+ # QueryProcessor handles natural language queries and generates responses
12
+ class QueryProcessor
13
+ attr_reader :embedding_service, :tag_service, :vector_search_service,
14
+ :fulltext_search_service, :hybrid_search_service,
15
+ :summarization_service, :embedding_manager, :config, :logger
16
+
17
+ # Initialize the query processor
18
+ # @param config [Hash] Configuration options
19
+ # @option config [EmbeddingService] :embedding_service Embedding service instance
20
+ # @option config [TagService] :tag_service Tag service instance
21
+ # @option config [VectorSearchService] :vector_search_service Vector search service instance
22
+ # @option config [FulltextSearchService] :fulltext_search_service Fulltext search service instance
23
+ # @option config [HybridSearchService] :hybrid_search_service Hybrid search service instance
24
+ # @option config [SummarizationService] :summarization_service Summarization service instance
25
+ # @option config [Logger] :logger Logger instance
26
+ def initialize(config = {})
27
+ @config = config
28
+ @logger = config[:logger] || Logger.new(STDOUT)
29
+
30
+ # Prepare config with logger for all services
31
+ config_with_logger = config.merge(logger: @logger)
32
+
33
+ # Initialize services (use provided or create defaults)
34
+ @embedding_service = config[:embedding_service] || Services::EmbeddingService.new(config_with_logger)
35
+ @tag_service = config[:tag_service] || Services::TagService.new(config_with_logger)
36
+
37
+ # Create embedding manager for vector search (used for tag-enhanced search)
38
+ @embedding_manager = config[:embedding_manager] || ::SmartRAG::Core::Embedding.new(config)
39
+ @vector_search_service = config[:vector_search_service] || Services::VectorSearchService.new(
40
+ @embedding_manager, config_with_logger
41
+ )
42
+
43
+ # Create fulltext manager for fulltext search
44
+ # Note: FulltextManager requires a database connection as first parameter
45
+ db = config[:db] || ::SmartRAG.db
46
+ fulltext_manager = config[:fulltext_manager] || ::SmartRAG::Core::FulltextManager.new(db, config.merge(logger: @logger))
47
+ query_parser = config[:query_parser] || ::SmartRAG::Parsers::QueryParser.new
48
+ @fulltext_search_service = config[:fulltext_search_service] || Services::FulltextSearchService.new(
49
+ fulltext_manager, query_parser, logger: @logger
50
+ )
51
+
52
+ @hybrid_search_service = config[:hybrid_search_service] || Services::HybridSearchService.new(
53
+ @embedding_manager,
54
+ fulltext_manager,
55
+ config_with_logger
56
+ )
57
+ @summarization_service = config[:summarization_service] || Services::SummarizationService.new(config_with_logger)
58
+
59
+ @logger.info 'QueryProcessor initialized with all services'
60
+ rescue StandardError => e
61
+ @logger.error "Failed to initialize QueryProcessor: #{e.message}" if @logger
62
+ raise
63
+ end
64
+
65
+ # Process a natural language query and return search results
66
+ # @param query_text [String] Natural language query
67
+ # @param options [Hash] Processing options
68
+ # @option options [Symbol] :language Query language (:zh_cn, :zh_tw, :en, :ja)
69
+ # @option options [Integer] :limit Maximum results (default: 10)
70
+ # @option options [Float] :threshold Similarity threshold (default: 0.3)
71
+ # @option options [Symbol] :search_type Search type (:vector, :fulltext, :hybrid)
72
+ # @option options [Array<Integer>] :document_ids Filter by document IDs
73
+ # @option options [Array<String>, Array<Tag>] :tags Tags to boost results
74
+ # @option options [Boolean] :generate_tags Whether to generate tags from query (default: false)
75
+ # @return [Hash] Search results with metadata
76
+ def process_query(query_text, options = {})
77
+ raise ArgumentError, 'Query text cannot be nil or empty' if query_text.to_s.strip.empty?
78
+
79
+ logger.info "Processing query: #{query_text[0..100]}..."
80
+
81
+ # Validate search type first
82
+ search_type = options[:search_type] || :hybrid
83
+ unless %i[vector fulltext hybrid].include?(search_type)
84
+ raise ArgumentError, "Invalid search type: #{search_type}"
85
+ end
86
+
87
+ # Detect language if not provided
88
+ options[:language] ||= detect_language(query_text)
89
+ language = options[:language]
90
+ logger.info "Detected language: #{language}"
91
+
92
+ # Generate query tags if requested
93
+ query_tags = []
94
+ if options[:generate_tags]
95
+ logger.info 'Generating tags from query...'
96
+ generated_tags = tag_service.generate_tags(query_text, nil, [language],
97
+ max_content_tags: 5, include_category: false)
98
+ query_tags = generated_tags[:content_tags] || []
99
+ logger.info "Generated #{query_tags.size} tags: #{query_tags.join(', ')}"
100
+ end
101
+
102
+ # Combine user-provided tags with generated tags
103
+ all_tags = options[:tags] ? ensure_tag_objects(options[:tags]) : []
104
+ all_tags.concat(ensure_tag_objects(query_tags)) if query_tags.any?
105
+
106
+ # Generate query embedding for vector search
107
+ query_embedding = generate_query_embedding(query_text, options)
108
+
109
+ # Execute search based on type
110
+ search_results = case search_type
111
+ when :vector
112
+ logger.info 'Performing vector search...'
113
+ perform_vector_search(query_embedding, all_tags, options)
114
+ when :fulltext
115
+ logger.info 'Performing fulltext search...'
116
+ perform_fulltext_search(query_text, options)
117
+ when :hybrid
118
+ logger.info 'Performing hybrid search...'
119
+ perform_hybrid_search(query_text, query_embedding, all_tags, options)
120
+ end
121
+
122
+ logger.info "Search completed. Found #{search_results[:results].size} results"
123
+
124
+ # Enrich results with additional metadata
125
+ enriched = enrich_results(search_results, query_text, options)
126
+ if search_type == :hybrid
127
+ apply_domain_boost(enriched, query_text, options)
128
+ enriched = diversify_results_by_category(enriched, options)
129
+ end
130
+ enriched
131
+ rescue ArgumentError
132
+ raise
133
+ rescue StandardError => e
134
+ logger.error "Query processing failed: #{e.message}"
135
+ logger.error e.backtrace.join("\n")
136
+ raise ::SmartRAG::Errors::QueryProcessingError, "Query processing failed: #{e.message}"
137
+ end
138
+
139
+ # Generate a natural language response based on search results
140
+ # @param question [String] Original question
141
+ # @param search_results [Hash] Results from process_query
142
+ # @param options [Hash] Response generation options
143
+ # @option options [Symbol] :language Response language
144
+ # @option options [Integer] :max_length Maximum response length
145
+ # @option options [Boolean] :include_sources Whether to include source references (default: true)
146
+ # @return [Hash] Response with answer and metadata
147
+ def generate_response(question, search_results, options = {})
148
+ raise ArgumentError, 'Question cannot be nil or empty' if question.to_s.strip.empty?
149
+ raise ArgumentError, 'Search results cannot be nil' if search_results.nil?
150
+
151
+ logger.info "Generating response for question: #{question[0..50]}..."
152
+ logger.info "Search results: #{search_results.inspect[0..200]}"
153
+
154
+ # Extract results and context
155
+ results = search_results[:results] || []
156
+ logger.info "Number of results: #{results.size}"
157
+
158
+ context = extract_context_for_response(results, options)
159
+ logger.info "Context extracted: #{context.length} chars"
160
+
161
+ if context.empty?
162
+ logger.warn 'No context available for response generation'
163
+ return {
164
+ answer: "I don't have enough information to answer this question.",
165
+ sources: [],
166
+ confidence: 0.0
167
+ }
168
+ end
169
+
170
+ # Generate response using summarization service
171
+ logger.info 'Calling summarization service...'
172
+ response = summarization_service.summarize_search_results(question, context, options)
173
+ logger.info "Summarization service returned: #{response.inspect[0..200]}"
174
+
175
+ # Add source references if requested
176
+ if options.fetch(:include_sources, true)
177
+ sources = extract_sources(results)
178
+ response[:sources] = sources
179
+ end
180
+
181
+ logger.info 'Response generated successfully'
182
+ response
183
+ rescue ArgumentError
184
+ raise
185
+ rescue StandardError => e
186
+ logger.error "Response generation failed: #{e.message}"
187
+ raise ::SmartRAG::Errors::ResponseGenerationError, "Response generation failed: #{e.message}"
188
+ end
189
+
190
+ # Process a query and generate a response in one step
191
+ # @param question [String] Natural language question
192
+ # @param options [Hash] Processing and response options
193
+ # @return [Hash] Complete response with answer, sources, and metadata
194
+ def ask(question, options = {})
195
+ logger.info "Processing ask request: #{question[0..50]}..."
196
+
197
+ # Process the query to get search results
198
+ search_results = process_query(question, options)
199
+
200
+ # Generate response from search results
201
+ response = generate_response(question, search_results, options)
202
+
203
+ # Combine everything
204
+ {
205
+ question: question,
206
+ answer: response[:answer],
207
+ sources: response[:sources],
208
+ search_results: search_results[:results],
209
+ metadata: {
210
+ search_type: search_results[:search_type],
211
+ total_results: search_results[:total_results],
212
+ processing_time_ms: search_results[:processing_time_ms],
213
+ confidence: response[:confidence]
214
+ }
215
+ }
216
+ rescue StandardError => e
217
+ logger.error "Ask request failed: #{e.message}"
218
+ raise
219
+ end
220
+
221
+ private
222
+
223
+ def detect_language(text)
224
+ # Simple language detection based on character ranges
225
+ # Check for Japanese hiragana/katakana first (more specific than Chinese kanji)
226
+ return :ja if text.match?(/[\u3040-\u309f\u30a0-\u30ff]/)
227
+ return :zh_cn if text.match?(/[\u4e00-\u9fff]/)
228
+
229
+ :en # Default to English
230
+ rescue StandardError => e
231
+ logger.warn "Language detection failed: #{e.message}, defaulting to English"
232
+ :en
233
+ end
234
+
235
+ def generate_query_embedding(query_text, options = {})
236
+ logger.debug 'Generating query embedding...'
237
+ embedding_service.generate_embedding(query_text, options)
238
+ rescue StandardError => e
239
+ logger.error "Failed to generate query embedding: #{e.message}"
240
+ raise
241
+ end
242
+
243
+ def perform_vector_search(query_embedding, tags, options = {})
244
+ limit = options[:limit] || 10
245
+ threshold = options[:threshold] || 0.3
246
+
247
+ results = if tags.any?
248
+ # Use tag-enhanced search if tags are provided (via embedding manager)
249
+ embedding_manager.search_by_vector_with_tags(
250
+ query_embedding,
251
+ tags,
252
+ options.merge(limit: limit, threshold: threshold, document_ids: options[:document_ids])
253
+ )
254
+ else
255
+ # Regular vector search (via vector search service)
256
+ # Extract just the results array from the service response
257
+ search_response = vector_search_service.search_by_vector(
258
+ query_embedding,
259
+ options.merge(limit: limit, threshold: threshold, document_ids: options[:document_ids])
260
+ )
261
+ # Handle both hash response and direct array
262
+ if search_response.is_a?(Hash)
263
+ search_response[:results] || []
264
+ else
265
+ search_response
266
+ end
267
+ end
268
+
269
+ {
270
+ results: results,
271
+ search_type: :vector,
272
+ total_results: results.size
273
+ }
274
+ rescue StandardError => e
275
+ logger.error "Vector search failed: #{e.message}"
276
+ raise
277
+ end
278
+
279
+ def perform_fulltext_search(query_text, options = {})
280
+ language = options[:language] || :en
281
+ limit = options[:limit] || 10
282
+
283
+ # Fulltext search service returns a complete response hash with query, results, and metadata
284
+ # No need to wrap it further
285
+ response = fulltext_search_service.search(
286
+ query_text,
287
+ options.merge(
288
+ language: language,
289
+ limit: limit
290
+ )
291
+ )
292
+
293
+ # Support both real service response hash and mocked array response in specs.
294
+ if response.is_a?(Array)
295
+ return {
296
+ results: response,
297
+ search_type: :fulltext,
298
+ total_results: response.length
299
+ }
300
+ end
301
+
302
+ # Ensure response has the expected structure for our pipeline.
303
+ {
304
+ results: response[:results] || [],
305
+ search_type: :fulltext,
306
+ total_results: response.dig(:metadata, :total_count) || response[:results]&.length || 0
307
+ }
308
+ rescue StandardError => e
309
+ logger.error "Fulltext search failed: #{e.message}"
310
+ raise
311
+ end
312
+
313
+ def perform_hybrid_search(query_text, query_embedding, tags, options = {})
314
+ limit = options[:limit] || 10
315
+
316
+ # Build filters by merging existing filters with document_ids and tags
317
+ search_filters = options[:filters] || {}
318
+ search_filters[:document_ids] = options[:document_ids] if options[:document_ids]
319
+ search_filters[:tags] = tags if tags && !tags.empty?
320
+
321
+ # Hybrid search service expects query text and can optionally use pre-computed query_embedding
322
+ # This avoids re-generating the embedding for efficiency
323
+ search_response = hybrid_search_service.search(
324
+ query_text,
325
+ options.merge(
326
+ limit: limit,
327
+ query_embedding: query_embedding,
328
+ filters: search_filters.compact
329
+ )
330
+ )
331
+
332
+ # Extract the actual results array from the hybrid search response
333
+ # Handle both mock format (direct array) and real format (hash with :results key)
334
+ actual_results = if search_response.is_a?(Array)
335
+ # Mock format - direct array of results
336
+ search_response
337
+ else
338
+ # Real format - hash with :results key
339
+ search_response[:results] || []
340
+ end
341
+
342
+ {
343
+ results: actual_results,
344
+ search_type: :hybrid,
345
+ total_results: actual_results.size
346
+ }
347
+ rescue StandardError => e
348
+ logger.error "Hybrid search failed: #{e.message}"
349
+ raise
350
+ end
351
+
352
+ def enrich_results(search_results, query_text, options = {})
353
+ # Normalize the search results into the expected format
354
+ # search_results may have :total_results or :total_count, convert to metadata
355
+ results = search_results[:results] || []
356
+
357
+ # Build the standardized response format
358
+ response = {
359
+ query: query_text,
360
+ results: results,
361
+ # Backward-compatible top-level keys expected by existing specs/callers.
362
+ search_type: search_results[:search_type],
363
+ total_results: search_results[:total_results] || search_results[:total_count] || results.length,
364
+ metadata: {
365
+ total_count: search_results[:total_results] || search_results[:total_count] || results.length,
366
+ execution_time_ms: calculate_processing_time,
367
+ language: options[:language] || :en
368
+ }
369
+ }
370
+
371
+ # Add additional metadata from search_results if present
372
+ response[:metadata][:search_type] = search_results[:search_type] if search_results[:search_type]
373
+
374
+ # Add processing timestamp
375
+ response[:metadata][:processed_at] = Time.now
376
+
377
+ response
378
+ rescue StandardError => e
379
+ # If enrichment fails, return basic results
380
+ logger.error "Failed to enrich results: #{e.message}"
381
+ {
382
+ query: query_text,
383
+ results: results,
384
+ metadata: {
385
+ total_count: results.length,
386
+ execution_time_ms: 0,
387
+ language: options[:language] || :en,
388
+ error: e.message
389
+ }
390
+ }
391
+ end
392
+
393
+ def extract_context_for_response(results, options = {})
394
+ max_context_length = options[:max_context_length] || 4000
395
+ context_parts = []
396
+
397
+ # Ensure results is an array
398
+ results = Array(results)
399
+
400
+ results.first(5).each_with_index do |result, index|
401
+ # Skip if result is nil
402
+ next if result.nil?
403
+
404
+ # Handle case where result is not a hash (might be an Embedding object or array)
405
+ if result.is_a?(Hash)
406
+ section = result[:section] || result[:embedding]&.section
407
+ elsif result.respond_to?(:section)
408
+ # It's likely an Embedding object
409
+ section = result.section
410
+ else
411
+ logger.warn "Unexpected result format at index #{index}: #{result.class}"
412
+ next
413
+ end
414
+
415
+ next unless section
416
+
417
+ # Handle both hash and object sections
418
+ if section.is_a?(Hash)
419
+ # Section is a hash (from VectorSearchService)
420
+ content = section[:content].to_s.strip
421
+ next if content.empty?
422
+
423
+ # Add section title if available
424
+ context_parts << if section[:title] && !section[:title].empty?
425
+ "Section: #{section[:title]}\n#{content}"
426
+ else
427
+ content
428
+ end
429
+ else
430
+ # Section is a model object
431
+ content = section.content.to_s.strip
432
+ next if content.empty?
433
+
434
+ # Add section title if available
435
+ context_parts << if section.section_title && !section.section_title.empty?
436
+ "Section: #{section.section_title}\n#{content}"
437
+ else
438
+ content
439
+ end
440
+ end
441
+ end
442
+
443
+ # Join and truncate if necessary
444
+ full_context = context_parts.join("\n\n---\n\n")
445
+
446
+ if full_context.length > max_context_length
447
+ full_context = full_context[0...max_context_length] + '... (truncated)'
448
+ end
449
+
450
+ full_context
451
+ end
452
+
453
+ def extract_sources(results)
454
+ sources = []
455
+
456
+ results.first(5).each do |result|
457
+ section = result[:section] || result[:embedding]&.section
458
+ next unless section
459
+
460
+ document = section.document
461
+ next unless document
462
+
463
+ sources << {
464
+ document_id: document.id,
465
+ document_title: document.title,
466
+ section_id: section.id,
467
+ section_title: section.section_title,
468
+ url: document.url,
469
+ relevance: result[:similarity] || result[:boosted_score] || 0
470
+ }
471
+ end
472
+
473
+ sources
474
+ end
475
+
476
+ def ensure_tag_objects(tags)
477
+ return [] unless tags
478
+
479
+ tags.map do |tag|
480
+ case tag
481
+ when ::SmartRAG::Models::Tag
482
+ tag
483
+ when Integer
484
+ ::SmartRAG::Models::Tag.find(id: tag) || raise(ArgumentError, "Tag not found: #{tag}")
485
+ when String
486
+ # Use find_or_create for string tags to ensure they exist
487
+ ::SmartRAG::Models::Tag.find_or_create(tag)
488
+ else
489
+ raise ArgumentError, "Invalid tag type: #{tag.class}"
490
+ end
491
+ end
492
+ end
493
+
494
+ def calculate_processing_time
495
+ # This would track actual processing time in a real implementation
496
+ # For now, return 0 as placeholder
497
+ 0
498
+ end
499
+
500
+ def apply_domain_boost(response, _query_text, options)
501
+ options ||= {}
502
+ expected = Array(options[:expected_categories] || options[:expected_category]).compact
503
+ return normalize_categories(response) if expected.empty?
504
+
505
+ results = response[:results] || []
506
+ return response if results.empty?
507
+
508
+ normalize_categories(response)
509
+
510
+ boosted = results.sort_by do |result|
511
+ metadata = result[:metadata] || {}
512
+ category = metadata[:category].to_s
513
+ match = expected.any? { |exp| category.include?(exp) }
514
+ match ? 0 : 1
515
+ end
516
+
517
+ response.merge(results: boosted)
518
+ end
519
+
520
+ def normalize_categories(response)
521
+ results = response[:results] || []
522
+ results.each do |result|
523
+ metadata = result[:metadata] || {}
524
+ normalized = normalize_category(metadata[:category], metadata[:document_title])
525
+ metadata[:category] = normalized if normalized
526
+ result[:metadata] = metadata
527
+ end
528
+ response.merge(results: results)
529
+ end
530
+
531
+ def diversify_results_by_category(response, options = {})
532
+ results = response[:results] || []
533
+ return response if results.length < 2
534
+
535
+ diversify = options.fetch(:diversify_categories, true)
536
+ return response unless diversify
537
+
538
+ groups = []
539
+ group_map = {}
540
+
541
+ results.each do |result|
542
+ metadata = result[:metadata] || {}
543
+ category = metadata[:category].to_s
544
+ category = "uncategorized" if category.empty?
545
+
546
+ unless group_map.key?(category)
547
+ group_map[category] = []
548
+ groups << category
549
+ end
550
+
551
+ group_map[category] << result
552
+ end
553
+
554
+ diversified = []
555
+ loop do
556
+ added = false
557
+ groups.each do |category|
558
+ next if group_map[category].empty?
559
+
560
+ diversified << group_map[category].shift
561
+ added = true
562
+ end
563
+ break unless added
564
+ end
565
+
566
+ response.merge(results: diversified)
567
+ end
568
+
569
+ def normalize_category(category, _title)
570
+ cat = category.to_s
571
+ return cat if cat.empty?
572
+
573
+ cat
574
+ end
575
+ end
576
+ end
577
+ end
@@ -0,0 +1,88 @@
1
+ module SmartRAG
2
+ module Errors
3
+ # Base error class for all SmartRAG errors
4
+ class BaseError < StandardError
5
+ attr_reader :context
6
+
7
+ def initialize(message, context = {})
8
+ super(message)
9
+ @context = context
10
+ end
11
+ end
12
+
13
+ # Document processing errors
14
+ class DocumentProcessingError < BaseError; end
15
+ class DocumentDownloadError < DocumentProcessingError; end
16
+ class DocumentConversionError < DocumentProcessingError; end
17
+ class ChunkingError < DocumentProcessingError; end
18
+
19
+ # Search errors (extend existing ones from fulltext_manager.rb)
20
+ class SearchError < BaseError
21
+ def initialize(message, context = {})
22
+ super("Search failed: #{message}", context)
23
+ end
24
+ end
25
+
26
+ class VectorSearchError < SearchError; end
27
+ class QueryParseError < SearchError; end
28
+ class LanguageDetectionError < SearchError; end
29
+ class QueryProcessingError < SearchError; end
30
+ class FulltextSearchError < SearchError; end
31
+ class HybridSearchError < SearchError; end
32
+
33
+ # Embedding errors
34
+ class EmbeddingError < BaseError; end
35
+ class EmbeddingGenerationError < EmbeddingError; end
36
+ class EmbeddingStorageError < EmbeddingError; end
37
+ class EmbeddingNotFoundError < EmbeddingError; end
38
+
39
+ # Tag generation errors
40
+ class TagError < BaseError; end
41
+ class TagGenerationError < TagError; end
42
+ class TagStorageError < TagError; end
43
+
44
+ # Database errors
45
+ class DatabaseError < BaseError; end
46
+ class MigrationError < DatabaseError; end
47
+ class ConnectionError < DatabaseError; end
48
+
49
+ # Configuration errors
50
+ class ConfigError < BaseError; end
51
+ class InvalidConfigError < ConfigError; end
52
+ class MissingConfigError < ConfigError; end
53
+
54
+ # Service errors
55
+ class ServiceError < BaseError
56
+ def initialize(message, context = {})
57
+ super("Service error: #{message}", context)
58
+ end
59
+ end
60
+ class EmbeddingServiceError < ServiceError; end
61
+ class VectorSearchServiceError < ServiceError; end
62
+ class FulltextSearchServiceError < ServiceError; end
63
+ class HybridSearchServiceError < ServiceError; end
64
+ class SummarizationServiceError < ServiceError; end
65
+ class TagServiceError < ServiceError; end
66
+ class QueryProcessingServiceError < ServiceError; end
67
+ class ResponseGenerationError < ServiceError; end
68
+
69
+ # LLM integration errors
70
+ class LLMError < BaseError; end
71
+ class LLMConnectionError < LLMError; end
72
+ class LLMRateLimitError < LLMError; end
73
+ class LLMTimeoutError < LLMError; end
74
+ class LLMResponseError < LLMError; end
75
+ class LLMConfigurationError < LLMError; end
76
+ class ExternalServiceUnavailable < LLMError; end
77
+ class ContextTooLarge < LLMError; end
78
+
79
+ # Parser errors
80
+ class ParserError < BaseError; end
81
+ class QueryParserError < ParserError; end
82
+
83
+ # Validation errors
84
+ class ValidationError < BaseError; end
85
+ class InvalidQueryError < ValidationError; end
86
+ class InvalidParameterError < ValidationError; end
87
+ end
88
+ end