smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,577 @@
|
|
|
1
|
+
require_relative '../services/embedding_service'
|
|
2
|
+
require_relative '../services/tag_service'
|
|
3
|
+
require_relative '../services/vector_search_service'
|
|
4
|
+
require_relative '../services/fulltext_search_service'
|
|
5
|
+
require_relative '../services/hybrid_search_service'
|
|
6
|
+
require_relative '../services/summarization_service'
|
|
7
|
+
require_relative '../errors'
|
|
8
|
+
|
|
9
|
+
module SmartRAG
|
|
10
|
+
module Core
|
|
11
|
+
# QueryProcessor handles natural language queries and generates responses
|
|
12
|
+
class QueryProcessor
|
|
13
|
+
attr_reader :embedding_service, :tag_service, :vector_search_service,
|
|
14
|
+
:fulltext_search_service, :hybrid_search_service,
|
|
15
|
+
:summarization_service, :embedding_manager, :config, :logger
|
|
16
|
+
|
|
17
|
+
# Initialize the query processor
|
|
18
|
+
# @param config [Hash] Configuration options
|
|
19
|
+
# @option config [EmbeddingService] :embedding_service Embedding service instance
|
|
20
|
+
# @option config [TagService] :tag_service Tag service instance
|
|
21
|
+
# @option config [VectorSearchService] :vector_search_service Vector search service instance
|
|
22
|
+
# @option config [FulltextSearchService] :fulltext_search_service Fulltext search service instance
|
|
23
|
+
# @option config [HybridSearchService] :hybrid_search_service Hybrid search service instance
|
|
24
|
+
# @option config [SummarizationService] :summarization_service Summarization service instance
|
|
25
|
+
# @option config [Logger] :logger Logger instance
|
|
26
|
+
def initialize(config = {})
|
|
27
|
+
@config = config
|
|
28
|
+
@logger = config[:logger] || Logger.new(STDOUT)
|
|
29
|
+
|
|
30
|
+
# Prepare config with logger for all services
|
|
31
|
+
config_with_logger = config.merge(logger: @logger)
|
|
32
|
+
|
|
33
|
+
# Initialize services (use provided or create defaults)
|
|
34
|
+
@embedding_service = config[:embedding_service] || Services::EmbeddingService.new(config_with_logger)
|
|
35
|
+
@tag_service = config[:tag_service] || Services::TagService.new(config_with_logger)
|
|
36
|
+
|
|
37
|
+
# Create embedding manager for vector search (used for tag-enhanced search)
|
|
38
|
+
@embedding_manager = config[:embedding_manager] || ::SmartRAG::Core::Embedding.new(config)
|
|
39
|
+
@vector_search_service = config[:vector_search_service] || Services::VectorSearchService.new(
|
|
40
|
+
@embedding_manager, config_with_logger
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Create fulltext manager for fulltext search
|
|
44
|
+
# Note: FulltextManager requires a database connection as first parameter
|
|
45
|
+
db = config[:db] || ::SmartRAG.db
|
|
46
|
+
fulltext_manager = config[:fulltext_manager] || ::SmartRAG::Core::FulltextManager.new(db, config.merge(logger: @logger))
|
|
47
|
+
query_parser = config[:query_parser] || ::SmartRAG::Parsers::QueryParser.new
|
|
48
|
+
@fulltext_search_service = config[:fulltext_search_service] || Services::FulltextSearchService.new(
|
|
49
|
+
fulltext_manager, query_parser, logger: @logger
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@hybrid_search_service = config[:hybrid_search_service] || Services::HybridSearchService.new(
|
|
53
|
+
@embedding_manager,
|
|
54
|
+
fulltext_manager,
|
|
55
|
+
config_with_logger
|
|
56
|
+
)
|
|
57
|
+
@summarization_service = config[:summarization_service] || Services::SummarizationService.new(config_with_logger)
|
|
58
|
+
|
|
59
|
+
@logger.info 'QueryProcessor initialized with all services'
|
|
60
|
+
rescue StandardError => e
|
|
61
|
+
@logger.error "Failed to initialize QueryProcessor: #{e.message}" if @logger
|
|
62
|
+
raise
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Process a natural language query and return search results
|
|
66
|
+
# @param query_text [String] Natural language query
|
|
67
|
+
# @param options [Hash] Processing options
|
|
68
|
+
# @option options [Symbol] :language Query language (:zh_cn, :zh_tw, :en, :ja)
|
|
69
|
+
# @option options [Integer] :limit Maximum results (default: 10)
|
|
70
|
+
# @option options [Float] :threshold Similarity threshold (default: 0.3)
|
|
71
|
+
# @option options [Symbol] :search_type Search type (:vector, :fulltext, :hybrid)
|
|
72
|
+
# @option options [Array<Integer>] :document_ids Filter by document IDs
|
|
73
|
+
# @option options [Array<String>, Array<Tag>] :tags Tags to boost results
|
|
74
|
+
# @option options [Boolean] :generate_tags Whether to generate tags from query (default: false)
|
|
75
|
+
# @return [Hash] Search results with metadata
|
|
76
|
+
def process_query(query_text, options = {})
|
|
77
|
+
raise ArgumentError, 'Query text cannot be nil or empty' if query_text.to_s.strip.empty?
|
|
78
|
+
|
|
79
|
+
logger.info "Processing query: #{query_text[0..100]}..."
|
|
80
|
+
|
|
81
|
+
# Validate search type first
|
|
82
|
+
search_type = options[:search_type] || :hybrid
|
|
83
|
+
unless %i[vector fulltext hybrid].include?(search_type)
|
|
84
|
+
raise ArgumentError, "Invalid search type: #{search_type}"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Detect language if not provided
|
|
88
|
+
options[:language] ||= detect_language(query_text)
|
|
89
|
+
language = options[:language]
|
|
90
|
+
logger.info "Detected language: #{language}"
|
|
91
|
+
|
|
92
|
+
# Generate query tags if requested
|
|
93
|
+
query_tags = []
|
|
94
|
+
if options[:generate_tags]
|
|
95
|
+
logger.info 'Generating tags from query...'
|
|
96
|
+
generated_tags = tag_service.generate_tags(query_text, nil, [language],
|
|
97
|
+
max_content_tags: 5, include_category: false)
|
|
98
|
+
query_tags = generated_tags[:content_tags] || []
|
|
99
|
+
logger.info "Generated #{query_tags.size} tags: #{query_tags.join(', ')}"
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Combine user-provided tags with generated tags
|
|
103
|
+
all_tags = options[:tags] ? ensure_tag_objects(options[:tags]) : []
|
|
104
|
+
all_tags.concat(ensure_tag_objects(query_tags)) if query_tags.any?
|
|
105
|
+
|
|
106
|
+
# Generate query embedding for vector search
|
|
107
|
+
query_embedding = generate_query_embedding(query_text, options)
|
|
108
|
+
|
|
109
|
+
# Execute search based on type
|
|
110
|
+
search_results = case search_type
|
|
111
|
+
when :vector
|
|
112
|
+
logger.info 'Performing vector search...'
|
|
113
|
+
perform_vector_search(query_embedding, all_tags, options)
|
|
114
|
+
when :fulltext
|
|
115
|
+
logger.info 'Performing fulltext search...'
|
|
116
|
+
perform_fulltext_search(query_text, options)
|
|
117
|
+
when :hybrid
|
|
118
|
+
logger.info 'Performing hybrid search...'
|
|
119
|
+
perform_hybrid_search(query_text, query_embedding, all_tags, options)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
logger.info "Search completed. Found #{search_results[:results].size} results"
|
|
123
|
+
|
|
124
|
+
# Enrich results with additional metadata
|
|
125
|
+
enriched = enrich_results(search_results, query_text, options)
|
|
126
|
+
if search_type == :hybrid
|
|
127
|
+
apply_domain_boost(enriched, query_text, options)
|
|
128
|
+
enriched = diversify_results_by_category(enriched, options)
|
|
129
|
+
end
|
|
130
|
+
enriched
|
|
131
|
+
rescue ArgumentError
|
|
132
|
+
raise
|
|
133
|
+
rescue StandardError => e
|
|
134
|
+
logger.error "Query processing failed: #{e.message}"
|
|
135
|
+
logger.error e.backtrace.join("\n")
|
|
136
|
+
raise ::SmartRAG::Errors::QueryProcessingError, "Query processing failed: #{e.message}"
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Generate a natural language response based on search results
|
|
140
|
+
# @param question [String] Original question
|
|
141
|
+
# @param search_results [Hash] Results from process_query
|
|
142
|
+
# @param options [Hash] Response generation options
|
|
143
|
+
# @option options [Symbol] :language Response language
|
|
144
|
+
# @option options [Integer] :max_length Maximum response length
|
|
145
|
+
# @option options [Boolean] :include_sources Whether to include source references (default: true)
|
|
146
|
+
# @return [Hash] Response with answer and metadata
|
|
147
|
+
def generate_response(question, search_results, options = {})
|
|
148
|
+
raise ArgumentError, 'Question cannot be nil or empty' if question.to_s.strip.empty?
|
|
149
|
+
raise ArgumentError, 'Search results cannot be nil' if search_results.nil?
|
|
150
|
+
|
|
151
|
+
logger.info "Generating response for question: #{question[0..50]}..."
|
|
152
|
+
logger.info "Search results: #{search_results.inspect[0..200]}"
|
|
153
|
+
|
|
154
|
+
# Extract results and context
|
|
155
|
+
results = search_results[:results] || []
|
|
156
|
+
logger.info "Number of results: #{results.size}"
|
|
157
|
+
|
|
158
|
+
context = extract_context_for_response(results, options)
|
|
159
|
+
logger.info "Context extracted: #{context.length} chars"
|
|
160
|
+
|
|
161
|
+
if context.empty?
|
|
162
|
+
logger.warn 'No context available for response generation'
|
|
163
|
+
return {
|
|
164
|
+
answer: "I don't have enough information to answer this question.",
|
|
165
|
+
sources: [],
|
|
166
|
+
confidence: 0.0
|
|
167
|
+
}
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Generate response using summarization service
|
|
171
|
+
logger.info 'Calling summarization service...'
|
|
172
|
+
response = summarization_service.summarize_search_results(question, context, options)
|
|
173
|
+
logger.info "Summarization service returned: #{response.inspect[0..200]}"
|
|
174
|
+
|
|
175
|
+
# Add source references if requested
|
|
176
|
+
if options.fetch(:include_sources, true)
|
|
177
|
+
sources = extract_sources(results)
|
|
178
|
+
response[:sources] = sources
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
logger.info 'Response generated successfully'
|
|
182
|
+
response
|
|
183
|
+
rescue ArgumentError
|
|
184
|
+
raise
|
|
185
|
+
rescue StandardError => e
|
|
186
|
+
logger.error "Response generation failed: #{e.message}"
|
|
187
|
+
raise ::SmartRAG::Errors::ResponseGenerationError, "Response generation failed: #{e.message}"
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Process a query and generate a response in one step
|
|
191
|
+
# @param question [String] Natural language question
|
|
192
|
+
# @param options [Hash] Processing and response options
|
|
193
|
+
# @return [Hash] Complete response with answer, sources, and metadata
|
|
194
|
+
def ask(question, options = {})
|
|
195
|
+
logger.info "Processing ask request: #{question[0..50]}..."
|
|
196
|
+
|
|
197
|
+
# Process the query to get search results
|
|
198
|
+
search_results = process_query(question, options)
|
|
199
|
+
|
|
200
|
+
# Generate response from search results
|
|
201
|
+
response = generate_response(question, search_results, options)
|
|
202
|
+
|
|
203
|
+
# Combine everything
|
|
204
|
+
{
|
|
205
|
+
question: question,
|
|
206
|
+
answer: response[:answer],
|
|
207
|
+
sources: response[:sources],
|
|
208
|
+
search_results: search_results[:results],
|
|
209
|
+
metadata: {
|
|
210
|
+
search_type: search_results[:search_type],
|
|
211
|
+
total_results: search_results[:total_results],
|
|
212
|
+
processing_time_ms: search_results[:processing_time_ms],
|
|
213
|
+
confidence: response[:confidence]
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
rescue StandardError => e
|
|
217
|
+
logger.error "Ask request failed: #{e.message}"
|
|
218
|
+
raise
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
private
|
|
222
|
+
|
|
223
|
+
def detect_language(text)
|
|
224
|
+
# Simple language detection based on character ranges
|
|
225
|
+
# Check for Japanese hiragana/katakana first (more specific than Chinese kanji)
|
|
226
|
+
return :ja if text.match?(/[\u3040-\u309f\u30a0-\u30ff]/)
|
|
227
|
+
return :zh_cn if text.match?(/[\u4e00-\u9fff]/)
|
|
228
|
+
|
|
229
|
+
:en # Default to English
|
|
230
|
+
rescue StandardError => e
|
|
231
|
+
logger.warn "Language detection failed: #{e.message}, defaulting to English"
|
|
232
|
+
:en
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
def generate_query_embedding(query_text, options = {})
|
|
236
|
+
logger.debug 'Generating query embedding...'
|
|
237
|
+
embedding_service.generate_embedding(query_text, options)
|
|
238
|
+
rescue StandardError => e
|
|
239
|
+
logger.error "Failed to generate query embedding: #{e.message}"
|
|
240
|
+
raise
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
def perform_vector_search(query_embedding, tags, options = {})
|
|
244
|
+
limit = options[:limit] || 10
|
|
245
|
+
threshold = options[:threshold] || 0.3
|
|
246
|
+
|
|
247
|
+
results = if tags.any?
|
|
248
|
+
# Use tag-enhanced search if tags are provided (via embedding manager)
|
|
249
|
+
embedding_manager.search_by_vector_with_tags(
|
|
250
|
+
query_embedding,
|
|
251
|
+
tags,
|
|
252
|
+
options.merge(limit: limit, threshold: threshold, document_ids: options[:document_ids])
|
|
253
|
+
)
|
|
254
|
+
else
|
|
255
|
+
# Regular vector search (via vector search service)
|
|
256
|
+
# Extract just the results array from the service response
|
|
257
|
+
search_response = vector_search_service.search_by_vector(
|
|
258
|
+
query_embedding,
|
|
259
|
+
options.merge(limit: limit, threshold: threshold, document_ids: options[:document_ids])
|
|
260
|
+
)
|
|
261
|
+
# Handle both hash response and direct array
|
|
262
|
+
if search_response.is_a?(Hash)
|
|
263
|
+
search_response[:results] || []
|
|
264
|
+
else
|
|
265
|
+
search_response
|
|
266
|
+
end
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
{
|
|
270
|
+
results: results,
|
|
271
|
+
search_type: :vector,
|
|
272
|
+
total_results: results.size
|
|
273
|
+
}
|
|
274
|
+
rescue StandardError => e
|
|
275
|
+
logger.error "Vector search failed: #{e.message}"
|
|
276
|
+
raise
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def perform_fulltext_search(query_text, options = {})
|
|
280
|
+
language = options[:language] || :en
|
|
281
|
+
limit = options[:limit] || 10
|
|
282
|
+
|
|
283
|
+
# Fulltext search service returns a complete response hash with query, results, and metadata
|
|
284
|
+
# No need to wrap it further
|
|
285
|
+
response = fulltext_search_service.search(
|
|
286
|
+
query_text,
|
|
287
|
+
options.merge(
|
|
288
|
+
language: language,
|
|
289
|
+
limit: limit
|
|
290
|
+
)
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# Support both real service response hash and mocked array response in specs.
|
|
294
|
+
if response.is_a?(Array)
|
|
295
|
+
return {
|
|
296
|
+
results: response,
|
|
297
|
+
search_type: :fulltext,
|
|
298
|
+
total_results: response.length
|
|
299
|
+
}
|
|
300
|
+
end
|
|
301
|
+
|
|
302
|
+
# Ensure response has the expected structure for our pipeline.
|
|
303
|
+
{
|
|
304
|
+
results: response[:results] || [],
|
|
305
|
+
search_type: :fulltext,
|
|
306
|
+
total_results: response.dig(:metadata, :total_count) || response[:results]&.length || 0
|
|
307
|
+
}
|
|
308
|
+
rescue StandardError => e
|
|
309
|
+
logger.error "Fulltext search failed: #{e.message}"
|
|
310
|
+
raise
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def perform_hybrid_search(query_text, query_embedding, tags, options = {})
|
|
314
|
+
limit = options[:limit] || 10
|
|
315
|
+
|
|
316
|
+
# Build filters by merging existing filters with document_ids and tags
|
|
317
|
+
search_filters = options[:filters] || {}
|
|
318
|
+
search_filters[:document_ids] = options[:document_ids] if options[:document_ids]
|
|
319
|
+
search_filters[:tags] = tags if tags && !tags.empty?
|
|
320
|
+
|
|
321
|
+
# Hybrid search service expects query text and can optionally use pre-computed query_embedding
|
|
322
|
+
# This avoids re-generating the embedding for efficiency
|
|
323
|
+
search_response = hybrid_search_service.search(
|
|
324
|
+
query_text,
|
|
325
|
+
options.merge(
|
|
326
|
+
limit: limit,
|
|
327
|
+
query_embedding: query_embedding,
|
|
328
|
+
filters: search_filters.compact
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# Extract the actual results array from the hybrid search response
|
|
333
|
+
# Handle both mock format (direct array) and real format (hash with :results key)
|
|
334
|
+
actual_results = if search_response.is_a?(Array)
|
|
335
|
+
# Mock format - direct array of results
|
|
336
|
+
search_response
|
|
337
|
+
else
|
|
338
|
+
# Real format - hash with :results key
|
|
339
|
+
search_response[:results] || []
|
|
340
|
+
end
|
|
341
|
+
|
|
342
|
+
{
|
|
343
|
+
results: actual_results,
|
|
344
|
+
search_type: :hybrid,
|
|
345
|
+
total_results: actual_results.size
|
|
346
|
+
}
|
|
347
|
+
rescue StandardError => e
|
|
348
|
+
logger.error "Hybrid search failed: #{e.message}"
|
|
349
|
+
raise
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
def enrich_results(search_results, query_text, options = {})
|
|
353
|
+
# Normalize the search results into the expected format
|
|
354
|
+
# search_results may have :total_results or :total_count, convert to metadata
|
|
355
|
+
results = search_results[:results] || []
|
|
356
|
+
|
|
357
|
+
# Build the standardized response format
|
|
358
|
+
response = {
|
|
359
|
+
query: query_text,
|
|
360
|
+
results: results,
|
|
361
|
+
# Backward-compatible top-level keys expected by existing specs/callers.
|
|
362
|
+
search_type: search_results[:search_type],
|
|
363
|
+
total_results: search_results[:total_results] || search_results[:total_count] || results.length,
|
|
364
|
+
metadata: {
|
|
365
|
+
total_count: search_results[:total_results] || search_results[:total_count] || results.length,
|
|
366
|
+
execution_time_ms: calculate_processing_time,
|
|
367
|
+
language: options[:language] || :en
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
# Add additional metadata from search_results if present
|
|
372
|
+
response[:metadata][:search_type] = search_results[:search_type] if search_results[:search_type]
|
|
373
|
+
|
|
374
|
+
# Add processing timestamp
|
|
375
|
+
response[:metadata][:processed_at] = Time.now
|
|
376
|
+
|
|
377
|
+
response
|
|
378
|
+
rescue StandardError => e
|
|
379
|
+
# If enrichment fails, return basic results
|
|
380
|
+
logger.error "Failed to enrich results: #{e.message}"
|
|
381
|
+
{
|
|
382
|
+
query: query_text,
|
|
383
|
+
results: results,
|
|
384
|
+
metadata: {
|
|
385
|
+
total_count: results.length,
|
|
386
|
+
execution_time_ms: 0,
|
|
387
|
+
language: options[:language] || :en,
|
|
388
|
+
error: e.message
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
end
|
|
392
|
+
|
|
393
|
+
def extract_context_for_response(results, options = {})
|
|
394
|
+
max_context_length = options[:max_context_length] || 4000
|
|
395
|
+
context_parts = []
|
|
396
|
+
|
|
397
|
+
# Ensure results is an array
|
|
398
|
+
results = Array(results)
|
|
399
|
+
|
|
400
|
+
results.first(5).each_with_index do |result, index|
|
|
401
|
+
# Skip if result is nil
|
|
402
|
+
next if result.nil?
|
|
403
|
+
|
|
404
|
+
# Handle case where result is not a hash (might be an Embedding object or array)
|
|
405
|
+
if result.is_a?(Hash)
|
|
406
|
+
section = result[:section] || result[:embedding]&.section
|
|
407
|
+
elsif result.respond_to?(:section)
|
|
408
|
+
# It's likely an Embedding object
|
|
409
|
+
section = result.section
|
|
410
|
+
else
|
|
411
|
+
logger.warn "Unexpected result format at index #{index}: #{result.class}"
|
|
412
|
+
next
|
|
413
|
+
end
|
|
414
|
+
|
|
415
|
+
next unless section
|
|
416
|
+
|
|
417
|
+
# Handle both hash and object sections
|
|
418
|
+
if section.is_a?(Hash)
|
|
419
|
+
# Section is a hash (from VectorSearchService)
|
|
420
|
+
content = section[:content].to_s.strip
|
|
421
|
+
next if content.empty?
|
|
422
|
+
|
|
423
|
+
# Add section title if available
|
|
424
|
+
context_parts << if section[:title] && !section[:title].empty?
|
|
425
|
+
"Section: #{section[:title]}\n#{content}"
|
|
426
|
+
else
|
|
427
|
+
content
|
|
428
|
+
end
|
|
429
|
+
else
|
|
430
|
+
# Section is a model object
|
|
431
|
+
content = section.content.to_s.strip
|
|
432
|
+
next if content.empty?
|
|
433
|
+
|
|
434
|
+
# Add section title if available
|
|
435
|
+
context_parts << if section.section_title && !section.section_title.empty?
|
|
436
|
+
"Section: #{section.section_title}\n#{content}"
|
|
437
|
+
else
|
|
438
|
+
content
|
|
439
|
+
end
|
|
440
|
+
end
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
# Join and truncate if necessary
|
|
444
|
+
full_context = context_parts.join("\n\n---\n\n")
|
|
445
|
+
|
|
446
|
+
if full_context.length > max_context_length
|
|
447
|
+
full_context = full_context[0...max_context_length] + '... (truncated)'
|
|
448
|
+
end
|
|
449
|
+
|
|
450
|
+
full_context
|
|
451
|
+
end
|
|
452
|
+
|
|
453
|
+
def extract_sources(results)
|
|
454
|
+
sources = []
|
|
455
|
+
|
|
456
|
+
results.first(5).each do |result|
|
|
457
|
+
section = result[:section] || result[:embedding]&.section
|
|
458
|
+
next unless section
|
|
459
|
+
|
|
460
|
+
document = section.document
|
|
461
|
+
next unless document
|
|
462
|
+
|
|
463
|
+
sources << {
|
|
464
|
+
document_id: document.id,
|
|
465
|
+
document_title: document.title,
|
|
466
|
+
section_id: section.id,
|
|
467
|
+
section_title: section.section_title,
|
|
468
|
+
url: document.url,
|
|
469
|
+
relevance: result[:similarity] || result[:boosted_score] || 0
|
|
470
|
+
}
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
sources
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
def ensure_tag_objects(tags)
|
|
477
|
+
return [] unless tags
|
|
478
|
+
|
|
479
|
+
tags.map do |tag|
|
|
480
|
+
case tag
|
|
481
|
+
when ::SmartRAG::Models::Tag
|
|
482
|
+
tag
|
|
483
|
+
when Integer
|
|
484
|
+
::SmartRAG::Models::Tag.find(id: tag) || raise(ArgumentError, "Tag not found: #{tag}")
|
|
485
|
+
when String
|
|
486
|
+
# Use find_or_create for string tags to ensure they exist
|
|
487
|
+
::SmartRAG::Models::Tag.find_or_create(tag)
|
|
488
|
+
else
|
|
489
|
+
raise ArgumentError, "Invalid tag type: #{tag.class}"
|
|
490
|
+
end
|
|
491
|
+
end
|
|
492
|
+
end
|
|
493
|
+
|
|
494
|
+
def calculate_processing_time
|
|
495
|
+
# This would track actual processing time in a real implementation
|
|
496
|
+
# For now, return 0 as placeholder
|
|
497
|
+
0
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
def apply_domain_boost(response, _query_text, options)
|
|
501
|
+
options ||= {}
|
|
502
|
+
expected = Array(options[:expected_categories] || options[:expected_category]).compact
|
|
503
|
+
return normalize_categories(response) if expected.empty?
|
|
504
|
+
|
|
505
|
+
results = response[:results] || []
|
|
506
|
+
return response if results.empty?
|
|
507
|
+
|
|
508
|
+
normalize_categories(response)
|
|
509
|
+
|
|
510
|
+
boosted = results.sort_by do |result|
|
|
511
|
+
metadata = result[:metadata] || {}
|
|
512
|
+
category = metadata[:category].to_s
|
|
513
|
+
match = expected.any? { |exp| category.include?(exp) }
|
|
514
|
+
match ? 0 : 1
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
response.merge(results: boosted)
|
|
518
|
+
end
|
|
519
|
+
|
|
520
|
+
def normalize_categories(response)
|
|
521
|
+
results = response[:results] || []
|
|
522
|
+
results.each do |result|
|
|
523
|
+
metadata = result[:metadata] || {}
|
|
524
|
+
normalized = normalize_category(metadata[:category], metadata[:document_title])
|
|
525
|
+
metadata[:category] = normalized if normalized
|
|
526
|
+
result[:metadata] = metadata
|
|
527
|
+
end
|
|
528
|
+
response.merge(results: results)
|
|
529
|
+
end
|
|
530
|
+
|
|
531
|
+
def diversify_results_by_category(response, options = {})
|
|
532
|
+
results = response[:results] || []
|
|
533
|
+
return response if results.length < 2
|
|
534
|
+
|
|
535
|
+
diversify = options.fetch(:diversify_categories, true)
|
|
536
|
+
return response unless diversify
|
|
537
|
+
|
|
538
|
+
groups = []
|
|
539
|
+
group_map = {}
|
|
540
|
+
|
|
541
|
+
results.each do |result|
|
|
542
|
+
metadata = result[:metadata] || {}
|
|
543
|
+
category = metadata[:category].to_s
|
|
544
|
+
category = "uncategorized" if category.empty?
|
|
545
|
+
|
|
546
|
+
unless group_map.key?(category)
|
|
547
|
+
group_map[category] = []
|
|
548
|
+
groups << category
|
|
549
|
+
end
|
|
550
|
+
|
|
551
|
+
group_map[category] << result
|
|
552
|
+
end
|
|
553
|
+
|
|
554
|
+
diversified = []
|
|
555
|
+
loop do
|
|
556
|
+
added = false
|
|
557
|
+
groups.each do |category|
|
|
558
|
+
next if group_map[category].empty?
|
|
559
|
+
|
|
560
|
+
diversified << group_map[category].shift
|
|
561
|
+
added = true
|
|
562
|
+
end
|
|
563
|
+
break unless added
|
|
564
|
+
end
|
|
565
|
+
|
|
566
|
+
response.merge(results: diversified)
|
|
567
|
+
end
|
|
568
|
+
|
|
569
|
+
def normalize_category(category, _title)
|
|
570
|
+
cat = category.to_s
|
|
571
|
+
return cat if cat.empty?
|
|
572
|
+
|
|
573
|
+
cat
|
|
574
|
+
end
|
|
575
|
+
end
|
|
576
|
+
end
|
|
577
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
module SmartRAG
|
|
2
|
+
module Errors
|
|
3
|
+
# Base error class for all SmartRAG errors
|
|
4
|
+
class BaseError < StandardError
|
|
5
|
+
attr_reader :context
|
|
6
|
+
|
|
7
|
+
def initialize(message, context = {})
|
|
8
|
+
super(message)
|
|
9
|
+
@context = context
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
# Document processing errors
|
|
14
|
+
class DocumentProcessingError < BaseError; end
|
|
15
|
+
class DocumentDownloadError < DocumentProcessingError; end
|
|
16
|
+
class DocumentConversionError < DocumentProcessingError; end
|
|
17
|
+
class ChunkingError < DocumentProcessingError; end
|
|
18
|
+
|
|
19
|
+
# Search errors (extend existing ones from fulltext_manager.rb)
|
|
20
|
+
class SearchError < BaseError
|
|
21
|
+
def initialize(message, context = {})
|
|
22
|
+
super("Search failed: #{message}", context)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
class VectorSearchError < SearchError; end
|
|
27
|
+
class QueryParseError < SearchError; end
|
|
28
|
+
class LanguageDetectionError < SearchError; end
|
|
29
|
+
class QueryProcessingError < SearchError; end
|
|
30
|
+
class FulltextSearchError < SearchError; end
|
|
31
|
+
class HybridSearchError < SearchError; end
|
|
32
|
+
|
|
33
|
+
# Embedding errors
|
|
34
|
+
class EmbeddingError < BaseError; end
|
|
35
|
+
class EmbeddingGenerationError < EmbeddingError; end
|
|
36
|
+
class EmbeddingStorageError < EmbeddingError; end
|
|
37
|
+
class EmbeddingNotFoundError < EmbeddingError; end
|
|
38
|
+
|
|
39
|
+
# Tag generation errors
|
|
40
|
+
class TagError < BaseError; end
|
|
41
|
+
class TagGenerationError < TagError; end
|
|
42
|
+
class TagStorageError < TagError; end
|
|
43
|
+
|
|
44
|
+
# Database errors
|
|
45
|
+
class DatabaseError < BaseError; end
|
|
46
|
+
class MigrationError < DatabaseError; end
|
|
47
|
+
class ConnectionError < DatabaseError; end
|
|
48
|
+
|
|
49
|
+
# Configuration errors
|
|
50
|
+
class ConfigError < BaseError; end
|
|
51
|
+
class InvalidConfigError < ConfigError; end
|
|
52
|
+
class MissingConfigError < ConfigError; end
|
|
53
|
+
|
|
54
|
+
# Service errors
|
|
55
|
+
class ServiceError < BaseError
|
|
56
|
+
def initialize(message, context = {})
|
|
57
|
+
super("Service error: #{message}", context)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
class EmbeddingServiceError < ServiceError; end
|
|
61
|
+
class VectorSearchServiceError < ServiceError; end
|
|
62
|
+
class FulltextSearchServiceError < ServiceError; end
|
|
63
|
+
class HybridSearchServiceError < ServiceError; end
|
|
64
|
+
class SummarizationServiceError < ServiceError; end
|
|
65
|
+
class TagServiceError < ServiceError; end
|
|
66
|
+
class QueryProcessingServiceError < ServiceError; end
|
|
67
|
+
class ResponseGenerationError < ServiceError; end
|
|
68
|
+
|
|
69
|
+
# LLM integration errors
|
|
70
|
+
class LLMError < BaseError; end
|
|
71
|
+
class LLMConnectionError < LLMError; end
|
|
72
|
+
class LLMRateLimitError < LLMError; end
|
|
73
|
+
class LLMTimeoutError < LLMError; end
|
|
74
|
+
class LLMResponseError < LLMError; end
|
|
75
|
+
class LLMConfigurationError < LLMError; end
|
|
76
|
+
class ExternalServiceUnavailable < LLMError; end
|
|
77
|
+
class ContextTooLarge < LLMError; end
|
|
78
|
+
|
|
79
|
+
# Parser errors
|
|
80
|
+
class ParserError < BaseError; end
|
|
81
|
+
class QueryParserError < ParserError; end
|
|
82
|
+
|
|
83
|
+
# Validation errors
|
|
84
|
+
class ValidationError < BaseError; end
|
|
85
|
+
class InvalidQueryError < ValidationError; end
|
|
86
|
+
class InvalidParameterError < ValidationError; end
|
|
87
|
+
end
|
|
88
|
+
end
|