smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,456 @@
1
+ require_relative '../core/fulltext_manager'
2
+
3
+ module SmartRAG
4
+ module Services
5
+ # FulltextSearchService executes full-text keyword search with multi-language support
6
+ # Provides a clean interface for full-text search operations
7
+ class FulltextSearchService
8
+ attr_reader :fulltext_manager, :query_parser, :config, :logger
9
+
10
+ # Default service configuration
11
+ DEFAULT_CONFIG = {
12
+ default_language: 'en',
13
+ max_results: 100,
14
+ default_limit: 20,
15
+ enable_highlighting: true,
16
+ highlight_options: {
17
+ max_words: 50,
18
+ min_words: 15,
19
+ max_fragments: 3,
20
+ start_sel: '<mark>',
21
+ stop_sel: '</mark>'
22
+ },
23
+ enable_spellcheck: false,
24
+ enable_suggestions: false,
25
+ min_search_length: 2, # Minimum query length
26
+ max_search_length: 1000 # Maximum query length
27
+ }.freeze
28
+
29
+ # Initialize FulltextSearchService
30
+ # @param fulltext_manager [FulltextManager] Full-text manager instance
31
+ # @param query_parser [QueryParser] Query parser instance
32
+ # @param options [Hash] Service configuration options
33
+ def initialize(fulltext_manager, query_parser = nil, options = {})
34
+ @fulltext_manager = fulltext_manager
35
+ @query_parser = query_parser || fulltext_manager.query_parser
36
+ @config = DEFAULT_CONFIG.merge(options)
37
+ @logger = options[:logger] || Logger.new(STDOUT)
38
+ end
39
+
40
+ # Perform full-text search
41
+ # @param query [String] Search query text
42
+ # @param options [Hash] Search options
43
+ # @option options [String] :language Language code (auto-detect if nil)
44
+ # @option options [Integer] :limit Maximum results (default: 20)
45
+ # @option options [Boolean] :enable_highlighting Enable highlighting (default: true)
46
+ # @option options [Hash] :filters Search filters
47
+ # @option options [Array<Integer>] :document_ids Filter by document IDs
48
+ # @option options [Array<Integer>] :tag_ids Filter by tag IDs
49
+ # @option options [DateTime] :date_from Filter by start date
50
+ # @option options [DateTime] :date_to Filter by end date
51
+ # @option options [Boolean] :include_content Include full content in results
52
+ # @option options [Boolean] :include_metadata Include metadata in results
53
+ # @return [Hash] Search results with metadata
54
+ def search(query, options = {})
55
+ # Validate query
56
+ validation_error = validate_query(query)
57
+ raise ArgumentError, validation_error if validation_error
58
+
59
+ # Parse advanced queries if needed
60
+ query_info = analyze_query(query)
61
+
62
+ # Extract options
63
+ language = options[:language] || detect_language(query)
64
+ limit = options[:limit] || config[:default_limit]
65
+ filters = options[:filters] || extract_filters(options)
66
+
67
+ # Log search start
68
+ @logger.info "Full-text search: '#{query}', language: #{language}, limit: #{limit}"
69
+
70
+ # Execute search
71
+ start_time = Time.now
72
+ results = if filters.empty?
73
+ fulltext_manager.search_by_text(query, language, limit)
74
+ else
75
+ fulltext_manager.search_with_filters(query, filters, {
76
+ language: language,
77
+ limit: limit
78
+ })
79
+ end
80
+ execution_time = ((Time.now - start_time) * 1000).round
81
+
82
+ # Format results with highlighting and metadata
83
+ formatted_results = format_search_results(results, options)
84
+
85
+ # Generate response
86
+ response = {
87
+ query: query,
88
+ query_info: query_info,
89
+ results: formatted_results,
90
+ metadata: {
91
+ total_count: results.length,
92
+ execution_time_ms: execution_time,
93
+ language: language,
94
+ has_more: results.length >= limit
95
+ }
96
+ }
97
+
98
+ # Add spellcheck/suggestions if enabled
99
+ response[:suggestions] = generate_suggestions(query, language) if config[:enable_spellcheck] && results.empty?
100
+
101
+ # Log search completion
102
+ log_search(query, results.length, execution_time)
103
+
104
+ response
105
+ rescue ArgumentError => e
106
+ # Re-raise ArgumentError (validation errors) without wrapping
107
+ log_search(query, 0, 0, e.message)
108
+ raise e
109
+ rescue StandardError => e
110
+ @logger.error "Full-text search failed: #{e.message}"
111
+ @logger.error e.backtrace.join("\n")
112
+ log_search(query, 0, 0, e.message)
113
+ raise Errors::FulltextSearchServiceError, "Search failed: #{e.message}"
114
+ end
115
+
116
+ # Quick search without metadata
117
+ # @param query [String] Search query
118
+ # @param limit [Integer] Result limit
119
+ # @return [Array] Simple result list
120
+ def quick_search(query, limit = 10)
121
+ results = fulltext_manager.search_by_text(query, nil, limit)
122
+ results.map { |r| simplify_result(r) }
123
+ rescue StandardError => e
124
+ @logger.error "Quick search failed: #{e.message}"
125
+ []
126
+ end
127
+
128
+ # Search with highlighting
129
+ # @param query [String] Search query
130
+ # @param options [Hash] Search options
131
+ # @return [Hash] Results with highlighted snippets
132
+ def search_with_highlighting(query, options = {})
133
+ # Force highlighting on
134
+ options = options.merge(enable_highlighting: true)
135
+ search(query, options)
136
+ end
137
+
138
+ # Advanced search with filter support
139
+ # @param query [String] Search query
140
+ # @param filters [Hash] Search filters
141
+ # @param options [Hash] Search options
142
+ # @return [Hash] Filtered search results
143
+ def advanced_search(query, filters, options = {})
144
+ options[:filters] = filters
145
+ search(query, options)
146
+ end
147
+
148
+ # Multi-language search
149
+ # @param query [String] Search query
150
+ # @param languages [Array<String>] Target languages
151
+ # @param options [Hash] Search options
152
+ # @return [Hash] Results from all languages
153
+ def multilingual_search(query, languages, options = {})
154
+ all_results = []
155
+ total_time = 0
156
+
157
+ languages.each do |lang|
158
+ lang_results = search(query, options.merge(language: lang))
159
+ all_results.concat(lang_results[:results].map { |r| r.merge(language: lang) })
160
+ total_time += lang_results[:metadata][:execution_time_ms]
161
+ rescue StandardError => e
162
+ @logger.error "Search failed for language #{lang}: #{e.message}"
163
+ end
164
+
165
+ # Sort by rank score across all languages
166
+ all_results.sort_by! { |r| -(r[:rank_score] || 0) }
167
+
168
+ # Apply limit
169
+ limit = options[:limit] || config[:default_limit]
170
+ all_results = all_results.first(limit)
171
+
172
+ {
173
+ query: query,
174
+ languages: languages,
175
+ results: all_results,
176
+ metadata: {
177
+ total_count: all_results.length,
178
+ execution_time_ms: total_time,
179
+ multilingual: true
180
+ }
181
+ }
182
+ end
183
+
184
+ # Search suggestions (auto-complete)
185
+ # @param prefix [String] Query prefix
186
+ # @param options [Hash] Options
187
+ # @return [Array] Suggestion list
188
+ def suggestions(prefix, options = {})
189
+ return [] if prefix.to_s.strip.length < 2
190
+
191
+ limit = options[:limit] || 10
192
+ language = options[:language] || config[:default_language]
193
+
194
+ # Simple implementation - in production, use a dedicated suggest index
195
+ suggestions = db[:section_fts]
196
+ .join(:source_sections, id: Sequel[:section_fts][:section_id])
197
+ .select(
198
+ Sequel[:source_sections][:content]
199
+ )
200
+ .where do
201
+ (Sequel[:section_fts][:language] =~ language) &
202
+ (Sequel[:source_sections][:content] =~ /#{prefix}/i)
203
+ end
204
+ .limit(limit * 10) # Get more to process
205
+ .map { |row| row[:content] }
206
+
207
+ # Extract words starting with prefix
208
+ words = suggestions.flat_map { |text| extract_words(text, prefix) }
209
+
210
+ # Count frequencies and return top suggestions
211
+ word_freq = words.group_by(&:downcase).transform_values(&:count)
212
+ word_freq
213
+ .sort_by { |_, count| -count }
214
+ .first(limit)
215
+ .map { |word, _| word }
216
+ rescue StandardError => e
217
+ @logger.error "Suggestions generation failed: #{e.message}"
218
+ []
219
+ end
220
+
221
+ # Get search statistics
222
+ # @return [Hash] Search statistics
223
+ def statistics
224
+ {
225
+ total_indexed: fulltext_manager.stats[:total_indexed],
226
+ search_performance: get_performance_stats,
227
+ language_distribution: get_language_distribution,
228
+ popular_queries: get_popular_queries
229
+ }
230
+ rescue StandardError => e
231
+ @logger.error "Failed to get statistics: #{e.message}"
232
+ {}
233
+ end
234
+
235
+ private
236
+
237
+ # Validate search query
238
+ def validate_query(query)
239
+ return 'Query cannot be nil' if query.nil?
240
+ return 'Query cannot be empty' if query.strip.empty?
241
+
242
+ length = query.strip.length
243
+ if length < config[:min_search_length]
244
+ return "Query too short (minimum #{config[:min_search_length]} characters)"
245
+ end
246
+
247
+ if length > config[:max_search_length]
248
+ return "Query too long (maximum #{config[:max_search_length]} characters)"
249
+ end
250
+
251
+ nil
252
+ end
253
+
254
+ # Analyze query to extract metadata
255
+ def analyze_query(query)
256
+ @query_parser.parse_advanced_query(query)
257
+ end
258
+
259
+ # Detect language for query
260
+ def detect_language(query)
261
+ @query_parser.detect_language(query)
262
+ end
263
+
264
+ # Extract filters from options
265
+ def extract_filters(options)
266
+ filters = {}
267
+ filters[:document_ids] = options[:document_ids] if options[:document_ids]
268
+ filters[:tag_ids] = options[:tag_ids] if options[:tag_ids]
269
+ filters[:date_from] = options[:date_from] if options[:date_from]
270
+ filters[:date_to] = options[:date_to] if options[:date_to]
271
+ filters
272
+ end
273
+
274
+ # Format search results with metadata
275
+ def format_search_results(results, options)
276
+ results.map.with_index do |result, index|
277
+ formatted = {
278
+ section_id: result[:section_id],
279
+ rank_score: result[:rank_score],
280
+ rank: index + 1
281
+ }
282
+
283
+ # Add highlight if available and enabled
284
+ formatted[:highlight] = result[:highlight] if result[:highlight] && config[:enable_highlighting]
285
+
286
+ # Include content if requested
287
+ if options[:include_content]
288
+ section = get_section_content(result[:section_id])
289
+ formatted[:content] = section[:content]
290
+ formatted[:title] = section[:title]
291
+ end
292
+
293
+ # Include metadata if requested
294
+ if options[:include_metadata]
295
+ metadata = get_section_metadata(result[:section_id])
296
+ formatted.merge!(metadata)
297
+ end
298
+
299
+ formatted
300
+ end
301
+ end
302
+
303
+ # Get section content
304
+ def get_section_content(section_id)
305
+ @fulltext_manager.db[:source_sections]
306
+ .where(id: section_id)
307
+ .select(:content, :section_title)
308
+ .first || {}
309
+ end
310
+
311
+ # Get section metadata
312
+ def get_section_metadata(section_id)
313
+ dataset = @fulltext_manager.db[:source_sections]
314
+ .where(Sequel[:source_sections][:id] => section_id)
315
+ .left_join(:source_documents, id: Sequel[:source_sections][:document_id])
316
+ .select(
317
+ Sequel[:source_documents][:id].as(:document_id),
318
+ Sequel[:source_documents][:title].as(:document_title),
319
+ Sequel[:source_documents][:author],
320
+ Sequel[:source_documents][:publication_date],
321
+ Sequel[:source_sections][:section_number],
322
+ Sequel[:source_documents][:metadata]
323
+ )
324
+
325
+ result = dataset.first
326
+ return {} unless result
327
+
328
+ metadata = {
329
+ document_id: result[:document_id],
330
+ document_title: result[:document_title],
331
+ author: result[:author],
332
+ publication_date: result[:publication_date],
333
+ section_number: result[:section_number]
334
+ }
335
+
336
+ if result[:metadata]
337
+ if result[:metadata].is_a?(String)
338
+ begin
339
+ parsed = JSON.parse(result[:metadata])
340
+ metadata.merge!(parsed) if parsed.is_a?(Hash)
341
+ rescue JSON::ParserError
342
+ # Ignore malformed metadata strings
343
+ end
344
+ elsif result[:metadata].is_a?(Hash)
345
+ metadata.merge!(result[:metadata])
346
+ end
347
+ end
348
+ metadata
349
+ end
350
+
351
+ # Simplify result for quick search
352
+ def simplify_result(result)
353
+ {
354
+ id: result[:section_id],
355
+ rank: result[:rank_score]
356
+ }
357
+ end
358
+
359
+ # Generate search suggestions
360
+ def generate_suggestions(query, language)
361
+ # Simple implementation - find similar terms
362
+ suggestions = []
363
+
364
+ # Split query into words
365
+ words = query.strip.split(/\s+/)
366
+
367
+ words.each do |word|
368
+ next if word.length < 3
369
+
370
+ # Find similar terms in the index
371
+ similar = @fulltext_manager.db[:section_fts]
372
+ .join(:source_sections, id: Sequel[:section_fts][:section_id])
373
+ .select(
374
+ Sequel.function(:substring, Sequel[:source_sections][:content],
375
+ /\b#{word[0..3]}\w*/i).as(:term)
376
+ )
377
+ .where(Sequel[:section_fts][:language] =~ language)
378
+ .map { |row| row[:term] }
379
+ .compact
380
+ .uniq
381
+
382
+ suggestions.concat(similar)
383
+ end
384
+
385
+ suggestions.uniq.first(3)
386
+ end
387
+
388
+ # Extract words starting with prefix
389
+ def extract_words(text, prefix)
390
+ # Find word boundaries
391
+ words = text.scan(/\b\w+/)
392
+ words.select { |w| w.downcase.start_with?(prefix.downcase) }
393
+ end
394
+
395
+ # Get performance statistics
396
+ def get_performance_stats
397
+ {
398
+ average_response_time: 0,
399
+ slowest_queries: [],
400
+ total_searches: 0
401
+ }
402
+ end
403
+
404
+ # Get language distribution
405
+ def get_language_distribution
406
+ @fulltext_manager.db[:section_fts]
407
+ .select(:language, Sequel.function(:count, '*').as(:count))
408
+ .group(:language)
409
+ .map { |row| { language: row[:language], count: row[:count] } }
410
+ end
411
+
412
+ # Get popular search queries
413
+ def get_popular_queries
414
+ @fulltext_manager.db[:search_logs]
415
+ .select(:query, Sequel.function(:count, '*').as(:count))
416
+ .where(Sequel[:created_at] > (Time.now - 86_400)) # Last 24 hours
417
+ .group(:query)
418
+ .order(Sequel.desc(:count))
419
+ .limit(10)
420
+ .map { |row| { query: row[:query], count: row[:count] } }
421
+ end
422
+
423
+ # Log search query
424
+ def log_search(query, result_count, execution_time, error = nil)
425
+ # Skip logging validation errors (nil/empty queries)
426
+ return if query.nil? || query.to_s.strip.empty?
427
+
428
+ begin
429
+ # Skip logging if database or fulltext_manager is not available
430
+ return unless @fulltext_manager && @fulltext_manager.respond_to?(:db) && @fulltext_manager.db
431
+
432
+ # Build insert hash without error_message column (not in migration)
433
+ log_data = {
434
+ query: query.to_s,
435
+ search_type: 'fulltext',
436
+ execution_time_ms: execution_time,
437
+ results_count: result_count,
438
+ created_at: Sequel::CURRENT_TIMESTAMP
439
+ }
440
+
441
+ # Only add filters if we have error (but format differently for existing columns)
442
+ log_data[:filters] = { error: error }.to_json if error
443
+
444
+ @fulltext_manager.db[:search_logs].insert(log_data) if fulltext_manager.db[:search_logs]
445
+ rescue StandardError => e
446
+ @logger.error "Failed to log search: #{e.message}"
447
+ end
448
+ end
449
+ end
450
+
451
+ # Custom errors
452
+ module Errors
453
+ class FulltextSearchServiceError < StandardError; end
454
+ end
455
+ end
456
+ end