smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,340 @@
1
+ require_relative "../services/embedding_service"
2
+ require_relative "../models/embedding"
3
+ require_relative "../models/source_section"
4
+ require_relative "../errors"
5
+
6
+ module SmartRAG
7
+ module Core
8
+ # Core embedding management class for the SmartRAG library
9
+ class Embedding
10
+ attr_reader :embedding_service, :config
11
+
12
+ # Initialize the embedding manager
13
+ # @param config [Hash] Configuration options
14
+ def initialize(config = {})
15
+ @embedding_service = Services::EmbeddingService.new(config)
16
+ @config = config
17
+ @logger = config[:logger] || Logger.new(STDOUT)
18
+ end
19
+
20
+ # Generate embeddings for a document
21
+ # @param document [SourceDocument] The document to process
22
+ # @param options [Hash] Options for processing
23
+ # @return [Integer] Number of embeddings generated
24
+ def generate_for_document(document, options = {})
25
+ raise ArgumentError, "Document cannot be nil" unless document
26
+
27
+ @logger.info "Generating embeddings for document: #{document.title}"
28
+
29
+ sections = Models::SourceSection.where(document_id: document.id).all
30
+ return 0 if sections.empty?
31
+
32
+ embeddings = @embedding_service.batch_generate(sections, options)
33
+ embeddings.size
34
+ rescue StandardError => e
35
+ document_id = document.respond_to?(:id) ? document.id : "unknown"
36
+ @logger.error "Failed to generate embeddings for document #{document_id}: #{e.message}"
37
+ raise
38
+ end
39
+
40
+ # Generate embeddings for multiple documents
41
+ # @param documents [Array<SourceDocument>] Documents to process
42
+ # @param options [Hash] Options
43
+ # @return [Hash] Results with success/failure counts
44
+ def batch_generate_for_documents(documents, options = {})
45
+ results = { success: 0, failed: 0, errors: [] }
46
+
47
+ documents.each do |document|
48
+ count = generate_for_document(document, options)
49
+ results[:success] += 1
50
+ @logger.info "Generated #{count} embeddings for document #{document.id}"
51
+ rescue StandardError => e
52
+ results[:failed] += 1
53
+ results[:errors] << { document_id: document.id, error: e.message }
54
+ @logger.error "Failed to process document #{document.id}: #{e.message}"
55
+ end
56
+
57
+ results
58
+ end
59
+
60
+ # Search for similar content
61
+ # @param query [String] Query text
62
+ # @param options [Hash] Search options
63
+ # @option options [Integer] :limit Maximum results (default: 10)
64
+ # @option options [Float] :threshold Similarity threshold (default: 0.8)
65
+ # @option options [Array<Integer>] :document_ids Filter by document IDs
66
+ # @option options [Array<Integer>] :tag_ids Filter by tag IDs
67
+ # @option options [String] :model Filter by model
68
+ # @return [Array<Hash>] Results with embedding and similarity score
69
+ def search_similar(query, options = {})
70
+ raise ArgumentError, "Query cannot be nil or empty" if query.to_s.strip.empty?
71
+
72
+ # Generate embedding for query
73
+ query_embedding = generate_query_embedding(query, options)
74
+
75
+ # Search by vector similarity
76
+ search_by_vector(query_embedding, options)
77
+ rescue ArgumentError
78
+ raise
79
+ rescue StandardError => e
80
+ logger.error "Vector search failed: #{e.message}"
81
+ raise ::SmartRAG::Errors::VectorSearchError, "Search failed: #{e.message}"
82
+ end
83
+
84
+ # Get embedding stats for a document
85
+ # @param document [SourceDocument] Document
86
+ # @return [Hash] Statistics
87
+ def document_stats(document)
88
+ raise ArgumentError, "Document cannot be nil" unless document
89
+
90
+ section_ids = Models::SourceSection.where(document_id: document.id).map(:id)
91
+
92
+ if section_ids.empty?
93
+ return {
94
+ document_id: document.id,
95
+ total_sections: 0,
96
+ embedded_sections: 0,
97
+ embedding_rate: 0.0,
98
+ models_used: [],
99
+ }
100
+ end
101
+
102
+ embeddings = Models::Embedding.where(source_id: section_ids).all
103
+ models = embeddings.map(&:model).uniq.compact
104
+
105
+ {
106
+ document_id: document.id,
107
+ total_sections: section_ids.size,
108
+ embedded_sections: embeddings.size,
109
+ embedding_rate: (embeddings.size.to_f / section_ids.size * 100).round(2),
110
+ models_used: models,
111
+ latest_embedding: embeddings.max_by(&:created_at)&.created_at,
112
+ }
113
+ rescue StandardError => e
114
+ @logger.error "Failed to get stats for document #{document.id}: #{e.message}"
115
+ raise
116
+ end
117
+
118
+ # Clean up embeddings for deleted sections
119
+ # @return [Integer] Number of cleaned embeddings
120
+ def cleanup_orphaned_embeddings
121
+ all_section_ids = Models::SourceSection.map(:id)
122
+ embeddings_to_delete = Models::Embedding.exclude(source_id: all_section_ids)
123
+
124
+ deleted_count = embeddings_to_delete.delete
125
+ @logger.info "Cleaned up #{deleted_count} orphaned embeddings"
126
+
127
+ deleted_count
128
+ rescue StandardError => e
129
+ @logger.error "Cleanup failed: #{e.message}"
130
+ raise
131
+ end
132
+
133
+ # Delete old embeddings
134
+ # @param days [Integer] Delete embeddings older than X days
135
+ # @return [Integer] Number of deleted embeddings
136
+ def delete_old_embeddings(days = 30)
137
+ deleted_count = Models::Embedding.delete_old_embeddings(days: days)
138
+ @logger.info "Deleted #{deleted_count} embeddings older than #{days} days"
139
+
140
+ deleted_count
141
+ rescue StandardError => e
142
+ @logger.error "Failed to delete old embeddings: #{e.message}"
143
+ raise
144
+ end
145
+
146
+ # Search similar by vector directly
147
+ # @param vector [Array<Float>] Query vector
148
+ # @param options [Hash] Search options
149
+ # @return [Array<Hash>] Results
150
+ def search_by_vector(vector, options = {})
151
+ raise ArgumentError, "Vector cannot be nil" if vector.nil?
152
+ raise ArgumentError, "Vector must be an array" unless vector.is_a?(Array)
153
+
154
+ limit = options[:limit] || 10
155
+ threshold = options[:threshold] || 0.3
156
+ fallback_threshold = options[:fallback_threshold] || 0.1
157
+
158
+ results = Models::Embedding.similar_to(vector, limit: limit, threshold: threshold)
159
+
160
+ if results.empty? && fallback_threshold < threshold
161
+ logger.info "No results at threshold=#{threshold}, retrying with fallback_threshold=#{fallback_threshold}"
162
+ results = Models::Embedding.similar_to(vector, limit: limit, threshold: fallback_threshold)
163
+ end
164
+
165
+ if results.empty? && options.fetch(:fallback_to_nearest, true)
166
+ logger.info "No results after threshold fallback, returning nearest neighbors without threshold"
167
+ results = Models::Embedding.nearest_to(vector, limit: limit)
168
+ end
169
+
170
+ if results.empty? && options.fetch(:fallback_to_in_memory, true)
171
+ pool_size = options[:fallback_pool_size] || 1000
172
+ logger.info "No results from database search, falling back to in-memory similarity (pool_size=#{pool_size})"
173
+ candidates = Models::Embedding.limit(pool_size).all
174
+ results = candidates.sort_by { |emb| -calculate_similarity(vector, emb) }.first(limit)
175
+ end
176
+
177
+ # Apply filters
178
+ results = apply_filters(results, options)
179
+
180
+ results.map.with_index do |embedding, index|
181
+ {
182
+ embedding: embedding,
183
+ section: embedding.section,
184
+ similarity: calculate_similarity(vector, embedding),
185
+ rank: index + 1,
186
+ }
187
+ end
188
+ rescue ArgumentError => e
189
+ # Re-raise validation errors
190
+ raise e
191
+ rescue StandardError => e
192
+ logger.error "Vector search failed: #{e.message}"
193
+ raise
194
+ end
195
+
196
+ # Search similar by vector with tag-based filtering
197
+ # @param vector [Array<Float>] Query vector
198
+ # @param tags [Array<Tag, Integer, String>] Tags to filter by
199
+ # @param options [Hash] Search options
200
+ # @option options [Integer] :limit Maximum results (default: 10)
201
+ # @option options [Float] :threshold Similarity threshold (default: 0.3)
202
+ # @option options [Array<Integer>] :document_ids Filter by document IDs
203
+ # @option options [String] :model Filter by model
204
+ # @option options [Float] :tag_boost_weight Boost factor for matching tags (default: 1.1)
205
+ # @return [Array<Hash>] Results with boosted scores
206
+ def search_by_vector_with_tags(vector, tags, options = {})
207
+ raise ArgumentError, "Vector cannot be nil" if vector.nil?
208
+ raise ArgumentError, "Vector must be an array" unless vector.is_a?(Array)
209
+ raise ArgumentError, "Tags cannot be nil" if tags.nil?
210
+
211
+ limit = options[:limit] || 10
212
+ threshold = options[:threshold] || 0.3
213
+ tag_boost_weight = options[:tag_boost_weight] || 1.1
214
+
215
+ # Get base results
216
+ results = search_by_vector(vector, options)
217
+
218
+ # Boost results that match tags
219
+ results_with_boosts = results.map do |result|
220
+ boosted_score = result[:similarity]
221
+
222
+ if tags.any?
223
+ # Check if section has matching tags
224
+ section_id = result[:section].id
225
+ section_tags = Models::SectionTag.where(section_id: section_id).map(&:tag_id)
226
+
227
+ tag_ids = tags.map do |tag|
228
+ case tag
229
+ when ::SmartRAG::Models::Tag
230
+ tag.id
231
+ when Integer
232
+ tag
233
+ when String
234
+ tag_obj = ::SmartRAG::Models::Tag.find(name: tag)
235
+ tag_obj&.id
236
+ end
237
+ end.compact
238
+
239
+ # Boost if any tag matches
240
+ if (tag_ids & section_tags).any?
241
+ boosted_score = result[:similarity] * tag_boost_weight
242
+ end
243
+ end
244
+
245
+ result.merge(boosted_score: boosted_score)
246
+ end
247
+
248
+ # Sort by boosted score and limit
249
+ final_limit = options[:limit] || limit
250
+ results_with_scores = results_with_boosts.sort_by { |r| -r[:boosted_score] }
251
+
252
+ filtered_results = results_with_scores.select { |r| r[:boosted_score] >= threshold }
253
+ final_results = filtered_results.first(final_limit)
254
+
255
+ logger.info "Tag-enhanced search returned #{final_results.size} results (tag boost: #{tag_boost_weight})"
256
+
257
+ final_results
258
+ rescue ArgumentError => e
259
+ # Re-raise validation errors
260
+ raise e
261
+ rescue StandardError => e
262
+ logger.error "Vector search failed: #{e.message}"
263
+ raise
264
+ end
265
+
266
+ private
267
+
268
+ attr_reader :logger, :config
269
+
270
+ def generate_query_embedding(query, options = {})
271
+ @embedding_service.send(:generate_embedding, query, options)
272
+ end
273
+
274
+ def ensure_tag_objects(tags)
275
+ tags.map do |tag|
276
+ case tag
277
+ when ::SmartRAG::Models::Tag
278
+ tag
279
+ when Integer
280
+ ::SmartRAG::Models::Tag.find(id: tag) || raise(ArgumentError, "Tag not found: #{tag}")
281
+ when String
282
+ ::SmartRAG::Models::Tag.find(name: tag) || raise(ArgumentError, "Tag not found: #{tag}")
283
+ else
284
+ raise ArgumentError, "Invalid tag type: #{tag.class}"
285
+ end
286
+ end
287
+ end
288
+
289
+ def apply_filters(results, options)
290
+ # Filter by document IDs
291
+ if options[:document_ids]
292
+ document_section_ids = Models::SourceSection.where(
293
+ document_id: options[:document_ids],
294
+ ).map(:id)
295
+
296
+ results = results.select do |emb|
297
+ document_section_ids.include?(emb.is_a?(Hash) ? emb[:embedding].source_id : emb.source_id)
298
+ end
299
+ end
300
+
301
+ # Filter by tag IDs
302
+ if options[:tag_ids]
303
+ section_tag_ids = Models::SectionTag.where(
304
+ tag_id: options[:tag_ids],
305
+ ).map(:section_id)
306
+
307
+ results = results.select do |emb|
308
+ section_tag_ids.include?(emb.is_a?(Hash) ? emb[:embedding].source_id : emb.source_id)
309
+ end
310
+ end
311
+
312
+ # Filter by model
313
+ if options[:model]
314
+ results = results.select { |emb| (emb.is_a?(Hash) ? emb[:embedding].model : emb.model) == options[:model] }
315
+ end
316
+
317
+ results
318
+ end
319
+
320
+ def calculate_similarity(query_vector, embedding)
321
+ # Use pgvector distance if available
322
+ return embedding.similarity_to(query_vector) if embedding.respond_to?(:similarity_to)
323
+
324
+ # Fallback to manual calculation
325
+ cosine_similarity(query_vector, embedding.vector_array)
326
+ end
327
+
328
+ def cosine_similarity(v1, v2)
329
+ return 0.0 if v1.nil? || v2.nil? || v1.empty? || v2.empty?
330
+
331
+ dot_product = v1.zip(v2).map { |a, b| a * b }.sum
332
+ magnitude1 = Math.sqrt(v1.map { |x| x * x }.sum)
333
+ magnitude2 = Math.sqrt(v2.map { |x| x * x }.sum)
334
+ return 0.0 if magnitude1 == 0 || magnitude2 == 0
335
+
336
+ dot_product / (magnitude1 * magnitude2)
337
+ end
338
+ end
339
+ end
340
+ end