smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
require_relative "../services/embedding_service"
|
|
2
|
+
require_relative "../models/embedding"
|
|
3
|
+
require_relative "../models/source_section"
|
|
4
|
+
require_relative "../errors"
|
|
5
|
+
|
|
6
|
+
module SmartRAG
|
|
7
|
+
module Core
|
|
8
|
+
# Core embedding management class for the SmartRAG library
|
|
9
|
+
class Embedding
|
|
10
|
+
attr_reader :embedding_service, :config
|
|
11
|
+
|
|
12
|
+
# Initialize the embedding manager
|
|
13
|
+
# @param config [Hash] Configuration options
|
|
14
|
+
def initialize(config = {})
|
|
15
|
+
@embedding_service = Services::EmbeddingService.new(config)
|
|
16
|
+
@config = config
|
|
17
|
+
@logger = config[:logger] || Logger.new(STDOUT)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Generate embeddings for a document
|
|
21
|
+
# @param document [SourceDocument] The document to process
|
|
22
|
+
# @param options [Hash] Options for processing
|
|
23
|
+
# @return [Integer] Number of embeddings generated
|
|
24
|
+
def generate_for_document(document, options = {})
|
|
25
|
+
raise ArgumentError, "Document cannot be nil" unless document
|
|
26
|
+
|
|
27
|
+
@logger.info "Generating embeddings for document: #{document.title}"
|
|
28
|
+
|
|
29
|
+
sections = Models::SourceSection.where(document_id: document.id).all
|
|
30
|
+
return 0 if sections.empty?
|
|
31
|
+
|
|
32
|
+
embeddings = @embedding_service.batch_generate(sections, options)
|
|
33
|
+
embeddings.size
|
|
34
|
+
rescue StandardError => e
|
|
35
|
+
document_id = document.respond_to?(:id) ? document.id : "unknown"
|
|
36
|
+
@logger.error "Failed to generate embeddings for document #{document_id}: #{e.message}"
|
|
37
|
+
raise
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Generate embeddings for multiple documents
|
|
41
|
+
# @param documents [Array<SourceDocument>] Documents to process
|
|
42
|
+
# @param options [Hash] Options
|
|
43
|
+
# @return [Hash] Results with success/failure counts
|
|
44
|
+
def batch_generate_for_documents(documents, options = {})
|
|
45
|
+
results = { success: 0, failed: 0, errors: [] }
|
|
46
|
+
|
|
47
|
+
documents.each do |document|
|
|
48
|
+
count = generate_for_document(document, options)
|
|
49
|
+
results[:success] += 1
|
|
50
|
+
@logger.info "Generated #{count} embeddings for document #{document.id}"
|
|
51
|
+
rescue StandardError => e
|
|
52
|
+
results[:failed] += 1
|
|
53
|
+
results[:errors] << { document_id: document.id, error: e.message }
|
|
54
|
+
@logger.error "Failed to process document #{document.id}: #{e.message}"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
results
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# Search for similar content
|
|
61
|
+
# @param query [String] Query text
|
|
62
|
+
# @param options [Hash] Search options
|
|
63
|
+
# @option options [Integer] :limit Maximum results (default: 10)
|
|
64
|
+
# @option options [Float] :threshold Similarity threshold (default: 0.8)
|
|
65
|
+
# @option options [Array<Integer>] :document_ids Filter by document IDs
|
|
66
|
+
# @option options [Array<Integer>] :tag_ids Filter by tag IDs
|
|
67
|
+
# @option options [String] :model Filter by model
|
|
68
|
+
# @return [Array<Hash>] Results with embedding and similarity score
|
|
69
|
+
def search_similar(query, options = {})
|
|
70
|
+
raise ArgumentError, "Query cannot be nil or empty" if query.to_s.strip.empty?
|
|
71
|
+
|
|
72
|
+
# Generate embedding for query
|
|
73
|
+
query_embedding = generate_query_embedding(query, options)
|
|
74
|
+
|
|
75
|
+
# Search by vector similarity
|
|
76
|
+
search_by_vector(query_embedding, options)
|
|
77
|
+
rescue ArgumentError
|
|
78
|
+
raise
|
|
79
|
+
rescue StandardError => e
|
|
80
|
+
logger.error "Vector search failed: #{e.message}"
|
|
81
|
+
raise ::SmartRAG::Errors::VectorSearchError, "Search failed: #{e.message}"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Get embedding stats for a document
|
|
85
|
+
# @param document [SourceDocument] Document
|
|
86
|
+
# @return [Hash] Statistics
|
|
87
|
+
def document_stats(document)
|
|
88
|
+
raise ArgumentError, "Document cannot be nil" unless document
|
|
89
|
+
|
|
90
|
+
section_ids = Models::SourceSection.where(document_id: document.id).map(:id)
|
|
91
|
+
|
|
92
|
+
if section_ids.empty?
|
|
93
|
+
return {
|
|
94
|
+
document_id: document.id,
|
|
95
|
+
total_sections: 0,
|
|
96
|
+
embedded_sections: 0,
|
|
97
|
+
embedding_rate: 0.0,
|
|
98
|
+
models_used: [],
|
|
99
|
+
}
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
embeddings = Models::Embedding.where(source_id: section_ids).all
|
|
103
|
+
models = embeddings.map(&:model).uniq.compact
|
|
104
|
+
|
|
105
|
+
{
|
|
106
|
+
document_id: document.id,
|
|
107
|
+
total_sections: section_ids.size,
|
|
108
|
+
embedded_sections: embeddings.size,
|
|
109
|
+
embedding_rate: (embeddings.size.to_f / section_ids.size * 100).round(2),
|
|
110
|
+
models_used: models,
|
|
111
|
+
latest_embedding: embeddings.max_by(&:created_at)&.created_at,
|
|
112
|
+
}
|
|
113
|
+
rescue StandardError => e
|
|
114
|
+
@logger.error "Failed to get stats for document #{document.id}: #{e.message}"
|
|
115
|
+
raise
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Clean up embeddings for deleted sections
|
|
119
|
+
# @return [Integer] Number of cleaned embeddings
|
|
120
|
+
def cleanup_orphaned_embeddings
|
|
121
|
+
all_section_ids = Models::SourceSection.map(:id)
|
|
122
|
+
embeddings_to_delete = Models::Embedding.exclude(source_id: all_section_ids)
|
|
123
|
+
|
|
124
|
+
deleted_count = embeddings_to_delete.delete
|
|
125
|
+
@logger.info "Cleaned up #{deleted_count} orphaned embeddings"
|
|
126
|
+
|
|
127
|
+
deleted_count
|
|
128
|
+
rescue StandardError => e
|
|
129
|
+
@logger.error "Cleanup failed: #{e.message}"
|
|
130
|
+
raise
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Delete old embeddings
|
|
134
|
+
# @param days [Integer] Delete embeddings older than X days
|
|
135
|
+
# @return [Integer] Number of deleted embeddings
|
|
136
|
+
def delete_old_embeddings(days = 30)
|
|
137
|
+
deleted_count = Models::Embedding.delete_old_embeddings(days: days)
|
|
138
|
+
@logger.info "Deleted #{deleted_count} embeddings older than #{days} days"
|
|
139
|
+
|
|
140
|
+
deleted_count
|
|
141
|
+
rescue StandardError => e
|
|
142
|
+
@logger.error "Failed to delete old embeddings: #{e.message}"
|
|
143
|
+
raise
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Search similar by vector directly
|
|
147
|
+
# @param vector [Array<Float>] Query vector
|
|
148
|
+
# @param options [Hash] Search options
|
|
149
|
+
# @return [Array<Hash>] Results
|
|
150
|
+
def search_by_vector(vector, options = {})
|
|
151
|
+
raise ArgumentError, "Vector cannot be nil" if vector.nil?
|
|
152
|
+
raise ArgumentError, "Vector must be an array" unless vector.is_a?(Array)
|
|
153
|
+
|
|
154
|
+
limit = options[:limit] || 10
|
|
155
|
+
threshold = options[:threshold] || 0.3
|
|
156
|
+
fallback_threshold = options[:fallback_threshold] || 0.1
|
|
157
|
+
|
|
158
|
+
results = Models::Embedding.similar_to(vector, limit: limit, threshold: threshold)
|
|
159
|
+
|
|
160
|
+
if results.empty? && fallback_threshold < threshold
|
|
161
|
+
logger.info "No results at threshold=#{threshold}, retrying with fallback_threshold=#{fallback_threshold}"
|
|
162
|
+
results = Models::Embedding.similar_to(vector, limit: limit, threshold: fallback_threshold)
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
if results.empty? && options.fetch(:fallback_to_nearest, true)
|
|
166
|
+
logger.info "No results after threshold fallback, returning nearest neighbors without threshold"
|
|
167
|
+
results = Models::Embedding.nearest_to(vector, limit: limit)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
if results.empty? && options.fetch(:fallback_to_in_memory, true)
|
|
171
|
+
pool_size = options[:fallback_pool_size] || 1000
|
|
172
|
+
logger.info "No results from database search, falling back to in-memory similarity (pool_size=#{pool_size})"
|
|
173
|
+
candidates = Models::Embedding.limit(pool_size).all
|
|
174
|
+
results = candidates.sort_by { |emb| -calculate_similarity(vector, emb) }.first(limit)
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Apply filters
|
|
178
|
+
results = apply_filters(results, options)
|
|
179
|
+
|
|
180
|
+
results.map.with_index do |embedding, index|
|
|
181
|
+
{
|
|
182
|
+
embedding: embedding,
|
|
183
|
+
section: embedding.section,
|
|
184
|
+
similarity: calculate_similarity(vector, embedding),
|
|
185
|
+
rank: index + 1,
|
|
186
|
+
}
|
|
187
|
+
end
|
|
188
|
+
rescue ArgumentError => e
|
|
189
|
+
# Re-raise validation errors
|
|
190
|
+
raise e
|
|
191
|
+
rescue StandardError => e
|
|
192
|
+
logger.error "Vector search failed: #{e.message}"
|
|
193
|
+
raise
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
# Search similar by vector with tag-based filtering
|
|
197
|
+
# @param vector [Array<Float>] Query vector
|
|
198
|
+
# @param tags [Array<Tag, Integer, String>] Tags to filter by
|
|
199
|
+
# @param options [Hash] Search options
|
|
200
|
+
# @option options [Integer] :limit Maximum results (default: 10)
|
|
201
|
+
# @option options [Float] :threshold Similarity threshold (default: 0.3)
|
|
202
|
+
# @option options [Array<Integer>] :document_ids Filter by document IDs
|
|
203
|
+
# @option options [String] :model Filter by model
|
|
204
|
+
# @option options [Float] :tag_boost_weight Boost factor for matching tags (default: 1.1)
|
|
205
|
+
# @return [Array<Hash>] Results with boosted scores
|
|
206
|
+
def search_by_vector_with_tags(vector, tags, options = {})
|
|
207
|
+
raise ArgumentError, "Vector cannot be nil" if vector.nil?
|
|
208
|
+
raise ArgumentError, "Vector must be an array" unless vector.is_a?(Array)
|
|
209
|
+
raise ArgumentError, "Tags cannot be nil" if tags.nil?
|
|
210
|
+
|
|
211
|
+
limit = options[:limit] || 10
|
|
212
|
+
threshold = options[:threshold] || 0.3
|
|
213
|
+
tag_boost_weight = options[:tag_boost_weight] || 1.1
|
|
214
|
+
|
|
215
|
+
# Get base results
|
|
216
|
+
results = search_by_vector(vector, options)
|
|
217
|
+
|
|
218
|
+
# Boost results that match tags
|
|
219
|
+
results_with_boosts = results.map do |result|
|
|
220
|
+
boosted_score = result[:similarity]
|
|
221
|
+
|
|
222
|
+
if tags.any?
|
|
223
|
+
# Check if section has matching tags
|
|
224
|
+
section_id = result[:section].id
|
|
225
|
+
section_tags = Models::SectionTag.where(section_id: section_id).map(&:tag_id)
|
|
226
|
+
|
|
227
|
+
tag_ids = tags.map do |tag|
|
|
228
|
+
case tag
|
|
229
|
+
when ::SmartRAG::Models::Tag
|
|
230
|
+
tag.id
|
|
231
|
+
when Integer
|
|
232
|
+
tag
|
|
233
|
+
when String
|
|
234
|
+
tag_obj = ::SmartRAG::Models::Tag.find(name: tag)
|
|
235
|
+
tag_obj&.id
|
|
236
|
+
end
|
|
237
|
+
end.compact
|
|
238
|
+
|
|
239
|
+
# Boost if any tag matches
|
|
240
|
+
if (tag_ids & section_tags).any?
|
|
241
|
+
boosted_score = result[:similarity] * tag_boost_weight
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
result.merge(boosted_score: boosted_score)
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Sort by boosted score and limit
|
|
249
|
+
final_limit = options[:limit] || limit
|
|
250
|
+
results_with_scores = results_with_boosts.sort_by { |r| -r[:boosted_score] }
|
|
251
|
+
|
|
252
|
+
filtered_results = results_with_scores.select { |r| r[:boosted_score] >= threshold }
|
|
253
|
+
final_results = filtered_results.first(final_limit)
|
|
254
|
+
|
|
255
|
+
logger.info "Tag-enhanced search returned #{final_results.size} results (tag boost: #{tag_boost_weight})"
|
|
256
|
+
|
|
257
|
+
final_results
|
|
258
|
+
rescue ArgumentError => e
|
|
259
|
+
# Re-raise validation errors
|
|
260
|
+
raise e
|
|
261
|
+
rescue StandardError => e
|
|
262
|
+
logger.error "Vector search failed: #{e.message}"
|
|
263
|
+
raise
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
private
|
|
267
|
+
|
|
268
|
+
attr_reader :logger, :config
|
|
269
|
+
|
|
270
|
+
def generate_query_embedding(query, options = {})
|
|
271
|
+
@embedding_service.send(:generate_embedding, query, options)
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
def ensure_tag_objects(tags)
|
|
275
|
+
tags.map do |tag|
|
|
276
|
+
case tag
|
|
277
|
+
when ::SmartRAG::Models::Tag
|
|
278
|
+
tag
|
|
279
|
+
when Integer
|
|
280
|
+
::SmartRAG::Models::Tag.find(id: tag) || raise(ArgumentError, "Tag not found: #{tag}")
|
|
281
|
+
when String
|
|
282
|
+
::SmartRAG::Models::Tag.find(name: tag) || raise(ArgumentError, "Tag not found: #{tag}")
|
|
283
|
+
else
|
|
284
|
+
raise ArgumentError, "Invalid tag type: #{tag.class}"
|
|
285
|
+
end
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def apply_filters(results, options)
|
|
290
|
+
# Filter by document IDs
|
|
291
|
+
if options[:document_ids]
|
|
292
|
+
document_section_ids = Models::SourceSection.where(
|
|
293
|
+
document_id: options[:document_ids],
|
|
294
|
+
).map(:id)
|
|
295
|
+
|
|
296
|
+
results = results.select do |emb|
|
|
297
|
+
document_section_ids.include?(emb.is_a?(Hash) ? emb[:embedding].source_id : emb.source_id)
|
|
298
|
+
end
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Filter by tag IDs
|
|
302
|
+
if options[:tag_ids]
|
|
303
|
+
section_tag_ids = Models::SectionTag.where(
|
|
304
|
+
tag_id: options[:tag_ids],
|
|
305
|
+
).map(:section_id)
|
|
306
|
+
|
|
307
|
+
results = results.select do |emb|
|
|
308
|
+
section_tag_ids.include?(emb.is_a?(Hash) ? emb[:embedding].source_id : emb.source_id)
|
|
309
|
+
end
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
# Filter by model
|
|
313
|
+
if options[:model]
|
|
314
|
+
results = results.select { |emb| (emb.is_a?(Hash) ? emb[:embedding].model : emb.model) == options[:model] }
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
results
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
def calculate_similarity(query_vector, embedding)
|
|
321
|
+
# Use pgvector distance if available
|
|
322
|
+
return embedding.similarity_to(query_vector) if embedding.respond_to?(:similarity_to)
|
|
323
|
+
|
|
324
|
+
# Fallback to manual calculation
|
|
325
|
+
cosine_similarity(query_vector, embedding.vector_array)
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
def cosine_similarity(v1, v2)
|
|
329
|
+
return 0.0 if v1.nil? || v2.nil? || v1.empty? || v2.empty?
|
|
330
|
+
|
|
331
|
+
dot_product = v1.zip(v2).map { |a, b| a * b }.sum
|
|
332
|
+
magnitude1 = Math.sqrt(v1.map { |x| x * x }.sum)
|
|
333
|
+
magnitude2 = Math.sqrt(v2.map { |x| x * x }.sum)
|
|
334
|
+
return 0.0 if magnitude1 == 0 || magnitude2 == 0
|
|
335
|
+
|
|
336
|
+
dot_product / (magnitude1 * magnitude2)
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
end
|