smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
require_relative '../parsers/query_parser'
|
|
2
|
+
|
|
3
|
+
module SmartRAG
|
|
4
|
+
module Core
|
|
5
|
+
# FulltextManager handles full-text search functionality and tsvector indexes
|
|
6
|
+
# Supports multi-language tokenization and language detection
|
|
7
|
+
class FulltextManager
|
|
8
|
+
attr_reader :db, :query_parser, :logger
|
|
9
|
+
|
|
10
|
+
# Weights for tsvector fields (A-D, A highest)
|
|
11
|
+
WEIGHTS = {
|
|
12
|
+
title: 'A',
|
|
13
|
+
content: 'B'
|
|
14
|
+
}.freeze
|
|
15
|
+
|
|
16
|
+
# Default search configuration
|
|
17
|
+
DEFAULT_CONFIG = {
|
|
18
|
+
max_results: 100,
|
|
19
|
+
default_language: 'en',
|
|
20
|
+
result_limits: 20
|
|
21
|
+
}.freeze
|
|
22
|
+
|
|
23
|
+
# Initialize FulltextManager
|
|
24
|
+
# @param db [Sequel::Database] Database connection
|
|
25
|
+
# @param options [Hash] Configuration options
|
|
26
|
+
def initialize(db, options = {})
|
|
27
|
+
@db = db
|
|
28
|
+
@query_parser = options[:query_parser] || Parsers::QueryParser.new
|
|
29
|
+
@logger = options[:logger] || Logger.new(STDOUT)
|
|
30
|
+
@config = DEFAULT_CONFIG.merge(options)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Store or update full-text index for a section
|
|
34
|
+
# @param section_id [Integer] Section ID
|
|
35
|
+
# @param title [String] Section title
|
|
36
|
+
# @param content [String] Section content
|
|
37
|
+
# @param language [String] Language code
|
|
38
|
+
# @return [Boolean] Success status
|
|
39
|
+
def update_fulltext_index(section_id, title, content, language = 'en')
|
|
40
|
+
raise ArgumentError, 'Section ID cannot be nil' unless section_id
|
|
41
|
+
raise ArgumentError, 'Content cannot be nil' unless content
|
|
42
|
+
|
|
43
|
+
# Decide configs for single or mixed-language tsvector
|
|
44
|
+
configs = build_language_configs(language, "#{title}\n\n#{content}")
|
|
45
|
+
|
|
46
|
+
# Prepare tsvector values
|
|
47
|
+
ts_title = if title.to_s.strip.empty?
|
|
48
|
+
''
|
|
49
|
+
else
|
|
50
|
+
build_weighted_vector(configs, title, WEIGHTS[:title])
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
ts_content = build_weighted_vector(configs, content, WEIGHTS[:content])
|
|
54
|
+
|
|
55
|
+
# Combine with weights
|
|
56
|
+
ts_combined = if ts_title.empty?
|
|
57
|
+
ts_content
|
|
58
|
+
else
|
|
59
|
+
# Use SQL concatenation for tsvector
|
|
60
|
+
Sequel.lit("#{ts_title} || #{ts_content}")
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Update or insert into section_fts table
|
|
64
|
+
db[:section_fts].insert_conflict(
|
|
65
|
+
target: :section_id,
|
|
66
|
+
update: {
|
|
67
|
+
language: language,
|
|
68
|
+
fts_title: ts_title,
|
|
69
|
+
fts_content: ts_content,
|
|
70
|
+
fts_combined: ts_combined,
|
|
71
|
+
updated_at: Sequel::CURRENT_TIMESTAMP
|
|
72
|
+
}
|
|
73
|
+
).insert(
|
|
74
|
+
section_id: section_id,
|
|
75
|
+
language: language,
|
|
76
|
+
fts_title: ts_title,
|
|
77
|
+
fts_content: ts_content,
|
|
78
|
+
fts_combined: ts_combined
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
@logger.info "Updated full-text index for section #{section_id}"
|
|
82
|
+
true
|
|
83
|
+
rescue Sequel::Error => e
|
|
84
|
+
@logger.error "Failed to update full-text index for section #{section_id}: #{e.message}"
|
|
85
|
+
@logger.error e.backtrace.join("\n")
|
|
86
|
+
false
|
|
87
|
+
rescue StandardError => e
|
|
88
|
+
# Re-raise ArgumentError and other programming errors
|
|
89
|
+
raise e if e.is_a?(ArgumentError)
|
|
90
|
+
|
|
91
|
+
@logger.error "Failed to update full-text index for section #{section_id}: #{e.message}"
|
|
92
|
+
@logger.error e.backtrace.join("\n")
|
|
93
|
+
raise Errors::FulltextSearchError, e.message
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Batch update full-text indexes
|
|
97
|
+
# @param sections [Array<Hash>] Array of section data
|
|
98
|
+
# @return [Hash] Success/failure counts
|
|
99
|
+
def batch_update_fulltext(sections)
|
|
100
|
+
results = { success: 0, failed: 0, errors: [] }
|
|
101
|
+
|
|
102
|
+
sections.each do |section|
|
|
103
|
+
success = update_fulltext_index(
|
|
104
|
+
section[:id],
|
|
105
|
+
section[:title],
|
|
106
|
+
section[:content],
|
|
107
|
+
section[:language] || 'en'
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
if success
|
|
111
|
+
results[:success] += 1
|
|
112
|
+
else
|
|
113
|
+
results[:failed] += 1
|
|
114
|
+
results[:errors] << { section_id: section[:id], error: 'Update failed' }
|
|
115
|
+
end
|
|
116
|
+
rescue StandardError => e
|
|
117
|
+
results[:failed] += 1
|
|
118
|
+
results[:errors] << { section_id: section[:id], error: e.message }
|
|
119
|
+
@logger.error "Batch update failed for section #{section[:id]}: #{e.message}"
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
@logger.info "Batch updated #{results[:success]} full-text indexes, #{results[:failed]} failed"
|
|
123
|
+
results
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Basic full-text search
|
|
127
|
+
# @param query [String] Search query text
|
|
128
|
+
# @param language [String] Language code (auto-detect if nil)
|
|
129
|
+
# @param limit [Integer] Maximum results
|
|
130
|
+
# @param options [Hash] Additional options
|
|
131
|
+
# @return [Array] Search results
|
|
132
|
+
def search_by_text(query, language = nil, limit = 20, options = {})
|
|
133
|
+
raise ArgumentError, 'Query cannot be nil' if query.nil?
|
|
134
|
+
raise ArgumentError, 'Query cannot be empty' if query.strip.empty?
|
|
135
|
+
|
|
136
|
+
# Detect language if not provided
|
|
137
|
+
language ||= detect_language_from_query(query)
|
|
138
|
+
|
|
139
|
+
# Build tsquery
|
|
140
|
+
tsquery = build_tsquery(query, language)
|
|
141
|
+
|
|
142
|
+
# Check if db is valid and table exists before querying
|
|
143
|
+
return [] if db.nil? || !db.respond_to?(:table_exists?) || !db.table_exists?(:section_fts)
|
|
144
|
+
|
|
145
|
+
# Base dataset
|
|
146
|
+
dataset = db[:section_fts]
|
|
147
|
+
.select(
|
|
148
|
+
Sequel[:section_fts][:section_id],
|
|
149
|
+
Sequel[:section_fts][:language],
|
|
150
|
+
Sequel[:source_sections][:section_title],
|
|
151
|
+
Sequel[:source_sections][:content],
|
|
152
|
+
Sequel[:source_sections][:document_id],
|
|
153
|
+
Sequel.function(
|
|
154
|
+
:ts_rank,
|
|
155
|
+
Sequel[:section_fts][:fts_combined],
|
|
156
|
+
Sequel.lit(tsquery)
|
|
157
|
+
).as(:rank_score),
|
|
158
|
+
Sequel.function(
|
|
159
|
+
:ts_headline,
|
|
160
|
+
Sequel[:source_sections][:content],
|
|
161
|
+
Sequel.lit(tsquery),
|
|
162
|
+
'MaxWords=50, MinWords=15, MaxFragments=3'
|
|
163
|
+
).as(:highlight)
|
|
164
|
+
)
|
|
165
|
+
.join(:source_sections, id: Sequel[:section_fts][:section_id])
|
|
166
|
+
.where(Sequel.lit("section_fts.fts_combined @@ #{tsquery}"))
|
|
167
|
+
.order(Sequel.desc(:rank_score))
|
|
168
|
+
.limit(limit)
|
|
169
|
+
|
|
170
|
+
@logger.debug "Fulltext search SQL: #{dataset.sql}"
|
|
171
|
+
@logger.debug "Fulltext search tsquery: #{tsquery}"
|
|
172
|
+
|
|
173
|
+
# Apply filters if provided
|
|
174
|
+
dataset = apply_search_filters(dataset, options[:filters])
|
|
175
|
+
|
|
176
|
+
# Execute query and format results
|
|
177
|
+
results = dataset.all.map do |row|
|
|
178
|
+
format_search_result(row, query)
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
@logger.info "Full-text search returned #{results.length} results" if results.any?
|
|
182
|
+
|
|
183
|
+
results
|
|
184
|
+
rescue StandardError => e
|
|
185
|
+
# Re-raise ArgumentError and other programming errors
|
|
186
|
+
raise e if e.is_a?(ArgumentError)
|
|
187
|
+
|
|
188
|
+
@logger.error "Full-text search failed: #{e.message}"
|
|
189
|
+
@logger.error e.backtrace.join("\n")
|
|
190
|
+
raise Errors::FulltextSearchError, "Search failed: #{e.message}"
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Full-text search with filters
|
|
194
|
+
# @param query [String] Search query
|
|
195
|
+
# @param filters [Hash] Filter options
|
|
196
|
+
# @param options [Hash] Search options
|
|
197
|
+
# @return [Array] Filtered results
|
|
198
|
+
def search_with_filters(query, filters, options = {})
|
|
199
|
+
options[:filters] = filters
|
|
200
|
+
search_by_text(query, nil, options[:limit] || 20, options)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Hybrid search combining text and vector search
|
|
204
|
+
# @param text_query [String] Full-text query
|
|
205
|
+
# @param vector_query [Array] Vector embedding
|
|
206
|
+
# @param options [Hash] Search options
|
|
207
|
+
# @return [Array] Combined results
|
|
208
|
+
def hybrid_search(text_query, vector_query, options = {})
|
|
209
|
+
raise ArgumentError, 'Text query or vector query must be provided' if text_query.nil? && vector_query.nil?
|
|
210
|
+
|
|
211
|
+
limit = options[:limit] || 20
|
|
212
|
+
k = options[:rrf_k] || 60 # RRF fusion parameter
|
|
213
|
+
|
|
214
|
+
# Get results from both search methods
|
|
215
|
+
text_results = text_query ? search_by_text(text_query, nil, limit * 2) : []
|
|
216
|
+
|
|
217
|
+
# Vector search would be called here in real implementation
|
|
218
|
+
# For now, we'll simulate or call a provided block
|
|
219
|
+
vector_results = []
|
|
220
|
+
vector_results = yield(vector_query, limit * 2) if block_given?
|
|
221
|
+
|
|
222
|
+
# Combine results using RRF (Reciprocal Rank Fusion)
|
|
223
|
+
combined = combine_results_with_rrf(text_results, vector_results, k: k)
|
|
224
|
+
|
|
225
|
+
# Limit final results
|
|
226
|
+
combined.first(limit)
|
|
227
|
+
rescue StandardError => e
|
|
228
|
+
@logger.error "Hybrid search failed: #{e.message}"
|
|
229
|
+
@logger.error e.backtrace.join("\n")
|
|
230
|
+
raise Errors::HybridSearchError, "Hybrid search failed: #{e.message}"
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Detect language for given text
|
|
234
|
+
# @param text [String] Text to analyze
|
|
235
|
+
# @return [String] Language code
|
|
236
|
+
def detect_language(text)
|
|
237
|
+
@query_parser.detect_language(text)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Build tsquery for given text and language
|
|
241
|
+
# @param text [String] Search text
|
|
242
|
+
# @param language [String] Language code
|
|
243
|
+
# @return [String] tsquery string
|
|
244
|
+
def build_tsquery(text, language)
|
|
245
|
+
@logger.debug "FulltextManager.build_tsquery called with text='#{text}', language='#{language}' (class: #{language.class})"
|
|
246
|
+
@query_parser.build_tsquery(text, language)
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
# Parse advanced query
|
|
250
|
+
# @param text [String] Query text
|
|
251
|
+
# @return [Hash] Parsed query structure
|
|
252
|
+
def parse_advanced_query(text)
|
|
253
|
+
@query_parser.parse_advanced_query(text)
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
# Get search statistics
|
|
257
|
+
# @return [Hash] Statistics
|
|
258
|
+
def stats
|
|
259
|
+
{
|
|
260
|
+
total_indexed: db[:section_fts].count,
|
|
261
|
+
languages: db[:section_fts].select(:language).distinct.map(:language),
|
|
262
|
+
last_updated: db[:section_fts].select { max(:updated_at) }.first.values.first
|
|
263
|
+
}
|
|
264
|
+
rescue StandardError => e
|
|
265
|
+
@logger.error "Failed to get stats: #{e.message}"
|
|
266
|
+
{}
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
# Remove full-text index for a section
|
|
270
|
+
# @param section_id [Integer] Section ID
|
|
271
|
+
# @return [Boolean] Success status
|
|
272
|
+
def remove_index(section_id)
|
|
273
|
+
raise ArgumentError, 'Section ID cannot be nil' unless section_id
|
|
274
|
+
|
|
275
|
+
deleted = db[:section_fts].where(section_id: section_id).delete
|
|
276
|
+
|
|
277
|
+
if deleted > 0
|
|
278
|
+
@logger.info "Removed full-text index for section #{section_id}"
|
|
279
|
+
true
|
|
280
|
+
else
|
|
281
|
+
@logger.warn "No full-text index found for section #{section_id}"
|
|
282
|
+
false
|
|
283
|
+
end
|
|
284
|
+
rescue Sequel::Error => e
|
|
285
|
+
@logger.error "Failed to remove index for section #{section_id}: #{e.message}"
|
|
286
|
+
false
|
|
287
|
+
rescue StandardError => e
|
|
288
|
+
# Re-raise ArgumentError and other programming errors
|
|
289
|
+
raise e if e.is_a?(ArgumentError)
|
|
290
|
+
|
|
291
|
+
@logger.error "Failed to remove index for section #{section_id}: #{e.message}"
|
|
292
|
+
false
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# Clean up orphaned indexes
|
|
296
|
+
# @return [Integer] Number of cleaned indexes
|
|
297
|
+
def cleanup_orphaned_indexes
|
|
298
|
+
# Delete rows from section_fts that don't have corresponding source_sections
|
|
299
|
+
count = db[:section_fts]
|
|
300
|
+
.where(
|
|
301
|
+
Sequel[:section_fts][:section_id] => db[:source_sections].select(:id)
|
|
302
|
+
)
|
|
303
|
+
.invert # This negates the WHERE, giving us NOT IN behavior
|
|
304
|
+
.or(section_id: nil) # Also clean up NULL section_id rows
|
|
305
|
+
.delete
|
|
306
|
+
|
|
307
|
+
@logger.info "Cleaned up #{count} orphaned full-text indexes"
|
|
308
|
+
count
|
|
309
|
+
rescue StandardError => e
|
|
310
|
+
@logger.error "Failed to cleanup orphaned indexes: #{e.message}"
|
|
311
|
+
0
|
|
312
|
+
end
|
|
313
|
+
|
|
314
|
+
private
|
|
315
|
+
|
|
316
|
+
# Detect language from query (simplified implementation)
|
|
317
|
+
def detect_language_from_query(query)
|
|
318
|
+
@query_parser.detect_language(query)
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
# Apply search filters to dataset
|
|
322
|
+
def apply_search_filters(dataset, filters)
|
|
323
|
+
return dataset unless filters && !filters.empty?
|
|
324
|
+
|
|
325
|
+
# Filter by document IDs
|
|
326
|
+
if filters[:document_ids]
|
|
327
|
+
dataset = dataset.where(
|
|
328
|
+
Sequel[:source_sections][:document_id] => filters[:document_ids]
|
|
329
|
+
)
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
# Filter by tags
|
|
333
|
+
if filters[:tag_ids]
|
|
334
|
+
# Use INNER JOIN for better performance when filtering
|
|
335
|
+
dataset = dataset
|
|
336
|
+
.join(:section_tags, section_id: Sequel[:section_fts][:section_id])
|
|
337
|
+
.where(Sequel[:section_tags][:tag_id] => filters[:tag_ids])
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
# Filter by date range
|
|
341
|
+
dataset = dataset.where(Sequel[:source_sections][:created_at] >= filters[:date_from]) if filters[:date_from]
|
|
342
|
+
|
|
343
|
+
dataset = dataset.where(Sequel[:source_sections][:created_at] <= filters[:date_to]) if filters[:date_to]
|
|
344
|
+
|
|
345
|
+
dataset
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# Get text search configuration
|
|
349
|
+
def get_text_search_config(language)
|
|
350
|
+
config = Models::TextSearchConfig.first(language_code: language.to_s)&.config_name
|
|
351
|
+
return 'pg_catalog.simple' unless config
|
|
352
|
+
|
|
353
|
+
# For development/test environments, always fall back to simple if pg_jieba is not available
|
|
354
|
+
if config == 'jieba'
|
|
355
|
+
begin
|
|
356
|
+
# Test if pg_jieba is available in a separate transaction
|
|
357
|
+
db.fetch("SELECT to_tsvector('jieba', 'test')").first
|
|
358
|
+
return 'jieba'
|
|
359
|
+
rescue StandardError => e
|
|
360
|
+
@logger.warn "pg_jieba extension not available, falling back to simple: #{e.message}"
|
|
361
|
+
return 'pg_catalog.simple'
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
config
|
|
366
|
+
rescue StandardError => e
|
|
367
|
+
@logger.warn "Failed to get text search config for #{language}: #{e.message}, using simple"
|
|
368
|
+
'pg_catalog.simple'
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
# Set weight for tsvector (helper method)
|
|
372
|
+
def setweight(vector, weight)
|
|
373
|
+
return '' if vector.nil? || vector.to_s.strip.empty?
|
|
374
|
+
|
|
375
|
+
"setweight(#{vector}, '#{weight}')"
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
# Convert text to tsvector (helper method)
|
|
379
|
+
def to_tsvector(config, text)
|
|
380
|
+
"to_tsvector('#{config}', #{escape_quote(text)})"
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
def build_language_configs(primary_language, text)
|
|
384
|
+
primary = primary_language.to_s
|
|
385
|
+
configs = [get_text_search_config(primary)]
|
|
386
|
+
|
|
387
|
+
mix = language_mix_ratios(text)
|
|
388
|
+
return configs if mix[:total].zero?
|
|
389
|
+
|
|
390
|
+
secondary = []
|
|
391
|
+
secondary << 'zh' if primary != 'zh' && mix[:zh] >= 0.2
|
|
392
|
+
secondary << 'ja' if primary != 'ja' && mix[:ja] >= 0.2
|
|
393
|
+
secondary << 'ko' if primary != 'ko' && mix[:ko] >= 0.2
|
|
394
|
+
|
|
395
|
+
secondary.each do |lang|
|
|
396
|
+
configs << get_text_search_config(lang)
|
|
397
|
+
end
|
|
398
|
+
|
|
399
|
+
configs.uniq
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
def build_weighted_vector(configs, text, weight)
|
|
403
|
+
vectors = configs.map { |config| setweight(to_tsvector(config, text), weight) }
|
|
404
|
+
return vectors.first if vectors.length == 1
|
|
405
|
+
|
|
406
|
+
vectors.join(' || ')
|
|
407
|
+
end
|
|
408
|
+
|
|
409
|
+
def language_mix_ratios(text)
|
|
410
|
+
return { zh: 0.0, ja: 0.0, ko: 0.0, total: 0 } if text.nil? || text.empty?
|
|
411
|
+
|
|
412
|
+
ja_count = text.scan(/[\u3040-\u309f\u30a0-\u30ff]/).length
|
|
413
|
+
ko_count = text.scan(/[\uac00-\ud7af]/).length
|
|
414
|
+
zh_count = text.scan(/[\u4e00-\u9fff]/).length
|
|
415
|
+
total = ja_count + ko_count + zh_count
|
|
416
|
+
return { zh: 0.0, ja: 0.0, ko: 0.0, total: 0 } if total.zero?
|
|
417
|
+
|
|
418
|
+
{
|
|
419
|
+
zh: zh_count.to_f / total,
|
|
420
|
+
ja: ja_count.to_f / total,
|
|
421
|
+
ko: ko_count.to_f / total,
|
|
422
|
+
total: total
|
|
423
|
+
}
|
|
424
|
+
end
|
|
425
|
+
|
|
426
|
+
# Escape quotes for SQL
|
|
427
|
+
def escape_quote(text)
|
|
428
|
+
"'#{text.gsub("'", "''")}'"
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
# Combine results using RRF algorithm
|
|
432
|
+
def combine_results_with_rrf(text_results, vector_results, k:)
|
|
433
|
+
scores = {}
|
|
434
|
+
|
|
435
|
+
# Score text results
|
|
436
|
+
text_results.each_with_index do |result, index|
|
|
437
|
+
rank = index + 1
|
|
438
|
+
section_id = result[:section_id]
|
|
439
|
+
scores[section_id] = {
|
|
440
|
+
text_score: 1.0 / (k + rank),
|
|
441
|
+
vector_score: 0,
|
|
442
|
+
data: result
|
|
443
|
+
}
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
# Score vector results
|
|
447
|
+
vector_results.each_with_index do |result, index|
|
|
448
|
+
rank = index + 1
|
|
449
|
+
section_id = result[:section_id]
|
|
450
|
+
|
|
451
|
+
if scores[section_id]
|
|
452
|
+
scores[section_id][:vector_score] = 1.0 / (k + rank)
|
|
453
|
+
else
|
|
454
|
+
scores[section_id] = {
|
|
455
|
+
text_score: 0,
|
|
456
|
+
vector_score: 1.0 / (k + rank),
|
|
457
|
+
data: result
|
|
458
|
+
}
|
|
459
|
+
end
|
|
460
|
+
end
|
|
461
|
+
|
|
462
|
+
# Sort by combined score
|
|
463
|
+
scores.values.sort_by do |score|
|
|
464
|
+
-(score[:text_score] + score[:vector_score])
|
|
465
|
+
end.map { |score| score[:data] }
|
|
466
|
+
end
|
|
467
|
+
|
|
468
|
+
# Format search result
|
|
469
|
+
def format_search_result(row, query)
|
|
470
|
+
{
|
|
471
|
+
section_id: row[:section_id],
|
|
472
|
+
section_title: row[:section_title],
|
|
473
|
+
content: row[:content],
|
|
474
|
+
document_id: row[:document_id],
|
|
475
|
+
language: row[:language],
|
|
476
|
+
rank_score: row[:rank_score] || 0,
|
|
477
|
+
highlight: row[:highlight] || '',
|
|
478
|
+
query: query
|
|
479
|
+
}
|
|
480
|
+
end
|
|
481
|
+
end
|
|
482
|
+
end
|
|
483
|
+
end
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
require 'json'
|
|
2
|
+
require 'tempfile'
|
|
3
|
+
require 'open3'
|
|
4
|
+
|
|
5
|
+
module SmartRAG
|
|
6
|
+
module Core
|
|
7
|
+
# Bridge to Python markitdown library
|
|
8
|
+
class MarkitdownBridge
|
|
9
|
+
class ConversionError < StandardError; end
|
|
10
|
+
class UnsupportedFormatError < StandardError; end
|
|
11
|
+
|
|
12
|
+
def initialize
|
|
13
|
+
@python_cmd = detect_python_cmd
|
|
14
|
+
@python_available = !@python_cmd.nil?
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Convert a file to markdown
|
|
19
|
+
# @param [String] file_path Path to the file to convert
|
|
20
|
+
# @return [String] Converted markdown content
|
|
21
|
+
def convert(file_path)
|
|
22
|
+
raise ConversionError, "Markitdown is not available" unless @python_available
|
|
23
|
+
raise ConversionError, "File not found: #{file_path}" unless File.exist?(file_path)
|
|
24
|
+
|
|
25
|
+
result = call_python_convert(file_path)
|
|
26
|
+
|
|
27
|
+
if result.nil? || result.empty?
|
|
28
|
+
raise ConversionError, "Conversion returned empty result"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
result
|
|
32
|
+
rescue StandardError => e
|
|
33
|
+
raise ConversionError, "Failed to convert #{file_path}: #{e.message}"
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Check if markitdown is available
|
|
37
|
+
# @return [Boolean]
|
|
38
|
+
def available?
|
|
39
|
+
@python_available
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def detect_python_cmd
|
|
45
|
+
candidates = %w[python3 python]
|
|
46
|
+
|
|
47
|
+
candidates.find do |cmd|
|
|
48
|
+
system(cmd, "-c", "import markitdown", out: File::NULL, err: File::NULL)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def check_python_markitdown
|
|
53
|
+
return false if @python_cmd.nil?
|
|
54
|
+
|
|
55
|
+
system(@python_cmd, "-c", "import markitdown", out: File::NULL, err: File::NULL)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def call_python_convert(file_path)
|
|
59
|
+
script = <<~PYTHON
|
|
60
|
+
import sys
|
|
61
|
+
import json
|
|
62
|
+
from markitdown import MarkItDown
|
|
63
|
+
from pathlib import Path
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
md = MarkItDown()
|
|
67
|
+
result = md.convert("#{file_path}")
|
|
68
|
+
# Return just the text content
|
|
69
|
+
print(result.text_content)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"ERROR: {str(e)}", file=sys.stderr)
|
|
72
|
+
sys.exit(1)
|
|
73
|
+
PYTHON
|
|
74
|
+
|
|
75
|
+
output, status = Open3.capture2e(@python_cmd, "-c", script)
|
|
76
|
+
|
|
77
|
+
if status.success?
|
|
78
|
+
output
|
|
79
|
+
else
|
|
80
|
+
raise ConversionError, output
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|