smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
require_relative '../models/text_search_config'
|
|
2
|
+
|
|
3
|
+
module SmartRAG
|
|
4
|
+
module Parsers
|
|
5
|
+
# Query parser for full-text search queries
|
|
6
|
+
# Handles language detection and tsquery building
|
|
7
|
+
class QueryParser
|
|
8
|
+
# Language patterns for detection
|
|
9
|
+
LANGUAGE_PATTERNS = {
|
|
10
|
+
'zh' => /[\u4e00-\u9fff]/, # Chinese characters
|
|
11
|
+
'ja' => /[\u3040-\u309f\u30a0-\u30ff]/, # Japanese hiragana/katakana
|
|
12
|
+
'ko' => /[\uac00-\ud7af]/, # Korean hangul
|
|
13
|
+
'en' => /[a-zA-Z]/ # English letters
|
|
14
|
+
}.freeze
|
|
15
|
+
|
|
16
|
+
# Advanced query operators
|
|
17
|
+
ADVANCED_OPERATORS = %w[AND OR NOT ""].freeze
|
|
18
|
+
|
|
19
|
+
def initialize
|
|
20
|
+
@logger = Logger.new(STDOUT)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Detect language of the given text
|
|
24
|
+
# @param text [String] Text to analyze
|
|
25
|
+
# @return [String] Language code (en/zh/ja/ko/default)
|
|
26
|
+
def detect_language(text)
|
|
27
|
+
return 'en' if text.nil? || text.strip.empty?
|
|
28
|
+
|
|
29
|
+
text = text.strip
|
|
30
|
+
char_counts = {}
|
|
31
|
+
|
|
32
|
+
# Count characters for each language
|
|
33
|
+
LANGUAGE_PATTERNS.each do |lang, pattern|
|
|
34
|
+
count = text.scan(pattern).length
|
|
35
|
+
char_counts[lang] = count if count > 0
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Return the language with most characters
|
|
39
|
+
# If no clear winner, default to 'en'
|
|
40
|
+
if char_counts.empty?
|
|
41
|
+
'en'
|
|
42
|
+
else
|
|
43
|
+
char_counts.max_by { |_, count| count }[0]
|
|
44
|
+
end
|
|
45
|
+
rescue StandardError => e
|
|
46
|
+
@logger.error "Language detection failed: #{e.message}"
|
|
47
|
+
'en' # Default to English on error
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Build tsquery from text query
|
|
51
|
+
# @param text [String] Search query text
|
|
52
|
+
# @param language [String] Language code
|
|
53
|
+
# @return [String] tsquery string
|
|
54
|
+
def build_tsquery(text, language = 'en')
|
|
55
|
+
raise ArgumentError, 'Query text cannot be nil' if text.nil?
|
|
56
|
+
raise ArgumentError, 'Query text cannot be empty' if text.strip.empty?
|
|
57
|
+
|
|
58
|
+
@logger.debug "QueryParser.build_tsquery called with text='#{text}', language='#{language}' (class: #{language.class})"
|
|
59
|
+
|
|
60
|
+
# Detect query type and build appropriate tsquery
|
|
61
|
+
if advanced_query?(text)
|
|
62
|
+
build_advanced_tsquery(text, language)
|
|
63
|
+
elsif phrase_query?(text)
|
|
64
|
+
build_phrase_tsquery(text, language)
|
|
65
|
+
else
|
|
66
|
+
build_plain_tsquery(text, language)
|
|
67
|
+
end
|
|
68
|
+
rescue StandardError => e
|
|
69
|
+
@logger.error "Failed to build tsquery: #{e.message}"
|
|
70
|
+
raise Errors::QueryParseError, "Failed to parse query: #{e.message}"
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Parse advanced query with operators
|
|
74
|
+
# @param text [String] Query text with operators
|
|
75
|
+
# @return [Hash] Parsed query structure
|
|
76
|
+
def parse_advanced_query(text)
|
|
77
|
+
if text.nil? || text.to_s.strip.empty?
|
|
78
|
+
return {
|
|
79
|
+
original: '',
|
|
80
|
+
tokens: [{ type: 'text', value: '' }],
|
|
81
|
+
phrases: [],
|
|
82
|
+
has_boolean: false,
|
|
83
|
+
has_phrases: false
|
|
84
|
+
}
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Remove extra whitespace
|
|
88
|
+
text = text.strip
|
|
89
|
+
|
|
90
|
+
# Parse quoted phrases
|
|
91
|
+
phrases = extract_quoted_phrases(text)
|
|
92
|
+
|
|
93
|
+
# Parse boolean operators
|
|
94
|
+
tokens = tokenize_query(text)
|
|
95
|
+
|
|
96
|
+
{
|
|
97
|
+
original: text,
|
|
98
|
+
tokens: tokens,
|
|
99
|
+
phrases: phrases,
|
|
100
|
+
has_boolean: tokens.any? { |t| %w[AND OR NOT].include?(t[:type]) },
|
|
101
|
+
has_phrases: phrases.any?
|
|
102
|
+
}
|
|
103
|
+
rescue StandardError => e
|
|
104
|
+
@logger.error "Advanced query parsing failed: #{e.message}"
|
|
105
|
+
{
|
|
106
|
+
original: text || '',
|
|
107
|
+
tokens: [{ type: 'text', value: text || '' }],
|
|
108
|
+
phrases: [],
|
|
109
|
+
has_boolean: false,
|
|
110
|
+
has_phrases: false
|
|
111
|
+
}
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
private
|
|
115
|
+
|
|
116
|
+
# Check if query contains advanced operators
|
|
117
|
+
def advanced_query?(text)
|
|
118
|
+
# Check for boolean operators (case insensitive)
|
|
119
|
+
return true if text =~ /\b(AND|OR|NOT)\b/i
|
|
120
|
+
|
|
121
|
+
# Check for quotes
|
|
122
|
+
return true if text.include?('"')
|
|
123
|
+
|
|
124
|
+
false
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Check if query is a phrase query (wrapped in quotes)
|
|
128
|
+
def phrase_query?(text)
|
|
129
|
+
text.strip.start_with?('"') && text.strip.end_with?('"')
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Build tsquery for natural language queries
|
|
133
|
+
def build_plain_tsquery(text, language)
|
|
134
|
+
# Get text search configuration for language
|
|
135
|
+
config = get_text_search_config(language)
|
|
136
|
+
|
|
137
|
+
# Keep plain query behavior consistent for both single-term and multi-term text.
|
|
138
|
+
# This also aligns with documentation/tests that expect the original full query text.
|
|
139
|
+
"plainto_tsquery('#{config}', #{escape_quote(text)})"
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
# Build tsquery for phrase queries
|
|
143
|
+
def build_phrase_tsquery(text, language)
|
|
144
|
+
config = get_text_search_config(language)
|
|
145
|
+
|
|
146
|
+
# Remove quotes and use phraseto_tsquery for phrase queries
|
|
147
|
+
phrase = text.strip[1...-1] # Remove surrounding quotes
|
|
148
|
+
"phraseto_tsquery('#{config}', #{escape_quote(phrase)})"
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Build tsquery for advanced queries with operators
|
|
152
|
+
def build_advanced_tsquery(text, language)
|
|
153
|
+
config = get_text_search_config(language)
|
|
154
|
+
parsed = parse_advanced_query(text)
|
|
155
|
+
|
|
156
|
+
# Convert parsed query to tsquery format
|
|
157
|
+
if parsed[:has_phrases] || parsed[:has_boolean]
|
|
158
|
+
build_complex_tsquery(parsed, config)
|
|
159
|
+
else
|
|
160
|
+
build_plain_tsquery(text, language)
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Build complex tsquery from parsed structure
|
|
165
|
+
def build_complex_tsquery(parsed, config)
|
|
166
|
+
# This is a simplified implementation
|
|
167
|
+
# In production, you might want more sophisticated parsing
|
|
168
|
+
query_parts = []
|
|
169
|
+
|
|
170
|
+
# Process phrases first
|
|
171
|
+
parsed[:phrases].each do |phrase|
|
|
172
|
+
query_parts << "phraseto_tsquery('#{config}', #{escape_quote(phrase)})"
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Process tokens - handle NOT as unary operator
|
|
176
|
+
# "a NOT b" should become "a & !b"
|
|
177
|
+
tokens = parsed[:tokens].dup
|
|
178
|
+
until tokens.empty?
|
|
179
|
+
token = tokens.shift
|
|
180
|
+
case token[:type]
|
|
181
|
+
when 'text'
|
|
182
|
+
query_parts << "plainto_tsquery('#{config}', #{escape_quote(token[:value])})"
|
|
183
|
+
when 'AND'
|
|
184
|
+
query_parts << '&&'
|
|
185
|
+
when 'OR'
|
|
186
|
+
query_parts << '||'
|
|
187
|
+
when 'NOT'
|
|
188
|
+
# NOT is unary - add & before it and apply to next token
|
|
189
|
+
query_parts << '&&' if !query_parts.last.to_s.include?('&&') && !query_parts.last.to_s.empty?
|
|
190
|
+
query_parts << '!!'
|
|
191
|
+
# Next token should be text, we need to negate it
|
|
192
|
+
next_token = tokens.shift
|
|
193
|
+
if next_token && next_token[:type] == 'text'
|
|
194
|
+
query_parts << "plainto_tsquery('#{config}', #{escape_quote(next_token[:value])})"
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Wrap in parentheses to ensure proper precedence and type handling
|
|
200
|
+
operators = ['&&', '||', '!!']
|
|
201
|
+
normalized = []
|
|
202
|
+
query_parts.each do |part|
|
|
203
|
+
next if part.to_s.strip.empty?
|
|
204
|
+
|
|
205
|
+
if operators.include?(part)
|
|
206
|
+
normalized << part
|
|
207
|
+
next
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
if normalized.any?
|
|
211
|
+
prev = normalized.last
|
|
212
|
+
if !operators.include?(prev)
|
|
213
|
+
normalized << '&&'
|
|
214
|
+
end
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
normalized << part
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
while normalized.any? && operators.include?(normalized.last)
|
|
221
|
+
normalized.pop
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
return "plainto_tsquery('#{config}', #{escape_quote(parsed[:original])})" if normalized.empty?
|
|
225
|
+
|
|
226
|
+
query_expr = normalized.join(' ')
|
|
227
|
+
"(#{query_expr})::tsquery"
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
# Extract quoted phrases from text
|
|
231
|
+
def extract_quoted_phrases(text)
|
|
232
|
+
phrases = []
|
|
233
|
+
# Match quoted strings, handling escaped quotes
|
|
234
|
+
text.scan(/"([^"\\]*(?:\\.[^"\\]*)*)"/).each do |match|
|
|
235
|
+
phrases << match[0].gsub('\\"', '"') # Unescape quotes
|
|
236
|
+
end
|
|
237
|
+
phrases
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Tokenize query into operators and text
|
|
241
|
+
def tokenize_query(text)
|
|
242
|
+
tokens = []
|
|
243
|
+
|
|
244
|
+
# Remove quoted phrases first
|
|
245
|
+
without_quotes = text.gsub(/"[^"]*"/, 'PHRASE_PLACEHOLDER')
|
|
246
|
+
|
|
247
|
+
# Split by operators
|
|
248
|
+
parts = without_quotes.split(/\b(AND|OR|NOT)\b/i)
|
|
249
|
+
|
|
250
|
+
parts.each_with_index do |part, _index|
|
|
251
|
+
part = part.strip
|
|
252
|
+
next if part.empty?
|
|
253
|
+
|
|
254
|
+
if part =~ /^(AND|OR|NOT)$/i
|
|
255
|
+
tokens << { type: part.upcase, value: part.upcase }
|
|
256
|
+
elsif part == 'PHRASE_PLACEHOLDER'
|
|
257
|
+
# Skip placeholders (phrases handled separately)
|
|
258
|
+
next
|
|
259
|
+
else
|
|
260
|
+
tokens << { type: 'text', value: part }
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
tokens
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Get text search configuration for language
|
|
268
|
+
def get_text_search_config(language)
|
|
269
|
+
# Try to get config from database
|
|
270
|
+
config = Models::TextSearchConfig.first(language_code: language.to_s)
|
|
271
|
+
return config.config_name if config
|
|
272
|
+
|
|
273
|
+
# Fallback to simple config
|
|
274
|
+
'pg_catalog.simple'
|
|
275
|
+
rescue StandardError => e
|
|
276
|
+
@logger.error "Failed to get text search config: #{e.message}"
|
|
277
|
+
'pg_catalog.simple'
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
# Escape single quotes for SQL
|
|
281
|
+
def escape_quote(text)
|
|
282
|
+
"'" + text.gsub("'", "''") + "'"
|
|
283
|
+
end
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Custom error for query parsing
|
|
287
|
+
module Errors
|
|
288
|
+
class QueryParseError < StandardError; end
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
end
|