smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,291 @@
1
+ require_relative '../models/text_search_config'
2
+
3
+ module SmartRAG
4
+ module Parsers
5
+ # Query parser for full-text search queries
6
+ # Handles language detection and tsquery building
7
+ class QueryParser
8
+ # Language patterns for detection
9
+ LANGUAGE_PATTERNS = {
10
+ 'zh' => /[\u4e00-\u9fff]/, # Chinese characters
11
+ 'ja' => /[\u3040-\u309f\u30a0-\u30ff]/, # Japanese hiragana/katakana
12
+ 'ko' => /[\uac00-\ud7af]/, # Korean hangul
13
+ 'en' => /[a-zA-Z]/ # English letters
14
+ }.freeze
15
+
16
+ # Advanced query operators
17
+ ADVANCED_OPERATORS = %w[AND OR NOT ""].freeze
18
+
19
+ def initialize
20
+ @logger = Logger.new(STDOUT)
21
+ end
22
+
23
+ # Detect language of the given text
24
+ # @param text [String] Text to analyze
25
+ # @return [String] Language code (en/zh/ja/ko/default)
26
+ def detect_language(text)
27
+ return 'en' if text.nil? || text.strip.empty?
28
+
29
+ text = text.strip
30
+ char_counts = {}
31
+
32
+ # Count characters for each language
33
+ LANGUAGE_PATTERNS.each do |lang, pattern|
34
+ count = text.scan(pattern).length
35
+ char_counts[lang] = count if count > 0
36
+ end
37
+
38
+ # Return the language with most characters
39
+ # If no clear winner, default to 'en'
40
+ if char_counts.empty?
41
+ 'en'
42
+ else
43
+ char_counts.max_by { |_, count| count }[0]
44
+ end
45
+ rescue StandardError => e
46
+ @logger.error "Language detection failed: #{e.message}"
47
+ 'en' # Default to English on error
48
+ end
49
+
50
+ # Build tsquery from text query
51
+ # @param text [String] Search query text
52
+ # @param language [String] Language code
53
+ # @return [String] tsquery string
54
+ def build_tsquery(text, language = 'en')
55
+ raise ArgumentError, 'Query text cannot be nil' if text.nil?
56
+ raise ArgumentError, 'Query text cannot be empty' if text.strip.empty?
57
+
58
+ @logger.debug "QueryParser.build_tsquery called with text='#{text}', language='#{language}' (class: #{language.class})"
59
+
60
+ # Detect query type and build appropriate tsquery
61
+ if advanced_query?(text)
62
+ build_advanced_tsquery(text, language)
63
+ elsif phrase_query?(text)
64
+ build_phrase_tsquery(text, language)
65
+ else
66
+ build_plain_tsquery(text, language)
67
+ end
68
+ rescue StandardError => e
69
+ @logger.error "Failed to build tsquery: #{e.message}"
70
+ raise Errors::QueryParseError, "Failed to parse query: #{e.message}"
71
+ end
72
+
73
+ # Parse advanced query with operators
74
+ # @param text [String] Query text with operators
75
+ # @return [Hash] Parsed query structure
76
+ def parse_advanced_query(text)
77
+ if text.nil? || text.to_s.strip.empty?
78
+ return {
79
+ original: '',
80
+ tokens: [{ type: 'text', value: '' }],
81
+ phrases: [],
82
+ has_boolean: false,
83
+ has_phrases: false
84
+ }
85
+ end
86
+
87
+ # Remove extra whitespace
88
+ text = text.strip
89
+
90
+ # Parse quoted phrases
91
+ phrases = extract_quoted_phrases(text)
92
+
93
+ # Parse boolean operators
94
+ tokens = tokenize_query(text)
95
+
96
+ {
97
+ original: text,
98
+ tokens: tokens,
99
+ phrases: phrases,
100
+ has_boolean: tokens.any? { |t| %w[AND OR NOT].include?(t[:type]) },
101
+ has_phrases: phrases.any?
102
+ }
103
+ rescue StandardError => e
104
+ @logger.error "Advanced query parsing failed: #{e.message}"
105
+ {
106
+ original: text || '',
107
+ tokens: [{ type: 'text', value: text || '' }],
108
+ phrases: [],
109
+ has_boolean: false,
110
+ has_phrases: false
111
+ }
112
+ end
113
+
114
+ private
115
+
116
+ # Check if query contains advanced operators
117
+ def advanced_query?(text)
118
+ # Check for boolean operators (case insensitive)
119
+ return true if text =~ /\b(AND|OR|NOT)\b/i
120
+
121
+ # Check for quotes
122
+ return true if text.include?('"')
123
+
124
+ false
125
+ end
126
+
127
+ # Check if query is a phrase query (wrapped in quotes)
128
+ def phrase_query?(text)
129
+ text.strip.start_with?('"') && text.strip.end_with?('"')
130
+ end
131
+
132
+ # Build tsquery for natural language queries
133
+ def build_plain_tsquery(text, language)
134
+ # Get text search configuration for language
135
+ config = get_text_search_config(language)
136
+
137
+ # Keep plain query behavior consistent for both single-term and multi-term text.
138
+ # This also aligns with documentation/tests that expect the original full query text.
139
+ "plainto_tsquery('#{config}', #{escape_quote(text)})"
140
+ end
141
+
142
+ # Build tsquery for phrase queries
143
+ def build_phrase_tsquery(text, language)
144
+ config = get_text_search_config(language)
145
+
146
+ # Remove quotes and use phraseto_tsquery for phrase queries
147
+ phrase = text.strip[1...-1] # Remove surrounding quotes
148
+ "phraseto_tsquery('#{config}', #{escape_quote(phrase)})"
149
+ end
150
+
151
+ # Build tsquery for advanced queries with operators
152
+ def build_advanced_tsquery(text, language)
153
+ config = get_text_search_config(language)
154
+ parsed = parse_advanced_query(text)
155
+
156
+ # Convert parsed query to tsquery format
157
+ if parsed[:has_phrases] || parsed[:has_boolean]
158
+ build_complex_tsquery(parsed, config)
159
+ else
160
+ build_plain_tsquery(text, language)
161
+ end
162
+ end
163
+
164
+ # Build complex tsquery from parsed structure
165
+ def build_complex_tsquery(parsed, config)
166
+ # This is a simplified implementation
167
+ # In production, you might want more sophisticated parsing
168
+ query_parts = []
169
+
170
+ # Process phrases first
171
+ parsed[:phrases].each do |phrase|
172
+ query_parts << "phraseto_tsquery('#{config}', #{escape_quote(phrase)})"
173
+ end
174
+
175
+ # Process tokens - handle NOT as unary operator
176
+ # "a NOT b" should become "a & !b"
177
+ tokens = parsed[:tokens].dup
178
+ until tokens.empty?
179
+ token = tokens.shift
180
+ case token[:type]
181
+ when 'text'
182
+ query_parts << "plainto_tsquery('#{config}', #{escape_quote(token[:value])})"
183
+ when 'AND'
184
+ query_parts << '&&'
185
+ when 'OR'
186
+ query_parts << '||'
187
+ when 'NOT'
188
+ # NOT is unary - add & before it and apply to next token
189
+ query_parts << '&&' if !query_parts.last.to_s.include?('&&') && !query_parts.last.to_s.empty?
190
+ query_parts << '!!'
191
+ # Next token should be text, we need to negate it
192
+ next_token = tokens.shift
193
+ if next_token && next_token[:type] == 'text'
194
+ query_parts << "plainto_tsquery('#{config}', #{escape_quote(next_token[:value])})"
195
+ end
196
+ end
197
+ end
198
+
199
+ # Wrap in parentheses to ensure proper precedence and type handling
200
+ operators = ['&&', '||', '!!']
201
+ normalized = []
202
+ query_parts.each do |part|
203
+ next if part.to_s.strip.empty?
204
+
205
+ if operators.include?(part)
206
+ normalized << part
207
+ next
208
+ end
209
+
210
+ if normalized.any?
211
+ prev = normalized.last
212
+ if !operators.include?(prev)
213
+ normalized << '&&'
214
+ end
215
+ end
216
+
217
+ normalized << part
218
+ end
219
+
220
+ while normalized.any? && operators.include?(normalized.last)
221
+ normalized.pop
222
+ end
223
+
224
+ return "plainto_tsquery('#{config}', #{escape_quote(parsed[:original])})" if normalized.empty?
225
+
226
+ query_expr = normalized.join(' ')
227
+ "(#{query_expr})::tsquery"
228
+ end
229
+
230
+ # Extract quoted phrases from text
231
+ def extract_quoted_phrases(text)
232
+ phrases = []
233
+ # Match quoted strings, handling escaped quotes
234
+ text.scan(/"([^"\\]*(?:\\.[^"\\]*)*)"/).each do |match|
235
+ phrases << match[0].gsub('\\"', '"') # Unescape quotes
236
+ end
237
+ phrases
238
+ end
239
+
240
+ # Tokenize query into operators and text
241
+ def tokenize_query(text)
242
+ tokens = []
243
+
244
+ # Remove quoted phrases first
245
+ without_quotes = text.gsub(/"[^"]*"/, 'PHRASE_PLACEHOLDER')
246
+
247
+ # Split by operators
248
+ parts = without_quotes.split(/\b(AND|OR|NOT)\b/i)
249
+
250
+ parts.each_with_index do |part, _index|
251
+ part = part.strip
252
+ next if part.empty?
253
+
254
+ if part =~ /^(AND|OR|NOT)$/i
255
+ tokens << { type: part.upcase, value: part.upcase }
256
+ elsif part == 'PHRASE_PLACEHOLDER'
257
+ # Skip placeholders (phrases handled separately)
258
+ next
259
+ else
260
+ tokens << { type: 'text', value: part }
261
+ end
262
+ end
263
+
264
+ tokens
265
+ end
266
+
267
+ # Get text search configuration for language
268
+ def get_text_search_config(language)
269
+ # Try to get config from database
270
+ config = Models::TextSearchConfig.first(language_code: language.to_s)
271
+ return config.config_name if config
272
+
273
+ # Fallback to simple config
274
+ 'pg_catalog.simple'
275
+ rescue StandardError => e
276
+ @logger.error "Failed to get text search config: #{e.message}"
277
+ 'pg_catalog.simple'
278
+ end
279
+
280
+ # Escape single quotes for SQL
281
+ def escape_quote(text)
282
+ "'" + text.gsub("'", "''") + "'"
283
+ end
284
+ end
285
+
286
+ # Custom error for query parsing
287
+ module Errors
288
+ class QueryParseError < StandardError; end
289
+ end
290
+ end
291
+ end