smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,322 @@
1
+ require "smart_prompt"
2
+ require_relative "../errors"
3
+
4
+ module SmartRAG
5
+ module Services
6
+ # Service for generating natural language summaries and responses
7
+ class SummarizationService
8
+ attr_reader :config, :logger, :smart_prompt_engine
9
+
10
+ # Initialize the summarization service
11
+ # @param config [Hash] Configuration options
12
+ # @option config [String] :config_path Path to smart_prompt config (default: config/llm_config.yml)
13
+ # @option config [Integer] :max_retries Maximum retries for API calls (default: 3)
14
+ # @option config [Integer] :timeout Timeout for API calls (default: 30)
15
+ # @option config [Logger] :logger Logger instance (default: Logger.new(STDOUT))
16
+ # @option config [Integer] :max_context_length Maximum context length (default: 4000)
17
+ def initialize(config = {})
18
+ config ||= {}
19
+ @logger = Logger.new(STDOUT)
20
+ @config = default_config.merge(config)
21
+ @logger = @config[:logger] || @logger
22
+ @max_context_length = @config[:max_context_length]
23
+
24
+ # Load workers
25
+ workers_dir = File.join(File.dirname(__FILE__), '..', '..', '..', 'workers')
26
+ Dir.glob(File.join(workers_dir, '*.rb')).each { |file| require file }
27
+
28
+ # Initialize SmartPrompt engine
29
+ config_path = @config[:config_path] || "config/llm_config.yml"
30
+ @smart_prompt_engine = SmartPrompt::Engine.new(config_path)
31
+
32
+ @logger.info "SummarizationService initialized"
33
+ rescue StandardError => e
34
+ log_error("Failed to initialize SummarizationService", e)
35
+ raise
36
+ end
37
+
38
+ # Summarize search results into a coherent answer
39
+ # @param question [String] The original question
40
+ # @param context [String] Search results context
41
+ # @param options [Hash] Summarization options
42
+ # @option options [Symbol] :language Output language (:zh_cn, :zh_tw, :en, :ja)
43
+ # @option options [Integer] :max_length Maximum response length (default: 1000)
44
+ # @option options [String] :tone Response tone (formal, casual, technical)
45
+ # @option options [Boolean] :include_citations Whether to include citations (default: true)
46
+ # @return [Hash] Summarized response with answer and metadata
47
+ def summarize_search_results(question, context, options = {})
48
+ raise ArgumentError, "Question cannot be nil or empty" if question.to_s.strip.empty?
49
+ raise ArgumentError, "Context cannot be nil" if context.nil?
50
+
51
+ logger.info "Summarizing search results for question: #{question[0..50]}..."
52
+ logger.info "Context length: #{context.length} chars"
53
+
54
+ # Truncate context if too long
55
+ truncated_context = truncate_context(context)
56
+ language = options[:language] || :en
57
+ max_length = options[:max_length] || 1000
58
+ tone = options[:tone] || 'formal'
59
+
60
+ # Build prompt based on language
61
+ prompt = build_summarization_prompt(
62
+ question,
63
+ truncated_context,
64
+ language,
65
+ max_length,
66
+ tone,
67
+ options
68
+ )
69
+
70
+ # Call LLM to generate summary
71
+ response = call_llm_for_summary(prompt, options)
72
+
73
+ # Parse response
74
+ parsed_response = parse_summary_response(response, options)
75
+
76
+ logger.info "Successfully generated summary (#{parsed_response[:answer].length} chars), confidence: #{parsed_response[:confidence]}"
77
+
78
+ parsed_response
79
+ rescue ArgumentError
80
+ raise
81
+ rescue StandardError => e
82
+ log_error("Failed to summarize search results", e)
83
+ raise ::SmartRAG::Errors::SummarizationServiceError, "Summarization failed: #{e.message}"
84
+ end
85
+
86
+ # Generate a standalone summary of a text
87
+ # @param text [String] Text to summarize
88
+ # @param options [Hash] Summarization options
89
+ # @return [String] Summary text
90
+ def summarize_text(text, options = {})
91
+ raise ArgumentError, "Text cannot be nil or empty" if text.to_s.strip.empty?
92
+
93
+ logger.info "Summarizing text (#{text.length} chars)..."
94
+
95
+ language = options[:language] || detect_language(text)
96
+ max_length = options[:max_length] || 500
97
+
98
+ prompt = build_standalone_summary_prompt(text, language, max_length)
99
+ response = call_llm_for_summary(prompt, options)
100
+
101
+ summary = extract_text_from_response(response)
102
+ logger.info "Generated summary (#{summary.length} chars)"
103
+
104
+ summary
105
+ rescue ArgumentError
106
+ raise
107
+ rescue StandardError => e
108
+ log_error("Failed to summarize text", e)
109
+ raise ::SmartRAG::Errors::SummarizationServiceError, "Text summarization failed: #{e.message}"
110
+ end
111
+
112
+ private
113
+
114
+ def build_summarization_prompt(question, context, language, max_length, tone, options)
115
+ include_citations = options.fetch(:include_citations, true)
116
+
117
+ case language
118
+ when :zh_cn
119
+ build_chinese_summarization_prompt(question, context, max_length, tone, include_citations)
120
+ when :zh_tw
121
+ build_traditional_chinese_summarization_prompt(question, context, max_length, tone, include_citations)
122
+ when :en
123
+ build_english_summarization_prompt(question, context, max_length, tone, include_citations)
124
+ when :ja
125
+ build_japanese_summarization_prompt(question, context, max_length, tone, include_citations)
126
+ else
127
+ logger.warn "Unsupported language: #{language}, defaulting to English"
128
+ build_english_summarization_prompt(question, context, max_length, tone, include_citations)
129
+ end
130
+ end
131
+
132
+ def build_chinese_summarization_prompt(question, context, max_length, tone, include_citations)
133
+ prompt = "基于以下搜索结果,回答问题并提供详细解释。"
134
+ prompt << "\n\n问题:#{question}\n\n"
135
+ prompt << "搜索结果:\n#{context}\n\n"
136
+ prompt << "要求:\n"
137
+ prompt << "1. 提供直接、准确的答案\n"
138
+ prompt << "2. 使用搜索结果中的信息支持你的回答\n"
139
+ prompt << "3. 答案长度不超过#{max_length}个字符\n"
140
+ prompt << "4. 语气:#{tone == 'formal' ? '正式' : tone == 'casual' ? '随意' : '专业'}\n"
141
+ prompt << "5. #{include_citations ? '使用[1]、[2]等格式引用来源' : '不需要引用来源'}\n\n"
142
+ prompt << "请提供结构化的回答:\n"
143
+ prompt << "- 简要答案(1-2句话)\n"
144
+ prompt << "- 详细解释\n"
145
+ include_citations ? (prompt << "- 来源引用\n") : ""
146
+ prompt << "\n以JSON格式输出:{\"answer\": \"...\", \"confidence\": 0.0-1.0}"
147
+ end
148
+
149
+ def build_traditional_chinese_summarization_prompt(question, context, max_length, tone, include_citations)
150
+ prompt = "基於以下搜尋結果,回答問題並提供詳細解釋。"
151
+ prompt << "\n\n問題:#{question}\n\n"
152
+ prompt << "搜尋結果:\n#{context}\n\n"
153
+ prompt << "要求:\n"
154
+ prompt << "1. 提供直接、準確的答案\n"
155
+ prompt << "2. 使用搜尋結果中的資訊支持你的回答\n"
156
+ prompt << "3. 答案長度不超過#{max_length}個字元\n"
157
+ prompt << "4. 語氣:#{tone == 'formal' ? '正式' : tone == 'casual' ? '隨意' : '專業'}\n"
158
+ prompt << "5. #{include_citations ? '使用[1]、[2]等格式引用來源' : '不需要引用來源'}\n\n"
159
+ prompt << "請提供結構化的回答:\n"
160
+ prompt << "- 簡要答案(1-2句話)\n"
161
+ prompt << "- 詳細解釋\n"
162
+ include_citations ? (prompt << "- 來源引用\n") : ""
163
+ prompt << "\n以JSON格式輸出:{\"answer\": \"...\", \"confidence\": 0.0-1.0}"
164
+ end
165
+
166
+ def build_english_summarization_prompt(question, context, max_length, tone, include_citations)
167
+ prompt = "Based on the following search results, answer the question and provide detailed explanation."
168
+ prompt << "\n\nQuestion: #{question}\n\n"
169
+ prompt << "Search Results:\n#{context}\n\n"
170
+ prompt << "Requirements:\n"
171
+ prompt << "1. Provide a direct, accurate answer\n"
172
+ prompt << "2. Support your answer with information from the search results\n"
173
+ prompt << "3. Keep answer under #{max_length} characters\n"
174
+ prompt << "4. Tone: #{tone}\n"
175
+ prompt << "5. #{include_citations ? 'Cite sources using [1], [2] format' : 'No citations needed'}\n\n"
176
+ prompt << "Provide a structured response:\n"
177
+ prompt << "- Brief answer (1-2 sentences)\n"
178
+ prompt << "- Detailed explanation\n"
179
+ include_citations ? (prompt << "- Source citations\n") : ""
180
+ prompt << "\nOutput in JSON format: {\"answer\": \"...\", \"confidence\": 0.0-1.0}"
181
+ end
182
+
183
+ def build_japanese_summarization_prompt(question, context, max_length, tone, include_citations)
184
+ prompt = "以下の検索結果に基づいて、質問に答えて詳細な説明を提供してください。"
185
+ prompt << "\n\n質問:#{question}\n\n"
186
+ prompt << "検索結果:\n#{context}\n\n"
187
+ prompt << "要件:\n"
188
+ prompt << "1. 直接的で正確な答えを提供する\n"
189
+ prompt << "2. 検索結果の情報を使用して回答をサポートする\n"
190
+ prompt << "3. 回答は#{max_length}文字以内にする\n"
191
+ prompt << "4. トーン:#{tone == 'formal' ? 'フォーマル' : tone == 'casual' ? 'カジュアル' : '専門的'}\n"
192
+ prompt << "5. #{include_citations ? '[1]、[2]などの形式で情報源を引用' : '引用は不要'}\n\n"
193
+ prompt << "構造化された回答を提供:\n"
194
+ prompt << "- 簡潔な答え(1-2文)\n"
195
+ prompt << "- 詳細な説明\n"
196
+ include_citations ? (prompt << "- 情報源の引用\n") : ""
197
+ prompt << "\nJSON形式で出力:{\"answer\": \"...\", \"confidence\": 0.0-1.0}"
198
+ end
199
+
200
+ def build_standalone_summary_prompt(text, language, max_length)
201
+ case language
202
+ when :zh_cn
203
+ "用#{max_length}字以内的简洁中文总结以下内容:\n\n#{text}"
204
+ when :zh_tw
205
+ "用#{max_length}字以內的簡潔繁體中文總結以下內容:\n\n#{text}"
206
+ when :en
207
+ "Summarize the following content in English within #{max_length} characters:\n\n#{text}"
208
+ when :ja
209
+ "次の内容を日本語で#{max_length}文字以内に要約してください:\n\n#{text}"
210
+ else
211
+ "Summarize the following content within #{max_length} characters:\n\n#{text}"
212
+ end
213
+ end
214
+
215
+ def call_llm_for_summary(prompt, options = {})
216
+ max_retries = options[:retries] || config[:max_retries]
217
+ timeout = options[:timeout] || config[:timeout]
218
+
219
+ with_retry(max_retries: max_retries, timeout: timeout) do
220
+ result = smart_prompt_engine.call_worker(:generate_content, { content: prompt })
221
+ raise "No response from LLM" unless result
222
+ result
223
+ end
224
+ rescue StandardError => e
225
+ logger.error "LLM call for summarization failed: #{e.message}"
226
+ raise
227
+ end
228
+
229
+ def parse_summary_response(response, options = {})
230
+ # Try to parse as JSON first
231
+ if response =~ /\{.*answer.*confidence.*\}/m
232
+ begin
233
+ parsed = JSON.parse(response.gsub(/```json\n?|\n?```/, ''))
234
+ return {
235
+ answer: parsed["answer"] || parsed["response"] || response,
236
+ confidence: parsed["confidence"]&.to_f || 0.8,
237
+ raw_response: response
238
+ }
239
+ rescue JSON::ParserError
240
+ logger.warn "Failed to parse JSON response, using raw response"
241
+ end
242
+ end
243
+
244
+ # Fallback to using the entire response as answer
245
+ {
246
+ answer: response,
247
+ confidence: 0.8, # Default confidence
248
+ raw_response: response
249
+ }
250
+ end
251
+
252
+ def extract_text_from_response(response)
253
+ # Remove any JSON wrapper if present
254
+ if response =~ /\{.*answer.*\}/m
255
+ begin
256
+ parsed = JSON.parse(response.gsub(/```json\n?|\n?```/, ''))
257
+ return parsed["answer"] || parsed["response"] || response
258
+ rescue JSON::ParserError
259
+ # Continue to fallback
260
+ end
261
+ end
262
+
263
+ # Remove markdown code blocks
264
+ response.gsub(/```[a-z]*\n?|\n?```/, '').strip
265
+ end
266
+
267
+ def truncate_context(context)
268
+ return context if context.length <= max_context_length
269
+
270
+ logger.warn "Context too long (#{context.length} chars), truncating to #{max_context_length}"
271
+ context[0...max_context_length] + "... (truncated)"
272
+ end
273
+
274
+ def detect_language(text)
275
+ # Check for Japanese hiragana/katakana first (more specific than Chinese kanji)
276
+ return :ja if text.match?(/[\u3040-\u309f\u30a0-\u30ff]/)
277
+ return :zh_cn if text.match?(/[\u4e00-\u9fff]/)
278
+ :en
279
+ end
280
+
281
+ def max_context_length
282
+ @max_context_length
283
+ end
284
+
285
+ def with_retry(max_retries:, timeout:, &block)
286
+ last_exception = nil
287
+
288
+ max_retries.times do |attempt|
289
+ begin
290
+ Timeout.timeout(timeout) do
291
+ return yield
292
+ end
293
+ rescue StandardError => e
294
+ last_exception = e
295
+ logger.warn "Attempt #{attempt + 1} failed: #{e.message}"
296
+
297
+ # Exponential backoff
298
+ sleep(2 ** attempt) if attempt < max_retries - 1
299
+ end
300
+ end
301
+
302
+ raise last_exception
303
+ end
304
+
305
+ def log_error(message, exception)
306
+ active_logger = logger || @logger || Logger.new(STDOUT)
307
+ active_logger.error "#{message}: #{exception.message}"
308
+ active_logger.error exception.backtrace.join("\n ")
309
+ end
310
+
311
+ def default_config
312
+ {
313
+ config_path: "config/llm_config.yml",
314
+ max_retries: 3,
315
+ timeout: 30,
316
+ max_context_length: 4000,
317
+ logger: Logger.new(STDOUT)
318
+ }
319
+ end
320
+ end
321
+ end
322
+ end