smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,126 @@
1
+ require 'yaml'
2
+ require 'erb'
3
+
4
+ module SmartRAG
5
+ class Config
6
+ class << self
7
+ def load(file_path = nil)
8
+ # If file_path is a Hash, return it directly (already a config hash)
9
+ return symbolize_keys(file_path) if file_path.is_a?(Hash)
10
+
11
+ file_path ||= default_config_path
12
+
13
+ unless File.exist?(file_path)
14
+ raise "Configuration file not found: #{file_path}"
15
+ end
16
+
17
+ yaml_content = File.read(file_path)
18
+ config = YAML.safe_load(
19
+ ERB.new(yaml_content).result,
20
+ permitted_classes: [Symbol, Time]
21
+ )
22
+
23
+ # Convert string keys to symbols for consistency
24
+ config = symbolize_keys(config) if config.is_a?(Hash)
25
+
26
+ validate_config(config)
27
+ config
28
+ end
29
+
30
+ def load_database_config(env = nil)
31
+ env ||= ENV['RACK_ENV'] || 'development'
32
+ env = env.to_sym if env.respond_to?(:to_sym)
33
+ database_config_path = File.join(config_dir, 'database.yml')
34
+
35
+ unless File.exist?(database_config_path)
36
+ # Fallback to main config
37
+ config = load
38
+ return config[:database] if config[:database]
39
+
40
+ raise "Database configuration file not found: #{database_config_path}"
41
+ end
42
+
43
+ yaml_content = File.read(database_config_path)
44
+ config = YAML.safe_load(
45
+ ERB.new(yaml_content).result,
46
+ permitted_classes: [Symbol]
47
+ )
48
+
49
+ # Convert string keys to symbols for consistency
50
+ config = symbolize_keys(config) if config.is_a?(Hash)
51
+
52
+ config[env] || config[:default] || config
53
+ end
54
+
55
+ def load_fulltext_config
56
+ fulltext_config_path = File.join(config_dir, 'fulltext_search.yml')
57
+
58
+ unless File.exist?(fulltext_config_path)
59
+ # Fallback to main config
60
+ config = load
61
+ return config[:fulltext_search] || {} if config[:fulltext_search]
62
+
63
+ return {}
64
+ end
65
+
66
+ yaml_content = File.read(fulltext_config_path)
67
+ config = YAML.safe_load(
68
+ ERB.new(yaml_content).result,
69
+ permitted_classes: [Symbol]
70
+ ) || {}
71
+
72
+ # Convert string keys to symbols for consistency
73
+ symbolize_keys(config) if config.is_a?(Hash)
74
+ end
75
+
76
+ private
77
+
78
+ def default_config_path
79
+ @default_config_path ||= File.join(config_dir, 'smart_rag.yml')
80
+ end
81
+
82
+ def config_dir
83
+ @config_dir ||= File.join(__dir__, '..', '..', 'config')
84
+ end
85
+
86
+ def validate_config(config)
87
+ return unless config.is_a?(Hash)
88
+
89
+ # Validate required sections
90
+ unless config[:database]
91
+ raise "Missing required 'database' configuration"
92
+ end
93
+
94
+ # Validate embedding configuration
95
+ if config[:embedding]
96
+ unless config[:embedding][:provider]
97
+ puts "Warning: Missing embedding provider configuration"
98
+ end
99
+
100
+ unless config[:embedding][:dimensions]
101
+ puts "Warning: Missing embedding dimensions, defaulting to 1024"
102
+ config[:embedding][:dimensions] = 1024
103
+ end
104
+ end
105
+
106
+ # Validate fulltext search configuration
107
+ if config[:fulltext_search]
108
+ # Check for supported languages
109
+ supported_langs = ['en', 'zh', 'ja', 'ko', 'default']
110
+ end
111
+
112
+ true
113
+ end
114
+
115
+ def symbolize_keys(hash)
116
+ return hash unless hash.is_a?(Hash)
117
+
118
+ hash.each_with_object({}) do |(key, value), result|
119
+ key = key.to_sym if key.respond_to?(:to_sym)
120
+ value = symbolize_keys(value) if value.is_a?(Hash)
121
+ result[key] = value
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,537 @@
1
+ require 'uri'
2
+ require 'net/http'
3
+ require 'fileutils'
4
+ require 'tempfile'
5
+ require 'digest'
6
+ require_relative '../../smart_rag'
7
+ require_relative '../models'
8
+ require_relative '../chunker/markdown_chunker'
9
+ require_relative '../smart_chunking/pipeline'
10
+
11
+ module SmartRAG
12
+ module Core
13
+ # DocumentProcessor handles document downloading, conversion, chunking, and storage
14
+ class DocumentProcessor
15
+ attr_reader :config, :embedding_manager, :tag_service
16
+
17
+ def initialize(config = {})
18
+ @config = config
19
+ @embedding_manager = config[:embedding_manager]
20
+ @tag_service = config[:tag_service]
21
+ @logger = config[:logger] || Logger.new(STDOUT)
22
+ @download_dir = config[:download_dir] || Dir.tmpdir
23
+ @default_chunk_size = config[:chunk_size] || 2000
24
+ @default_overlap = config[:overlap] || 200
25
+
26
+ # Update config with defaults
27
+ @config[:logger] = @logger
28
+ @config[:download_dir] = @download_dir
29
+ @config[:chunk_size] = @default_chunk_size
30
+ @config[:overlap] = @default_overlap
31
+ end
32
+
33
+ # Process a document from URL or local file
34
+ # @param [String] source URL or file path
35
+ # @param [Hash] options Processing options
36
+ # @return [::SmartRAG::Models::SourceDocument] processed document
37
+ def process(source, options = {})
38
+ @logger.info "Processing document from: #{source}"
39
+
40
+ # Step 1: Download if it's a URL
41
+ file_path = if source =~ %r{\Ahttps?://}
42
+ download_from_url(source, options)
43
+ elsif File.exist?(source)
44
+ source
45
+ else
46
+ raise ArgumentError, "Invalid source: #{source}. Must be a valid URL or file path."
47
+ end
48
+
49
+ # Step 2: Extract metadata
50
+ metadata = extract_metadata(file_path, options)
51
+
52
+ # Step 3: Convert to markdown
53
+ markdown_content = convert_to_markdown(file_path, options)
54
+
55
+ # Add markdown content to metadata for language detection
56
+ metadata[:content] = markdown_content if metadata[:content].nil? || metadata[:content].empty?
57
+
58
+ # Step 4: Create or update document record
59
+ document = create_or_update_document(source, metadata, options)
60
+
61
+ # Step 5: Chunk content
62
+ chunks = chunk_content(markdown_content, options)
63
+
64
+ # Step 6: Save sections
65
+ save_sections(document, chunks, options)
66
+
67
+ # Step 7: Update document status
68
+ document.set_download_state(:completed)
69
+
70
+ @logger.info "Successfully processed document: #{document.title}"
71
+ document
72
+ rescue StandardError => e
73
+ @logger.error "Failed to process document #{source}: #{e.message}"
74
+ @logger.error e.backtrace.join("\n")
75
+
76
+ # Mark as failed if document was created
77
+ @document.set_download_state(:failed) if defined?(@document) && @document
78
+
79
+ raise e
80
+ ensure
81
+ # Clean up temporary downloaded files
82
+ if defined?(@downloaded_file) && @downloaded_file && File.exist?(@downloaded_file)
83
+ File.delete(@downloaded_file)
84
+ @logger.debug "Cleaned up temporary file: #{@downloaded_file}"
85
+ end
86
+ end
87
+
88
+ # Create a document and return document with sections
89
+ # @param [String] source URL or file path
90
+ # @param [Hash] options Processing options
91
+ # @return [Hash] Document and sections { document: SourceDocument, sections: [] }
92
+ def create_document(source, options = {})
93
+ @logger.info "Creating document from: #{source}"
94
+
95
+ # Step 1: Download if it's a URL
96
+ file_path = if source =~ %r{\Ahttps?://}
97
+ download_from_url(source, options)
98
+ elsif File.exist?(source)
99
+ source
100
+ else
101
+ raise ArgumentError, "Invalid source: #{source}. Must be a valid URL or file path."
102
+ end
103
+
104
+ # Step 2: Extract metadata
105
+ metadata = extract_metadata(file_path, options)
106
+
107
+ # Step 3: Convert to markdown
108
+ markdown_content = convert_to_markdown(file_path, options)
109
+
110
+ # Add markdown content to metadata for language detection
111
+ metadata[:content] = markdown_content if metadata[:content].nil? || metadata[:content].empty?
112
+
113
+ # Step 4: Create or update document record
114
+ document = create_or_update_document(source, metadata, options)
115
+
116
+ # Step 5: Chunk content
117
+ chunks = chunk_content(markdown_content, options)
118
+
119
+ # Step 6: Save sections (and optionally generate embeddings/tags)
120
+ sections = save_sections(document, chunks, options)
121
+
122
+ # Step 7: Update document status
123
+ document.set_download_state(:completed)
124
+
125
+ @logger.info "Successfully created document: #{document.title} with #{sections.length} sections"
126
+
127
+ # Return hash with document and sections as expected by the API
128
+ {
129
+ document: document,
130
+ sections: sections
131
+ }
132
+ rescue StandardError => e
133
+ @logger.error "Failed to create document #{source}: #{e.message}"
134
+ @logger.error e.backtrace.join("\n")
135
+
136
+ # Mark as failed if document was created
137
+ @document.set_download_state(:failed) if defined?(@document) && @document
138
+
139
+ raise e
140
+ ensure
141
+ # Clean up temporary downloaded files
142
+ if defined?(@downloaded_file) && @downloaded_file && File.exist?(@downloaded_file)
143
+ File.delete(@downloaded_file)
144
+ @logger.debug "Cleaned up temporary file: #{@downloaded_file}"
145
+ end
146
+ end
147
+
148
+ # Download document from URL
149
+ # @param [String] url Source URL
150
+ # @param [Hash] options Download options
151
+ # @return [String] Path to downloaded file
152
+ def download_from_url(url, options = {})
153
+ uri = URI.parse(url)
154
+ @logger.info "Downloading from URL: #{url}"
155
+
156
+ # Create temp file with appropriate extension
157
+ ext = File.extname(uri.path)
158
+ ext = '.html' if ext.empty?
159
+ temp_file = Tempfile.new(['doc_', ext], @download_dir)
160
+ temp_path = temp_file.path
161
+ temp_file.close
162
+
163
+ # Download the file
164
+ Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == 'https') do |http|
165
+ request = Net::HTTP::Get.new(uri)
166
+ # Set user agent to avoid being blocked
167
+ request['User-Agent'] = 'SmartRAG Document Processor/1.0'
168
+
169
+ response = http.request(request)
170
+
171
+ case response.code
172
+ when '200'
173
+ File.write(temp_path, response.body)
174
+ when '301', '302', '303', '307', '308'
175
+ # Follow redirect
176
+ redirect_url = response['Location']
177
+ @logger.info "Redirecting to: #{redirect_url}"
178
+ return download_from_url(redirect_url, options)
179
+ else
180
+ raise "HTTP Error: #{response.code} - #{response.message}"
181
+ end
182
+ end
183
+
184
+ @downloaded_file = temp_path
185
+ @logger.info "Downloaded file to: #{temp_path}"
186
+ temp_path
187
+ rescue StandardError => e
188
+ @logger.error "Download failed: #{e.message}"
189
+ raise e
190
+ end
191
+
192
+ # Extract metadata from file
193
+ # @param [String] file_path Path to file
194
+ # @param [Hash] options Metadata options
195
+ # @return [Hash] Extracted metadata
196
+ def extract_metadata(file_path, options = {})
197
+ metadata = {
198
+ file_path: file_path,
199
+ file_size: File.size(file_path),
200
+ file_type: File.extname(file_path).downcase,
201
+ created_at: File.ctime(file_path),
202
+ modified_at: File.mtime(file_path)
203
+ }
204
+
205
+ # Try to extract more metadata based on file type
206
+ case metadata[:file_type]
207
+ when '.pdf'
208
+ metadata.merge!(extract_pdf_metadata(file_path))
209
+ when '.docx', '.doc'
210
+ metadata.merge!(extract_docx_metadata(file_path))
211
+ when '.html', '.htm'
212
+ metadata.merge!(extract_html_metadata(file_path))
213
+ end
214
+
215
+ # Use provided title if available (for backward compatibility)
216
+ metadata[:title] = options[:title] if options[:title]
217
+
218
+ # Override with provided metadata
219
+ metadata.merge!(options[:metadata] || {})
220
+
221
+ @logger.debug "Extracted metadata: #{metadata.except(:file_path)}"
222
+ metadata
223
+ rescue StandardError => e
224
+ @logger.warn "Failed to extract metadata: #{e.message}"
225
+ metadata
226
+ end
227
+
228
+ # Convert document to markdown using markitdown
229
+ # @param [String] file_path Path to source file
230
+ # @param [Hash] options Conversion options
231
+ # @return [String] Converted markdown content
232
+ def convert_to_markdown(file_path, options = {})
233
+ @logger.info "Converting #{file_path} to markdown"
234
+
235
+ ext = File.extname(file_path).downcase
236
+ if ['.md', '.markdown'].include?(ext)
237
+ @logger.info "Detected markdown source; skipping conversion"
238
+ return File.read(file_path)
239
+ end
240
+
241
+ # Use markitdown bridge for conversion
242
+ require_relative 'markitdown_bridge'
243
+
244
+
245
+
246
+ max_retries = options[:max_retries] || 3
247
+ retry_delay = options[:retry_delay] || 1
248
+
249
+ bridge = MarkitdownBridge.new
250
+ unless bridge.available?
251
+ raise LoadError, 'markitdown Python package is not installed. Install with: pip install markitdown'
252
+ end
253
+
254
+ # Check if file exists before attempting conversion
255
+ raise "File not found: #{file_path}" unless File.exist?(file_path)
256
+
257
+ retries = 0
258
+ begin
259
+ markdown = bridge.convert(file_path)
260
+
261
+ raise 'Conversion failed: empty result' if markdown.nil? || markdown.strip.empty?
262
+
263
+ @logger.info "Successfully converted to markdown (#{markdown.length} chars)"
264
+ markdown
265
+ rescue StandardError => e
266
+ retries += 1
267
+ if retries < max_retries
268
+ @logger.warn "Conversion attempt #{retries} failed: #{e.message}. Retrying in #{retry_delay}s..."
269
+ sleep retry_delay
270
+ retry
271
+ end
272
+
273
+ @logger.error "All conversion attempts failed: #{e.message}"
274
+ raise "Conversion failed after #{max_retries} attempts: #{e.message}"
275
+ end
276
+ rescue LoadError => e
277
+ @logger.error e.message
278
+ raise e
279
+ rescue StandardError => e
280
+ @logger.error "Conversion failed: #{e.message}"
281
+ raise e
282
+ end
283
+
284
+ # Create or update document record
285
+ # @param [String] source Original source
286
+ # @param [Hash] metadata Document metadata
287
+ # @param [Hash] options Document options
288
+ # @return [::SmartRAG::Models::SourceDocument]
289
+ def create_or_update_document(source, metadata, options = {})
290
+ original_url = options[:url] || metadata[:url] || source
291
+ normalized_source_uri = options[:source_uri] || original_url
292
+ source_type = options[:source_type] || infer_source_type(normalized_source_uri, source)
293
+ content_hash = metadata[:content_hash] || Digest::SHA256.hexdigest((metadata[:content] || '').to_s)
294
+ doc_attributes = {
295
+ url: original_url,
296
+ title: metadata[:title] || File.basename(source),
297
+ author: metadata[:author],
298
+ description: metadata[:description],
299
+ publication_date: metadata[:publication_date],
300
+ language: metadata[:language] || detect_language(metadata[:content] || ''),
301
+ download_state: ::SmartRAG::Models::SourceDocument::DOWNLOAD_STATES[:pending],
302
+ source_type: source_type,
303
+ source_uri: normalized_source_uri,
304
+ content_hash: content_hash,
305
+ metadata: metadata.to_json
306
+ }
307
+
308
+ @document = ::SmartRAG::Models::SourceDocument.create_or_update(doc_attributes)
309
+
310
+ if @document.id.nil? || !@document.exists?
311
+ @logger.error "Document save failed: #{@document.errors.inspect}"
312
+ raise "Failed to save document: #{@document.errors.inspect}"
313
+ end
314
+
315
+ @logger.info "Created document record: #{@document.id}"
316
+ @document
317
+ rescue StandardError => e
318
+ @logger.error "Exception creating document: #{e.message}"
319
+ raise e
320
+ end
321
+
322
+ # Chunk markdown content into sections
323
+ # @param [String] markdown_content Content to chunk
324
+ # @param [Hash] options Chunking options
325
+ # @return [Array<Hash>] Array of chunk hashes
326
+ def chunk_content(markdown_content, options = {})
327
+ use_smart = options.fetch(:smart_chunking, true)
328
+ legacy_chunker = options[:chunker] || ::SmartRAG::Chunker::MarkdownChunker.new(
329
+ chunk_size: options[:chunk_size] || @default_chunk_size,
330
+ overlap: options[:overlap] || @default_overlap
331
+ )
332
+
333
+ if use_smart
334
+ token_limit = options[:chunk_token_num] || 400
335
+ doc_type = options[:doc_type] || :general
336
+ pipeline = ::SmartRAG::SmartChunking::Pipeline.new(token_limit: token_limit)
337
+ chunks = pipeline.chunk(markdown_content, doc_type: doc_type, options: options)
338
+ # Fallback for plain text or heading-less content where smart chunking yields no sections.
339
+ if chunks.empty? && markdown_content.to_s.strip.length > 0
340
+ @logger.info 'Smart chunking returned no chunks, falling back to MarkdownChunker'
341
+ chunks = legacy_chunker.chunk(markdown_content)
342
+ end
343
+ else
344
+ chunks = legacy_chunker.chunk(markdown_content)
345
+ end
346
+ @logger.info "Created #{chunks.length} chunks"
347
+ chunks
348
+ end
349
+
350
+ # Save chunk sections to database
351
+ # @param [::SmartRAG::Models::SourceDocument] document Document record
352
+ # @param [Array<Hash>] chunks Array of chunk hashes
353
+ # @param [Hash] options Save options
354
+ # @option options [Boolean] :generate_embeddings Whether to generate embeddings for sections
355
+ # @option options [Boolean] :generate_tags Whether to generate tags for sections
356
+ def save_sections(document, chunks, options = {})
357
+ sections = chunks.each_with_index.map do |chunk, index|
358
+ {
359
+ document_id: document.id,
360
+ section_title: chunk[:title],
361
+ section_number: index + 1,
362
+ content: chunk[:content],
363
+ created_at: Time.now,
364
+ updated_at: Time.now
365
+ }
366
+ end
367
+
368
+ ::SmartRAG::Models::SourceSection.batch_insert(sections)
369
+ @logger.info "Saved #{sections.length} sections to database"
370
+
371
+ # Get the created sections with their IDs
372
+ created_sections = ::SmartRAG::Models::SourceSection.where(document_id: document.id).all
373
+
374
+ # Generate embeddings if requested
375
+ generate_embeddings_for_sections(created_sections) if options[:generate_embeddings] && @embedding_manager
376
+
377
+ # Generate tags if requested
378
+ generate_tags_for_sections(created_sections) if options[:generate_tags] && @tag_service
379
+
380
+ created_sections
381
+ end
382
+
383
+ def infer_source_type(source_uri, source)
384
+ return 'url' if source_uri.to_s.start_with?('http://', 'https://')
385
+ return 'file' if source.to_s.start_with?('/') || source.to_s.include?('.')
386
+
387
+ 'manual'
388
+ end
389
+
390
+ # Generate embeddings for sections
391
+ def generate_embeddings_for_sections(sections)
392
+ @logger.info "Generating embeddings for #{sections.length} sections..."
393
+
394
+ sections.each_with_index do |section, index|
395
+ vector = @embedding_manager.generate_embedding(section.content)
396
+ if vector && vector.is_a?(Array) && !vector.empty?
397
+ ::SmartRAG::Models::Embedding.create(
398
+ source_id: section.id,
399
+ vector: "[#{vector.join(',')}]"
400
+ )
401
+ @logger.debug "Generated embedding for section #{index + 1}/#{sections.length}"
402
+ end
403
+ rescue StandardError => e
404
+ @logger.warn "Failed to generate embedding for section #{section.id}: #{e.message}"
405
+ end
406
+
407
+ @logger.info 'Embeddings generation completed'
408
+ end
409
+
410
+ # Generate tags for sections
411
+ def generate_tags_for_sections(sections)
412
+ @logger.info "Generating tags for #{sections.length} sections..."
413
+
414
+ sections.each_with_index do |section, index|
415
+ tags = @tag_service.generate_tags(section.content, section.section_title,
416
+ [detect_language(section.content)])
417
+
418
+ if tags && tags[:content_tags] && !tags[:content_tags].empty?
419
+ # Create or find tags and associate with section
420
+ tags[:content_tags].each do |tag_name|
421
+ tag = ::SmartRAG::Models::Tag.find_or_create(name: tag_name)
422
+
423
+ # Check if association already exists
424
+ existing = ::SmartRAG::Models::SectionTag.find(
425
+ section_id: section.id,
426
+ tag_id: tag.id
427
+ )
428
+
429
+ # Create association if it doesn't exist
430
+ next unless existing.nil?
431
+
432
+ ::SmartRAG::Models::SectionTag.create(
433
+ section_id: section.id,
434
+ tag_id: tag.id
435
+ )
436
+ end
437
+ @logger.debug "Generated #{tags[:content_tags].length} tags for section #{index + 1}/#{sections.length}"
438
+ end
439
+ rescue StandardError => e
440
+ @logger.warn "Failed to generate tags for section #{section.id}: #{e.message}"
441
+ end
442
+
443
+ @logger.info 'Tags generation completed'
444
+ end
445
+
446
+ # Detect language from text
447
+ # @param [String] text Text to analyze
448
+ # @return [String] Language code (ISO 639-1)
449
+ def detect_language(text)
450
+ return 'en' if text.nil? || text.empty?
451
+
452
+ # Heuristic: decide by CJK character ratios to avoid short mixed-language bias.
453
+ ja_count = text.scan(/[\u3040-\u309f\u30a0-\u30ff]/).length
454
+ ko_count = text.scan(/[\uac00-\ud7af]/).length
455
+ zh_count = text.scan(/[\u4e00-\u9fff]/).length
456
+ cjk_total = ja_count + ko_count + zh_count
457
+
458
+ return 'en' if cjk_total.zero?
459
+
460
+ ja_ratio = ja_count.to_f / cjk_total
461
+ ko_ratio = ko_count.to_f / cjk_total
462
+ zh_ratio = zh_count.to_f / cjk_total
463
+
464
+ return 'ja' if ja_ratio >= 0.3 && ja_ratio > zh_ratio && ja_ratio > ko_ratio
465
+ return 'ko' if ko_ratio >= 0.3 && ko_ratio > zh_ratio
466
+
467
+ 'zh'
468
+ rescue StandardError => e
469
+ @logger.warn "Language detection failed: #{e.message}, defaulting to 'en'"
470
+ 'en'
471
+ end
472
+
473
+ # Extract metadata from PDF files
474
+ # @param [String] file_path Path to PDF
475
+ # @return [Hash] PDF metadata
476
+ def extract_pdf_metadata(_file_path)
477
+ # This would require a PDF parsing library
478
+ # For now, return empty hash
479
+ {}
480
+ end
481
+
482
+ # Extract metadata from DOCX files
483
+ # @param [String] file_path Path to DOCX
484
+ # @return [Hash] DOCX metadata
485
+ def extract_docx_metadata(_file_path)
486
+ # This would require a DOCX parsing library
487
+ # For now, return empty hash
488
+ {}
489
+ end
490
+
491
+ # Extract metadata from HTML files
492
+ # @param [String] file_path Path to HTML
493
+ # @return [Hash] HTML metadata
494
+ def extract_html_metadata(file_path)
495
+ content = File.read(file_path, encoding: 'utf-8')
496
+ metadata = {}
497
+
498
+ # Extract title
499
+ if content =~ %r{<title>(.*?)</title>}mi
500
+ metadata[:title] = ::Regexp.last_match(1).strip
501
+ elsif content =~ %r{<h1>(.*?)</h1>}mi
502
+ metadata[:title] = ::Regexp.last_match(1).strip
503
+ end
504
+
505
+ # Extract meta tags - improved regex to handle quotes properly
506
+ content.scan(%r{<meta\s+name=["']?([^"'\s]+)["']?\s+content=["']?([^"']+)["']?\s*/?\s*>}i).each do |name, content|
507
+ case name.downcase
508
+ when 'author'
509
+ metadata[:author] = content
510
+ when 'description'
511
+ metadata[:description] = content
512
+ when 'keywords'
513
+ metadata[:keywords] = content.split(',').map(&:strip)
514
+ end
515
+ end
516
+
517
+ # Extract body content for language detection
518
+ # Remove script and style tags, then extract text
519
+ body_content = content.gsub(%r{<script[^>]*>.*?</script>}mi, '')
520
+ .gsub(%r{<style[^>]*>.*?</style>}mi, '')
521
+ metadata[:content] = if body_content =~ %r{<body[^>]*>(.*?)</body>}mi
522
+ ::Regexp.last_match(1).gsub(/<[^>]+>/, ' ').strip.gsub(/\s+/, ' ')
523
+ elsif body_content =~ /<body[^>]*>(.*)/mi
524
+ ::Regexp.last_match(1).gsub(/<[^>]+>/, ' ').strip.gsub(/\s+/, ' ')
525
+ else
526
+ # Fallback: extract any text content
527
+ content.gsub(/<[^>]+>/, ' ').strip.gsub(/\s+/, ' ')
528
+ end
529
+
530
+ metadata
531
+ rescue StandardError => e
532
+ @logger.warn "Failed to extract HTML metadata: #{e.message}"
533
+ {}
534
+ end
535
+ end
536
+ end
537
+ end