smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
require 'yaml'
|
|
2
|
+
require 'erb'
|
|
3
|
+
|
|
4
|
+
module SmartRAG
|
|
5
|
+
class Config
|
|
6
|
+
class << self
|
|
7
|
+
def load(file_path = nil)
|
|
8
|
+
# If file_path is a Hash, return it directly (already a config hash)
|
|
9
|
+
return symbolize_keys(file_path) if file_path.is_a?(Hash)
|
|
10
|
+
|
|
11
|
+
file_path ||= default_config_path
|
|
12
|
+
|
|
13
|
+
unless File.exist?(file_path)
|
|
14
|
+
raise "Configuration file not found: #{file_path}"
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
yaml_content = File.read(file_path)
|
|
18
|
+
config = YAML.safe_load(
|
|
19
|
+
ERB.new(yaml_content).result,
|
|
20
|
+
permitted_classes: [Symbol, Time]
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Convert string keys to symbols for consistency
|
|
24
|
+
config = symbolize_keys(config) if config.is_a?(Hash)
|
|
25
|
+
|
|
26
|
+
validate_config(config)
|
|
27
|
+
config
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def load_database_config(env = nil)
|
|
31
|
+
env ||= ENV['RACK_ENV'] || 'development'
|
|
32
|
+
env = env.to_sym if env.respond_to?(:to_sym)
|
|
33
|
+
database_config_path = File.join(config_dir, 'database.yml')
|
|
34
|
+
|
|
35
|
+
unless File.exist?(database_config_path)
|
|
36
|
+
# Fallback to main config
|
|
37
|
+
config = load
|
|
38
|
+
return config[:database] if config[:database]
|
|
39
|
+
|
|
40
|
+
raise "Database configuration file not found: #{database_config_path}"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
yaml_content = File.read(database_config_path)
|
|
44
|
+
config = YAML.safe_load(
|
|
45
|
+
ERB.new(yaml_content).result,
|
|
46
|
+
permitted_classes: [Symbol]
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Convert string keys to symbols for consistency
|
|
50
|
+
config = symbolize_keys(config) if config.is_a?(Hash)
|
|
51
|
+
|
|
52
|
+
config[env] || config[:default] || config
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def load_fulltext_config
|
|
56
|
+
fulltext_config_path = File.join(config_dir, 'fulltext_search.yml')
|
|
57
|
+
|
|
58
|
+
unless File.exist?(fulltext_config_path)
|
|
59
|
+
# Fallback to main config
|
|
60
|
+
config = load
|
|
61
|
+
return config[:fulltext_search] || {} if config[:fulltext_search]
|
|
62
|
+
|
|
63
|
+
return {}
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
yaml_content = File.read(fulltext_config_path)
|
|
67
|
+
config = YAML.safe_load(
|
|
68
|
+
ERB.new(yaml_content).result,
|
|
69
|
+
permitted_classes: [Symbol]
|
|
70
|
+
) || {}
|
|
71
|
+
|
|
72
|
+
# Convert string keys to symbols for consistency
|
|
73
|
+
symbolize_keys(config) if config.is_a?(Hash)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def default_config_path
|
|
79
|
+
@default_config_path ||= File.join(config_dir, 'smart_rag.yml')
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def config_dir
|
|
83
|
+
@config_dir ||= File.join(__dir__, '..', '..', 'config')
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def validate_config(config)
|
|
87
|
+
return unless config.is_a?(Hash)
|
|
88
|
+
|
|
89
|
+
# Validate required sections
|
|
90
|
+
unless config[:database]
|
|
91
|
+
raise "Missing required 'database' configuration"
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Validate embedding configuration
|
|
95
|
+
if config[:embedding]
|
|
96
|
+
unless config[:embedding][:provider]
|
|
97
|
+
puts "Warning: Missing embedding provider configuration"
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
unless config[:embedding][:dimensions]
|
|
101
|
+
puts "Warning: Missing embedding dimensions, defaulting to 1024"
|
|
102
|
+
config[:embedding][:dimensions] = 1024
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Validate fulltext search configuration
|
|
107
|
+
if config[:fulltext_search]
|
|
108
|
+
# Check for supported languages
|
|
109
|
+
supported_langs = ['en', 'zh', 'ja', 'ko', 'default']
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
true
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def symbolize_keys(hash)
|
|
116
|
+
return hash unless hash.is_a?(Hash)
|
|
117
|
+
|
|
118
|
+
hash.each_with_object({}) do |(key, value), result|
|
|
119
|
+
key = key.to_sym if key.respond_to?(:to_sym)
|
|
120
|
+
value = symbolize_keys(value) if value.is_a?(Hash)
|
|
121
|
+
result[key] = value
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
@@ -0,0 +1,537 @@
|
|
|
1
|
+
require 'uri'
|
|
2
|
+
require 'net/http'
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'tempfile'
|
|
5
|
+
require 'digest'
|
|
6
|
+
require_relative '../../smart_rag'
|
|
7
|
+
require_relative '../models'
|
|
8
|
+
require_relative '../chunker/markdown_chunker'
|
|
9
|
+
require_relative '../smart_chunking/pipeline'
|
|
10
|
+
|
|
11
|
+
module SmartRAG
|
|
12
|
+
module Core
|
|
13
|
+
# DocumentProcessor handles document downloading, conversion, chunking, and storage
|
|
14
|
+
class DocumentProcessor
|
|
15
|
+
attr_reader :config, :embedding_manager, :tag_service
|
|
16
|
+
|
|
17
|
+
def initialize(config = {})
|
|
18
|
+
@config = config
|
|
19
|
+
@embedding_manager = config[:embedding_manager]
|
|
20
|
+
@tag_service = config[:tag_service]
|
|
21
|
+
@logger = config[:logger] || Logger.new(STDOUT)
|
|
22
|
+
@download_dir = config[:download_dir] || Dir.tmpdir
|
|
23
|
+
@default_chunk_size = config[:chunk_size] || 2000
|
|
24
|
+
@default_overlap = config[:overlap] || 200
|
|
25
|
+
|
|
26
|
+
# Update config with defaults
|
|
27
|
+
@config[:logger] = @logger
|
|
28
|
+
@config[:download_dir] = @download_dir
|
|
29
|
+
@config[:chunk_size] = @default_chunk_size
|
|
30
|
+
@config[:overlap] = @default_overlap
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Process a document from URL or local file
|
|
34
|
+
# @param [String] source URL or file path
|
|
35
|
+
# @param [Hash] options Processing options
|
|
36
|
+
# @return [::SmartRAG::Models::SourceDocument] processed document
|
|
37
|
+
def process(source, options = {})
|
|
38
|
+
@logger.info "Processing document from: #{source}"
|
|
39
|
+
|
|
40
|
+
# Step 1: Download if it's a URL
|
|
41
|
+
file_path = if source =~ %r{\Ahttps?://}
|
|
42
|
+
download_from_url(source, options)
|
|
43
|
+
elsif File.exist?(source)
|
|
44
|
+
source
|
|
45
|
+
else
|
|
46
|
+
raise ArgumentError, "Invalid source: #{source}. Must be a valid URL or file path."
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Step 2: Extract metadata
|
|
50
|
+
metadata = extract_metadata(file_path, options)
|
|
51
|
+
|
|
52
|
+
# Step 3: Convert to markdown
|
|
53
|
+
markdown_content = convert_to_markdown(file_path, options)
|
|
54
|
+
|
|
55
|
+
# Add markdown content to metadata for language detection
|
|
56
|
+
metadata[:content] = markdown_content if metadata[:content].nil? || metadata[:content].empty?
|
|
57
|
+
|
|
58
|
+
# Step 4: Create or update document record
|
|
59
|
+
document = create_or_update_document(source, metadata, options)
|
|
60
|
+
|
|
61
|
+
# Step 5: Chunk content
|
|
62
|
+
chunks = chunk_content(markdown_content, options)
|
|
63
|
+
|
|
64
|
+
# Step 6: Save sections
|
|
65
|
+
save_sections(document, chunks, options)
|
|
66
|
+
|
|
67
|
+
# Step 7: Update document status
|
|
68
|
+
document.set_download_state(:completed)
|
|
69
|
+
|
|
70
|
+
@logger.info "Successfully processed document: #{document.title}"
|
|
71
|
+
document
|
|
72
|
+
rescue StandardError => e
|
|
73
|
+
@logger.error "Failed to process document #{source}: #{e.message}"
|
|
74
|
+
@logger.error e.backtrace.join("\n")
|
|
75
|
+
|
|
76
|
+
# Mark as failed if document was created
|
|
77
|
+
@document.set_download_state(:failed) if defined?(@document) && @document
|
|
78
|
+
|
|
79
|
+
raise e
|
|
80
|
+
ensure
|
|
81
|
+
# Clean up temporary downloaded files
|
|
82
|
+
if defined?(@downloaded_file) && @downloaded_file && File.exist?(@downloaded_file)
|
|
83
|
+
File.delete(@downloaded_file)
|
|
84
|
+
@logger.debug "Cleaned up temporary file: #{@downloaded_file}"
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Create a document and return document with sections
|
|
89
|
+
# @param [String] source URL or file path
|
|
90
|
+
# @param [Hash] options Processing options
|
|
91
|
+
# @return [Hash] Document and sections { document: SourceDocument, sections: [] }
|
|
92
|
+
def create_document(source, options = {})
|
|
93
|
+
@logger.info "Creating document from: #{source}"
|
|
94
|
+
|
|
95
|
+
# Step 1: Download if it's a URL
|
|
96
|
+
file_path = if source =~ %r{\Ahttps?://}
|
|
97
|
+
download_from_url(source, options)
|
|
98
|
+
elsif File.exist?(source)
|
|
99
|
+
source
|
|
100
|
+
else
|
|
101
|
+
raise ArgumentError, "Invalid source: #{source}. Must be a valid URL or file path."
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# Step 2: Extract metadata
|
|
105
|
+
metadata = extract_metadata(file_path, options)
|
|
106
|
+
|
|
107
|
+
# Step 3: Convert to markdown
|
|
108
|
+
markdown_content = convert_to_markdown(file_path, options)
|
|
109
|
+
|
|
110
|
+
# Add markdown content to metadata for language detection
|
|
111
|
+
metadata[:content] = markdown_content if metadata[:content].nil? || metadata[:content].empty?
|
|
112
|
+
|
|
113
|
+
# Step 4: Create or update document record
|
|
114
|
+
document = create_or_update_document(source, metadata, options)
|
|
115
|
+
|
|
116
|
+
# Step 5: Chunk content
|
|
117
|
+
chunks = chunk_content(markdown_content, options)
|
|
118
|
+
|
|
119
|
+
# Step 6: Save sections (and optionally generate embeddings/tags)
|
|
120
|
+
sections = save_sections(document, chunks, options)
|
|
121
|
+
|
|
122
|
+
# Step 7: Update document status
|
|
123
|
+
document.set_download_state(:completed)
|
|
124
|
+
|
|
125
|
+
@logger.info "Successfully created document: #{document.title} with #{sections.length} sections"
|
|
126
|
+
|
|
127
|
+
# Return hash with document and sections as expected by the API
|
|
128
|
+
{
|
|
129
|
+
document: document,
|
|
130
|
+
sections: sections
|
|
131
|
+
}
|
|
132
|
+
rescue StandardError => e
|
|
133
|
+
@logger.error "Failed to create document #{source}: #{e.message}"
|
|
134
|
+
@logger.error e.backtrace.join("\n")
|
|
135
|
+
|
|
136
|
+
# Mark as failed if document was created
|
|
137
|
+
@document.set_download_state(:failed) if defined?(@document) && @document
|
|
138
|
+
|
|
139
|
+
raise e
|
|
140
|
+
ensure
|
|
141
|
+
# Clean up temporary downloaded files
|
|
142
|
+
if defined?(@downloaded_file) && @downloaded_file && File.exist?(@downloaded_file)
|
|
143
|
+
File.delete(@downloaded_file)
|
|
144
|
+
@logger.debug "Cleaned up temporary file: #{@downloaded_file}"
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Download document from URL
|
|
149
|
+
# @param [String] url Source URL
|
|
150
|
+
# @param [Hash] options Download options
|
|
151
|
+
# @return [String] Path to downloaded file
|
|
152
|
+
def download_from_url(url, options = {})
|
|
153
|
+
uri = URI.parse(url)
|
|
154
|
+
@logger.info "Downloading from URL: #{url}"
|
|
155
|
+
|
|
156
|
+
# Create temp file with appropriate extension
|
|
157
|
+
ext = File.extname(uri.path)
|
|
158
|
+
ext = '.html' if ext.empty?
|
|
159
|
+
temp_file = Tempfile.new(['doc_', ext], @download_dir)
|
|
160
|
+
temp_path = temp_file.path
|
|
161
|
+
temp_file.close
|
|
162
|
+
|
|
163
|
+
# Download the file
|
|
164
|
+
Net::HTTP.start(uri.hostname, uri.port, use_ssl: uri.scheme == 'https') do |http|
|
|
165
|
+
request = Net::HTTP::Get.new(uri)
|
|
166
|
+
# Set user agent to avoid being blocked
|
|
167
|
+
request['User-Agent'] = 'SmartRAG Document Processor/1.0'
|
|
168
|
+
|
|
169
|
+
response = http.request(request)
|
|
170
|
+
|
|
171
|
+
case response.code
|
|
172
|
+
when '200'
|
|
173
|
+
File.write(temp_path, response.body)
|
|
174
|
+
when '301', '302', '303', '307', '308'
|
|
175
|
+
# Follow redirect
|
|
176
|
+
redirect_url = response['Location']
|
|
177
|
+
@logger.info "Redirecting to: #{redirect_url}"
|
|
178
|
+
return download_from_url(redirect_url, options)
|
|
179
|
+
else
|
|
180
|
+
raise "HTTP Error: #{response.code} - #{response.message}"
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
@downloaded_file = temp_path
|
|
185
|
+
@logger.info "Downloaded file to: #{temp_path}"
|
|
186
|
+
temp_path
|
|
187
|
+
rescue StandardError => e
|
|
188
|
+
@logger.error "Download failed: #{e.message}"
|
|
189
|
+
raise e
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Extract metadata from file
|
|
193
|
+
# @param [String] file_path Path to file
|
|
194
|
+
# @param [Hash] options Metadata options
|
|
195
|
+
# @return [Hash] Extracted metadata
|
|
196
|
+
def extract_metadata(file_path, options = {})
|
|
197
|
+
metadata = {
|
|
198
|
+
file_path: file_path,
|
|
199
|
+
file_size: File.size(file_path),
|
|
200
|
+
file_type: File.extname(file_path).downcase,
|
|
201
|
+
created_at: File.ctime(file_path),
|
|
202
|
+
modified_at: File.mtime(file_path)
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Try to extract more metadata based on file type
|
|
206
|
+
case metadata[:file_type]
|
|
207
|
+
when '.pdf'
|
|
208
|
+
metadata.merge!(extract_pdf_metadata(file_path))
|
|
209
|
+
when '.docx', '.doc'
|
|
210
|
+
metadata.merge!(extract_docx_metadata(file_path))
|
|
211
|
+
when '.html', '.htm'
|
|
212
|
+
metadata.merge!(extract_html_metadata(file_path))
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Use provided title if available (for backward compatibility)
|
|
216
|
+
metadata[:title] = options[:title] if options[:title]
|
|
217
|
+
|
|
218
|
+
# Override with provided metadata
|
|
219
|
+
metadata.merge!(options[:metadata] || {})
|
|
220
|
+
|
|
221
|
+
@logger.debug "Extracted metadata: #{metadata.except(:file_path)}"
|
|
222
|
+
metadata
|
|
223
|
+
rescue StandardError => e
|
|
224
|
+
@logger.warn "Failed to extract metadata: #{e.message}"
|
|
225
|
+
metadata
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Convert document to markdown using markitdown
|
|
229
|
+
# @param [String] file_path Path to source file
|
|
230
|
+
# @param [Hash] options Conversion options
|
|
231
|
+
# @return [String] Converted markdown content
|
|
232
|
+
def convert_to_markdown(file_path, options = {})
|
|
233
|
+
@logger.info "Converting #{file_path} to markdown"
|
|
234
|
+
|
|
235
|
+
ext = File.extname(file_path).downcase
|
|
236
|
+
if ['.md', '.markdown'].include?(ext)
|
|
237
|
+
@logger.info "Detected markdown source; skipping conversion"
|
|
238
|
+
return File.read(file_path)
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Use markitdown bridge for conversion
|
|
242
|
+
require_relative 'markitdown_bridge'
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
max_retries = options[:max_retries] || 3
|
|
247
|
+
retry_delay = options[:retry_delay] || 1
|
|
248
|
+
|
|
249
|
+
bridge = MarkitdownBridge.new
|
|
250
|
+
unless bridge.available?
|
|
251
|
+
raise LoadError, 'markitdown Python package is not installed. Install with: pip install markitdown'
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# Check if file exists before attempting conversion
|
|
255
|
+
raise "File not found: #{file_path}" unless File.exist?(file_path)
|
|
256
|
+
|
|
257
|
+
retries = 0
|
|
258
|
+
begin
|
|
259
|
+
markdown = bridge.convert(file_path)
|
|
260
|
+
|
|
261
|
+
raise 'Conversion failed: empty result' if markdown.nil? || markdown.strip.empty?
|
|
262
|
+
|
|
263
|
+
@logger.info "Successfully converted to markdown (#{markdown.length} chars)"
|
|
264
|
+
markdown
|
|
265
|
+
rescue StandardError => e
|
|
266
|
+
retries += 1
|
|
267
|
+
if retries < max_retries
|
|
268
|
+
@logger.warn "Conversion attempt #{retries} failed: #{e.message}. Retrying in #{retry_delay}s..."
|
|
269
|
+
sleep retry_delay
|
|
270
|
+
retry
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
@logger.error "All conversion attempts failed: #{e.message}"
|
|
274
|
+
raise "Conversion failed after #{max_retries} attempts: #{e.message}"
|
|
275
|
+
end
|
|
276
|
+
rescue LoadError => e
|
|
277
|
+
@logger.error e.message
|
|
278
|
+
raise e
|
|
279
|
+
rescue StandardError => e
|
|
280
|
+
@logger.error "Conversion failed: #{e.message}"
|
|
281
|
+
raise e
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
# Create or update document record
|
|
285
|
+
# @param [String] source Original source
|
|
286
|
+
# @param [Hash] metadata Document metadata
|
|
287
|
+
# @param [Hash] options Document options
|
|
288
|
+
# @return [::SmartRAG::Models::SourceDocument]
|
|
289
|
+
def create_or_update_document(source, metadata, options = {})
|
|
290
|
+
original_url = options[:url] || metadata[:url] || source
|
|
291
|
+
normalized_source_uri = options[:source_uri] || original_url
|
|
292
|
+
source_type = options[:source_type] || infer_source_type(normalized_source_uri, source)
|
|
293
|
+
content_hash = metadata[:content_hash] || Digest::SHA256.hexdigest((metadata[:content] || '').to_s)
|
|
294
|
+
doc_attributes = {
|
|
295
|
+
url: original_url,
|
|
296
|
+
title: metadata[:title] || File.basename(source),
|
|
297
|
+
author: metadata[:author],
|
|
298
|
+
description: metadata[:description],
|
|
299
|
+
publication_date: metadata[:publication_date],
|
|
300
|
+
language: metadata[:language] || detect_language(metadata[:content] || ''),
|
|
301
|
+
download_state: ::SmartRAG::Models::SourceDocument::DOWNLOAD_STATES[:pending],
|
|
302
|
+
source_type: source_type,
|
|
303
|
+
source_uri: normalized_source_uri,
|
|
304
|
+
content_hash: content_hash,
|
|
305
|
+
metadata: metadata.to_json
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
@document = ::SmartRAG::Models::SourceDocument.create_or_update(doc_attributes)
|
|
309
|
+
|
|
310
|
+
if @document.id.nil? || !@document.exists?
|
|
311
|
+
@logger.error "Document save failed: #{@document.errors.inspect}"
|
|
312
|
+
raise "Failed to save document: #{@document.errors.inspect}"
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
@logger.info "Created document record: #{@document.id}"
|
|
316
|
+
@document
|
|
317
|
+
rescue StandardError => e
|
|
318
|
+
@logger.error "Exception creating document: #{e.message}"
|
|
319
|
+
raise e
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
# Chunk markdown content into sections
|
|
323
|
+
# @param [String] markdown_content Content to chunk
|
|
324
|
+
# @param [Hash] options Chunking options
|
|
325
|
+
# @return [Array<Hash>] Array of chunk hashes
|
|
326
|
+
def chunk_content(markdown_content, options = {})
|
|
327
|
+
use_smart = options.fetch(:smart_chunking, true)
|
|
328
|
+
legacy_chunker = options[:chunker] || ::SmartRAG::Chunker::MarkdownChunker.new(
|
|
329
|
+
chunk_size: options[:chunk_size] || @default_chunk_size,
|
|
330
|
+
overlap: options[:overlap] || @default_overlap
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
if use_smart
|
|
334
|
+
token_limit = options[:chunk_token_num] || 400
|
|
335
|
+
doc_type = options[:doc_type] || :general
|
|
336
|
+
pipeline = ::SmartRAG::SmartChunking::Pipeline.new(token_limit: token_limit)
|
|
337
|
+
chunks = pipeline.chunk(markdown_content, doc_type: doc_type, options: options)
|
|
338
|
+
# Fallback for plain text or heading-less content where smart chunking yields no sections.
|
|
339
|
+
if chunks.empty? && markdown_content.to_s.strip.length > 0
|
|
340
|
+
@logger.info 'Smart chunking returned no chunks, falling back to MarkdownChunker'
|
|
341
|
+
chunks = legacy_chunker.chunk(markdown_content)
|
|
342
|
+
end
|
|
343
|
+
else
|
|
344
|
+
chunks = legacy_chunker.chunk(markdown_content)
|
|
345
|
+
end
|
|
346
|
+
@logger.info "Created #{chunks.length} chunks"
|
|
347
|
+
chunks
|
|
348
|
+
end
|
|
349
|
+
|
|
350
|
+
# Save chunk sections to database
|
|
351
|
+
# @param [::SmartRAG::Models::SourceDocument] document Document record
|
|
352
|
+
# @param [Array<Hash>] chunks Array of chunk hashes
|
|
353
|
+
# @param [Hash] options Save options
|
|
354
|
+
# @option options [Boolean] :generate_embeddings Whether to generate embeddings for sections
|
|
355
|
+
# @option options [Boolean] :generate_tags Whether to generate tags for sections
|
|
356
|
+
def save_sections(document, chunks, options = {})
|
|
357
|
+
sections = chunks.each_with_index.map do |chunk, index|
|
|
358
|
+
{
|
|
359
|
+
document_id: document.id,
|
|
360
|
+
section_title: chunk[:title],
|
|
361
|
+
section_number: index + 1,
|
|
362
|
+
content: chunk[:content],
|
|
363
|
+
created_at: Time.now,
|
|
364
|
+
updated_at: Time.now
|
|
365
|
+
}
|
|
366
|
+
end
|
|
367
|
+
|
|
368
|
+
::SmartRAG::Models::SourceSection.batch_insert(sections)
|
|
369
|
+
@logger.info "Saved #{sections.length} sections to database"
|
|
370
|
+
|
|
371
|
+
# Get the created sections with their IDs
|
|
372
|
+
created_sections = ::SmartRAG::Models::SourceSection.where(document_id: document.id).all
|
|
373
|
+
|
|
374
|
+
# Generate embeddings if requested
|
|
375
|
+
generate_embeddings_for_sections(created_sections) if options[:generate_embeddings] && @embedding_manager
|
|
376
|
+
|
|
377
|
+
# Generate tags if requested
|
|
378
|
+
generate_tags_for_sections(created_sections) if options[:generate_tags] && @tag_service
|
|
379
|
+
|
|
380
|
+
created_sections
|
|
381
|
+
end
|
|
382
|
+
|
|
383
|
+
def infer_source_type(source_uri, source)
|
|
384
|
+
return 'url' if source_uri.to_s.start_with?('http://', 'https://')
|
|
385
|
+
return 'file' if source.to_s.start_with?('/') || source.to_s.include?('.')
|
|
386
|
+
|
|
387
|
+
'manual'
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
# Generate embeddings for sections
|
|
391
|
+
def generate_embeddings_for_sections(sections)
|
|
392
|
+
@logger.info "Generating embeddings for #{sections.length} sections..."
|
|
393
|
+
|
|
394
|
+
sections.each_with_index do |section, index|
|
|
395
|
+
vector = @embedding_manager.generate_embedding(section.content)
|
|
396
|
+
if vector && vector.is_a?(Array) && !vector.empty?
|
|
397
|
+
::SmartRAG::Models::Embedding.create(
|
|
398
|
+
source_id: section.id,
|
|
399
|
+
vector: "[#{vector.join(',')}]"
|
|
400
|
+
)
|
|
401
|
+
@logger.debug "Generated embedding for section #{index + 1}/#{sections.length}"
|
|
402
|
+
end
|
|
403
|
+
rescue StandardError => e
|
|
404
|
+
@logger.warn "Failed to generate embedding for section #{section.id}: #{e.message}"
|
|
405
|
+
end
|
|
406
|
+
|
|
407
|
+
@logger.info 'Embeddings generation completed'
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
# Generate tags for sections
|
|
411
|
+
def generate_tags_for_sections(sections)
|
|
412
|
+
@logger.info "Generating tags for #{sections.length} sections..."
|
|
413
|
+
|
|
414
|
+
sections.each_with_index do |section, index|
|
|
415
|
+
tags = @tag_service.generate_tags(section.content, section.section_title,
|
|
416
|
+
[detect_language(section.content)])
|
|
417
|
+
|
|
418
|
+
if tags && tags[:content_tags] && !tags[:content_tags].empty?
|
|
419
|
+
# Create or find tags and associate with section
|
|
420
|
+
tags[:content_tags].each do |tag_name|
|
|
421
|
+
tag = ::SmartRAG::Models::Tag.find_or_create(name: tag_name)
|
|
422
|
+
|
|
423
|
+
# Check if association already exists
|
|
424
|
+
existing = ::SmartRAG::Models::SectionTag.find(
|
|
425
|
+
section_id: section.id,
|
|
426
|
+
tag_id: tag.id
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# Create association if it doesn't exist
|
|
430
|
+
next unless existing.nil?
|
|
431
|
+
|
|
432
|
+
::SmartRAG::Models::SectionTag.create(
|
|
433
|
+
section_id: section.id,
|
|
434
|
+
tag_id: tag.id
|
|
435
|
+
)
|
|
436
|
+
end
|
|
437
|
+
@logger.debug "Generated #{tags[:content_tags].length} tags for section #{index + 1}/#{sections.length}"
|
|
438
|
+
end
|
|
439
|
+
rescue StandardError => e
|
|
440
|
+
@logger.warn "Failed to generate tags for section #{section.id}: #{e.message}"
|
|
441
|
+
end
|
|
442
|
+
|
|
443
|
+
@logger.info 'Tags generation completed'
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
# Detect language from text
|
|
447
|
+
# @param [String] text Text to analyze
|
|
448
|
+
# @return [String] Language code (ISO 639-1)
|
|
449
|
+
def detect_language(text)
|
|
450
|
+
return 'en' if text.nil? || text.empty?
|
|
451
|
+
|
|
452
|
+
# Heuristic: decide by CJK character ratios to avoid short mixed-language bias.
|
|
453
|
+
ja_count = text.scan(/[\u3040-\u309f\u30a0-\u30ff]/).length
|
|
454
|
+
ko_count = text.scan(/[\uac00-\ud7af]/).length
|
|
455
|
+
zh_count = text.scan(/[\u4e00-\u9fff]/).length
|
|
456
|
+
cjk_total = ja_count + ko_count + zh_count
|
|
457
|
+
|
|
458
|
+
return 'en' if cjk_total.zero?
|
|
459
|
+
|
|
460
|
+
ja_ratio = ja_count.to_f / cjk_total
|
|
461
|
+
ko_ratio = ko_count.to_f / cjk_total
|
|
462
|
+
zh_ratio = zh_count.to_f / cjk_total
|
|
463
|
+
|
|
464
|
+
return 'ja' if ja_ratio >= 0.3 && ja_ratio > zh_ratio && ja_ratio > ko_ratio
|
|
465
|
+
return 'ko' if ko_ratio >= 0.3 && ko_ratio > zh_ratio
|
|
466
|
+
|
|
467
|
+
'zh'
|
|
468
|
+
rescue StandardError => e
|
|
469
|
+
@logger.warn "Language detection failed: #{e.message}, defaulting to 'en'"
|
|
470
|
+
'en'
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
# Extract metadata from PDF files
|
|
474
|
+
# @param [String] file_path Path to PDF
|
|
475
|
+
# @return [Hash] PDF metadata
|
|
476
|
+
def extract_pdf_metadata(_file_path)
|
|
477
|
+
# This would require a PDF parsing library
|
|
478
|
+
# For now, return empty hash
|
|
479
|
+
{}
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
# Extract metadata from DOCX files
|
|
483
|
+
# @param [String] file_path Path to DOCX
|
|
484
|
+
# @return [Hash] DOCX metadata
|
|
485
|
+
def extract_docx_metadata(_file_path)
|
|
486
|
+
# This would require a DOCX parsing library
|
|
487
|
+
# For now, return empty hash
|
|
488
|
+
{}
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
# Extract metadata from HTML files
|
|
492
|
+
# @param [String] file_path Path to HTML
|
|
493
|
+
# @return [Hash] HTML metadata
|
|
494
|
+
def extract_html_metadata(file_path)
|
|
495
|
+
content = File.read(file_path, encoding: 'utf-8')
|
|
496
|
+
metadata = {}
|
|
497
|
+
|
|
498
|
+
# Extract title
|
|
499
|
+
if content =~ %r{<title>(.*?)</title>}mi
|
|
500
|
+
metadata[:title] = ::Regexp.last_match(1).strip
|
|
501
|
+
elsif content =~ %r{<h1>(.*?)</h1>}mi
|
|
502
|
+
metadata[:title] = ::Regexp.last_match(1).strip
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
# Extract meta tags - improved regex to handle quotes properly
|
|
506
|
+
content.scan(%r{<meta\s+name=["']?([^"'\s]+)["']?\s+content=["']?([^"']+)["']?\s*/?\s*>}i).each do |name, content|
|
|
507
|
+
case name.downcase
|
|
508
|
+
when 'author'
|
|
509
|
+
metadata[:author] = content
|
|
510
|
+
when 'description'
|
|
511
|
+
metadata[:description] = content
|
|
512
|
+
when 'keywords'
|
|
513
|
+
metadata[:keywords] = content.split(',').map(&:strip)
|
|
514
|
+
end
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
# Extract body content for language detection
|
|
518
|
+
# Remove script and style tags, then extract text
|
|
519
|
+
body_content = content.gsub(%r{<script[^>]*>.*?</script>}mi, '')
|
|
520
|
+
.gsub(%r{<style[^>]*>.*?</style>}mi, '')
|
|
521
|
+
metadata[:content] = if body_content =~ %r{<body[^>]*>(.*?)</body>}mi
|
|
522
|
+
::Regexp.last_match(1).gsub(/<[^>]+>/, ' ').strip.gsub(/\s+/, ' ')
|
|
523
|
+
elsif body_content =~ /<body[^>]*>(.*)/mi
|
|
524
|
+
::Regexp.last_match(1).gsub(/<[^>]+>/, ' ').strip.gsub(/\s+/, ' ')
|
|
525
|
+
else
|
|
526
|
+
# Fallback: extract any text content
|
|
527
|
+
content.gsub(/<[^>]+>/, ' ').strip.gsub(/\s+/, ' ')
|
|
528
|
+
end
|
|
529
|
+
|
|
530
|
+
metadata
|
|
531
|
+
rescue StandardError => e
|
|
532
|
+
@logger.warn "Failed to extract HTML metadata: #{e.message}"
|
|
533
|
+
{}
|
|
534
|
+
end
|
|
535
|
+
end
|
|
536
|
+
end
|
|
537
|
+
end
|