ragdoll 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +318 -40
- data/Rakefile +15 -4
- data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
- data/db/migrate/004_create_ragdoll_documents.rb +70 -0
- data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
- data/db/migrate/006_create_ragdoll_contents.rb +47 -0
- data/lib/ragdoll/core/client.rb +315 -0
- data/lib/ragdoll/core/configuration.rb +273 -0
- data/lib/ragdoll/core/database.rb +141 -0
- data/lib/ragdoll/core/document_management.rb +110 -0
- data/lib/ragdoll/core/document_processor.rb +344 -0
- data/lib/ragdoll/core/embedding_service.rb +183 -0
- data/lib/ragdoll/core/errors.rb +11 -0
- data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
- data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
- data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
- data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
- data/lib/ragdoll/core/metadata_schemas.rb +334 -0
- data/lib/ragdoll/core/models/audio_content.rb +175 -0
- data/lib/ragdoll/core/models/content.rb +126 -0
- data/lib/ragdoll/core/models/document.rb +678 -0
- data/lib/ragdoll/core/models/embedding.rb +204 -0
- data/lib/ragdoll/core/models/image_content.rb +227 -0
- data/lib/ragdoll/core/models/text_content.rb +169 -0
- data/lib/ragdoll/core/search_engine.rb +50 -0
- data/lib/ragdoll/core/services/image_description_service.rb +230 -0
- data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
- data/lib/ragdoll/core/shrine_config.rb +71 -0
- data/lib/ragdoll/core/text_chunker.rb +210 -0
- data/lib/ragdoll/core/text_generation_service.rb +360 -0
- data/lib/ragdoll/core/version.rb +8 -0
- data/lib/ragdoll/core.rb +73 -0
- data/lib/ragdoll-core.rb +3 -0
- data/lib/ragdoll.rb +243 -6
- data/lib/tasks/annotate.rake +126 -0
- data/lib/tasks/db.rake +338 -0
- metadata +40 -37
- data/app/models/ragdoll/document.rb +0 -9
- data/app/models/ragdoll/embedding.rb +0 -9
- data/config/initializers/ragdoll.rb +0 -6
- data/config/routes.rb +0 -5
- data/db/migrate/20250218123456_create_documents.rb +0 -20
- data/lib/config/database.yml +0 -28
- data/lib/config/ragdoll.yml +0 -31
- data/lib/ragdoll/engine.rb +0 -16
- data/lib/ragdoll/import_job.rb +0 -15
- data/lib/ragdoll/ingestion.rb +0 -30
- data/lib/ragdoll/search.rb +0 -18
- data/lib/ragdoll/version.rb +0 -7
- data/lib/tasks/import_task.thor +0 -32
- data/lib/tasks/jobs_task.thor +0 -40
- data/lib/tasks/ragdoll_tasks.thor +0 -7
- data/lib/tasks/search_task.thor +0 -55
@@ -0,0 +1,210 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragdoll
|
4
|
+
module Core
|
5
|
+
class TextChunker
|
6
|
+
DEFAULT_CHUNK_SIZE = 1000
|
7
|
+
DEFAULT_CHUNK_OVERLAP = 200
|
8
|
+
|
9
|
+
def self.chunk(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP)
|
10
|
+
new(text, chunk_size: chunk_size, chunk_overlap: chunk_overlap).chunk
|
11
|
+
end
|
12
|
+
|
13
|
+
def initialize(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP)
|
14
|
+
@text = text.to_s
|
15
|
+
@chunk_size = chunk_size
|
16
|
+
@chunk_overlap = chunk_overlap
|
17
|
+
end
|
18
|
+
|
19
|
+
def chunk
|
20
|
+
return [] if @text.empty?
|
21
|
+
|
22
|
+
# Ensure chunk_size and chunk_overlap are valid integers
|
23
|
+
@chunk_size = (@chunk_size || DEFAULT_CHUNK_SIZE).to_i
|
24
|
+
@chunk_overlap = (@chunk_overlap || DEFAULT_CHUNK_OVERLAP).to_i
|
25
|
+
|
26
|
+
# Ensure chunk_overlap is not greater than or equal to chunk_size to prevent infinite loops
|
27
|
+
@chunk_overlap = [@chunk_size - 1, 0].max if @chunk_overlap >= @chunk_size
|
28
|
+
|
29
|
+
return [@text] if @text.length <= @chunk_size
|
30
|
+
|
31
|
+
chunks = []
|
32
|
+
start_pos = 0
|
33
|
+
|
34
|
+
while start_pos < @text.length
|
35
|
+
end_pos = start_pos + @chunk_size
|
36
|
+
|
37
|
+
# If this is the last chunk, take everything remaining
|
38
|
+
if end_pos >= @text.length
|
39
|
+
chunks << @text[start_pos..].strip
|
40
|
+
break
|
41
|
+
end
|
42
|
+
|
43
|
+
# Try to find a good breaking point (sentence, paragraph, or word boundary)
|
44
|
+
chunk_text = @text[start_pos...end_pos]
|
45
|
+
break_pos = find_break_position(chunk_text, @text, start_pos, end_pos)
|
46
|
+
|
47
|
+
# Extract the chunk
|
48
|
+
actual_end_pos = start_pos + break_pos
|
49
|
+
chunk_content = @text[start_pos...actual_end_pos].strip
|
50
|
+
|
51
|
+
chunks << chunk_content unless chunk_content.empty?
|
52
|
+
|
53
|
+
# Move to next chunk with overlap
|
54
|
+
next_start_pos = actual_end_pos - @chunk_overlap
|
55
|
+
next_start_pos = [next_start_pos, 0].max # Ensure we don't go negative
|
56
|
+
|
57
|
+
# Ensure forward progress - if we're not advancing, force a step forward
|
58
|
+
next_start_pos = start_pos + 1 if next_start_pos <= start_pos
|
59
|
+
|
60
|
+
start_pos = next_start_pos
|
61
|
+
end
|
62
|
+
|
63
|
+
chunks.reject(&:empty?)
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def find_break_position(chunk_text, _full_text, _start_pos, _end_pos)
|
69
|
+
# Priority order for breaking points:
|
70
|
+
# 1. Double newline (paragraph break)
|
71
|
+
# 2. Single newline + sentence ending
|
72
|
+
# 3. Sentence ending punctuation
|
73
|
+
# 4. Word boundary
|
74
|
+
# 5. Character boundary (fallback)
|
75
|
+
|
76
|
+
# Look for paragraph breaks
|
77
|
+
paragraph_break = chunk_text.rindex("\n\n")
|
78
|
+
return paragraph_break + 2 if paragraph_break && paragraph_break > @chunk_size * 0.5
|
79
|
+
|
80
|
+
# Look for sentence endings near newlines
|
81
|
+
sentence_patterns = [
|
82
|
+
/[.!?]\s*\n/,
|
83
|
+
/[.!?]\s+[A-Z]/,
|
84
|
+
/[.!?]$/
|
85
|
+
]
|
86
|
+
|
87
|
+
sentence_patterns.each do |pattern|
|
88
|
+
matches = chunk_text.enum_for(:scan, pattern).map { Regexp.last_match.end(0) }
|
89
|
+
next unless matches.any?
|
90
|
+
|
91
|
+
# Find the best sentence break (closest to chunk_size but not too small)
|
92
|
+
best_break = matches.select { |pos| pos > @chunk_size * 0.5 }.max
|
93
|
+
return best_break if best_break
|
94
|
+
end
|
95
|
+
|
96
|
+
# Look for word boundaries
|
97
|
+
word_break = chunk_text.rindex(/\s/)
|
98
|
+
return word_break + 1 if word_break && word_break > @chunk_size * 0.3
|
99
|
+
|
100
|
+
# Fallback to character boundary
|
101
|
+
@chunk_size
|
102
|
+
end
|
103
|
+
|
104
|
+
# Alternative chunking method for structured documents
|
105
|
+
def self.chunk_by_structure(text, max_chunk_size: DEFAULT_CHUNK_SIZE)
|
106
|
+
chunks = []
|
107
|
+
current_chunk = ""
|
108
|
+
|
109
|
+
# Split by paragraphs first
|
110
|
+
paragraphs = text.split(/\n\s*\n/)
|
111
|
+
|
112
|
+
paragraphs.each do |paragraph|
|
113
|
+
paragraph = paragraph.strip
|
114
|
+
next if paragraph.empty?
|
115
|
+
|
116
|
+
# If adding this paragraph would exceed chunk size, start new chunk
|
117
|
+
if !current_chunk.empty? && (current_chunk.length + paragraph.length + 2) > max_chunk_size
|
118
|
+
chunks << current_chunk.strip
|
119
|
+
current_chunk = ""
|
120
|
+
end
|
121
|
+
|
122
|
+
# If single paragraph is too large, split it
|
123
|
+
if paragraph.length > max_chunk_size
|
124
|
+
# Split large paragraph into sentences
|
125
|
+
sentences = paragraph.split(/(?<=[.!?])\s+/)
|
126
|
+
|
127
|
+
sentences.each do |sentence|
|
128
|
+
sentence = sentence.strip
|
129
|
+
next if sentence.empty?
|
130
|
+
|
131
|
+
if !current_chunk.empty? && (current_chunk.length + sentence.length + 1) > max_chunk_size
|
132
|
+
chunks << current_chunk.strip
|
133
|
+
current_chunk = ""
|
134
|
+
end
|
135
|
+
|
136
|
+
if sentence.length > max_chunk_size
|
137
|
+
# Split very long sentences by words
|
138
|
+
words = sentence.split(/\s+/)
|
139
|
+
words.each do |word|
|
140
|
+
if !current_chunk.empty? && (current_chunk.length + word.length + 1) > max_chunk_size
|
141
|
+
chunks << current_chunk.strip
|
142
|
+
current_chunk = ""
|
143
|
+
end
|
144
|
+
current_chunk += (current_chunk.empty? ? "" : " ") + word
|
145
|
+
end
|
146
|
+
else
|
147
|
+
current_chunk += (current_chunk.empty? ? "" : " ") + sentence
|
148
|
+
end
|
149
|
+
end
|
150
|
+
else
|
151
|
+
current_chunk += (current_chunk.empty? ? "" : "\n\n") + paragraph
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
chunks << current_chunk.strip unless current_chunk.strip.empty?
|
156
|
+
chunks.reject(&:empty?)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Specialized chunking for code documents
|
160
|
+
def self.chunk_code(text, max_chunk_size: DEFAULT_CHUNK_SIZE)
|
161
|
+
chunks = []
|
162
|
+
current_chunk = ""
|
163
|
+
|
164
|
+
# Split by functions, classes, or logical blocks
|
165
|
+
lines = text.split("\n")
|
166
|
+
current_block = []
|
167
|
+
block_indent = nil
|
168
|
+
|
169
|
+
lines.each do |line|
|
170
|
+
line_indent = line[/^\s*/].length
|
171
|
+
|
172
|
+
# Detect block boundaries (functions, classes, etc.)
|
173
|
+
if line.match?(/^\s*(def|class|function|const|let|var)\s/) ||
|
174
|
+
(block_indent && line_indent <= block_indent && !line.strip.empty?)
|
175
|
+
|
176
|
+
# Process current block
|
177
|
+
if current_block.any?
|
178
|
+
block_text = current_block.join("\n")
|
179
|
+
|
180
|
+
if !current_chunk.empty? && (current_chunk.length + block_text.length + 1) > max_chunk_size
|
181
|
+
chunks << current_chunk.strip
|
182
|
+
current_chunk = ""
|
183
|
+
end
|
184
|
+
|
185
|
+
current_chunk += (current_chunk.empty? ? "" : "\n") + block_text
|
186
|
+
end
|
187
|
+
|
188
|
+
current_block = [line]
|
189
|
+
block_indent = line_indent
|
190
|
+
else
|
191
|
+
current_block << line
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
# Process final block
|
196
|
+
if current_block.any?
|
197
|
+
block_text = current_block.join("\n")
|
198
|
+
if !current_chunk.empty? && (current_chunk.length + block_text.length + 1) > max_chunk_size
|
199
|
+
chunks << current_chunk.strip
|
200
|
+
current_chunk = ""
|
201
|
+
end
|
202
|
+
current_chunk += (current_chunk.empty? ? "" : "\n") + block_text
|
203
|
+
end
|
204
|
+
|
205
|
+
chunks << current_chunk.strip unless current_chunk.strip.empty?
|
206
|
+
chunks.reject(&:empty?)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -0,0 +1,360 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "ruby_llm"
|
4
|
+
|
5
|
+
module Ragdoll
|
6
|
+
module Core
|
7
|
+
class TextGenerationService
|
8
|
+
class GenerationError < StandardError; end
|
9
|
+
|
10
|
+
def initialize(client: nil)
|
11
|
+
@configuration = Ragdoll.config
|
12
|
+
@client = client
|
13
|
+
configure_ruby_llm_if_possible unless @client
|
14
|
+
end
|
15
|
+
|
16
|
+
def generate_summary(text, max_length: nil)
|
17
|
+
return "" if text.nil? || text.strip.empty?
|
18
|
+
|
19
|
+
# Skip summarization if not enabled
|
20
|
+
unless @configuration.summarization_config[:enable]
|
21
|
+
puts "⚠️ LLM summarization disabled, using fallback (first 500 chars)"
|
22
|
+
return text[0..500]
|
23
|
+
end
|
24
|
+
|
25
|
+
# Skip if content is too short
|
26
|
+
min_length = @configuration.summarization_config[:min_content_length]
|
27
|
+
return text if text.length < min_length
|
28
|
+
|
29
|
+
max_length ||= @configuration.summarization_config[:max_length]
|
30
|
+
|
31
|
+
# Clean and prepare text
|
32
|
+
cleaned_text = clean_text(text)
|
33
|
+
|
34
|
+
# Create summarization prompt
|
35
|
+
prompt = build_summary_prompt(cleaned_text, max_length)
|
36
|
+
|
37
|
+
begin
|
38
|
+
if @client == :ruby_llm_configured
|
39
|
+
# Use RubyLLM for text generation
|
40
|
+
# Use summary model from models config, fallback to default
|
41
|
+
model_string = @configuration.models[:summary] || @configuration.models[:default]
|
42
|
+
|
43
|
+
# Parse provider/model - use only the model part for RubyLLM
|
44
|
+
parsed = @configuration.parse_provider_model(model_string)
|
45
|
+
model = parsed[:model] || model_string
|
46
|
+
|
47
|
+
chat = RubyLLM.chat.with_model(model)
|
48
|
+
.with_temperature(0.3)
|
49
|
+
chat.add_message(role: "user", content: prompt)
|
50
|
+
response = chat.complete
|
51
|
+
|
52
|
+
if response.respond_to?(:content)
|
53
|
+
response.content.strip
|
54
|
+
elsif response.respond_to?(:message) && response.message.respond_to?(:content)
|
55
|
+
response.message.content.strip
|
56
|
+
elsif response && response["choices"]&.first
|
57
|
+
response["choices"].first["message"]["content"].strip
|
58
|
+
elsif response && response["content"]
|
59
|
+
response["content"].strip
|
60
|
+
else
|
61
|
+
raise GenerationError, "Invalid response format from text generation API"
|
62
|
+
end
|
63
|
+
elsif @client
|
64
|
+
# Use custom client for testing
|
65
|
+
model = @configuration.models[:summary] || @configuration.models[:default]
|
66
|
+
|
67
|
+
response = @client.chat(
|
68
|
+
model: model,
|
69
|
+
messages: [
|
70
|
+
{ role: "user", content: prompt }
|
71
|
+
],
|
72
|
+
max_tokens: max_length + 50,
|
73
|
+
temperature: 0.3
|
74
|
+
)
|
75
|
+
|
76
|
+
if response && response["choices"]&.first
|
77
|
+
response["choices"].first["message"]["content"].strip
|
78
|
+
elsif response && response["content"]
|
79
|
+
response["content"].strip
|
80
|
+
else
|
81
|
+
raise GenerationError, "Invalid response format from text generation API"
|
82
|
+
end
|
83
|
+
else
|
84
|
+
# Fallback to basic summarization for testing/dev environments
|
85
|
+
puts "⚠️ No LLM client configured, using fallback summarization"
|
86
|
+
generate_basic_summary(cleaned_text, max_length)
|
87
|
+
end
|
88
|
+
rescue StandardError => e
|
89
|
+
# Fall back to basic summarization if API fails
|
90
|
+
puts "❌ LLM summary generation failed, using fallback: #{e.message}"
|
91
|
+
generate_basic_summary(cleaned_text, max_length)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def extract_keywords(text, max_keywords: 20)
|
96
|
+
return [] if text.nil? || text.strip.empty?
|
97
|
+
|
98
|
+
# Clean and prepare text
|
99
|
+
cleaned_text = clean_text(text)
|
100
|
+
|
101
|
+
# Create keyword extraction prompt
|
102
|
+
prompt = build_keyword_prompt(cleaned_text, max_keywords)
|
103
|
+
|
104
|
+
begin
|
105
|
+
if @client == :ruby_llm_configured
|
106
|
+
# Use RubyLLM for keyword extraction
|
107
|
+
# Use keywords model from models config, fallback to default
|
108
|
+
model_string = @configuration.models[:keywords] || @configuration.models[:default]
|
109
|
+
|
110
|
+
# Parse provider/model - use only the model part for RubyLLM
|
111
|
+
parsed = @configuration.parse_provider_model(model_string)
|
112
|
+
model = parsed[:model] || model_string
|
113
|
+
|
114
|
+
chat = RubyLLM.chat.with_model(model).with_temperature(0.1)
|
115
|
+
chat.add_message(role: "user", content: prompt)
|
116
|
+
response = chat.complete
|
117
|
+
|
118
|
+
if response.respond_to?(:content)
|
119
|
+
content = response.content.strip
|
120
|
+
parse_keywords_response(content)
|
121
|
+
elsif response.respond_to?(:message) && response.message.respond_to?(:content)
|
122
|
+
content = response.message.content.strip
|
123
|
+
parse_keywords_response(content)
|
124
|
+
elsif response && response["choices"]&.first
|
125
|
+
content = response["choices"].first["message"]["content"].strip
|
126
|
+
parse_keywords_response(content)
|
127
|
+
elsif response && response["content"]
|
128
|
+
content = response["content"].strip
|
129
|
+
parse_keywords_response(content)
|
130
|
+
else
|
131
|
+
raise GenerationError, "Invalid response format from text generation API"
|
132
|
+
end
|
133
|
+
elsif @client
|
134
|
+
# Use custom client for testing
|
135
|
+
model = @configuration.models[:keywords] || @configuration.models[:default]
|
136
|
+
|
137
|
+
response = @client.chat(
|
138
|
+
model: model,
|
139
|
+
messages: [
|
140
|
+
{ role: "user", content: prompt }
|
141
|
+
],
|
142
|
+
max_tokens: 200,
|
143
|
+
temperature: 0.1
|
144
|
+
)
|
145
|
+
|
146
|
+
if response && response["choices"]&.first
|
147
|
+
content = response["choices"].first["message"]["content"].strip
|
148
|
+
parse_keywords_response(content)
|
149
|
+
elsif response && response["content"]
|
150
|
+
content = response["content"].strip
|
151
|
+
parse_keywords_response(content)
|
152
|
+
else
|
153
|
+
raise GenerationError, "Invalid response format from text generation API"
|
154
|
+
end
|
155
|
+
else
|
156
|
+
# Fallback to basic keyword extraction for testing/dev environments
|
157
|
+
puts "⚠️ No LLM client configured, using fallback keyword extraction"
|
158
|
+
extract_basic_keywords(cleaned_text, max_keywords)
|
159
|
+
end
|
160
|
+
rescue StandardError => e
|
161
|
+
# Fall back to basic keyword extraction if API fails
|
162
|
+
puts "❌ LLM keyword extraction failed, using fallback: #{e.message}"
|
163
|
+
puts "Error class: #{e.class}"
|
164
|
+
puts "Backtrace: #{e.backtrace.first(3).join(', ')}"
|
165
|
+
extract_basic_keywords(cleaned_text, max_keywords)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
private
|
170
|
+
|
171
|
+
def configure_ruby_llm_if_possible
|
172
|
+
# Only configure if we have valid configuration
|
173
|
+
# Use embedding_config provider, fallback to :openai
|
174
|
+
provider = @configuration.embedding_config[:provider] || :openai
|
175
|
+
config = @configuration.ruby_llm_config[provider] || {}
|
176
|
+
|
177
|
+
# Check if we have the necessary API key for the provider
|
178
|
+
has_api_key = case provider
|
179
|
+
when :openai
|
180
|
+
config[:api_key] && !config[:api_key].empty?
|
181
|
+
when :anthropic
|
182
|
+
config[:api_key] && !config[:api_key].empty?
|
183
|
+
when :google
|
184
|
+
config[:api_key] && !config[:api_key].empty?
|
185
|
+
when :azure
|
186
|
+
config[:api_key] && !config[:api_key].empty?
|
187
|
+
when :ollama
|
188
|
+
true # Ollama doesn't need API key for local setup
|
189
|
+
when :huggingface
|
190
|
+
config[:api_key] && !config[:api_key].empty?
|
191
|
+
when :openrouter
|
192
|
+
config[:api_key] && !config[:api_key].empty?
|
193
|
+
else
|
194
|
+
false
|
195
|
+
end
|
196
|
+
|
197
|
+
return unless has_api_key
|
198
|
+
|
199
|
+
begin
|
200
|
+
RubyLLM.configure do |ruby_llm_config|
|
201
|
+
case provider
|
202
|
+
when :openai
|
203
|
+
ruby_llm_config.openai_api_key = config[:api_key]
|
204
|
+
ruby_llm_config.openai_organization = config[:organization] if config[:organization]
|
205
|
+
ruby_llm_config.openai_project = config[:project] if config[:project]
|
206
|
+
when :anthropic
|
207
|
+
ruby_llm_config.anthropic_api_key = config[:api_key]
|
208
|
+
when :google
|
209
|
+
ruby_llm_config.google_api_key = config[:api_key]
|
210
|
+
ruby_llm_config.google_project_id = config[:project_id] if config[:project_id]
|
211
|
+
when :azure
|
212
|
+
ruby_llm_config.azure_api_key = config[:api_key]
|
213
|
+
ruby_llm_config.azure_endpoint = config[:endpoint] if config[:endpoint]
|
214
|
+
ruby_llm_config.azure_api_version = config[:api_version] if config[:api_version]
|
215
|
+
when :ollama
|
216
|
+
ruby_llm_config.ollama_endpoint = config[:endpoint] if config[:endpoint]
|
217
|
+
when :huggingface
|
218
|
+
ruby_llm_config.huggingface_api_key = config[:api_key]
|
219
|
+
when :openrouter
|
220
|
+
ruby_llm_config.openrouter_api_key = config[:api_key]
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# RubyLLM uses module-level methods, not individual provider classes
|
225
|
+
@client = :ruby_llm_configured
|
226
|
+
rescue StandardError => e
|
227
|
+
# If configuration fails, don't set client (will use fallback)
|
228
|
+
puts "❌ RubyLLM configuration failed: #{e.message}"
|
229
|
+
puts " Will use fallback text processing for summaries and keywords"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def clean_text(text)
|
234
|
+
return "" if text.nil?
|
235
|
+
|
236
|
+
# Remove excessive whitespace and normalize
|
237
|
+
cleaned = text.strip
|
238
|
+
.gsub(/\s+/, " ") # Multiple spaces to single space
|
239
|
+
.gsub(/\n+/, "\n") # Multiple newlines to single newline
|
240
|
+
.gsub(/\t+/, " ") # Tabs to spaces
|
241
|
+
|
242
|
+
# Truncate if too long (most models have token limits)
|
243
|
+
max_chars = 12_000 # Conservative limit for most language models
|
244
|
+
cleaned.length > max_chars ? cleaned[0, max_chars] : cleaned
|
245
|
+
end
|
246
|
+
|
247
|
+
def build_summary_prompt(text, max_length)
|
248
|
+
<<~PROMPT
|
249
|
+
Please provide a concise summary of the following text. The summary should:
|
250
|
+
- Be approximately #{max_length} characters or less
|
251
|
+
- Capture the main topics and key points
|
252
|
+
- Be written in clear, professional language
|
253
|
+
- Focus on the most important information
|
254
|
+
|
255
|
+
Text to summarize:
|
256
|
+
#{text}
|
257
|
+
|
258
|
+
Summary:
|
259
|
+
PROMPT
|
260
|
+
end
|
261
|
+
|
262
|
+
def build_keyword_prompt(text, max_keywords)
|
263
|
+
<<~PROMPT
|
264
|
+
Please extract the most important keywords and key phrases from the following text.#{' '}
|
265
|
+
Provide up to #{max_keywords} keywords that best represent the content.
|
266
|
+
|
267
|
+
Requirements:
|
268
|
+
- Focus on nouns, important concepts, and technical terms
|
269
|
+
- Avoid common stop words and articles
|
270
|
+
- Include both single words and meaningful phrases
|
271
|
+
- Separate keywords with commas
|
272
|
+
- Order by importance (most important first)
|
273
|
+
|
274
|
+
Text to analyze:
|
275
|
+
#{text}
|
276
|
+
|
277
|
+
Keywords (comma-separated):
|
278
|
+
PROMPT
|
279
|
+
end
|
280
|
+
|
281
|
+
def parse_keywords_response(content)
|
282
|
+
# Extract keywords from the response, handling various formats
|
283
|
+
content
|
284
|
+
.gsub(/^(keywords?:?\s*)/i, "") # Remove "Keywords:" prefix
|
285
|
+
.split(/[,\n]/) # Split by commas or newlines
|
286
|
+
.map(&:strip) # Remove whitespace
|
287
|
+
.reject(&:empty?) # Remove empty strings
|
288
|
+
.reject { |k| k.match?(/^\d+\./) } # Remove numbered list items
|
289
|
+
.map { |k| k.gsub(/^\d+\.\s*/, "") } # Remove numbering from start
|
290
|
+
.reject { |k| k.length < 2 } # Remove very short words
|
291
|
+
.first(20) # Limit to 20 keywords
|
292
|
+
end
|
293
|
+
|
294
|
+
def generate_basic_summary(text, max_length)
|
295
|
+
# Fallback summarization method (same as before)
|
296
|
+
clean_text = text.gsub(/\s+/, " ").strip
|
297
|
+
|
298
|
+
# Split into sentences
|
299
|
+
sentences = clean_text.split(/[.!?]+/).map(&:strip).reject(&:empty?)
|
300
|
+
|
301
|
+
# If content is short, use the whole thing
|
302
|
+
return clean_text if clean_text.length <= max_length
|
303
|
+
|
304
|
+
# Take first 2-3 sentences or up to max_length characters
|
305
|
+
summary_sentences = []
|
306
|
+
total_length = 0
|
307
|
+
|
308
|
+
sentences.each do |sentence|
|
309
|
+
break unless total_length + sentence.length <= max_length && summary_sentences.length < 3
|
310
|
+
|
311
|
+
summary_sentences << sentence
|
312
|
+
total_length += sentence.length
|
313
|
+
end
|
314
|
+
|
315
|
+
summary = summary_sentences.join(". ")
|
316
|
+
summary += "." unless summary.end_with?(".", "!", "?")
|
317
|
+
summary
|
318
|
+
end
|
319
|
+
|
320
|
+
def extract_basic_keywords(text, max_keywords)
|
321
|
+
# Fallback keyword extraction method (same as before)
|
322
|
+
stop_words = %w[
|
323
|
+
a an and are as at be by for from has he in is it its of on that the
|
324
|
+
to was will with the this these those they them their there where when
|
325
|
+
what who why how which would could should shall might may can must
|
326
|
+
do does did done doing go goes went gone going get gets got gotten
|
327
|
+
getting have had having has been being am was were are is was been
|
328
|
+
but or not no yes if then else also too very much many most some any
|
329
|
+
all each every each other another one two three first second third
|
330
|
+
last next previous before after during while until since through
|
331
|
+
above below under over between among within without across around
|
332
|
+
near far close distant here there everywhere nowhere somewhere anywhere
|
333
|
+
]
|
334
|
+
|
335
|
+
# Clean and normalize text
|
336
|
+
cleaned_text = text.downcase
|
337
|
+
.gsub(/[^\w\s]/, " ") # Remove punctuation
|
338
|
+
.gsub(/\s+/, " ") # Normalize whitespace
|
339
|
+
.strip
|
340
|
+
|
341
|
+
# Split into words and filter
|
342
|
+
words = cleaned_text.split(" ")
|
343
|
+
.reject { |word| word.length < 3 } # Remove short words
|
344
|
+
.reject { |word| stop_words.include?(word) } # Remove stop words
|
345
|
+
.reject { |word| word.match?(/^\d+$/) } # Remove pure numbers
|
346
|
+
|
347
|
+
# Count word frequencies
|
348
|
+
word_counts = Hash.new(0)
|
349
|
+
words.each { |word| word_counts[word] += 1 }
|
350
|
+
|
351
|
+
# Get top keywords (words that appear more than once or are significant)
|
352
|
+
word_counts
|
353
|
+
.select { |word, count| count > 1 || word.length > 6 }
|
354
|
+
.sort_by { |word, count| [-count, word] }
|
355
|
+
.first(max_keywords) # Limit to max_keywords
|
356
|
+
.map { |word, _count| word }
|
357
|
+
end
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
data/lib/ragdoll/core.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "delegate"
|
4
|
+
require "debug_me"
|
5
|
+
include DebugMe
|
6
|
+
$DEBUG_ME = true
|
7
|
+
|
8
|
+
# require_relative "../extensions/openstruct_merge" # File doesn't exist
|
9
|
+
|
10
|
+
require_relative "core/version"
|
11
|
+
require_relative "core/errors"
|
12
|
+
require_relative "core/configuration"
|
13
|
+
require_relative "core/database"
|
14
|
+
require_relative "core/shrine_config"
|
15
|
+
require_relative "core/models/document"
|
16
|
+
require_relative "core/models/embedding"
|
17
|
+
require_relative "core/models/text_content"
|
18
|
+
require_relative "core/models/audio_content"
|
19
|
+
require_relative "core/models/image_content"
|
20
|
+
require_relative "core/document_processor"
|
21
|
+
require_relative "core/document_management"
|
22
|
+
require_relative "core/text_chunker"
|
23
|
+
require_relative "core/embedding_service"
|
24
|
+
require_relative "core/text_generation_service"
|
25
|
+
require_relative "core/search_engine"
|
26
|
+
require_relative "core/services/image_description_service"
|
27
|
+
require_relative "core/jobs/generate_embeddings"
|
28
|
+
require_relative "core/jobs/generate_summary"
|
29
|
+
require_relative "core/jobs/extract_keywords"
|
30
|
+
require_relative "core/client"
|
31
|
+
|
32
|
+
module Ragdoll
|
33
|
+
def self.config
|
34
|
+
@config ||= Core::Configuration.new
|
35
|
+
end
|
36
|
+
|
37
|
+
module Core
|
38
|
+
extend SingleForwardable
|
39
|
+
|
40
|
+
def self.config
|
41
|
+
@config ||= Configuration.new
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.configuration
|
45
|
+
config
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.configure
|
49
|
+
yield(config)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Reset configuration (useful for testing)
|
53
|
+
def self.reset_configuration!
|
54
|
+
@config = Configuration.new
|
55
|
+
@default_client = nil
|
56
|
+
end
|
57
|
+
|
58
|
+
# Factory method for creating clients
|
59
|
+
def self.client(config = nil)
|
60
|
+
Client.new(config)
|
61
|
+
end
|
62
|
+
|
63
|
+
# Delegate high-level API methods to default client
|
64
|
+
def_delegators :default_client, :add_document, :search, :enhance_prompt,
|
65
|
+
:get_document, :document_status, :list_documents, :delete_document,
|
66
|
+
:update_document, :get_context, :search_similar_content,
|
67
|
+
:add_directory, :stats, :healthy?, :hybrid_search
|
68
|
+
|
69
|
+
def self.default_client
|
70
|
+
@default_client ||= Client.new
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
data/lib/ragdoll-core.rb
ADDED