ragdoll 0.1.1 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +1 -1
  3. data/Rakefile +52 -1
  4. data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
  5. data/app/jobs/ragdoll/extract_text_job.rb +38 -0
  6. data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
  7. data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
  8. data/app/lib/ragdoll/metadata_schemas.rb +332 -0
  9. data/app/models/ragdoll/audio_content.rb +142 -0
  10. data/app/models/ragdoll/content.rb +95 -0
  11. data/app/models/ragdoll/document.rb +611 -0
  12. data/app/models/ragdoll/embedding.rb +176 -0
  13. data/app/models/ragdoll/image_content.rb +194 -0
  14. data/app/models/ragdoll/text_content.rb +137 -0
  15. data/app/services/ragdoll/configuration_service.rb +113 -0
  16. data/app/services/ragdoll/document_management.rb +108 -0
  17. data/app/services/ragdoll/document_processor.rb +342 -0
  18. data/app/services/ragdoll/embedding_service.rb +202 -0
  19. data/app/services/ragdoll/image_description_service.rb +230 -0
  20. data/app/services/ragdoll/metadata_generator.rb +329 -0
  21. data/app/services/ragdoll/model_resolver.rb +72 -0
  22. data/app/services/ragdoll/search_engine.rb +51 -0
  23. data/app/services/ragdoll/text_chunker.rb +208 -0
  24. data/app/services/ragdoll/text_generation_service.rb +355 -0
  25. data/lib/ragdoll/core/client.rb +32 -41
  26. data/lib/ragdoll/core/configuration.rb +140 -156
  27. data/lib/ragdoll/core/database.rb +1 -1
  28. data/lib/ragdoll/core/model.rb +45 -0
  29. data/lib/ragdoll/core/version.rb +1 -1
  30. data/lib/ragdoll/core.rb +35 -17
  31. data/lib/ragdoll.rb +1 -1
  32. data/lib/tasks/annotate.rake +1 -1
  33. data/lib/tasks/db.rake +2 -2
  34. metadata +24 -20
  35. data/lib/ragdoll/core/document_management.rb +0 -110
  36. data/lib/ragdoll/core/document_processor.rb +0 -344
  37. data/lib/ragdoll/core/embedding_service.rb +0 -183
  38. data/lib/ragdoll/core/jobs/extract_keywords.rb +0 -32
  39. data/lib/ragdoll/core/jobs/extract_text.rb +0 -42
  40. data/lib/ragdoll/core/jobs/generate_embeddings.rb +0 -32
  41. data/lib/ragdoll/core/jobs/generate_summary.rb +0 -29
  42. data/lib/ragdoll/core/metadata_schemas.rb +0 -334
  43. data/lib/ragdoll/core/models/audio_content.rb +0 -175
  44. data/lib/ragdoll/core/models/content.rb +0 -126
  45. data/lib/ragdoll/core/models/document.rb +0 -678
  46. data/lib/ragdoll/core/models/embedding.rb +0 -204
  47. data/lib/ragdoll/core/models/image_content.rb +0 -227
  48. data/lib/ragdoll/core/models/text_content.rb +0 -169
  49. data/lib/ragdoll/core/search_engine.rb +0 -50
  50. data/lib/ragdoll/core/services/image_description_service.rb +0 -230
  51. data/lib/ragdoll/core/services/metadata_generator.rb +0 -335
  52. data/lib/ragdoll/core/text_chunker.rb +0 -210
  53. data/lib/ragdoll/core/text_generation_service.rb +0 -360
@@ -0,0 +1,208 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ragdoll
4
+ class TextChunker
5
+ DEFAULT_CHUNK_SIZE = 1000
6
+ DEFAULT_CHUNK_OVERLAP = 200
7
+
8
+ def self.chunk(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP)
9
+ new(text, chunk_size: chunk_size, chunk_overlap: chunk_overlap).chunk
10
+ end
11
+
12
+ def initialize(text, chunk_size: DEFAULT_CHUNK_SIZE, chunk_overlap: DEFAULT_CHUNK_OVERLAP)
13
+ @text = text.to_s
14
+ @chunk_size = chunk_size
15
+ @chunk_overlap = chunk_overlap
16
+ end
17
+
18
+ def chunk
19
+ return [] if @text.empty?
20
+
21
+ # Ensure chunk_size and chunk_overlap are valid integers
22
+ @chunk_size = (@chunk_size || DEFAULT_CHUNK_SIZE).to_i
23
+ @chunk_overlap = (@chunk_overlap || DEFAULT_CHUNK_OVERLAP).to_i
24
+
25
+ # Ensure chunk_overlap is not greater than or equal to chunk_size to prevent infinite loops
26
+ @chunk_overlap = [@chunk_size - 1, 0].max if @chunk_overlap >= @chunk_size
27
+
28
+ return [@text] if @text.length <= @chunk_size
29
+
30
+ chunks = []
31
+ start_pos = 0
32
+
33
+ while start_pos < @text.length
34
+ end_pos = start_pos + @chunk_size
35
+
36
+ # If this is the last chunk, take everything remaining
37
+ if end_pos >= @text.length
38
+ chunks << @text[start_pos..].strip
39
+ break
40
+ end
41
+
42
+ # Try to find a good breaking point (sentence, paragraph, or word boundary)
43
+ chunk_text = @text[start_pos...end_pos]
44
+ break_pos = find_break_position(chunk_text, @text, start_pos, end_pos)
45
+
46
+ # Extract the chunk
47
+ actual_end_pos = start_pos + break_pos
48
+ chunk_content = @text[start_pos...actual_end_pos].strip
49
+
50
+ chunks << chunk_content unless chunk_content.empty?
51
+
52
+ # Move to next chunk with overlap
53
+ next_start_pos = actual_end_pos - @chunk_overlap
54
+ next_start_pos = [next_start_pos, 0].max # Ensure we don't go negative
55
+
56
+ # Ensure forward progress - if we're not advancing, force a step forward
57
+ next_start_pos = start_pos + 1 if next_start_pos <= start_pos
58
+
59
+ start_pos = next_start_pos
60
+ end
61
+
62
+ chunks.reject(&:empty?)
63
+ end
64
+
65
+ private
66
+
67
+ def find_break_position(chunk_text, _full_text, _start_pos, _end_pos)
68
+ # Priority order for breaking points:
69
+ # 1. Double newline (paragraph break)
70
+ # 2. Single newline + sentence ending
71
+ # 3. Sentence ending punctuation
72
+ # 4. Word boundary
73
+ # 5. Character boundary (fallback)
74
+
75
+ # Look for paragraph breaks
76
+ paragraph_break = chunk_text.rindex("\n\n")
77
+ return paragraph_break + 2 if paragraph_break && paragraph_break > @chunk_size * 0.5
78
+
79
+ # Look for sentence endings near newlines
80
+ sentence_patterns = [
81
+ /[.!?]\s*\n/,
82
+ /[.!?]\s+[A-Z]/,
83
+ /[.!?]$/
84
+ ]
85
+
86
+ sentence_patterns.each do |pattern|
87
+ matches = chunk_text.enum_for(:scan, pattern).map { Regexp.last_match.end(0) }
88
+ next unless matches.any?
89
+
90
+ # Find the best sentence break (closest to chunk_size but not too small)
91
+ best_break = matches.select { |pos| pos > @chunk_size * 0.5 }.max
92
+ return best_break if best_break
93
+ end
94
+
95
+ # Look for word boundaries
96
+ word_break = chunk_text.rindex(/\s/)
97
+ return word_break + 1 if word_break && word_break > @chunk_size * 0.3
98
+
99
+ # Fallback to character boundary
100
+ @chunk_size
101
+ end
102
+
103
+ # Alternative chunking method for structured documents
104
+ def self.chunk_by_structure(text, max_chunk_size: DEFAULT_CHUNK_SIZE)
105
+ chunks = []
106
+ current_chunk = ""
107
+
108
+ # Split by paragraphs first
109
+ paragraphs = text.split(/\n\s*\n/)
110
+
111
+ paragraphs.each do |paragraph|
112
+ paragraph = paragraph.strip
113
+ next if paragraph.empty?
114
+
115
+ # If adding this paragraph would exceed chunk size, start new chunk
116
+ if !current_chunk.empty? && (current_chunk.length + paragraph.length + 2) > max_chunk_size
117
+ chunks << current_chunk.strip
118
+ current_chunk = ""
119
+ end
120
+
121
+ # If single paragraph is too large, split it
122
+ if paragraph.length > max_chunk_size
123
+ # Split large paragraph into sentences
124
+ sentences = paragraph.split(/(?<=[.!?])\s+/)
125
+
126
+ sentences.each do |sentence|
127
+ sentence = sentence.strip
128
+ next if sentence.empty?
129
+
130
+ if !current_chunk.empty? && (current_chunk.length + sentence.length + 1) > max_chunk_size
131
+ chunks << current_chunk.strip
132
+ current_chunk = ""
133
+ end
134
+
135
+ if sentence.length > max_chunk_size
136
+ # Split very long sentences by words
137
+ words = sentence.split(/\s+/)
138
+ words.each do |word|
139
+ if !current_chunk.empty? && (current_chunk.length + word.length + 1) > max_chunk_size
140
+ chunks << current_chunk.strip
141
+ current_chunk = ""
142
+ end
143
+ current_chunk += (current_chunk.empty? ? "" : " ") + word
144
+ end
145
+ else
146
+ current_chunk += (current_chunk.empty? ? "" : " ") + sentence
147
+ end
148
+ end
149
+ else
150
+ current_chunk += (current_chunk.empty? ? "" : "\n\n") + paragraph
151
+ end
152
+ end
153
+
154
+ chunks << current_chunk.strip unless current_chunk.strip.empty?
155
+ chunks.reject(&:empty?)
156
+ end
157
+
158
+ # Specialized chunking for code documents
159
+ def self.chunk_code(text, max_chunk_size: DEFAULT_CHUNK_SIZE)
160
+ chunks = []
161
+ current_chunk = ""
162
+
163
+ # Split by functions, classes, or logical blocks
164
+ lines = text.split("\n")
165
+ current_block = []
166
+ block_indent = nil
167
+
168
+ lines.each do |line|
169
+ line_indent = line[/^\s*/].length
170
+
171
+ # Detect block boundaries (functions, classes, etc.)
172
+ if line.match?(/^\s*(def|class|function|const|let|var)\s/) ||
173
+ (block_indent && line_indent <= block_indent && !line.strip.empty?)
174
+
175
+ # Process current block
176
+ if current_block.any?
177
+ block_text = current_block.join("\n")
178
+
179
+ if !current_chunk.empty? && (current_chunk.length + block_text.length + 1) > max_chunk_size
180
+ chunks << current_chunk.strip
181
+ current_chunk = ""
182
+ end
183
+
184
+ current_chunk += (current_chunk.empty? ? "" : "\n") + block_text
185
+ end
186
+
187
+ current_block = [line]
188
+ block_indent = line_indent
189
+ else
190
+ current_block << line
191
+ end
192
+ end
193
+
194
+ # Process final block
195
+ if current_block.any?
196
+ block_text = current_block.join("\n")
197
+ if !current_chunk.empty? && (current_chunk.length + block_text.length + 1) > max_chunk_size
198
+ chunks << current_chunk.strip
199
+ current_chunk = ""
200
+ end
201
+ current_chunk += (current_chunk.empty? ? "" : "\n") + block_text
202
+ end
203
+
204
+ chunks << current_chunk.strip unless current_chunk.strip.empty?
205
+ chunks.reject(&:empty?)
206
+ end
207
+ end
208
+ end
@@ -0,0 +1,355 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ruby_llm"
4
+
5
+ module Ragdoll
6
+ class TextGenerationService
7
+ class GenerationError < StandardError; end
8
+
9
+ def initialize(client: nil, config_service: nil, model_resolver: nil)
10
+ @config_service = config_service || Ragdoll::ConfigurationService.new
11
+ @model_resolver = model_resolver || Ragdoll::ModelResolver.new(@config_service)
12
+ @client = client
13
+ configure_ruby_llm_if_possible unless @client
14
+ end
15
+
16
+ def generate_summary(text, max_length: nil)
17
+ return "" if text.nil? || text.strip.empty?
18
+
19
+ # Skip summarization if not enabled
20
+ unless @config_service.config.summarization[:enable]
21
+ puts "⚠️ LLM summarization disabled, using fallback (first 500 chars)"
22
+ return text[0..500]
23
+ end
24
+
25
+ # Skip if content is too short
26
+ min_length = @config_service.config.summarization[:min_content_length]
27
+ return text if text.length < min_length
28
+
29
+ max_length ||= @config_service.config.summarization[:max_length]
30
+
31
+ # Clean and prepare text
32
+ cleaned_text = clean_text(text)
33
+
34
+ # Create summarization prompt
35
+ prompt = build_summary_prompt(cleaned_text, max_length)
36
+
37
+ begin
38
+ if @client == :ruby_llm_configured
39
+ # Use RubyLLM for text generation
40
+ # Use model resolver to get summary model with inheritance
41
+ model_obj = @model_resolver.resolve_for_task(:summary)
42
+ model = model_obj.model
43
+
44
+ chat = RubyLLM.chat.with_model(model)
45
+ .with_temperature(0.3)
46
+ chat.add_message(role: "user", content: prompt)
47
+ response = chat.complete
48
+
49
+ if response.respond_to?(:content)
50
+ response.content.strip
51
+ elsif response.respond_to?(:message) && response.message.respond_to?(:content)
52
+ response.message.content.strip
53
+ elsif response && response["choices"]&.first
54
+ response["choices"].first["message"]["content"].strip
55
+ elsif response && response["content"]
56
+ response["content"].strip
57
+ else
58
+ raise GenerationError, "Invalid response format from text generation API"
59
+ end
60
+ elsif @client
61
+ # Use custom client for testing
62
+ model_obj = @model_resolver.resolve_for_task(:summary)
63
+ model = model_obj.model
64
+
65
+ response = @client.chat(
66
+ model: model,
67
+ messages: [
68
+ { role: "user", content: prompt }
69
+ ],
70
+ max_tokens: max_length + 50,
71
+ temperature: 0.3
72
+ )
73
+
74
+ if response && response["choices"]&.first
75
+ response["choices"].first["message"]["content"].strip
76
+ elsif response && response["content"]
77
+ response["content"].strip
78
+ else
79
+ raise GenerationError, "Invalid response format from text generation API"
80
+ end
81
+ else
82
+ # Fallback to basic summarization for testing/dev environments
83
+ puts "⚠️ No LLM client configured, using fallback summarization"
84
+ generate_basic_summary(cleaned_text, max_length)
85
+ end
86
+ rescue StandardError => e
87
+ # Fall back to basic summarization if API fails
88
+ puts "❌ LLM summary generation failed, using fallback: #{e.message}"
89
+ generate_basic_summary(cleaned_text, max_length)
90
+ end
91
+ end
92
+
93
+ def extract_keywords(text, max_keywords: 20)
94
+ return [] if text.nil? || text.strip.empty?
95
+
96
+ # Clean and prepare text
97
+ cleaned_text = clean_text(text)
98
+
99
+ # Create keyword extraction prompt
100
+ prompt = build_keyword_prompt(cleaned_text, max_keywords)
101
+
102
+ begin
103
+ if @client == :ruby_llm_configured
104
+ # Use RubyLLM for keyword extraction
105
+ # Use keywords model from models config, fallback to default
106
+ model_obj = @model_resolver.resolve_for_task(:keywords)
107
+ model = model_obj.model
108
+
109
+ chat = RubyLLM.chat.with_model(model).with_temperature(0.1)
110
+ chat.add_message(role: "user", content: prompt)
111
+ response = chat.complete
112
+
113
+ if response.respond_to?(:content)
114
+ content = response.content.strip
115
+ parse_keywords_response(content)
116
+ elsif response.respond_to?(:message) && response.message.respond_to?(:content)
117
+ content = response.message.content.strip
118
+ parse_keywords_response(content)
119
+ elsif response && response["choices"]&.first
120
+ content = response["choices"].first["message"]["content"].strip
121
+ parse_keywords_response(content)
122
+ elsif response && response["content"]
123
+ content = response["content"].strip
124
+ parse_keywords_response(content)
125
+ else
126
+ raise GenerationError, "Invalid response format from text generation API"
127
+ end
128
+ elsif @client
129
+ # Use custom client for testing
130
+ model_obj = @model_resolver.resolve_for_task(:keywords)
131
+ model = model_obj.model
132
+
133
+ response = @client.chat(
134
+ model: model,
135
+ messages: [
136
+ { role: "user", content: prompt }
137
+ ],
138
+ max_tokens: 200,
139
+ temperature: 0.1
140
+ )
141
+
142
+ if response && response["choices"]&.first
143
+ content = response["choices"].first["message"]["content"].strip
144
+ parse_keywords_response(content)
145
+ elsif response && response["content"]
146
+ content = response["content"].strip
147
+ parse_keywords_response(content)
148
+ else
149
+ raise GenerationError, "Invalid response format from text generation API"
150
+ end
151
+ else
152
+ # Fallback to basic keyword extraction for testing/dev environments
153
+ puts "⚠️ No LLM client configured, using fallback keyword extraction"
154
+ extract_basic_keywords(cleaned_text, max_keywords)
155
+ end
156
+ rescue StandardError => e
157
+ # Fall back to basic keyword extraction if API fails
158
+ puts "❌ LLM keyword extraction failed, using fallback: #{e.message}"
159
+ puts "Error class: #{e.class}"
160
+ puts "Backtrace: #{e.backtrace.first(3).join(', ')}"
161
+ extract_basic_keywords(cleaned_text, max_keywords)
162
+ end
163
+ end
164
+
165
+ private
166
+
167
+ def configure_ruby_llm_if_possible
168
+ # Only configure if we have valid configuration
169
+ # Use embedding provider, fallback to :openai
170
+ provider = @config_service.config.models[:embedding][:provider] || :openai
171
+ config = @config_service.config.llm_providers[provider] || {}
172
+
173
+ # Check if we have the necessary API key for the provider
174
+ has_api_key = case provider
175
+ when :openai
176
+ config[:api_key] && !config[:api_key].empty?
177
+ when :anthropic
178
+ config[:api_key] && !config[:api_key].empty?
179
+ when :google
180
+ config[:api_key] && !config[:api_key].empty?
181
+ when :azure
182
+ config[:api_key] && !config[:api_key].empty?
183
+ when :ollama
184
+ true # Ollama doesn't need API key for local setup
185
+ when :huggingface
186
+ config[:api_key] && !config[:api_key].empty?
187
+ when :openrouter
188
+ config[:api_key] && !config[:api_key].empty?
189
+ else
190
+ false
191
+ end
192
+
193
+ return unless has_api_key
194
+
195
+ begin
196
+ RubyLLM.configure do |ruby_llm_config|
197
+ case provider
198
+ when :openai
199
+ ruby_llm_config.openai_api_key = config[:api_key]
200
+ ruby_llm_config.openai_organization = config[:organization] if config[:organization]
201
+ ruby_llm_config.openai_project = config[:project] if config[:project]
202
+ when :anthropic
203
+ ruby_llm_config.anthropic_api_key = config[:api_key]
204
+ when :google
205
+ ruby_llm_config.google_api_key = config[:api_key]
206
+ ruby_llm_config.google_project_id = config[:project_id] if config[:project_id]
207
+ when :azure
208
+ ruby_llm_config.azure_api_key = config[:api_key]
209
+ ruby_llm_config.azure_endpoint = config[:endpoint] if config[:endpoint]
210
+ ruby_llm_config.azure_api_version = config[:api_version] if config[:api_version]
211
+ when :ollama
212
+ ruby_llm_config.ollama_endpoint = config[:endpoint] if config[:endpoint]
213
+ when :huggingface
214
+ ruby_llm_config.huggingface_api_key = config[:api_key]
215
+ when :openrouter
216
+ ruby_llm_config.openrouter_api_key = config[:api_key]
217
+ end
218
+ end
219
+
220
+ # RubyLLM uses module-level methods, not individual provider classes
221
+ @client = :ruby_llm_configured
222
+ rescue StandardError => e
223
+ # If configuration fails, don't set client (will use fallback)
224
+ puts "❌ RubyLLM configuration failed: #{e.message}"
225
+ puts " Will use fallback text processing for summaries and keywords"
226
+ end
227
+ end
228
+
229
+ def clean_text(text)
230
+ return "" if text.nil?
231
+
232
+ # Remove excessive whitespace and normalize
233
+ cleaned = text.strip
234
+ .gsub(/\s+/, " ") # Multiple spaces to single space
235
+ .gsub(/\n+/, "\n") # Multiple newlines to single newline
236
+ .gsub(/\t+/, " ") # Tabs to spaces
237
+
238
+ # Truncate if too long (most models have token limits)
239
+ max_chars = 12_000 # Conservative limit for most language models
240
+ cleaned.length > max_chars ? cleaned[0, max_chars] : cleaned
241
+ end
242
+
243
+ def build_summary_prompt(text, max_length)
244
+ <<~PROMPT
245
+ Please provide a concise summary of the following text. The summary should:
246
+ - Be approximately #{max_length} characters or less
247
+ - Capture the main topics and key points
248
+ - Be written in clear, professional language
249
+ - Focus on the most important information
250
+
251
+ Text to summarize:
252
+ #{text}
253
+
254
+ Summary:
255
+ PROMPT
256
+ end
257
+
258
+ def build_keyword_prompt(text, max_keywords)
259
+ <<~PROMPT
260
+ Please extract the most important keywords and key phrases from the following text.#{' '}
261
+ Provide up to #{max_keywords} keywords that best represent the content.
262
+
263
+ Requirements:
264
+ - Focus on nouns, important concepts, and technical terms
265
+ - Avoid common stop words and articles
266
+ - Include both single words and meaningful phrases
267
+ - Separate keywords with commas
268
+ - Order by importance (most important first)
269
+
270
+ Text to analyze:
271
+ #{text}
272
+
273
+ Keywords (comma-separated):
274
+ PROMPT
275
+ end
276
+
277
+ def parse_keywords_response(content)
278
+ # Extract keywords from the response, handling various formats
279
+ content
280
+ .gsub(/^(keywords?:?\s*)/i, "") # Remove "Keywords:" prefix
281
+ .split(/[,\n]/) # Split by commas or newlines
282
+ .map(&:strip) # Remove whitespace
283
+ .reject(&:empty?) # Remove empty strings
284
+ .reject { |k| k.match?(/^\d+\./) } # Remove numbered list items
285
+ .map { |k| k.gsub(/^\d+\.\s*/, "") } # Remove numbering from start
286
+ .reject { |k| k.length < 2 } # Remove very short words
287
+ .first(20) # Limit to 20 keywords
288
+ end
289
+
290
+ def generate_basic_summary(text, max_length)
291
+ # Fallback summarization method (same as before)
292
+ clean_text = text.gsub(/\s+/, " ").strip
293
+
294
+ # Split into sentences
295
+ sentences = clean_text.split(/[.!?]+/).map(&:strip).reject(&:empty?)
296
+
297
+ # If content is short, use the whole thing
298
+ return clean_text if clean_text.length <= max_length
299
+
300
+ # Take first 2-3 sentences or up to max_length characters
301
+ summary_sentences = []
302
+ total_length = 0
303
+
304
+ sentences.each do |sentence|
305
+ break unless total_length + sentence.length <= max_length && summary_sentences.length < 3
306
+
307
+ summary_sentences << sentence
308
+ total_length += sentence.length
309
+ end
310
+
311
+ summary = summary_sentences.join(". ")
312
+ summary += "." unless summary.end_with?(".", "!", "?")
313
+ summary
314
+ end
315
+
316
+ def extract_basic_keywords(text, max_keywords)
317
+ # Fallback keyword extraction method (same as before)
318
+ stop_words = %w[
319
+ a an and are as at be by for from has he in is it its of on that the
320
+ to was will with the this these those they them their there where when
321
+ what who why how which would could should shall might may can must
322
+ do does did done doing go goes went gone going get gets got gotten
323
+ getting have had having has been being am was were are is was been
324
+ but or not no yes if then else also too very much many most some any
325
+ all each every each other another one two three first second third
326
+ last next previous before after during while until since through
327
+ above below under over between among within without across around
328
+ near far close distant here there everywhere nowhere somewhere anywhere
329
+ ]
330
+
331
+ # Clean and normalize text
332
+ cleaned_text = text.downcase
333
+ .gsub(/[^\w\s]/, " ") # Remove punctuation
334
+ .gsub(/\s+/, " ") # Normalize whitespace
335
+ .strip
336
+
337
+ # Split into words and filter
338
+ words = cleaned_text.split(" ")
339
+ .reject { |word| word.length < 3 } # Remove short words
340
+ .reject { |word| stop_words.include?(word) } # Remove stop words
341
+ .reject { |word| word.match?(/^\d+$/) } # Remove pure numbers
342
+
343
+ # Count word frequencies
344
+ word_counts = Hash.new(0)
345
+ words.each { |word| word_counts[word] += 1 }
346
+
347
+ # Get top keywords (words that appear more than once or are significant)
348
+ word_counts
349
+ .select { |word, count| count > 1 || word.length > 6 }
350
+ .sort_by { |word, count| [-count, word] }
351
+ .first(max_keywords) # Limit to max_keywords
352
+ .map { |word, _count| word }
353
+ end
354
+ end
355
+ end