ragdoll 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +353 -0
  3. data/Rakefile +21 -0
  4. data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
  5. data/db/migrate/004_create_ragdoll_documents.rb +70 -0
  6. data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
  7. data/db/migrate/006_create_ragdoll_contents.rb +47 -0
  8. data/lib/ragdoll/core/client.rb +315 -0
  9. data/lib/ragdoll/core/configuration.rb +273 -0
  10. data/lib/ragdoll/core/database.rb +141 -0
  11. data/lib/ragdoll/core/document_management.rb +110 -0
  12. data/lib/ragdoll/core/document_processor.rb +344 -0
  13. data/lib/ragdoll/core/embedding_service.rb +183 -0
  14. data/lib/ragdoll/core/errors.rb +11 -0
  15. data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
  16. data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
  17. data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
  18. data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
  19. data/lib/ragdoll/core/metadata_schemas.rb +334 -0
  20. data/lib/ragdoll/core/models/audio_content.rb +175 -0
  21. data/lib/ragdoll/core/models/content.rb +126 -0
  22. data/lib/ragdoll/core/models/document.rb +678 -0
  23. data/lib/ragdoll/core/models/embedding.rb +204 -0
  24. data/lib/ragdoll/core/models/image_content.rb +227 -0
  25. data/lib/ragdoll/core/models/text_content.rb +169 -0
  26. data/lib/ragdoll/core/search_engine.rb +50 -0
  27. data/lib/ragdoll/core/services/image_description_service.rb +230 -0
  28. data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
  29. data/lib/ragdoll/core/shrine_config.rb +71 -0
  30. data/lib/ragdoll/core/text_chunker.rb +210 -0
  31. data/lib/ragdoll/core/text_generation_service.rb +360 -0
  32. data/lib/ragdoll/core/version.rb +8 -0
  33. data/lib/ragdoll/core.rb +73 -0
  34. data/lib/ragdoll-core.rb +3 -0
  35. data/lib/ragdoll.rb +249 -0
  36. data/lib/tasks/annotate.rake +126 -0
  37. data/lib/tasks/db.rake +338 -0
  38. metadata +80 -0
@@ -0,0 +1,315 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "fileutils"
4
+
5
+ module Ragdoll
6
+ module Core
7
+ class Client
8
+ def initialize
9
+ # Setup logging
10
+ setup_logging
11
+
12
+ # Setup database connection
13
+ Database.setup(Ragdoll.config.database_config)
14
+
15
+ @embedding_service = EmbeddingService.new
16
+ @search_engine = SearchEngine.new(@embedding_service)
17
+ end
18
+
19
+ # Primary method for RAG applications
20
+ # Returns context-enhanced content for AI prompts
21
+ def enhance_prompt(prompt:, context_limit: 5, **options)
22
+ context_data = get_context(query: prompt, limit: context_limit, **options)
23
+
24
+ if context_data[:context_chunks].any?
25
+ enhanced_prompt = build_enhanced_prompt(prompt, context_data[:combined_context])
26
+ {
27
+ enhanced_prompt: enhanced_prompt,
28
+ original_prompt: prompt,
29
+ context_sources: context_data[:context_chunks].map { |chunk| chunk[:source] },
30
+ context_count: context_data[:total_chunks]
31
+ }
32
+ else
33
+ {
34
+ enhanced_prompt: prompt,
35
+ original_prompt: prompt,
36
+ context_sources: [],
37
+ context_count: 0
38
+ }
39
+ end
40
+ end
41
+
42
+ # Get relevant context without prompt enhancement
43
+ def get_context(query:, limit: 10, **options)
44
+ results = search_similar_content(query: query, limit: limit, **options)
45
+
46
+ context_chunks = results.map do |result|
47
+ {
48
+ content: result[:content],
49
+ source: result[:document_location],
50
+ similarity: result[:similarity],
51
+ chunk_index: result[:chunk_index]
52
+ }
53
+ end
54
+
55
+ combined_context = context_chunks.map { |chunk| chunk[:content] }.join("\n\n")
56
+
57
+ {
58
+ context_chunks: context_chunks,
59
+ combined_context: combined_context,
60
+ total_chunks: context_chunks.length
61
+ }
62
+ end
63
+
64
+ # FIXME: This high-level API method should be able to take a query that is
65
+ # a string or a file. If its a file, then the downstream Process will
66
+ # be responsible for reading the file and passing the contents to the
67
+ # search method based upon whether the content is text, image or audio.
68
+
69
+ # Semantic search++ should incorporate hybrid search
70
+ def search(query:, **options)
71
+ results = search_similar_content(query: query, **options)
72
+
73
+ {
74
+ query: query,
75
+ results: results,
76
+ total_results: results.length
77
+ }
78
+ end
79
+
80
+ # Search similar content (core functionality)
81
+ def search_similar_content(query:, **options)
82
+ @search_engine.search_similar_content(query, **options)
83
+ end
84
+
85
+ # Hybrid search combining semantic and full-text search
86
+ def hybrid_search(query:, **options)
87
+ # Generate embedding for the query
88
+ query_embedding = @embedding_service.generate_embedding(query)
89
+
90
+ # Perform hybrid search
91
+ results = Models::Document.hybrid_search(query, query_embedding: query_embedding, **options)
92
+
93
+ {
94
+ query: query,
95
+ search_type: "hybrid",
96
+ results: results,
97
+ total_results: results.length,
98
+ semantic_weight: options[:semantic_weight] || 0.7,
99
+ text_weight: options[:text_weight] || 0.3
100
+ }
101
+ rescue StandardError => e
102
+ {
103
+ query: query,
104
+ search_type: "hybrid",
105
+ results: [],
106
+ total_results: 0,
107
+ error: "Hybrid search failed: #{e.message}"
108
+ }
109
+ end
110
+
111
+ # Document management
112
+ def add_document(path:)
113
+ # Parse the document
114
+ parsed = DocumentProcessor.parse(path)
115
+
116
+ # Extract title from metadata or use filename
117
+ title = parsed[:metadata][:title] ||
118
+ File.basename(path, File.extname(path))
119
+
120
+ # Add document to database
121
+ doc_id = DocumentManagement.add_document(path, parsed[:content], {
122
+ title: title,
123
+ document_type: parsed[:document_type],
124
+ **parsed[:metadata]
125
+ })
126
+
127
+
128
+ # Queue background jobs for processing if content is available
129
+ embeddings_queued = false
130
+ if parsed[:content].present?
131
+ Ragdoll::Core::Jobs::GenerateEmbeddings.perform_later(doc_id)
132
+ Ragdoll::Core::Jobs::GenerateSummary.perform_later(doc_id)
133
+ Ragdoll::Core::Jobs::ExtractKeywords.perform_later(doc_id)
134
+ embeddings_queued = true
135
+ end
136
+
137
+
138
+ # Return success information
139
+ {
140
+ success: true,
141
+ document_id: doc_id,
142
+ title: title,
143
+ document_type: parsed[:document_type],
144
+ content_length: parsed[:content]&.length || 0,
145
+ embeddings_queued: embeddings_queued,
146
+ message: "Document '#{title}' added successfully with ID #{doc_id}"
147
+ }
148
+ rescue StandardError => e # StandardError => e
149
+ {
150
+ success: false,
151
+ error: e.message,
152
+ message: "Failed to add document: #{e.message}"
153
+ }
154
+ end
155
+
156
+ def add_text(content:, title:, **options)
157
+ # Add document to database
158
+ doc_id = DocumentManagement.add_document(title, content, {
159
+ title: title,
160
+ document_type: "text",
161
+ **options
162
+ })
163
+
164
+ # Queue background job for embeddings
165
+ Ragdoll::Core::Jobs::GenerateEmbeddings.perform_later(doc_id,
166
+ chunk_size: options[:chunk_size],
167
+ chunk_overlap: options[:chunk_overlap])
168
+
169
+ doc_id
170
+ end
171
+
172
+ def add_directory(path:, recursive: false)
173
+ results = []
174
+ pattern = recursive ? File.join(path, "**", "*") : File.join(path, "*")
175
+
176
+ Dir.glob(pattern).each do |file_path|
177
+ next unless File.file?(file_path)
178
+
179
+ begin
180
+ doc_id = add_document(path: file_path)
181
+ results << { file: file_path, document_id: doc_id, status: "success" }
182
+ rescue StandardError => e
183
+ results << { file: file_path, error: e.message, status: "error" }
184
+ end
185
+ end
186
+
187
+ results
188
+ end
189
+
190
+ def get_document(id:)
191
+ document_hash = DocumentManagement.get_document(id)
192
+ return nil unless document_hash
193
+
194
+ # DocumentManagement.get_document already returns a hash with all needed info
195
+ document_hash
196
+ end
197
+
198
+ def document_status(id:)
199
+ document = Models::Document.find(id)
200
+ embeddings_count = document.all_embeddings.count
201
+
202
+ {
203
+ id: document.id,
204
+ title: document.title,
205
+ status: document.status,
206
+ embeddings_count: embeddings_count,
207
+ embeddings_ready: embeddings_count.positive?,
208
+ content_preview: document.content&.first(200) || "No content",
209
+ message: case document.status
210
+ when "processed"
211
+ "Document processed successfully with #{embeddings_count} embeddings"
212
+ when "processing"
213
+ "Document is being processed"
214
+ when "pending"
215
+ "Document is pending processing"
216
+ when "error"
217
+ "Document processing failed"
218
+ else
219
+ "Document status: #{document.status}"
220
+ end
221
+ }
222
+ rescue ActiveRecord::RecordNotFound
223
+ {
224
+ success: false,
225
+ error: "Document not found",
226
+ message: "Document with ID #{id} does not exist"
227
+ }
228
+ end
229
+
230
+ def update_document(id:, **updates)
231
+ DocumentManagement.update_document(id, **updates)
232
+ end
233
+
234
+ def delete_document(id:)
235
+ DocumentManagement.delete_document(id)
236
+ end
237
+
238
+ def list_documents(**options)
239
+ DocumentManagement.list_documents(options)
240
+ end
241
+
242
+ # Analytics and stats
243
+ def stats
244
+ DocumentManagement.get_document_stats
245
+ end
246
+
247
+ def search_analytics(days: 30)
248
+ # This could be implemented with additional database queries
249
+ Models::Embedding.where("returned_at > ?", days.days.ago)
250
+ .group("DATE(returned_at)")
251
+ .count
252
+ end
253
+
254
+ # Health check
255
+ def healthy?
256
+ Database.connected? && stats[:total_documents] >= 0
257
+ rescue StandardError
258
+ false
259
+ end
260
+
261
+ private
262
+
263
+ def setup_logging
264
+ require "logger"
265
+ require "active_job"
266
+
267
+ # Create log directory if it doesn't exist
268
+ # FIXME: log_file is not in current config structure
269
+ log_file = Ragdoll.config.logging_config[:filepath] || File.join(Dir.home, ".ragdoll", "ragdoll.log")
270
+ log_dir = File.dirname(log_file)
271
+ FileUtils.mkdir_p(log_dir) unless Dir.exist?(log_dir)
272
+
273
+ # Set up logger with appropriate level
274
+ logger = Logger.new(log_file)
275
+ logger.level = case Ragdoll.config.logging_config[:level]
276
+ when :debug then Logger::DEBUG
277
+ when :info then Logger::INFO
278
+ when :warn then Logger::WARN
279
+ when :error then Logger::ERROR
280
+ when :fatal then Logger::FATAL
281
+ else Logger::WARN
282
+ end
283
+
284
+ # Configure ActiveJob to use our logger and reduce verbosity
285
+ ActiveJob::Base.logger = logger
286
+ ActiveJob::Base.logger.level = Logger::WARN
287
+
288
+ # Set up ActiveJob queue adapter - use inline for immediate execution
289
+ ActiveJob::Base.queue_adapter = :inline
290
+ end
291
+
292
+ def build_enhanced_prompt(original_prompt, context)
293
+ # FIXME: prompt_template is not in current config structure
294
+ template = default_prompt_template
295
+
296
+ template
297
+ .gsub("{{context}}", context)
298
+ .gsub("{{prompt}}", original_prompt)
299
+ end
300
+
301
+ def default_prompt_template
302
+ <<~TEMPLATE
303
+ You are an AI assistant. Use the following context to help answer the user's question. If the context doesn't contain relevant information, say so.
304
+
305
+ Context:
306
+ {{context}}
307
+
308
+ Question: {{prompt}}
309
+
310
+ Answer:
311
+ TEMPLATE
312
+ end
313
+ end
314
+ end
315
+ end
@@ -0,0 +1,273 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "fileutils"
5
+ require "ostruct"
6
+
7
+ module Ragdoll
8
+ module Core
9
+ class Configuration
10
+ class ConfigurationFileNotFoundError < StandardError; end
11
+ class ConfigurationSaveError < StandardError; end
12
+ class ConfigurationLoadUnknownError < StandardError; end
13
+
14
+ DEFAULT = {
15
+ directory: File.join(Dir.home, ".ragdoll"),
16
+ filepath: File.join(Dir.home, ".ragdoll", "config.yml"),
17
+ models: {
18
+ default: "openai/gpt-4o",
19
+ summary: "openai/gpt-4o",
20
+ keywords: "openai/gpt-4o",
21
+ embedding: {
22
+ text: "text-embedding-3-small",
23
+ image: "image-embedding-3-small", # FIXME
24
+ audio: "audio-embedding-3-small", # FIXME
25
+ },
26
+ },
27
+ chunking: {
28
+ text: {
29
+ max_tokens: 1000,
30
+ overlap: 200,
31
+ },
32
+ image: {
33
+ max_tokens: 4096,
34
+ overlap: 128,
35
+ },
36
+ audio: {
37
+ max_tokens: 4096,
38
+ overlap: 128,
39
+ },
40
+ default: {
41
+ max_tokens: 4096,
42
+ overlap: 128,
43
+ },
44
+ },
45
+ ruby_llm_config: {
46
+ openai: {
47
+ api_key: -> { ENV["OPENAI_API_KEY"] },
48
+ organization: -> { ENV["OPENAI_ORGANIZATION"] },
49
+ project: -> { ENV["OPENAI_PROJECT"] },
50
+ },
51
+ anthropic: {
52
+ api_key: -> { ENV["ANTHROPIC_API_KEY"] },
53
+ },
54
+ google: {
55
+ api_key: -> { ENV["GOOGLE_API_KEY"] },
56
+ project_id: -> { ENV["GOOGLE_PROJECT_ID"] },
57
+ },
58
+ azure: {
59
+ api_key: -> { ENV["AZURE_OPENAI_API_KEY"] },
60
+ endpoint: -> { ENV["AZURE_OPENAI_ENDPOINT"] },
61
+ api_version: -> { ENV["AZURE_OPENAI_API_VERSION"] || "2024-02-01" },
62
+ },
63
+ ollama: {
64
+ endpoint: -> { ENV["OLLAMA_ENDPOINT"] || "http://localhost:11434/v1" },
65
+ },
66
+ huggingface: {
67
+ api_key: -> { ENV["HUGGINGFACE_API_KEY"] },
68
+ },
69
+ openrouter: {
70
+ api_key: -> { ENV["OPENROUTER_API_KEY"] },
71
+ },
72
+ },
73
+ embedding_config: {
74
+ provider: :openai,
75
+ cache_embeddings: true,
76
+ max_embedding_dimensions: 3072, # Support up to text-embedding-3-large
77
+ },
78
+ summarization_config: {
79
+ enable: true,
80
+ max_length: 300,
81
+ min_content_length: 300,
82
+ },
83
+ database_config: {
84
+ adapter: "postgresql",
85
+ database: "ragdoll_development",
86
+ username: "ragdoll",
87
+ password: -> { ENV["DATABASE_PASSWORD"] },
88
+ host: "localhost",
89
+ port: 5432,
90
+ auto_migrate: true,
91
+ logger: nil, # Set to Logger.new(STDOUT) for debugging
92
+ },
93
+ logging_config: {
94
+ log_level: :warn,
95
+ log_directory: File.join(Dir.home, ".ragdoll"),
96
+ log_filepath: File.join(Dir.home, ".ragdoll", "ragdoll.log"),
97
+ },
98
+ search: {
99
+ similarity_threshold: 0.7,
100
+ max_results: 10,
101
+ enable_analytics: true,
102
+ enable_usage_tracking: true,
103
+ usage_ranking_enabled: true,
104
+ usage_recency_weight: 0.3,
105
+ usage_frequency_weight: 0.7,
106
+ usage_similarity_weight: 1.0,
107
+ },
108
+ }
109
+
110
+ def initialize(config = {})
111
+ merged_config = deep_merge(self.class::DEFAULT, config)
112
+ resolved_config = resolve_procs(merged_config)
113
+ @config = OpenStruct.new(resolved_config)
114
+ end
115
+
116
+ def self.load(path: nil)
117
+ path ||= DEFAULT[:filepath]
118
+
119
+ unless File.exist?(path)
120
+ raise ConfigurationFileNotFoundError, "Configuration file not found: #{path}"
121
+ end
122
+
123
+ new(YAML.safe_load_file(path) || {})
124
+ rescue Errno::ENOENT
125
+ raise ConfigurationFileNotFoundError, "Configuration file not found: #{path}"
126
+ rescue => e
127
+ raise ConfigurationLoadUnknownError, "Failed to load configuration from #{path}: #{e.message}"
128
+ end
129
+
130
+ def save(path: nil)
131
+ if path.nil?
132
+ path = @config.filepath
133
+ else
134
+ save_filepath = @config.filepath
135
+ @config.filepath = path
136
+ end
137
+
138
+ FileUtils.mkdir_p(File.dirname(path))
139
+
140
+ File.write(path, @config.to_yaml)
141
+ rescue => e
142
+ @config.filepath = save_filepath unless save_filepath.nil?
143
+ raise ConfigurationSaveError, "Failed to save configuration to #{path}: #{e.message}"
144
+ end
145
+
146
+ # SMELL: isn't this method more of a utility?
147
+
148
+ # Parse a provider/model string into its components
149
+ # Format: "provider/model" -> { provider: :provider, model: "model" }
150
+ # Format: "model" -> { provider: nil, model: "model" } (RubyLLM determines provider)
151
+ def parse_provider_model(provider_model_string)
152
+ return { provider: nil, model: nil } if provider_model_string.nil? || provider_model_string.empty?
153
+
154
+ parts = provider_model_string.split("/", 2)
155
+ if parts.length == 2
156
+ { provider: parts[0].to_sym, model: parts[1] }
157
+ else
158
+ # If no slash, let RubyLLM determine provider from model name
159
+ { provider: nil, model: provider_model_string }
160
+ end
161
+ end
162
+
163
+ # Enable method delegation to the internal OpenStruct
164
+ def method_missing(method_name, *args, &block)
165
+ @config.send(method_name, *args, &block)
166
+ end
167
+
168
+ def respond_to_missing?(method_name, include_private = false)
169
+ @config.respond_to?(method_name, include_private) || super
170
+ end
171
+
172
+ private
173
+
174
+ def resolve_procs(obj)
175
+ case obj
176
+ when Hash
177
+ obj.transform_values { |v| resolve_procs(v) }
178
+ when Proc
179
+ obj.call
180
+ else
181
+ obj
182
+ end
183
+ end
184
+
185
+ def deep_merge(hash1, hash2)
186
+ hash1.merge(hash2) do |key, oldval, newval|
187
+ oldval.is_a?(Hash) && newval.is_a?(Hash) ? deep_merge(oldval, newval) : newval
188
+ end
189
+ end
190
+ end
191
+ end
192
+ end
193
+
194
+ __END__
195
+
196
+ {
197
+ directory: "/Users/dewayne/.ragdoll",
198
+ filepath: "/Users/dewayne/.ragdoll/config.yml",
199
+ embedding_config:
200
+ {default:
201
+ {model: "openai/gpt-4o-mini", summary_model: "openai/gpt-4o-mini", keywords_model: "openai/gpt-4o-mini", max_dimensions: 3072},
202
+ text: {model: "openai/text-embedding-3-small", max_tokens: 1000, overlap: 200},
203
+ image: {model: "laion/CLIP-ViT-H-14", max_tokens: 4096, overlap: 128},
204
+ audio: {model: "openl3", transcription_model: "openai/whisper-large-v2", max_tokens: 4096, overlap: 128}},
205
+ chunking: {text: {max_tokens: 1000, overlap: 200}, default: {max_tokens: 4096, overlap: 128}},
206
+ ruby_llm_config:
207
+ {openai: {api_key: "***", organization: nil, project: nil},
208
+ anthropic:
209
+ {api_key: "***"},
210
+ google: {api_key: "***", project_id: nil},
211
+ azure: {api_key: nil, endpoint: nil, api_version: "2024-02-01"},
212
+ ollama: {endpoint: "http://localhost:11434/v1"},
213
+ huggingface: {api_key: nil},
214
+ openrouter: {api_key: nil}},
215
+ summarization_config: {enable: true, model: nil, max_length: 300, min_content_length: 300},
216
+ database_config:
217
+ {adapter: "postgresql",
218
+ database: "ragdoll_development",
219
+ username: "ragdoll",
220
+ password: "ragdoll",
221
+ host: "localhost",
222
+ port: 5432,
223
+ pool: 20,
224
+ timeout: 5000,
225
+ auto_migrate: true,
226
+ logger: nil},
227
+ logging_config: {level: :warn, directory: "/Users/dewayne/.ragdoll", filepath: "/Users/dewayne/.ragdoll/ragdoll.log"},
228
+ search:
229
+ {similarity_threshold: 0.7,
230
+ max_results: 10,
231
+ enable_analytics: true,
232
+ enable_usage_tracking: true,
233
+ usage_ranking_enabled: true,
234
+ usage_recency_weight: 0.3,
235
+ usage_frequency_weight: 0.7,
236
+ usage_similarity_weight: 1.0},
237
+ llm_provider: :openai,
238
+ openai_api_key: "***",
239
+ llm_config:
240
+ {openai: {api_key: "***", organization: nil, project: nil},
241
+ anthropic:
242
+ {api_key: "***"},
243
+ google: {api_key: "***", project_id: nil},
244
+ azure: {api_key: nil, endpoint: nil, api_version: "2024-02-01"},
245
+ ollama: {endpoint: "http://localhost:11434"},
246
+ huggingface: {api_key: nil},
247
+ openrouter: {api_key: nil}},
248
+ embedding_provider: :openai,
249
+ embedding_model: "text-embedding-3-small",
250
+ max_embedding_dimensions: 3072,
251
+ cache_embeddings: true,
252
+ default_model: "gpt-4o-mini",
253
+ summary_provider_model: "openai/gpt-4o-mini",
254
+ keywords_provider_model: "openai/gpt-4o-mini",
255
+ embeddings_provider_model: "openai/text-embedding-3-small",
256
+ summary_model: nil,
257
+ chunk_size: 1000,
258
+ chunk_overlap: 200,
259
+ enable_document_summarization: true,
260
+ summary_max_length: 300,
261
+ summary_min_content_length: 300,
262
+ prompt_template: nil,
263
+ search_similarity_threshold: 0.7,
264
+ max_search_results: 10,
265
+ enable_search_analytics: true,
266
+ enable_usage_tracking: true,
267
+ usage_ranking_enabled: true,
268
+ usage_recency_weight: 0.3,
269
+ usage_frequency_weight: 0.7,
270
+ usage_similarity_weight: 1.0,
271
+ log_level: :warn,
272
+ log_file: "/Users/dewayne/.ragdoll/ragdoll.log"
273
+ }