RubyGems - ragdoll - Versions diffs - 0.0.2 - Mend

ragdoll 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +7 -0
data/README.md +353 -0
data/Rakefile +21 -0
data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
data/db/migrate/004_create_ragdoll_documents.rb +70 -0
data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
data/db/migrate/006_create_ragdoll_contents.rb +47 -0
data/lib/ragdoll/core/client.rb +315 -0
data/lib/ragdoll/core/configuration.rb +273 -0
data/lib/ragdoll/core/database.rb +141 -0
data/lib/ragdoll/core/document_management.rb +110 -0
data/lib/ragdoll/core/document_processor.rb +344 -0
data/lib/ragdoll/core/embedding_service.rb +183 -0
data/lib/ragdoll/core/errors.rb +11 -0
data/lib/ragdoll/core/jobs/extract_keywords.rb +32 -0
data/lib/ragdoll/core/jobs/extract_text.rb +42 -0
data/lib/ragdoll/core/jobs/generate_embeddings.rb +32 -0
data/lib/ragdoll/core/jobs/generate_summary.rb +29 -0
data/lib/ragdoll/core/metadata_schemas.rb +334 -0
data/lib/ragdoll/core/models/audio_content.rb +175 -0
data/lib/ragdoll/core/models/content.rb +126 -0
data/lib/ragdoll/core/models/document.rb +678 -0
data/lib/ragdoll/core/models/embedding.rb +204 -0
data/lib/ragdoll/core/models/image_content.rb +227 -0
data/lib/ragdoll/core/models/text_content.rb +169 -0
data/lib/ragdoll/core/search_engine.rb +50 -0
data/lib/ragdoll/core/services/image_description_service.rb +230 -0
data/lib/ragdoll/core/services/metadata_generator.rb +335 -0
data/lib/ragdoll/core/shrine_config.rb +71 -0
data/lib/ragdoll/core/text_chunker.rb +210 -0
data/lib/ragdoll/core/text_generation_service.rb +360 -0
data/lib/ragdoll/core/version.rb +8 -0
data/lib/ragdoll/core.rb +73 -0
data/lib/ragdoll-core.rb +3 -0
data/lib/ragdoll.rb +249 -0
data/lib/tasks/annotate.rake +126 -0
data/lib/tasks/db.rake +338 -0
metadata +80 -0

data/lib/ragdoll/core/client.rb ADDED Viewed

@@ -0,0 +1,315 @@
+# frozen_string_literal: true
+require "fileutils"
+module Ragdoll
+  module Core
+    class Client
+      def initialize
+        # Setup logging
+        setup_logging
+        # Setup database connection
+        Database.setup(Ragdoll.config.database_config)
+        @embedding_service = EmbeddingService.new
+        @search_engine = SearchEngine.new(@embedding_service)
+      end
+      # Primary method for RAG applications
+      # Returns context-enhanced content for AI prompts
+      def enhance_prompt(prompt:, context_limit: 5, **options)
+        context_data = get_context(query: prompt, limit: context_limit, **options)
+        if context_data[:context_chunks].any?
+          enhanced_prompt = build_enhanced_prompt(prompt, context_data[:combined_context])
+          {
+            enhanced_prompt: enhanced_prompt,
+            original_prompt: prompt,
+            context_sources: context_data[:context_chunks].map { |chunk| chunk[:source] },
+            context_count: context_data[:total_chunks]
+          }
+        else
+          {
+            enhanced_prompt: prompt,
+            original_prompt: prompt,
+            context_sources: [],
+            context_count: 0
+          }
+        end
+      end
+      # Get relevant context without prompt enhancement
+      def get_context(query:, limit: 10, **options)
+        results = search_similar_content(query: query, limit: limit, **options)
+        context_chunks = results.map do |result|
+          {
+            content: result[:content],
+            source: result[:document_location],
+            similarity: result[:similarity],
+            chunk_index: result[:chunk_index]
+          }
+        end
+        combined_context = context_chunks.map { |chunk| chunk[:content] }.join("\n\n")
+        {
+          context_chunks: context_chunks,
+          combined_context: combined_context,
+          total_chunks: context_chunks.length
+        }
+      end
+      # FIXME: This high-level API method should be able to take a query that is
+      #        a string or a file.  If its a file, then the downstream Process will
+      #        be responsible for reading the file and passing the contents to the
+      #        search method based upon whether the content is text, image or audio.
+      # Semantic search++ should incorporate hybrid search
+      def search(query:, **options)
+        results = search_similar_content(query: query, **options)
+        {
+          query: query,
+          results: results,
+          total_results: results.length
+        }
+      end
+      # Search similar content (core functionality)
+      def search_similar_content(query:, **options)
+        @search_engine.search_similar_content(query, **options)
+      end
+      # Hybrid search combining semantic and full-text search
+      def hybrid_search(query:, **options)
+        # Generate embedding for the query
+        query_embedding = @embedding_service.generate_embedding(query)
+        # Perform hybrid search
+        results = Models::Document.hybrid_search(query, query_embedding: query_embedding, **options)
+        {
+          query: query,
+          search_type: "hybrid",
+          results: results,
+          total_results: results.length,
+          semantic_weight: options[:semantic_weight] || 0.7,
+          text_weight: options[:text_weight] || 0.3
+        }
+      rescue StandardError => e
+        {
+          query: query,
+          search_type: "hybrid",
+          results: [],
+          total_results: 0,
+          error: "Hybrid search failed: #{e.message}"
+        }
+      end
+      # Document management
+      def add_document(path:)
+        # Parse the document
+        parsed = DocumentProcessor.parse(path)
+        # Extract title from metadata or use filename
+        title = parsed[:metadata][:title] ||
+                File.basename(path, File.extname(path))
+        # Add document to database
+        doc_id = DocumentManagement.add_document(path, parsed[:content], {
+                                                   title: title,
+                                                   document_type: parsed[:document_type],
+                                                   **parsed[:metadata]
+                                                 })
+        # Queue background jobs for processing if content is available
+        embeddings_queued = false
+        if parsed[:content].present?
+          Ragdoll::Core::Jobs::GenerateEmbeddings.perform_later(doc_id)
+          Ragdoll::Core::Jobs::GenerateSummary.perform_later(doc_id)
+          Ragdoll::Core::Jobs::ExtractKeywords.perform_later(doc_id)
+          embeddings_queued = true
+        end
+        # Return success information
+        {
+          success: true,
+          document_id: doc_id,
+          title: title,
+          document_type: parsed[:document_type],
+          content_length: parsed[:content]&.length || 0,
+          embeddings_queued: embeddings_queued,
+          message: "Document '#{title}' added successfully with ID #{doc_id}"
+        }
+      rescue StandardError => e # StandardError => e
+        {
+          success: false,
+          error: e.message,
+          message: "Failed to add document: #{e.message}"
+        }
+      end
+      def add_text(content:, title:, **options)
+        # Add document to database
+        doc_id = DocumentManagement.add_document(title, content, {
+                                                   title: title,
+                                                   document_type: "text",
+                                                   **options
+                                                 })
+        # Queue background job for embeddings
+        Ragdoll::Core::Jobs::GenerateEmbeddings.perform_later(doc_id,
+                                                              chunk_size: options[:chunk_size],
+                                                              chunk_overlap: options[:chunk_overlap])
+        doc_id
+      end
+      def add_directory(path:, recursive: false)
+        results = []
+        pattern = recursive ? File.join(path, "**", "*") : File.join(path, "*")
+        Dir.glob(pattern).each do |file_path|
+          next unless File.file?(file_path)
+          begin
+            doc_id = add_document(path: file_path)
+            results << { file: file_path, document_id: doc_id, status: "success" }
+          rescue StandardError => e
+            results << { file: file_path, error: e.message, status: "error" }
+          end
+        end
+        results
+      end
+      def get_document(id:)
+        document_hash = DocumentManagement.get_document(id)
+        return nil unless document_hash
+        # DocumentManagement.get_document already returns a hash with all needed info
+        document_hash
+      end
+      def document_status(id:)
+        document = Models::Document.find(id)
+        embeddings_count = document.all_embeddings.count
+        {
+          id: document.id,
+          title: document.title,
+          status: document.status,
+          embeddings_count: embeddings_count,
+          embeddings_ready: embeddings_count.positive?,
+          content_preview: document.content&.first(200) || "No content",
+          message: case document.status
+                   when "processed"
+                     "Document processed successfully with #{embeddings_count} embeddings"
+                   when "processing"
+                     "Document is being processed"
+                   when "pending"
+                     "Document is pending processing"
+                   when "error"
+                     "Document processing failed"
+                   else
+                     "Document status: #{document.status}"
+                   end
+        }
+      rescue ActiveRecord::RecordNotFound
+        {
+          success: false,
+          error: "Document not found",
+          message: "Document with ID #{id} does not exist"
+        }
+      end
+      def update_document(id:, **updates)
+        DocumentManagement.update_document(id, **updates)
+      end
+      def delete_document(id:)
+        DocumentManagement.delete_document(id)
+      end
+      def list_documents(**options)
+        DocumentManagement.list_documents(options)
+      end
+      # Analytics and stats
+      def stats
+        DocumentManagement.get_document_stats
+      end
+      def search_analytics(days: 30)
+        # This could be implemented with additional database queries
+        Models::Embedding.where("returned_at > ?", days.days.ago)
+                         .group("DATE(returned_at)")
+                         .count
+      end
+      # Health check
+      def healthy?
+        Database.connected? && stats[:total_documents] >= 0
+      rescue StandardError
+        false
+      end
+      private
+      def setup_logging
+        require "logger"
+        require "active_job"
+        # Create log directory if it doesn't exist
+        # FIXME: log_file is not in current config structure
+        log_file = Ragdoll.config.logging_config[:filepath] || File.join(Dir.home, ".ragdoll", "ragdoll.log")
+        log_dir = File.dirname(log_file)
+        FileUtils.mkdir_p(log_dir) unless Dir.exist?(log_dir)
+        # Set up logger with appropriate level
+        logger = Logger.new(log_file)
+        logger.level = case Ragdoll.config.logging_config[:level]
+                       when :debug then Logger::DEBUG
+                       when :info then Logger::INFO
+                       when :warn then Logger::WARN
+                       when :error then Logger::ERROR
+                       when :fatal then Logger::FATAL
+                       else Logger::WARN
+                       end
+        # Configure ActiveJob to use our logger and reduce verbosity
+        ActiveJob::Base.logger = logger
+        ActiveJob::Base.logger.level = Logger::WARN
+        # Set up ActiveJob queue adapter - use inline for immediate execution
+        ActiveJob::Base.queue_adapter = :inline
+      end
+      def build_enhanced_prompt(original_prompt, context)
+        # FIXME: prompt_template is not in current config structure
+        template = default_prompt_template
+        template
+          .gsub("{{context}}", context)
+          .gsub("{{prompt}}", original_prompt)
+      end
+      def default_prompt_template
+        <<~TEMPLATE
+          You are an AI assistant. Use the following context to help answer the user's question. If the context doesn't contain relevant information, say so.
+          Context:
+          {{context}}
+          Question: {{prompt}}
+          Answer:
+        TEMPLATE
+      end
+    end
+  end
+end

data/lib/ragdoll/core/configuration.rb ADDED Viewed

@@ -0,0 +1,273 @@
+# frozen_string_literal: true
+require "yaml"
+require "fileutils"
+require "ostruct"
+module Ragdoll
+  module Core
+    class Configuration
+      class ConfigurationFileNotFoundError < StandardError; end
+      class ConfigurationSaveError < StandardError; end
+      class ConfigurationLoadUnknownError < StandardError; end
+      DEFAULT = {
+        directory: File.join(Dir.home, ".ragdoll"),
+        filepath: File.join(Dir.home, ".ragdoll", "config.yml"),
+        models: {
+          default: "openai/gpt-4o",
+          summary: "openai/gpt-4o",
+          keywords: "openai/gpt-4o",
+          embedding: {
+            text: "text-embedding-3-small",
+            image: "image-embedding-3-small", # FIXME
+            audio: "audio-embedding-3-small",  # FIXME
+          },
+        },
+        chunking: {
+          text: {
+            max_tokens: 1000,
+            overlap: 200,
+          },
+          image: {
+            max_tokens: 4096,
+            overlap: 128,
+          },
+          audio: {
+            max_tokens: 4096,
+            overlap: 128,
+          },
+          default: {
+            max_tokens: 4096,
+            overlap: 128,
+          },
+        },
+        ruby_llm_config: {
+          openai: {
+            api_key: -> { ENV["OPENAI_API_KEY"] },
+            organization: -> { ENV["OPENAI_ORGANIZATION"] },
+            project: -> { ENV["OPENAI_PROJECT"] },
+          },
+          anthropic: {
+            api_key: -> { ENV["ANTHROPIC_API_KEY"] },
+          },
+          google: {
+            api_key: -> { ENV["GOOGLE_API_KEY"] },
+            project_id: -> { ENV["GOOGLE_PROJECT_ID"] },
+          },
+          azure: {
+            api_key: -> { ENV["AZURE_OPENAI_API_KEY"] },
+            endpoint: -> { ENV["AZURE_OPENAI_ENDPOINT"] },
+            api_version: -> { ENV["AZURE_OPENAI_API_VERSION"] || "2024-02-01" },
+          },
+          ollama: {
+            endpoint: -> { ENV["OLLAMA_ENDPOINT"] || "http://localhost:11434/v1" },
+          },
+          huggingface: {
+            api_key: -> { ENV["HUGGINGFACE_API_KEY"] },
+          },
+          openrouter: {
+            api_key: -> { ENV["OPENROUTER_API_KEY"] },
+          },
+        },
+        embedding_config: {
+          provider: :openai,
+          cache_embeddings: true,
+          max_embedding_dimensions: 3072, # Support up to text-embedding-3-large
+        },
+        summarization_config: {
+          enable: true,
+          max_length: 300,
+          min_content_length: 300,
+        },
+        database_config: {
+          adapter: "postgresql",
+          database: "ragdoll_development",
+          username: "ragdoll",
+          password: -> { ENV["DATABASE_PASSWORD"] },
+          host: "localhost",
+          port: 5432,
+          auto_migrate: true,
+          logger: nil, # Set to Logger.new(STDOUT) for debugging
+        },
+        logging_config: {
+          log_level: :warn,
+          log_directory: File.join(Dir.home, ".ragdoll"),
+          log_filepath: File.join(Dir.home, ".ragdoll", "ragdoll.log"),
+        },
+        search: {
+          similarity_threshold: 0.7,
+          max_results: 10,
+          enable_analytics: true,
+          enable_usage_tracking: true,
+          usage_ranking_enabled: true,
+          usage_recency_weight: 0.3,
+          usage_frequency_weight: 0.7,
+          usage_similarity_weight: 1.0,
+        },
+      }
+      def initialize(config = {})
+        merged_config = deep_merge(self.class::DEFAULT, config)
+        resolved_config = resolve_procs(merged_config)
+        @config = OpenStruct.new(resolved_config)
+      end
+      def self.load(path: nil)
+        path ||= DEFAULT[:filepath]
+        unless File.exist?(path)
+          raise ConfigurationFileNotFoundError, "Configuration file not found: #{path}"
+        end
+        new(YAML.safe_load_file(path) || {})
+      rescue Errno::ENOENT
+        raise ConfigurationFileNotFoundError, "Configuration file not found: #{path}"
+      rescue => e
+        raise ConfigurationLoadUnknownError, "Failed to load configuration from #{path}: #{e.message}"
+      end
+      def save(path: nil)
+        if path.nil?
+          path = @config.filepath
+        else
+          save_filepath = @config.filepath
+          @config.filepath = path
+        end
+        FileUtils.mkdir_p(File.dirname(path))
+        File.write(path, @config.to_yaml)
+      rescue => e
+        @config.filepath = save_filepath unless save_filepath.nil?
+        raise ConfigurationSaveError, "Failed to save configuration to #{path}: #{e.message}"
+      end
+      # SMELL: isn't this method more of a utility?
+      # Parse a provider/model string into its components
+      # Format: "provider/model" -> { provider: :provider, model: "model" }
+      # Format: "model" -> { provider: nil, model: "model" } (RubyLLM determines provider)
+      def parse_provider_model(provider_model_string)
+        return { provider: nil, model: nil } if provider_model_string.nil? || provider_model_string.empty?
+        parts = provider_model_string.split("/", 2)
+        if parts.length == 2
+          { provider: parts[0].to_sym, model: parts[1] }
+        else
+          # If no slash, let RubyLLM determine provider from model name
+          { provider: nil, model: provider_model_string }
+        end
+      end
+      # Enable method delegation to the internal OpenStruct
+      def method_missing(method_name, *args, &block)
+        @config.send(method_name, *args, &block)
+      end
+      def respond_to_missing?(method_name, include_private = false)
+        @config.respond_to?(method_name, include_private) || super
+      end
+      private
+      def resolve_procs(obj)
+        case obj
+        when Hash
+          obj.transform_values { |v| resolve_procs(v) }
+        when Proc
+          obj.call
+        else
+          obj
+        end
+      end
+      def deep_merge(hash1, hash2)
+        hash1.merge(hash2) do |key, oldval, newval|
+          oldval.is_a?(Hash) && newval.is_a?(Hash) ? deep_merge(oldval, newval) : newval
+        end
+      end
+    end
+  end
+end
+__END__
+{
+directory: "/Users/dewayne/.ragdoll",
+ filepath: "/Users/dewayne/.ragdoll/config.yml",
+ embedding_config:
+  {default:
+    {model: "openai/gpt-4o-mini", summary_model: "openai/gpt-4o-mini", keywords_model: "openai/gpt-4o-mini", max_dimensions: 3072},
+   text: {model: "openai/text-embedding-3-small", max_tokens: 1000, overlap: 200},
+   image: {model: "laion/CLIP-ViT-H-14", max_tokens: 4096, overlap: 128},
+   audio: {model: "openl3", transcription_model: "openai/whisper-large-v2", max_tokens: 4096, overlap: 128}},
+ chunking: {text: {max_tokens: 1000, overlap: 200}, default: {max_tokens: 4096, overlap: 128}},
+ ruby_llm_config:
+  {openai: {api_key: "***", organization: nil, project: nil},
+   anthropic:
+    {api_key: "***"},
+   google: {api_key: "***", project_id: nil},
+   azure: {api_key: nil, endpoint: nil, api_version: "2024-02-01"},
+   ollama: {endpoint: "http://localhost:11434/v1"},
+   huggingface: {api_key: nil},
+   openrouter: {api_key: nil}},
+ summarization_config: {enable: true, model: nil, max_length: 300, min_content_length: 300},
+ database_config:
+  {adapter: "postgresql",
+   database: "ragdoll_development",
+   username: "ragdoll",
+   password: "ragdoll",
+   host: "localhost",
+   port: 5432,
+   pool: 20,
+   timeout: 5000,
+   auto_migrate: true,
+   logger: nil},
+ logging_config: {level: :warn, directory: "/Users/dewayne/.ragdoll", filepath: "/Users/dewayne/.ragdoll/ragdoll.log"},
+ search:
+  {similarity_threshold: 0.7,
+   max_results: 10,
+   enable_analytics: true,
+   enable_usage_tracking: true,
+   usage_ranking_enabled: true,
+   usage_recency_weight: 0.3,
+   usage_frequency_weight: 0.7,
+   usage_similarity_weight: 1.0},
+ llm_provider: :openai,
+ openai_api_key: "***",
+ llm_config:
+  {openai: {api_key: "***", organization: nil, project: nil},
+   anthropic:
+    {api_key: "***"},
+   google: {api_key: "***", project_id: nil},
+   azure: {api_key: nil, endpoint: nil, api_version: "2024-02-01"},
+   ollama: {endpoint: "http://localhost:11434"},
+   huggingface: {api_key: nil},
+   openrouter: {api_key: nil}},
+ embedding_provider: :openai,
+ embedding_model: "text-embedding-3-small",
+ max_embedding_dimensions: 3072,
+ cache_embeddings: true,
+ default_model: "gpt-4o-mini",
+ summary_provider_model: "openai/gpt-4o-mini",
+ keywords_provider_model: "openai/gpt-4o-mini",
+ embeddings_provider_model: "openai/text-embedding-3-small",
+ summary_model: nil,
+ chunk_size: 1000,
+ chunk_overlap: 200,
+ enable_document_summarization: true,
+ summary_max_length: 300,
+ summary_min_content_length: 300,
+ prompt_template: nil,
+ search_similarity_threshold: 0.7,
+ max_search_results: 10,
+ enable_search_analytics: true,
+ enable_usage_tracking: true,
+ usage_ranking_enabled: true,
+ usage_recency_weight: 0.3,
+ usage_frequency_weight: 0.7,
+ usage_similarity_weight: 1.0,
+ log_level: :warn,
+ log_file: "/Users/dewayne/.ragdoll/ragdoll.log"
+}