RubyGems - ragdoll - Versions diffs - 0.1.0 → 0.1.3 - Mend

ragdoll 0.1.0 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/README.md +318 -40
data/Rakefile +66 -4
data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
data/app/jobs/ragdoll/extract_text_job.rb +38 -0
data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
data/app/lib/ragdoll/metadata_schemas.rb +332 -0
data/app/models/ragdoll/audio_content.rb +142 -0
data/app/models/ragdoll/content.rb +95 -0
data/app/models/ragdoll/document.rb +606 -4
data/app/models/ragdoll/embedding.rb +172 -5
data/app/models/ragdoll/image_content.rb +194 -0
data/app/models/ragdoll/text_content.rb +137 -0
data/app/services/ragdoll/configuration_service.rb +113 -0
data/app/services/ragdoll/document_management.rb +108 -0
data/app/services/ragdoll/document_processor.rb +342 -0
data/app/services/ragdoll/embedding_service.rb +202 -0
data/app/services/ragdoll/image_description_service.rb +230 -0
data/app/services/ragdoll/metadata_generator.rb +329 -0
data/app/services/ragdoll/model_resolver.rb +72 -0
data/app/services/ragdoll/search_engine.rb +51 -0
data/app/services/ragdoll/text_chunker.rb +208 -0
data/app/services/ragdoll/text_generation_service.rb +355 -0
data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
data/db/migrate/004_create_ragdoll_documents.rb +70 -0
data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
data/db/migrate/006_create_ragdoll_contents.rb +47 -0
data/lib/ragdoll/core/client.rb +306 -0
data/lib/ragdoll/core/configuration.rb +257 -0
data/lib/ragdoll/core/database.rb +141 -0
data/lib/ragdoll/core/errors.rb +11 -0
data/lib/ragdoll/core/model.rb +45 -0
data/lib/ragdoll/core/shrine_config.rb +71 -0
data/lib/ragdoll/core/version.rb +8 -0
data/lib/ragdoll/core.rb +91 -0
data/lib/ragdoll-core.rb +3 -0
data/lib/ragdoll.rb +243 -6
data/lib/tasks/annotate.rake +126 -0
data/lib/tasks/db.rake +338 -0
metadata +42 -35
data/config/initializers/ragdoll.rb +0 -6
data/config/routes.rb +0 -5
data/db/migrate/20250218123456_create_documents.rb +0 -20
data/lib/config/database.yml +0 -28
data/lib/config/ragdoll.yml +0 -31
data/lib/ragdoll/engine.rb +0 -16
data/lib/ragdoll/import_job.rb +0 -15
data/lib/ragdoll/ingestion.rb +0 -30
data/lib/ragdoll/search.rb +0 -18
data/lib/ragdoll/version.rb +0 -7
data/lib/tasks/import_task.thor +0 -32
data/lib/tasks/jobs_task.thor +0 -40
data/lib/tasks/ragdoll_tasks.thor +0 -7
data/lib/tasks/search_task.thor +0 -55

data/db/migrate/006_create_ragdoll_contents.rb ADDED Viewed

@@ -0,0 +1,47 @@
+class CreateRagdollContents < ActiveRecord::Migration[7.0]
+  def change
+    create_table :ragdoll_contents,
+      comment: "Content storage for polymorphic embedding architecture using STI" do |t|
+      t.string :type, null: false,
+        comment: "Type of content (e.g., AudioContent, ImageContent, TextContent)"
+      t.references :document, null: false, foreign_key: { to_table: :ragdoll_documents },
+        comment: "Reference to parent document"
+      t.string :embedding_model, null: false,
+        comment: "Embedding model to use for this content"
+      t.text :content,
+        comment: "Text content or description of the file"
+      t.text :data,
+        comment: "Raw data from file"
+      t.json :metadata, default: {},
+        comment: "Additional metadata about the file's raw data"
+      t.float :duration,
+        comment: "Duration of audio in seconds (for audio content)"
+      t.integer :sample_rate,
+        comment: "Audio sample rate in Hz (for audio content)"
+      t.timestamps null: false,
+        comment: "Standard creation and update timestamps"
+      ###########
+      # Indexes #
+      ###########
+      t.index :embedding_model,
+        comment: "Index for filtering by embedding model"
+      t.index :type,
+        comment: "Index for filtering by content type"
+      t.index "to_tsvector('english', COALESCE(content, ''))", using: :gin, name: "index_ragdoll_contents_on_fulltext_search",
+        comment: "Full-text search index for text content"
+    end
+  end
+end

data/lib/ragdoll/core/client.rb ADDED Viewed

@@ -0,0 +1,306 @@
+# frozen_string_literal: true
+require "fileutils"
+module Ragdoll
+  module Core
+    class Client
+      def initialize
+        # Setup configuration services
+        @config_service = Ragdoll::ConfigurationService.new
+        @model_resolver = Ragdoll::ModelResolver.new(@config_service)
+        # Setup logging
+        setup_logging
+        # Setup database connection
+        Database.setup(@config_service.config.database)
+        @embedding_service = Ragdoll::EmbeddingService.new(
+          client: nil,
+          config_service: @config_service,
+          model_resolver: @model_resolver
+        )
+        @search_engine = Ragdoll::SearchEngine.new(@embedding_service, config_service: @config_service)
+      end
+      # Primary method for RAG applications
+      # Returns context-enhanced content for AI prompts
+      def enhance_prompt(prompt:, context_limit: 5, **options)
+        context_data = get_context(query: prompt, limit: context_limit, **options)
+        if context_data[:context_chunks].any?
+          enhanced_prompt = build_enhanced_prompt(prompt, context_data[:combined_context])
+          {
+            enhanced_prompt: enhanced_prompt,
+            original_prompt: prompt,
+            context_sources: context_data[:context_chunks].map { |chunk| chunk[:source] },
+            context_count: context_data[:total_chunks]
+          }
+        else
+          {
+            enhanced_prompt: prompt,
+            original_prompt: prompt,
+            context_sources: [],
+            context_count: 0
+          }
+        end
+      end
+      # Get relevant context without prompt enhancement
+      def get_context(query:, limit: 10, **options)
+        results = search_similar_content(query: query, limit: limit, **options)
+        context_chunks = results.map do |result|
+          {
+            content: result[:content],
+            source: result[:document_location],
+            similarity: result[:similarity],
+            chunk_index: result[:chunk_index]
+          }
+        end
+        combined_context = context_chunks.map { |chunk| chunk[:content] }.join("\n\n")
+        {
+          context_chunks: context_chunks,
+          combined_context: combined_context,
+          total_chunks: context_chunks.length
+        }
+      end
+      # FIXME: This high-level API method should be able to take a query that is
+      #        a string or a file.  If its a file, then the downstream Process will
+      #        be responsible for reading the file and passing the contents to the
+      #        search method based upon whether the content is text, image or audio.
+      # Semantic search++ should incorporate hybrid search
+      def search(query:, **options)
+        results = search_similar_content(query: query, **options)
+        {
+          query: query,
+          results: results,
+          total_results: results.length
+        }
+      end
+      # Search similar content (core functionality)
+      def search_similar_content(query:, **options)
+        @search_engine.search_similar_content(query, **options)
+      end
+      # Hybrid search combining semantic and full-text search
+      def hybrid_search(query:, **options)
+        # Generate embedding for the query
+        query_embedding = @embedding_service.generate_embedding(query)
+        # Perform hybrid search
+        results = Ragdoll::Document.hybrid_search(query, query_embedding: query_embedding, **options)
+        {
+          query: query,
+          search_type: "hybrid",
+          results: results,
+          total_results: results.length,
+          semantic_weight: options[:semantic_weight] || 0.7,
+          text_weight: options[:text_weight] || 0.3
+        }
+      rescue StandardError => e
+        {
+          query: query,
+          search_type: "hybrid",
+          results: [],
+          total_results: 0,
+          error: "Hybrid search failed: #{e.message}"
+        }
+      end
+      # Document management
+      def add_document(path:)
+        # Parse the document
+        parsed = Ragdoll::DocumentProcessor.parse(path)
+        # Extract title from metadata or use filename
+        title = parsed[:metadata][:title] ||
+                File.basename(path, File.extname(path))
+        # Add document to database
+        doc_id = Ragdoll::DocumentManagement.add_document(path, parsed[:content], {
+                                                   title: title,
+                                                   document_type: parsed[:document_type],
+                                                   **parsed[:metadata]
+                                                 })
+        # Queue background jobs for processing if content is available
+        embeddings_queued = false
+        if parsed[:content].present?
+          Ragdoll::GenerateEmbeddingsJob.perform_later(doc_id)
+          Ragdoll::GenerateSummaryJob.perform_later(doc_id)
+          Ragdoll::ExtractKeywordsJob.perform_later(doc_id)
+          embeddings_queued = true
+        end
+        # Return success information
+        {
+          success: true,
+          document_id: doc_id,
+          title: title,
+          document_type: parsed[:document_type],
+          content_length: parsed[:content]&.length || 0,
+          embeddings_queued: embeddings_queued,
+          message: "Document '#{title}' added successfully with ID #{doc_id}"
+        }
+      rescue StandardError => e # StandardError => e
+        {
+          success: false,
+          error: e.message,
+          message: "Failed to add document: #{e.message}"
+        }
+      end
+      def add_text(content:, title:, **options)
+        # Add document to database
+        doc_id = Ragdoll::DocumentManagement.add_document(title, content, {
+                                                   title: title,
+                                                   document_type: "text",
+                                                   **options
+                                                 })
+        # Queue background job for embeddings
+        Ragdoll::GenerateEmbeddingsJob.perform_later(doc_id,
+                                                     chunk_size: options[:chunk_size],
+                                                     chunk_overlap: options[:chunk_overlap])
+        doc_id
+      end
+      def add_directory(path:, recursive: false)
+        results = []
+        pattern = recursive ? File.join(path, "**", "*") : File.join(path, "*")
+        Dir.glob(pattern).each do |file_path|
+          next unless File.file?(file_path)
+          begin
+            doc_id = add_document(path: file_path)
+            results << { file: file_path, document_id: doc_id, status: "success" }
+          rescue StandardError => e
+            results << { file: file_path, error: e.message, status: "error" }
+          end
+        end
+        results
+      end
+      def get_document(id:)
+        document_hash = Ragdoll::DocumentManagement.get_document(id)
+        return nil unless document_hash
+        # DocumentManagement.get_document already returns a hash with all needed info
+        document_hash
+      end
+      def document_status(id:)
+        document = Ragdoll::Document.find(id)
+        embeddings_count = document.all_embeddings.count
+        {
+          id: document.id,
+          title: document.title,
+          status: document.status,
+          embeddings_count: embeddings_count,
+          embeddings_ready: embeddings_count.positive?,
+          content_preview: document.content&.first(200) || "No content",
+          message: case document.status
+                   when "processed"
+                     "Document processed successfully with #{embeddings_count} embeddings"
+                   when "processing"
+                     "Document is being processed"
+                   when "pending"
+                     "Document is pending processing"
+                   when "error"
+                     "Document processing failed"
+                   else
+                     "Document status: #{document.status}"
+                   end
+        }
+      rescue ActiveRecord::RecordNotFound
+        {
+          success: false,
+          error: "Document not found",
+          message: "Document with ID #{id} does not exist"
+        }
+      end
+      def update_document(id:, **updates)
+        Ragdoll::DocumentManagement.update_document(id, **updates)
+      end
+      def delete_document(id:)
+        Ragdoll::DocumentManagement.delete_document(id)
+      end
+      def list_documents(**options)
+        Ragdoll::DocumentManagement.list_documents(options)
+      end
+      # Analytics and stats
+      def stats
+        Ragdoll::DocumentManagement.get_document_stats
+      end
+      def search_analytics(days: 30)
+        # This could be implemented with additional database queries
+        Ragdoll::Embedding.where("returned_at > ?", days.days.ago)
+                         .group("DATE(returned_at)")
+                         .count
+      end
+      # Health check
+      def healthy?
+        Database.connected? && stats[:total_documents] >= 0
+      rescue StandardError
+        false
+      end
+      private
+      def setup_logging
+        require "logger"
+        require "active_job"
+        # Create log directory if it doesn't exist
+        log_file = @config_service.config.logging[:filepath]
+        log_dir = File.dirname(log_file)
+        FileUtils.mkdir_p(log_dir) unless Dir.exist?(log_dir)
+        # Set up logger with appropriate level
+        logger = Logger.new(log_file)
+        logger.level = case @config_service.config.logging[:level]
+                       when :debug then Logger::DEBUG
+                       when :info then Logger::INFO
+                       when :warn then Logger::WARN
+                       when :error then Logger::ERROR
+                       when :fatal then Logger::FATAL
+                       else Logger::WARN
+                       end
+        # Configure ActiveJob to use our logger and reduce verbosity
+        ActiveJob::Base.logger = logger
+        ActiveJob::Base.logger.level = Logger::WARN
+        # Set up ActiveJob queue adapter - use inline for immediate execution
+        ActiveJob::Base.queue_adapter = :inline
+      end
+      def build_enhanced_prompt(original_prompt, context)
+        template = @config_service.config.prompt_template(:rag_enhancement)
+        template
+          .gsub("{{context}}", context)
+          .gsub("{{prompt}}", original_prompt)
+      end
+    end
+  end
+end

data/lib/ragdoll/core/configuration.rb ADDED Viewed

@@ -0,0 +1,257 @@
+# frozen_string_literal: true
+require "yaml"
+require "fileutils"
+require "ostruct"
+require_relative "model"
+module Ragdoll
+  module Core
+    class Configuration
+      class ConfigurationFileNotFoundError < StandardError; end
+      class ConfigurationSaveError < StandardError; end
+      class ConfigurationLoadUnknownError < StandardError; end
+      DEFAULT = {
+        # Base directory for all Ragdoll files - single source of truth
+        base_directory: File.join(Dir.home, ".config", "ragdoll"),
+        # Configuration file path derived from base directory
+        config_filepath: File.join(Dir.home, ".config", "ragdoll", "config.yml"),
+        # Model configurations organized by purpose with inheritance support
+        models: {
+          text_generation: {
+            default: -> { Model.new(ENV.fetch("RAGDOLL_DEFAULT_TEXT_MODEL", "openai/gpt-4o")) },
+            summary: -> { Model.new(ENV.fetch("RAGDOLL_SUMMARY_MODEL", "openai/gpt-4o")) },
+            keywords: -> { Model.new(ENV.fetch("RAGDOLL_KEYWORDS_MODEL", "openai/gpt-4o")) }
+          },
+          embedding: {
+            provider: :openai,
+            text: -> { Model.new(ENV.fetch("RAGDOLL_TEXT_EMBEDDING_MODEL", "openai/text-embedding-3-small")) },
+            image: -> { Model.new(ENV.fetch("RAGDOLL_IMAGE_EMBEDDING_MODEL", "openai/clip-vit-base-patch32")) },
+            audio: -> { Model.new(ENV.fetch("RAGDOLL_AUDIO_EMBEDDING_MODEL", "openai/whisper-1")) },
+            max_dimensions: 3072,
+            cache_embeddings: true
+          }
+        },
+        # Processing configuration by content type
+        processing: {
+          text: {
+            chunking: {
+              max_tokens: 1000,
+              overlap: 200
+            }
+          },
+          default: {
+            chunking: {
+              max_tokens: 4096,
+              overlap: 128
+            }
+          },
+          search: {
+            similarity_threshold: 0.7,
+            max_results: 10,
+            analytics: {
+              enable: true,
+              usage_tracking_enabled: true,
+              ranking_enabled: true,
+              recency_weight: 0.3,
+              frequency_weight: 0.7,
+              similarity_weight: 1.0
+            }
+          }
+        },
+        # LLM provider configurations (renamed from ruby_llm_config)
+        llm_providers: {
+          default_provider: :openai,
+          openai: {
+            api_key: -> { ENV.fetch("OPENAI_API_KEY", nil) },
+            organization: -> { ENV.fetch("OPENAI_ORGANIZATION", nil) },
+            project: -> { ENV.fetch("OPENAI_PROJECT", nil) }
+          },
+          anthropic: {
+            api_key: -> { ENV.fetch("ANTHROPIC_API_KEY", nil) }
+          },
+          google: {
+            api_key: -> { ENV.fetch("GOOGLE_API_KEY", nil) },
+            project_id: -> { ENV.fetch("GOOGLE_PROJECT_ID", nil) }
+          },
+          azure: {
+            api_key: -> { ENV.fetch("AZURE_OPENAI_API_KEY", nil) },
+            endpoint: -> { ENV.fetch("AZURE_OPENAI_ENDPOINT", nil) },
+            api_version: -> { ENV.fetch("AZURE_OPENAI_API_VERSION", "2024-02-01") }
+          },
+          ollama: {
+            endpoint: -> { ENV.fetch("OLLAMA_ENDPOINT", "http://localhost:11434") }
+          },
+          huggingface: {
+            api_key: -> { ENV.fetch("HUGGINGFACE_API_KEY", nil) }
+          },
+          openrouter: {
+            api_key: -> { ENV.fetch("OPENROUTER_API_KEY", nil) }
+          }
+        },
+        # Summarization configuration
+        summarization: {
+          enable: true,
+          max_length: 300,
+          min_content_length: 300
+        },
+        # Database configuration with standardized ENV variable name
+        database: {
+          adapter: "postgresql",
+          database: "ragdoll_development",
+          username: "ragdoll",
+          password: -> { ENV.fetch("RAGDOLL_DATABASE_PASSWORD", nil) },
+          host: "localhost",
+          port: 5432,
+          auto_migrate: true,
+          logger: nil
+        },
+        # Logging configuration with corrected key names and path derivation
+        logging: {
+          level: :warn, # Fixed: was log_level, now matches usage
+          directory: File.join(Dir.home, ".config", "ragdoll", "logs"),
+          filepath: File.join(Dir.home, ".config", "ragdoll", "logs", "ragdoll.log")
+        },
+        # Prompt templates for customizable text generation
+        prompt_templates: {
+          rag_enhancement: <<~TEMPLATE.strip
+            You are an AI assistant. Use the following context to help answer the user's question.
+            If the context doesn't contain relevant information, say so.
+            Context:
+            {{context}}
+            Question: {{prompt}}
+            Answer:
+          TEMPLATE
+        }
+      }.freeze
+      def initialize(config = {})
+        merged_config = deep_merge(self.class::DEFAULT, config)
+        resolved_config = resolve_procs(merged_config, [])
+        @config = OpenStruct.new(resolved_config)
+      end
+      def self.load(path: nil)
+        path ||= DEFAULT[:config_filepath]
+        raise ConfigurationFileNotFoundError, "Configuration file not found: #{path}" unless File.exist?(path)
+        new(YAML.safe_load_file(path) || {})
+      rescue Errno::ENOENT
+        raise ConfigurationFileNotFoundError, "Configuration file not found: #{path}"
+      rescue StandardError => e
+        raise ConfigurationLoadUnknownError, "Failed to load configuration from #{path}: #{e.message}"
+      end
+      def save(path: nil)
+        if path.nil?
+          path = @config.config_filepath
+        else
+          save_filepath = @config.config_filepath
+          @config.config_filepath = path
+        end
+        FileUtils.mkdir_p(File.dirname(path))
+        File.write(path, @config.to_yaml)
+      rescue StandardError => e
+        @config.config_filepath = save_filepath unless save_filepath.nil?
+        raise ConfigurationSaveError, "Failed to save configuration to #{path}: #{e.message}"
+      end
+      # SMELL: isn't this method more of a utility?
+      # Parse a provider/model string into its components
+      # Format: "provider/model" -> { provider: :provider, model: "model" }
+      # Format: "model" -> { provider: nil, model: "model" } (RubyLLM determines provider)
+      def parse_provider_model(provider_model_string)
+        return { provider: nil, model: nil } if provider_model_string.nil? || provider_model_string.empty?
+        parts = provider_model_string.split("/", 2)
+        if parts.length == 2
+          { provider: parts[0].to_sym, model: parts[1] }
+        else
+          # If no slash, let RubyLLM determine provider from model name
+          { provider: nil, model: provider_model_string }
+        end
+      end
+      # Resolve model with inheritance support
+      # Returns the model string for a given task, with inheritance from default
+      def resolve_model(task_type)
+        case task_type
+        when :embedding
+          @config.models[:embedding]
+        when :text, :summary, :keywords, :default
+          @config.models[:text_generation][task_type] || @config.models[:text_generation][:default]
+        else
+          @config.models[:text_generation][:default]
+        end
+      end
+      # Get provider credentials for a given provider
+      def provider_credentials(provider = nil)
+        provider ||= @config.llm_providers[:default_provider]
+        @config.llm_providers[provider] || {}
+      end
+      # Resolve embedding model for content type
+      def embedding_model(content_type = :text)
+        @config.models[:embedding][content_type] || @config.models[:embedding][:text]
+      end
+      # Get prompt template
+      def prompt_template(template_name = :rag_enhancement)
+        @config.prompt_templates[template_name]
+      end
+      # Enable method delegation to the internal OpenStruct
+      def method_missing(method_name, *args, &block)
+        @config.send(method_name, *args, &block)
+      end
+      def respond_to_missing?(method_name, include_private = false)
+        @config.respond_to?(method_name, include_private) || super
+      end
+      private
+      def resolve_procs(obj, path = [])
+        case obj
+        when Hash
+          obj.each_with_object({}) { |(k, v), result| result[k] = resolve_procs(v, path + [k]) }
+        when Proc
+          obj.call
+        when String
+          # Convert strings to Model instances in the models configuration section
+          if path.length >= 2 && path[0] == :models
+            Model.new(obj)
+          else
+            obj
+          end
+        else
+          obj
+        end
+      end
+      def deep_merge(hash1, hash2)
+        hash1.merge(hash2) do |_key, oldval, newval|
+          oldval.is_a?(Hash) && newval.is_a?(Hash) ? deep_merge(oldval, newval) : newval
+        end
+      end
+    end
+  end
+end