RubyGems - ragdoll - Versions diffs - 0.1.1 → 0.1.3 - Mend

ragdoll 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +4 -4
data/README.md +1 -1
data/Rakefile +52 -1
data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
data/app/jobs/ragdoll/extract_text_job.rb +38 -0
data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
data/app/lib/ragdoll/metadata_schemas.rb +332 -0
data/app/models/ragdoll/audio_content.rb +142 -0
data/app/models/ragdoll/content.rb +95 -0
data/app/models/ragdoll/document.rb +611 -0
data/app/models/ragdoll/embedding.rb +176 -0
data/app/models/ragdoll/image_content.rb +194 -0
data/app/models/ragdoll/text_content.rb +137 -0
data/app/services/ragdoll/configuration_service.rb +113 -0
data/app/services/ragdoll/document_management.rb +108 -0
data/app/services/ragdoll/document_processor.rb +342 -0
data/app/services/ragdoll/embedding_service.rb +202 -0
data/app/services/ragdoll/image_description_service.rb +230 -0
data/app/services/ragdoll/metadata_generator.rb +329 -0
data/app/services/ragdoll/model_resolver.rb +72 -0
data/app/services/ragdoll/search_engine.rb +51 -0
data/app/services/ragdoll/text_chunker.rb +208 -0
data/app/services/ragdoll/text_generation_service.rb +355 -0
data/lib/ragdoll/core/client.rb +32 -41
data/lib/ragdoll/core/configuration.rb +140 -156
data/lib/ragdoll/core/database.rb +1 -1
data/lib/ragdoll/core/model.rb +45 -0
data/lib/ragdoll/core/version.rb +1 -1
data/lib/ragdoll/core.rb +35 -17
data/lib/ragdoll.rb +1 -1
data/lib/tasks/annotate.rake +1 -1
data/lib/tasks/db.rake +2 -2
metadata +24 -20
data/lib/ragdoll/core/document_management.rb +0 -110
data/lib/ragdoll/core/document_processor.rb +0 -344
data/lib/ragdoll/core/embedding_service.rb +0 -183
data/lib/ragdoll/core/jobs/extract_keywords.rb +0 -32
data/lib/ragdoll/core/jobs/extract_text.rb +0 -42
data/lib/ragdoll/core/jobs/generate_embeddings.rb +0 -32
data/lib/ragdoll/core/jobs/generate_summary.rb +0 -29
data/lib/ragdoll/core/metadata_schemas.rb +0 -334
data/lib/ragdoll/core/models/audio_content.rb +0 -175
data/lib/ragdoll/core/models/content.rb +0 -126
data/lib/ragdoll/core/models/document.rb +0 -678
data/lib/ragdoll/core/models/embedding.rb +0 -204
data/lib/ragdoll/core/models/image_content.rb +0 -227
data/lib/ragdoll/core/models/text_content.rb +0 -169
data/lib/ragdoll/core/search_engine.rb +0 -50
data/lib/ragdoll/core/services/image_description_service.rb +0 -230
data/lib/ragdoll/core/services/metadata_generator.rb +0 -335
data/lib/ragdoll/core/text_chunker.rb +0 -210
data/lib/ragdoll/core/text_generation_service.rb +0 -360

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '00886bb2bc00baa882b99ce87ff32219cbd8b99c9b7dece3d87d90968a440303'
-  data.tar.gz: 73449d8f094966cd6cd46c7aed59bedbe0ca1fb12d12fcc4d23e49d84c031158
+  metadata.gz: 2016536d66d295c1fe5054aedb77526271692d7562131df9de9e1ad756309459
+  data.tar.gz: 725a221ab132fd9ce77f623114c034d675c626428c9d5d8c72e45e275b08feea
 SHA512:
-  metadata.gz: 4d29e99949b98558e6a554a96f3b05ca26dd50ca448ad34a05203c5072db7d50e9525e7103a6abe61cd1433e676d8aabd578c8f59b9a642d12b0e53b40ba3300
-  data.tar.gz: b5a6f9decda306d972122716f7c2baca82e9c0530eaa9cfe95c9eea3d82a37f773d19176e5887dfaab135aa98602380f770d994cd304766edd6ec23dbf570db5
+  metadata.gz: 221c7d3408a9ec1b4c2f735bf733ae40aab896fdc07858b69de0866acb684c1eb65a3fb054342a6d20cd8a6e0b4e3f0c866f1df3a5bd8e5a475d6c3d72062b1a
+  data.tar.gz: 3228762fd152ff2a2fd5c0f514ae39e11e483dba698b8139f6c0696437a70209fb0576d67fb271eed45c4c7a2c08247dcbd68a2eab8f19cda144c01d38c2299f

data/README.md CHANGED Viewed

@@ -8,7 +8,7 @@
     <tr>
       <td width="50%">
         <a href="https://research.ibm.com/blog/retrieval-augmented-generation-RAG" target="_blank">
-          <img src="rag_doll.png" alt="Ragdoll" width="800">
+          <img src="ragdoll.png" alt="Ragdoll" width="800">
         </a>
       </td>
       <td width="50%" valign="top">

data/Rakefile CHANGED Viewed

@@ -1,6 +1,6 @@
 # frozen_string_literal: true
-require 'simplecov'
+require "simplecov"
 SimpleCov.start
 # Suppress bundler/rubygems warnings
@@ -9,12 +9,63 @@ $VERBOSE = nil
 require "bundler/gem_tasks"
 require "rake/testtask"
+def ci_environment?
+  ENV["CI"] == "true" || ENV["RAGDOLL_SKIP_DATABASE_TESTS"] == "true"
+end
+desc "Setup test database"
+task :setup_test_db do
+  require_relative "lib/ragdoll-core"
+  # Database configuration for tests
+  test_db_config = {
+    adapter: "postgresql",
+    database: "ragdoll_test",
+    username: ENV.fetch("RAGDOLL_POSTGRES_USER", "postgres"),
+    password: ENV.fetch("RAGDOLL_POSTGRES_PASSWORD", ""),
+    host: ENV.fetch("RAGDOLL_POSTGRES_HOST", "localhost"),
+    port: ENV.fetch("RAGDOLL_POSTGRES_PORT", 5432)
+  }
+  # Ensure database exists
+  begin
+    # Try to connect to the database
+    ActiveRecord::Base.establish_connection(test_db_config)
+    ActiveRecord::Base.connection.execute("SELECT 1")
+  rescue ActiveRecord::NoDatabaseError
+    # Database doesn't exist, create it
+    puts "Creating ragdoll_test database..."
+    admin_config = test_db_config.merge(database: "postgres")
+    ActiveRecord::Base.establish_connection(admin_config)
+    ActiveRecord::Base.connection.execute("CREATE DATABASE ragdoll_test")
+    ActiveRecord::Base.establish_connection(test_db_config)
+  rescue PG::ConnectionBad => e
+    puts "Error connecting to PostgreSQL: #{e.message}"
+    puts "Please ensure PostgreSQL is running and accessible"
+    exit 1
+  end
+  # Ensure pgvector extension is installed
+  begin
+    ActiveRecord::Base.connection.execute("CREATE EXTENSION IF NOT EXISTS vector")
+  rescue StandardError => e
+    puts "Warning: Could not install pgvector extension: #{e.message}"
+  end
+  # Run migrations
+  Ragdoll::Core::Database.setup(test_db_config.merge(auto_migrate: true, logger: nil))
+  puts "Test database setup complete"
+end
 Rake::TestTask.new(:test) do |t|
   t.libs << "test"
   t.libs << "lib"
   t.test_files = FileList["test/**/*_test.rb"]
 end
+# Make test task depend on database setup only if not skipping database tests
+task test: :setup_test_db unless ci_environment?
 # Load annotate tasks
 Dir.glob("lib/tasks/*.rake").each { |r| load r }

data/app/jobs/ragdoll/extract_keywords_job.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+require "active_job"
+module Ragdoll
+  class ExtractKeywordsJob < ActiveJob::Base
+    queue_as :default
+    def perform(document_id)
+      document = Ragdoll::Document.find(document_id)
+      return unless document.content.present?
+      return if document.keywords.present?
+      text_service = Ragdoll::TextGenerationService.new
+      keywords_array = text_service.extract_keywords(document.content)
+      if keywords_array.present?
+        keywords_string = keywords_array.join(", ")
+        document.update!(keywords: keywords_string)
+      end
+    rescue ActiveRecord::RecordNotFound
+      # Document was deleted, nothing to do
+    rescue StandardError => e
+      Rails.logger.error "Failed to generate keywords for document #{document_id}: #{e.message}" if defined?(Rails)
+      raise e
+    end
+  end
+end

data/app/jobs/ragdoll/extract_text_job.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# frozen_string_literal: true
+require "active_job"
+module Ragdoll
+  class ExtractTextJob < ActiveJob::Base
+    queue_as :default
+    def perform(document_id)
+      document = Ragdoll::Document.find(document_id)
+      return unless document.file_attached?
+      return if document.content.present?
+      document.update!(status: "processing")
+      extracted_content = document.extract_text_from_file
+      if extracted_content.present?
+        document.update!(
+          content: extracted_content,
+          status: "processed"
+        )
+        # Queue follow-up jobs
+        Ragdoll::GenerateSummaryJob.perform_later(document_id)
+        Ragdoll::ExtractKeywordsJob.perform_later(document_id)
+        Ragdoll::GenerateEmbeddingsJob.perform_later(document_id)
+      else
+        document.update!(status: "error")
+      end
+    rescue ActiveRecord::RecordNotFound
+      # Document was deleted, nothing to do
+    rescue StandardError => e
+      document&.update!(status: "error")
+      raise e
+    end
+  end
+end

data/app/jobs/ragdoll/generate_embeddings_job.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+require "active_job"
+module Ragdoll
+  class GenerateEmbeddingsJob < ActiveJob::Base
+    queue_as :default
+    def perform(document_id, chunk_size: nil, chunk_overlap: nil)
+      document = Ragdoll::Document.find(document_id)
+      return unless document.content.present?
+      return if document.all_embeddings.exists?
+      # Process all content records using their own generate_embeddings! methods
+      document.contents.each(&:generate_embeddings!)
+      # Update document status to processed
+      document.update!(status: "processed")
+    rescue ActiveRecord::RecordNotFound
+      # Document was deleted, nothing to do
+    rescue StandardError => e
+      if defined?(Rails)
+        Rails.logger.error "Failed to generate embeddings for document #{document_id}: #{e.message}"
+      end
+      raise e
+    end
+  end
+end

data/app/jobs/ragdoll/generate_summary_job.rb ADDED Viewed

@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+require "active_job"
+module Ragdoll
+  class GenerateSummaryJob < ActiveJob::Base
+    queue_as :default
+    def perform(document_id)
+      document = Ragdoll::Document.find(document_id)
+      return unless document.content.present?
+      return if document.summary.present?
+      text_service = Ragdoll::TextGenerationService.new
+      summary = text_service.generate_summary(document.content)
+      document.update!(summary: summary) if summary.present?
+    rescue ActiveRecord::RecordNotFound
+      # Document was deleted, nothing to do
+    rescue StandardError => e
+      Rails.logger.error "Failed to generate summary for document #{document_id}: #{e.message}" if defined?(Rails)
+      raise e
+    end
+  end
+end

data/app/lib/ragdoll/metadata_schemas.rb ADDED Viewed

@@ -0,0 +1,332 @@
+# frozen_string_literal: true
+module Ragdoll
+  # Document metadata schemas for LLM structured output
+  # Each document type has a specific schema that guides LLM generation
+  module MetadataSchemas
+    # Text document metadata schema
+    TEXT_SCHEMA = {
+      type: "object",
+      properties: {
+        summary: {
+          type: "string",
+          description: "Concise summary of the text content (2-3 paragraphs)"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Relevant keywords and phrases extracted from the text",
+          maxItems: 10
+        },
+        classification: {
+          type: "string",
+          enum: %w[research article blog documentation technical legal financial marketing other],
+          description: "Document classification category"
+        },
+        topics: {
+          type: "array",
+          items: { type: "string" },
+          description: "Main topics discussed in the document",
+          maxItems: 5
+        },
+        sentiment: {
+          type: "string",
+          enum: %w[positive negative neutral mixed],
+          description: "Overall sentiment of the text"
+        },
+        reading_time_minutes: {
+          type: "integer",
+          description: "Estimated reading time in minutes"
+        },
+        language: {
+          type: "string",
+          description: "Primary language of the text (ISO 639-1 code)"
+        },
+        complexity_level: {
+          type: "string",
+          enum: %w[beginner intermediate advanced expert],
+          description: "Complexity/difficulty level of the content"
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[summary keywords classification]
+    }.freeze
+    # Image document metadata schema
+    IMAGE_SCHEMA = {
+      type: "object",
+      properties: {
+        description: {
+          type: "string",
+          description: "Detailed description of what is shown in the image"
+        },
+        summary: {
+          type: "string",
+          description: "Brief summary of the image content (1 paragraph)"
+        },
+        objects: {
+          type: "array",
+          items: { type: "string" },
+          description: "List of objects, people, or items visible in the image",
+          maxItems: 15
+        },
+        scene_type: {
+          type: "string",
+          enum: %w[indoor outdoor portrait landscape diagram chart screenshot artwork photo other],
+          description: "Type of scene or image category"
+        },
+        colors: {
+          type: "array",
+          items: { type: "string" },
+          description: "Dominant colors in the image",
+          maxItems: 5
+        },
+        style: {
+          type: "string",
+          enum: %w[photograph illustration diagram chart screenshot artwork technical drawing other],
+          description: "Visual style or format of the image"
+        },
+        mood: {
+          type: "string",
+          enum: %w[professional casual formal technical artistic dramatic serene energetic other],
+          description: "Overall mood or tone of the image"
+        },
+        text_content: {
+          type: "string",
+          description: "Any visible text in the image (OCR extracted)"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Relevant keywords for image search and categorization",
+          maxItems: 10
+        },
+        classification: {
+          type: "string",
+          enum: %w[technical diagram photo artwork chart screenshot document other],
+          description: "Image classification category"
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[description summary scene_type classification]
+    }.freeze
+    # Audio document metadata schema
+    AUDIO_SCHEMA = {
+      type: "object",
+      properties: {
+        summary: {
+          type: "string",
+          description: "Summary of audio content (speech transcript summary or music description)"
+        },
+        content_type: {
+          type: "string",
+          enum: %w[speech music podcast interview lecture presentation sound_effect meeting other],
+          description: "Type of audio content"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Relevant keywords extracted from transcript or describing music",
+          maxItems: 10
+        },
+        classification: {
+          type: "string",
+          enum: %w[educational entertainment business technical musical interview podcast other],
+          description: "Audio content classification"
+        },
+        topics: {
+          type: "array",
+          items: { type: "string" },
+          description: "Main topics discussed (for speech) or musical elements (for music)",
+          maxItems: 5
+        },
+        language: {
+          type: "string",
+          description: "Language of speech content (ISO 639-1 code) or N/A for music"
+        },
+        speakers: {
+          type: "array",
+          items: { type: "string" },
+          description: "Number or names of speakers (for speech content)",
+          maxItems: 10
+        },
+        mood: {
+          type: "string",
+          enum: %w[formal casual energetic calm professional educational entertaining informative other],
+          description: "Overall mood or tone of the audio"
+        },
+        genre: {
+          type: "string",
+          description: "Music genre (for musical content) or speech type (for spoken content)"
+        },
+        key_quotes: {
+          type: "array",
+          items: { type: "string" },
+          description: "Important quotes or phrases from speech content",
+          maxItems: 3
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[summary content_type classification]
+    }.freeze
+    # PDF document metadata schema (combines text analysis with document structure)
+    PDF_SCHEMA = {
+      type: "object",
+      properties: {
+        summary: {
+          type: "string",
+          description: "Summary of the PDF document content"
+        },
+        document_type: {
+          type: "string",
+          enum: %w[research_paper report manual presentation legal financial technical academic other],
+          description: "Type of PDF document"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Keywords extracted from the document text",
+          maxItems: 15
+        },
+        classification: {
+          type: "string",
+          enum: %w[academic business legal technical manual report presentation other],
+          description: "Document classification category"
+        },
+        topics: {
+          type: "array",
+          items: { type: "string" },
+          description: "Main topics covered in the document",
+          maxItems: 8
+        },
+        structure: {
+          type: "object",
+          properties: {
+            has_table_of_contents: { type: "boolean" },
+            has_bibliography: { type: "boolean" },
+            has_figures: { type: "boolean" },
+            has_tables: { type: "boolean" },
+            estimated_pages: { type: "integer" }
+          }
+        },
+        reading_time_minutes: {
+          type: "integer",
+          description: "Estimated reading time in minutes"
+        },
+        complexity_level: {
+          type: "string",
+          enum: %w[beginner intermediate advanced expert],
+          description: "Complexity level of the content"
+        },
+        language: {
+          type: "string",
+          description: "Primary language of the document"
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[summary document_type classification]
+    }.freeze
+    # Mixed/multi-modal document metadata schema
+    MIXED_SCHEMA = {
+      type: "object",
+      properties: {
+        summary: {
+          type: "string",
+          description: "Overall summary combining all content types in the document"
+        },
+        content_types: {
+          type: "array",
+          items: { type: "string", enum: %w[text image audio] },
+          description: "Types of content present in this multi-modal document"
+        },
+        primary_content_type: {
+          type: "string",
+          enum: %w[text image audio],
+          description: "The primary or dominant content type"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Keywords extracted from all content types",
+          maxItems: 15
+        },
+        classification: {
+          type: "string",
+          enum: %w[multimedia_presentation research educational marketing technical training other],
+          description: "Multi-modal document classification"
+        },
+        topics: {
+          type: "array",
+          items: { type: "string" },
+          description: "Main topics across all content types",
+          maxItems: 8
+        },
+        cohesion_analysis: {
+          type: "string",
+          description: "How well the different content types work together"
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[summary content_types primary_content_type classification]
+    }.freeze
+    # Get schema for document type
+    def self.schema_for(document_type)
+      case document_type.to_s.downcase
+      when "text", "markdown", "html"
+        TEXT_SCHEMA
+      when "image"
+        IMAGE_SCHEMA
+      when "audio"
+        AUDIO_SCHEMA
+      when "pdf", "docx"
+        PDF_SCHEMA
+      when "mixed"
+        MIXED_SCHEMA
+      else
+        TEXT_SCHEMA # fallback
+      end
+    end
+    # Get required fields for document type
+    def self.required_fields_for(document_type)
+      schema_for(document_type)[:required] || []
+    end
+    # Validate metadata against schema
+    def self.validate_metadata(document_type, metadata)
+      schema = schema_for(document_type)
+      required_fields = schema[:required] || []
+      errors = []
+      required_fields.each do |field|
+        errors << "Missing required field: #{field}" unless metadata.key?(field)
+      end
+      errors
+    end
+  end
+end

data/app/models/ragdoll/audio_content.rb ADDED Viewed

@@ -0,0 +1,142 @@
+# frozen_string_literal: true
+require "active_record"
+require_relative "content"
+module Ragdoll
+  class AudioContent < Content
+    validate :audio_data_or_transcript_present
+    validates :duration, numericality: { greater_than: 0 }, allow_nil: true
+    validates :sample_rate, numericality: { greater_than: 0 }, allow_nil: true
+    scope :recent, -> { order(created_at: :desc) }
+    scope :with_audio, -> { where.not(data: [nil, ""]) }
+    scope :with_transcripts, -> { where.not(content: [nil, ""]) }
+    scope :by_duration, lambda { |min_duration, max_duration = nil|
+      scope = where("duration >= ?", min_duration)
+      scope = scope.where("duration <= ?", max_duration) if max_duration
+      scope
+    }
+    # Audio content accessors - content field stores transcript for embedding
+    def transcript
+      content
+    end
+    def transcript=(value)
+      self.content = value
+    end
+    # Audio file data accessor
+    def audio_data
+      data
+    end
+    def audio_data=(value)
+      self.data = value
+    end
+    # Audio file technical properties (stored in content metadata - raw file data)
+    def audio_attached?
+      data.present?
+    end
+    def audio_size
+      metadata.dig("file_size") || 0
+    end
+    def audio_size=(value)
+      self.metadata = metadata.merge("file_size" => value)
+    end
+    def audio_content_type
+      metadata.dig("content_type")
+    end
+    def audio_content_type=(value)
+      self.metadata = metadata.merge("content_type" => value)
+    end
+    def audio_filename
+      metadata.dig("filename")
+    end
+    def audio_filename=(value)
+      self.metadata = metadata.merge("filename" => value)
+    end
+    # Audio format and technical details
+    def codec
+      metadata.dig("codec")
+    end
+    def codec=(value)
+      self.metadata = metadata.merge("codec" => value)
+    end
+    def bitrate
+      metadata.dig("bitrate")
+    end
+    def bitrate=(value)
+      self.metadata = metadata.merge("bitrate" => value)
+    end
+    def channels
+      metadata.dig("channels")
+    end
+    def channels=(value)
+      self.metadata = metadata.merge("channels" => value)
+    end
+    def duration_formatted
+      return "Unknown" unless duration
+      minutes = (duration / 60).floor
+      seconds = (duration % 60).round
+      "#{minutes}:#{seconds.to_s.rjust(2, '0')}"
+    end
+    # Override content for embedding to use transcript
+    def content_for_embedding
+      transcript.presence || "Audio content without transcript"
+    end
+    def generate_embeddings!
+      return unless should_generate_embeddings?
+      embedding_content = content_for_embedding
+      return if embedding_content.blank?
+      # Generate embeddings using the base class method
+      super
+    end
+    # Override should_generate_embeddings to check for transcript
+    def should_generate_embeddings?
+      content_for_embedding.present? && embeddings.empty?
+    end
+    def self.stats
+      {
+        total_audio_contents: count,
+        by_model: group(:embedding_model).count,
+        total_embeddings: joins(:embeddings).count,
+        with_audio: with_audio.count,
+        with_transcripts: with_transcripts.count,
+        total_duration: sum(:duration),
+        average_duration: average(:duration),
+        average_audio_size: joins(:audio_attachment).average("active_storage_blobs.byte_size")
+      }
+    end
+    private
+    def audio_data_or_transcript_present
+      return if audio_attached? || transcript.present?
+      errors.add(:base, "Must have either audio data or transcript")
+    end
+  end
+end