RubyGems - ragdoll - Versions diffs - 0.1.0 → 0.1.3 - Mend

ragdoll 0.1.0 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/README.md +318 -40
data/Rakefile +66 -4
data/app/jobs/ragdoll/extract_keywords_job.rb +28 -0
data/app/jobs/ragdoll/extract_text_job.rb +38 -0
data/app/jobs/ragdoll/generate_embeddings_job.rb +28 -0
data/app/jobs/ragdoll/generate_summary_job.rb +25 -0
data/app/lib/ragdoll/metadata_schemas.rb +332 -0
data/app/models/ragdoll/audio_content.rb +142 -0
data/app/models/ragdoll/content.rb +95 -0
data/app/models/ragdoll/document.rb +606 -4
data/app/models/ragdoll/embedding.rb +172 -5
data/app/models/ragdoll/image_content.rb +194 -0
data/app/models/ragdoll/text_content.rb +137 -0
data/app/services/ragdoll/configuration_service.rb +113 -0
data/app/services/ragdoll/document_management.rb +108 -0
data/app/services/ragdoll/document_processor.rb +342 -0
data/app/services/ragdoll/embedding_service.rb +202 -0
data/app/services/ragdoll/image_description_service.rb +230 -0
data/app/services/ragdoll/metadata_generator.rb +329 -0
data/app/services/ragdoll/model_resolver.rb +72 -0
data/app/services/ragdoll/search_engine.rb +51 -0
data/app/services/ragdoll/text_chunker.rb +208 -0
data/app/services/ragdoll/text_generation_service.rb +355 -0
data/db/migrate/001_enable_postgresql_extensions.rb +23 -0
data/db/migrate/004_create_ragdoll_documents.rb +70 -0
data/db/migrate/005_create_ragdoll_embeddings.rb +41 -0
data/db/migrate/006_create_ragdoll_contents.rb +47 -0
data/lib/ragdoll/core/client.rb +306 -0
data/lib/ragdoll/core/configuration.rb +257 -0
data/lib/ragdoll/core/database.rb +141 -0
data/lib/ragdoll/core/errors.rb +11 -0
data/lib/ragdoll/core/model.rb +45 -0
data/lib/ragdoll/core/shrine_config.rb +71 -0
data/lib/ragdoll/core/version.rb +8 -0
data/lib/ragdoll/core.rb +91 -0
data/lib/ragdoll-core.rb +3 -0
data/lib/ragdoll.rb +243 -6
data/lib/tasks/annotate.rake +126 -0
data/lib/tasks/db.rake +338 -0
metadata +42 -35
data/config/initializers/ragdoll.rb +0 -6
data/config/routes.rb +0 -5
data/db/migrate/20250218123456_create_documents.rb +0 -20
data/lib/config/database.yml +0 -28
data/lib/config/ragdoll.yml +0 -31
data/lib/ragdoll/engine.rb +0 -16
data/lib/ragdoll/import_job.rb +0 -15
data/lib/ragdoll/ingestion.rb +0 -30
data/lib/ragdoll/search.rb +0 -18
data/lib/ragdoll/version.rb +0 -7
data/lib/tasks/import_task.thor +0 -32
data/lib/tasks/jobs_task.thor +0 -40
data/lib/tasks/ragdoll_tasks.thor +0 -7
data/lib/tasks/search_task.thor +0 -55

data/app/lib/ragdoll/metadata_schemas.rb ADDED Viewed

@@ -0,0 +1,332 @@
+# frozen_string_literal: true
+module Ragdoll
+  # Document metadata schemas for LLM structured output
+  # Each document type has a specific schema that guides LLM generation
+  module MetadataSchemas
+    # Text document metadata schema
+    TEXT_SCHEMA = {
+      type: "object",
+      properties: {
+        summary: {
+          type: "string",
+          description: "Concise summary of the text content (2-3 paragraphs)"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Relevant keywords and phrases extracted from the text",
+          maxItems: 10
+        },
+        classification: {
+          type: "string",
+          enum: %w[research article blog documentation technical legal financial marketing other],
+          description: "Document classification category"
+        },
+        topics: {
+          type: "array",
+          items: { type: "string" },
+          description: "Main topics discussed in the document",
+          maxItems: 5
+        },
+        sentiment: {
+          type: "string",
+          enum: %w[positive negative neutral mixed],
+          description: "Overall sentiment of the text"
+        },
+        reading_time_minutes: {
+          type: "integer",
+          description: "Estimated reading time in minutes"
+        },
+        language: {
+          type: "string",
+          description: "Primary language of the text (ISO 639-1 code)"
+        },
+        complexity_level: {
+          type: "string",
+          enum: %w[beginner intermediate advanced expert],
+          description: "Complexity/difficulty level of the content"
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[summary keywords classification]
+    }.freeze
+    # Image document metadata schema
+    IMAGE_SCHEMA = {
+      type: "object",
+      properties: {
+        description: {
+          type: "string",
+          description: "Detailed description of what is shown in the image"
+        },
+        summary: {
+          type: "string",
+          description: "Brief summary of the image content (1 paragraph)"
+        },
+        objects: {
+          type: "array",
+          items: { type: "string" },
+          description: "List of objects, people, or items visible in the image",
+          maxItems: 15
+        },
+        scene_type: {
+          type: "string",
+          enum: %w[indoor outdoor portrait landscape diagram chart screenshot artwork photo other],
+          description: "Type of scene or image category"
+        },
+        colors: {
+          type: "array",
+          items: { type: "string" },
+          description: "Dominant colors in the image",
+          maxItems: 5
+        },
+        style: {
+          type: "string",
+          enum: %w[photograph illustration diagram chart screenshot artwork technical drawing other],
+          description: "Visual style or format of the image"
+        },
+        mood: {
+          type: "string",
+          enum: %w[professional casual formal technical artistic dramatic serene energetic other],
+          description: "Overall mood or tone of the image"
+        },
+        text_content: {
+          type: "string",
+          description: "Any visible text in the image (OCR extracted)"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Relevant keywords for image search and categorization",
+          maxItems: 10
+        },
+        classification: {
+          type: "string",
+          enum: %w[technical diagram photo artwork chart screenshot document other],
+          description: "Image classification category"
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[description summary scene_type classification]
+    }.freeze
+    # Audio document metadata schema
+    AUDIO_SCHEMA = {
+      type: "object",
+      properties: {
+        summary: {
+          type: "string",
+          description: "Summary of audio content (speech transcript summary or music description)"
+        },
+        content_type: {
+          type: "string",
+          enum: %w[speech music podcast interview lecture presentation sound_effect meeting other],
+          description: "Type of audio content"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Relevant keywords extracted from transcript or describing music",
+          maxItems: 10
+        },
+        classification: {
+          type: "string",
+          enum: %w[educational entertainment business technical musical interview podcast other],
+          description: "Audio content classification"
+        },
+        topics: {
+          type: "array",
+          items: { type: "string" },
+          description: "Main topics discussed (for speech) or musical elements (for music)",
+          maxItems: 5
+        },
+        language: {
+          type: "string",
+          description: "Language of speech content (ISO 639-1 code) or N/A for music"
+        },
+        speakers: {
+          type: "array",
+          items: { type: "string" },
+          description: "Number or names of speakers (for speech content)",
+          maxItems: 10
+        },
+        mood: {
+          type: "string",
+          enum: %w[formal casual energetic calm professional educational entertaining informative other],
+          description: "Overall mood or tone of the audio"
+        },
+        genre: {
+          type: "string",
+          description: "Music genre (for musical content) or speech type (for spoken content)"
+        },
+        key_quotes: {
+          type: "array",
+          items: { type: "string" },
+          description: "Important quotes or phrases from speech content",
+          maxItems: 3
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[summary content_type classification]
+    }.freeze
+    # PDF document metadata schema (combines text analysis with document structure)
+    PDF_SCHEMA = {
+      type: "object",
+      properties: {
+        summary: {
+          type: "string",
+          description: "Summary of the PDF document content"
+        },
+        document_type: {
+          type: "string",
+          enum: %w[research_paper report manual presentation legal financial technical academic other],
+          description: "Type of PDF document"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Keywords extracted from the document text",
+          maxItems: 15
+        },
+        classification: {
+          type: "string",
+          enum: %w[academic business legal technical manual report presentation other],
+          description: "Document classification category"
+        },
+        topics: {
+          type: "array",
+          items: { type: "string" },
+          description: "Main topics covered in the document",
+          maxItems: 8
+        },
+        structure: {
+          type: "object",
+          properties: {
+            has_table_of_contents: { type: "boolean" },
+            has_bibliography: { type: "boolean" },
+            has_figures: { type: "boolean" },
+            has_tables: { type: "boolean" },
+            estimated_pages: { type: "integer" }
+          }
+        },
+        reading_time_minutes: {
+          type: "integer",
+          description: "Estimated reading time in minutes"
+        },
+        complexity_level: {
+          type: "string",
+          enum: %w[beginner intermediate advanced expert],
+          description: "Complexity level of the content"
+        },
+        language: {
+          type: "string",
+          description: "Primary language of the document"
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[summary document_type classification]
+    }.freeze
+    # Mixed/multi-modal document metadata schema
+    MIXED_SCHEMA = {
+      type: "object",
+      properties: {
+        summary: {
+          type: "string",
+          description: "Overall summary combining all content types in the document"
+        },
+        content_types: {
+          type: "array",
+          items: { type: "string", enum: %w[text image audio] },
+          description: "Types of content present in this multi-modal document"
+        },
+        primary_content_type: {
+          type: "string",
+          enum: %w[text image audio],
+          description: "The primary or dominant content type"
+        },
+        keywords: {
+          type: "array",
+          items: { type: "string" },
+          description: "Keywords extracted from all content types",
+          maxItems: 15
+        },
+        classification: {
+          type: "string",
+          enum: %w[multimedia_presentation research educational marketing technical training other],
+          description: "Multi-modal document classification"
+        },
+        topics: {
+          type: "array",
+          items: { type: "string" },
+          description: "Main topics across all content types",
+          maxItems: 8
+        },
+        cohesion_analysis: {
+          type: "string",
+          description: "How well the different content types work together"
+        },
+        tags: {
+          type: "array",
+          items: { type: "string" },
+          description: "User-defined or AI-suggested tags for organization"
+        }
+      },
+      required: %w[summary content_types primary_content_type classification]
+    }.freeze
+    # Get schema for document type
+    def self.schema_for(document_type)
+      case document_type.to_s.downcase
+      when "text", "markdown", "html"
+        TEXT_SCHEMA
+      when "image"
+        IMAGE_SCHEMA
+      when "audio"
+        AUDIO_SCHEMA
+      when "pdf", "docx"
+        PDF_SCHEMA
+      when "mixed"
+        MIXED_SCHEMA
+      else
+        TEXT_SCHEMA # fallback
+      end
+    end
+    # Get required fields for document type
+    def self.required_fields_for(document_type)
+      schema_for(document_type)[:required] || []
+    end
+    # Validate metadata against schema
+    def self.validate_metadata(document_type, metadata)
+      schema = schema_for(document_type)
+      required_fields = schema[:required] || []
+      errors = []
+      required_fields.each do |field|
+        errors << "Missing required field: #{field}" unless metadata.key?(field)
+      end
+      errors
+    end
+  end
+end

data/app/models/ragdoll/audio_content.rb ADDED Viewed

@@ -0,0 +1,142 @@
+# frozen_string_literal: true
+require "active_record"
+require_relative "content"
+module Ragdoll
+  class AudioContent < Content
+    validate :audio_data_or_transcript_present
+    validates :duration, numericality: { greater_than: 0 }, allow_nil: true
+    validates :sample_rate, numericality: { greater_than: 0 }, allow_nil: true
+    scope :recent, -> { order(created_at: :desc) }
+    scope :with_audio, -> { where.not(data: [nil, ""]) }
+    scope :with_transcripts, -> { where.not(content: [nil, ""]) }
+    scope :by_duration, lambda { |min_duration, max_duration = nil|
+      scope = where("duration >= ?", min_duration)
+      scope = scope.where("duration <= ?", max_duration) if max_duration
+      scope
+    }
+    # Audio content accessors - content field stores transcript for embedding
+    def transcript
+      content
+    end
+    def transcript=(value)
+      self.content = value
+    end
+    # Audio file data accessor
+    def audio_data
+      data
+    end
+    def audio_data=(value)
+      self.data = value
+    end
+    # Audio file technical properties (stored in content metadata - raw file data)
+    def audio_attached?
+      data.present?
+    end
+    def audio_size
+      metadata.dig("file_size") || 0
+    end
+    def audio_size=(value)
+      self.metadata = metadata.merge("file_size" => value)
+    end
+    def audio_content_type
+      metadata.dig("content_type")
+    end
+    def audio_content_type=(value)
+      self.metadata = metadata.merge("content_type" => value)
+    end
+    def audio_filename
+      metadata.dig("filename")
+    end
+    def audio_filename=(value)
+      self.metadata = metadata.merge("filename" => value)
+    end
+    # Audio format and technical details
+    def codec
+      metadata.dig("codec")
+    end
+    def codec=(value)
+      self.metadata = metadata.merge("codec" => value)
+    end
+    def bitrate
+      metadata.dig("bitrate")
+    end
+    def bitrate=(value)
+      self.metadata = metadata.merge("bitrate" => value)
+    end
+    def channels
+      metadata.dig("channels")
+    end
+    def channels=(value)
+      self.metadata = metadata.merge("channels" => value)
+    end
+    def duration_formatted
+      return "Unknown" unless duration
+      minutes = (duration / 60).floor
+      seconds = (duration % 60).round
+      "#{minutes}:#{seconds.to_s.rjust(2, '0')}"
+    end
+    # Override content for embedding to use transcript
+    def content_for_embedding
+      transcript.presence || "Audio content without transcript"
+    end
+    def generate_embeddings!
+      return unless should_generate_embeddings?
+      embedding_content = content_for_embedding
+      return if embedding_content.blank?
+      # Generate embeddings using the base class method
+      super
+    end
+    # Override should_generate_embeddings to check for transcript
+    def should_generate_embeddings?
+      content_for_embedding.present? && embeddings.empty?
+    end
+    def self.stats
+      {
+        total_audio_contents: count,
+        by_model: group(:embedding_model).count,
+        total_embeddings: joins(:embeddings).count,
+        with_audio: with_audio.count,
+        with_transcripts: with_transcripts.count,
+        total_duration: sum(:duration),
+        average_duration: average(:duration),
+        average_audio_size: joins(:audio_attachment).average("active_storage_blobs.byte_size")
+      }
+    end
+    private
+    def audio_data_or_transcript_present
+      return if audio_attached? || transcript.present?
+      errors.add(:base, "Must have either audio data or transcript")
+    end
+  end
+end

data/app/models/ragdoll/content.rb ADDED Viewed

@@ -0,0 +1,95 @@
+# frozen_string_literal: true
+require "active_record"
+module Ragdoll
+  class Content < ActiveRecord::Base
+    self.table_name = "ragdoll_contents"
+    belongs_to :document,
+               class_name: "Ragdoll::Document",
+               foreign_key: "document_id"
+    has_many :embeddings,
+             class_name: "Ragdoll::Embedding",
+             as: :embeddable,
+             dependent: :destroy
+    validates :type, presence: true
+    validates :embedding_model, presence: true
+    validates :document_id, presence: true
+    # JSON columns are handled natively by PostgreSQL
+    scope :by_type, ->(content_type) { where(type: content_type) }
+    scope :with_embeddings, -> { joins(:embeddings).distinct }
+    scope :without_embeddings, -> { left_joins(:embeddings).where(embeddings: { id: nil }) }
+    # Generate embeddings for this content
+    def generate_embeddings!
+      return unless should_generate_embeddings?
+      embedding_content = content_for_embedding
+      return if embedding_content.blank?
+      # Clear existing embeddings
+      embeddings.destroy_all
+      # Use TextChunker to split content into chunks
+      chunks = Ragdoll::TextChunker.chunk(embedding_content)
+      # Generate embeddings for each chunk
+      embedding_service = Ragdoll::EmbeddingService.new
+      chunks.each_with_index do |chunk_text, index|
+        begin
+          vector = embedding_service.generate_embedding(chunk_text)
+          embeddings.create!(
+            content: chunk_text,
+            embedding_vector: vector,
+            chunk_index: index
+          )
+        rescue StandardError => e
+          puts "Failed to generate embedding for chunk #{index}: #{e.message}"
+        end
+      end
+      update!(metadata: metadata.merge("embeddings_generated_at" => Time.current))
+    end
+    # Content to use for embedding generation (overridden by subclasses)
+    def content_for_embedding
+      content
+    end
+    # Whether this content should generate embeddings
+    def should_generate_embeddings?
+      content_for_embedding.present? && embeddings.empty?
+    end
+    # Statistics
+    def word_count
+      return 0 unless content.present?
+      content.split(/\s+/).length
+    end
+    def character_count
+      content&.length || 0
+    end
+    def embedding_count
+      embeddings.count
+    end
+    # Search within this content type
+    def self.search_content(query, **options)
+      return none if query.blank?
+      where(
+        "to_tsvector('english', COALESCE(content, '')) @@ plainto_tsquery('english', ?)",
+        query
+      ).limit(options[:limit] || 20)
+    end
+  end
+end