RubyGems - vectra-client - Versions diffs - 0.1.3 → 0.2.0 - Mend

vectra-client 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/.rubocop.yml +23 -3
data/CHANGELOG.md +23 -0
data/IMPLEMENTATION_GUIDE.md +686 -0
data/NEW_FEATURES_v0.2.0.md +459 -0
data/RELEASE_CHECKLIST_v0.2.0.md +383 -0
data/Rakefile +12 -0
data/USAGE_EXAMPLES.md +787 -0
data/benchmarks/batch_operations_benchmark.rb +117 -0
data/benchmarks/connection_pooling_benchmark.rb +93 -0
data/examples/active_record_demo.rb +227 -0
data/examples/instrumentation_demo.rb +157 -0
data/lib/generators/vectra/install_generator.rb +115 -0
data/lib/generators/vectra/templates/enable_pgvector_extension.rb +11 -0
data/lib/generators/vectra/templates/vectra.rb +79 -0
data/lib/vectra/active_record.rb +195 -0
data/lib/vectra/client.rb +60 -22
data/lib/vectra/configuration.rb +6 -1
data/lib/vectra/instrumentation/datadog.rb +82 -0
data/lib/vectra/instrumentation/new_relic.rb +70 -0
data/lib/vectra/instrumentation.rb +143 -0
data/lib/vectra/providers/pgvector/connection.rb +5 -1
data/lib/vectra/retry.rb +156 -0
data/lib/vectra/version.rb +1 -1
data/lib/vectra.rb +11 -0
metadata +45 -1

data/benchmarks/batch_operations_benchmark.rb ADDED Viewed

@@ -0,0 +1,117 @@
+# frozen_string_literal: true
+require "bundler/setup"
+require "vectra"
+require "benchmark"
+# Benchmark for batch operations
+#
+# Usage:
+#   ruby benchmarks/batch_operations_benchmark.rb
+puts "=" * 80
+puts "VECTRA BATCH OPERATIONS BENCHMARK"
+puts "=" * 80
+puts
+# Setup
+DB_URL = ENV.fetch("DATABASE_URL", "postgres://postgres:password@localhost/vectra_benchmark")
+DIMENSION = 384
+ITERATIONS = 5
+client = Vectra.pgvector(connection_url: DB_URL)
+# Create test index
+puts "Creating test index..."
+begin
+  client.provider.delete_index(name: "benchmark_test")
+rescue Vectra::NotFoundError
+  # Index doesn't exist, that's fine
+end
+client.provider.create_index(
+  name: "benchmark_test",
+  dimension: DIMENSION,
+  metric: "cosine"
+)
+# Generate test vectors
+def generate_vectors(count, dimension)
+  count.times.map do |i|
+    {
+      id: "vec_#{i}",
+      values: Array.new(dimension) { rand },
+      metadata: { index: i, category: "cat_#{i % 10}" }
+    }
+  end
+end
+puts "\nRunning benchmarks (#{ITERATIONS} iterations each)..."
+puts "-" * 80
+# Test different vector counts
+[100, 500, 1000, 5000, 10_000].each do |count|
+  puts "\n#{count} vectors:"
+  vectors = generate_vectors(count, DIMENSION)
+  # Test different batch sizes
+  [50, 100, 250, 500].each do |batch_size|
+    next if batch_size > count
+    client.config.batch_size = batch_size
+    times = []
+    ITERATIONS.times do
+      time = Benchmark.realtime do
+        client.upsert(index: "benchmark_test", vectors: vectors)
+      end
+      times << time
+    end
+    avg_time = times.sum / times.size
+    vectors_per_sec = count / avg_time
+    batches = (count.to_f / batch_size).ceil
+    puts "  Batch size #{batch_size.to_s.rjust(3)}: " \
+         "#{avg_time.round(3)}s avg " \
+         "(#{vectors_per_sec.round(0)} vectors/sec, " \
+         "#{batches} batches)"
+  end
+end
+# Query benchmarks
+puts "\n#{"=" * 80}"
+puts "QUERY BENCHMARKS"
+puts "=" * 80
+query_vector = Array.new(DIMENSION) { rand }
+puts "\nQuery performance (#{ITERATIONS} iterations):"
+[10, 20, 50, 100].each do |top_k|
+  times = []
+  ITERATIONS.times do
+    time = Benchmark.realtime do
+      client.query(
+        index: "benchmark_test",
+        vector: query_vector,
+        top_k: top_k
+      )
+    end
+    times << time
+  end
+  avg_time = times.sum / times.size
+  queries_per_sec = 1 / avg_time
+  puts "  top_k=#{top_k.to_s.rjust(3)}: " \
+       "#{(avg_time * 1000).round(1)}ms avg " \
+       "(#{queries_per_sec.round(1)} queries/sec)"
+end
+# Cleanup
+puts "\nCleaning up..."
+client.provider.delete_index(name: "benchmark_test")
+client.provider.shutdown!
+puts "\n✅ Benchmark complete!"

data/benchmarks/connection_pooling_benchmark.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+require "bundler/setup"
+require "vectra"
+require "benchmark"
+# Benchmark for connection pooling under concurrent load
+#
+# Usage:
+#   ruby benchmarks/connection_pooling_benchmark.rb
+puts "=" * 80
+puts "VECTRA CONNECTION POOLING BENCHMARK"
+puts "=" * 80
+puts
+DB_URL = ENV.fetch("DATABASE_URL", "postgres://postgres:password@localhost/vectra_benchmark")
+DIMENSION = 384
+THREAD_COUNTS = [1, 2, 5, 10, 20].freeze
+OPERATIONS_PER_THREAD = 50
+# Test different pool sizes
+[5, 10, 20].each do |pool_size|
+  puts "\n#{"=" * 80}"
+  puts "Pool Size: #{pool_size}"
+  puts "=" * 80
+  client = Vectra.pgvector(
+    connection_url: DB_URL,
+    pool_size: pool_size,
+    pool_timeout: 10
+  )
+  # Create test index
+  begin
+    client.provider.create_index(name: "benchmark_pool", dimension: DIMENSION)
+  rescue StandardError
+    # Already exists
+  end
+  # Pre-populate some data
+  vectors = 100.times.map do |i|
+    {
+      id: "vec_#{i}",
+      values: Array.new(DIMENSION) { rand },
+      metadata: { index: i }
+    }
+  end
+  client.upsert(index: "benchmark_pool", vectors: vectors)
+  THREAD_COUNTS.each do |thread_count|
+    # Skip if threads > pool size (will timeout)
+    next if thread_count > pool_size + 5
+    total_time = Benchmark.realtime do
+      threads = thread_count.times.map do |_thread_idx|
+        Thread.new do
+          query_vector = Array.new(DIMENSION) { rand }
+          OPERATIONS_PER_THREAD.times do
+            client.query(
+              index: "benchmark_pool",
+              vector: query_vector,
+              top_k: 10
+            )
+          end
+        end
+      end
+      threads.each(&:join)
+    end
+    total_operations = thread_count * OPERATIONS_PER_THREAD
+    ops_per_sec = total_operations / total_time
+    # Get pool stats
+    stats = client.provider.pool_stats
+    puts "  #{thread_count.to_s.rjust(2)} threads: " \
+         "#{total_time.round(2)}s total " \
+         "(#{ops_per_sec.round(1)} ops/sec) " \
+         "Pool: #{stats[:available]}/#{stats[:size]} available"
+  end
+  # Cleanup
+  client.provider.shutdown!
+end
+puts "\n✅ Benchmark complete!"
+puts "\nKey takeaways:"
+puts "  • Pool size should match max concurrent threads"
+puts "  • More threads than pool size causes waiting/timeouts"
+puts "  • Monitor pool_stats in production for optimal sizing"

data/examples/active_record_demo.rb ADDED Viewed

@@ -0,0 +1,227 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+# Demo of Vectra ActiveRecord integration
+#
+# Usage: ruby examples/active_record_demo.rb
+require "bundler/setup"
+require "active_record"
+require "vectra"
+puts "=" * 80
+puts "VECTRA ACTIVERECORD INTEGRATION DEMO"
+puts "=" * 80
+puts
+# Setup database connection
+ActiveRecord::Base.establish_connection(
+  adapter: "postgresql",
+  database: ENV.fetch("DATABASE_NAME", "vectra_demo"),
+  host: ENV.fetch("DATABASE_HOST", "localhost"),
+  username: ENV.fetch("DATABASE_USER", "postgres"),
+  password: ENV.fetch("DATABASE_PASSWORD", "password")
+)
+# Configure Vectra
+Vectra.configure do |config|
+  config.provider = :pgvector
+  config.host = ENV.fetch("DATABASE_URL", "postgres://postgres:password@localhost/vectra_demo")
+end
+# Create documents table if not exists
+ActiveRecord::Schema.define do
+  unless ActiveRecord::Base.connection.table_exists?("documents")
+    enable_extension "vector"
+    create_table :documents do |t|
+      t.string :title
+      t.text :content
+      t.string :category
+      t.string :status
+      t.column :embedding, :vector, limit: 3 # 3-dimensional for demo
+      t.timestamps
+    end
+  end
+end
+# Define Document model with vector search
+class Document < ActiveRecord::Base
+  include Vectra::ActiveRecord
+  has_vector :embedding,
+             dimension: 3,
+             provider: :pgvector,
+             index: "documents",
+             auto_index: true,
+             metadata_fields: [:title, :category, :status]
+  # Generate embedding before validation
+  # In production, use OpenAI/Cohere/etc.
+  before_validation :generate_embedding, if: -> { content.present? && embedding.nil? }
+  private
+  def generate_embedding
+    # Simple deterministic embedding for demo
+    # In production: self.embedding = OpenAI.embed(content)
+    hash = content.hash.abs
+    self.embedding = [
+      (hash % 1000) / 1000.0,
+      ((hash / 1000) % 1000) / 1000.0,
+      ((hash / 1_000_000) % 1000) / 1000.0
+    ]
+  end
+end
+# Create pgvector index
+puts "Creating vector index..."
+begin
+  Vectra::Client.new.provider.create_index(
+    name: "documents",
+    dimension: 3,
+    metric: "cosine"
+  )
+  puts "✅ Index created\n"
+rescue StandardError => e
+  puts "⚠️  Index might already exist: #{e.message}\n"
+end
+puts "\n#{"=" * 80}"
+puts "TESTING ACTIVERECORD INTEGRATION"
+puts "=" * 80
+puts
+# Clean up existing data
+Document.delete_all
+# Test 1: Create document (auto-indexes)
+puts "1. Creating documents (auto-indexes on save)...\n"
+doc1 = Document.create!(
+  title: "Getting Started Guide",
+  content: "This guide will help you get started with our platform.",
+  category: "tutorial",
+  status: "published"
+)
+puts "   Created: #{doc1.title} (ID: #{doc1.id})"
+puts "   Embedding: #{doc1.embedding.map { |v| v.round(3) }}"
+puts "   ✅ Automatically indexed in Vectra\n\n"
+doc2 = Document.create!(
+  title: "Advanced Features",
+  content: "Learn about advanced features and best practices.",
+  category: "tutorial",
+  status: "published"
+)
+puts "   Created: #{doc2.title} (ID: #{doc2.id})\n\n"
+doc3 = Document.create!(
+  title: "API Reference",
+  content: "Complete API documentation for developers.",
+  category: "reference",
+  status: "published"
+)
+puts "   Created: #{doc3.title} (ID: #{doc3.id})\n\n"
+sleep 0.5
+# Test 2: Vector search
+puts "2. Vector search (finds similar documents)...\n"
+query_embedding = [0.5, 0.5, 0.5]
+results = Document.vector_search(query_embedding, limit: 5)
+puts "   Query: #{query_embedding.inspect}"
+puts "   Found #{results.size} results:\n\n"
+results.each_with_index do |doc, idx|
+  puts "   #{idx + 1}. #{doc.title}"
+  puts "      Score: #{doc.vector_score.round(3)}"
+  puts "      Category: #{doc.category}"
+  puts
+end
+# Test 3: Search with filters
+puts "3. Vector search with metadata filter...\n"
+results = Document.vector_search(
+  query_embedding,
+  limit: 10,
+  filter: { category: "tutorial" }
+)
+puts "   Filter: category='tutorial'"
+puts "   Found #{results.size} results:\n"
+results.each { |doc| puts "      • #{doc.title}" }
+puts
+# Test 4: Find similar documents
+puts "4. Find similar to specific document...\n"
+similar = doc1.similar(limit: 2)
+puts "   Document: '#{doc1.title}'"
+puts "   Similar documents:\n"
+similar.each do |doc|
+  puts "      • #{doc.title} (score: #{doc.vector_score.round(3)})"
+end
+puts
+# Test 5: Update triggers re-indexing
+puts "5. Update document (triggers re-indexing)...\n"
+doc1.update!(content: "Updated content about getting started.")
+puts "   Updated: #{doc1.title}"
+puts "   New embedding: #{doc1.embedding.map { |v| v.round(3) }}"
+puts "   ✅ Automatically re-indexed\n\n"
+# Test 6: Manual index control
+puts "6. Manual index control...\n"
+doc4 = Document.new(
+  title: "Draft Article",
+  content: "This is a draft article.",
+  category: "blog",
+  status: "draft"
+)
+doc4.save!(validate: false) # Skip auto-index
+puts "   Created without auto-index: #{doc4.title}"
+puts "   Manually indexing..."
+doc4.index_vector!
+puts "   ✅ Manually indexed\n\n"
+# Test 7: Delete removes from index
+puts "7. Delete document (removes from index)...\n"
+doc4.destroy!
+puts "   ✅ Deleted and removed from vector index\n\n"
+# Cleanup
+puts "=" * 80
+puts "SUMMARY"
+puts "=" * 80
+puts
+puts "ActiveRecord Integration Features:"
+puts "  ✅ Automatic indexing on create/update"
+puts "  ✅ Automatic removal on delete"
+puts "  ✅ Vector search with AR object loading"
+puts "  ✅ Metadata filtering"
+puts "  ✅ Find similar documents"
+puts "  ✅ Manual index control"
+puts "  ✅ Custom embedding generation"
+puts
+puts "Total documents in database: #{Document.count}"
+puts "Total documents in vector index: (same, auto-synced)"
+puts
+puts "✅ Demo complete!"
+puts "\nNext steps:"
+puts "  • Replace embedding generation with real model (OpenAI, Cohere, etc.)"
+puts "  • Add background job for async indexing"
+puts "  • Use higher dimensions (384, 768, 1536)"
+puts "  • Add score threshold filtering"

data/examples/instrumentation_demo.rb ADDED Viewed

@@ -0,0 +1,157 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+# Demo of Vectra instrumentation features
+#
+# Usage: ruby examples/instrumentation_demo.rb
+require "bundler/setup"
+require "vectra"
+puts "=" * 80
+puts "VECTRA INSTRUMENTATION DEMO"
+puts "=" * 80
+puts
+# Configure Vectra with instrumentation
+Vectra.configure do |config|
+  config.provider = :pgvector
+  config.host = ENV.fetch("DATABASE_URL", "postgres://postgres:password@localhost/vectra_demo")
+  config.instrumentation = true # Enable instrumentation
+  config.pool_size = 5
+  config.batch_size = 100
+  config.max_retries = 3
+  config.retry_delay = 0.5
+end
+# Register custom instrumentation handler
+puts "Registering custom instrumentation handler...\n"
+Vectra.on_operation do |event|
+  status = event.success? ? "✅ SUCCESS" : "❌ ERROR"
+  duration_color = event.duration > 100 ? "\e[31m" : "\e[32m" # Red if > 100ms, green otherwise
+  reset_color = "\e[0m"
+  puts "#{status} | #{event.operation.to_s.upcase.ljust(10)} | " \
+       "#{event.provider}/#{event.index.ljust(15)} | " \
+       "#{duration_color}#{event.duration.round(1)}ms#{reset_color}"
+  if event.metadata.any?
+    puts "         Metadata: #{event.metadata.inspect}"
+  end
+  if event.failure?
+    puts "         Error: #{event.error.class} - #{event.error.message}"
+  end
+  puts
+end
+# Create client
+client = Vectra::Client.new
+puts "Creating test index...\n"
+begin
+  client.provider.delete_index(name: "demo_index")
+rescue Vectra::NotFoundError
+  # Doesn't exist, that's fine
+end
+client.provider.create_index(name: "demo_index", dimension: 3, metric: "cosine")
+sleep 0.5 # Give it a moment
+puts "\n#{"=" * 80}"
+puts "TESTING OPERATIONS"
+puts "=" * 80
+puts
+# Test 1: Upsert
+puts "1. UPSERT (3 vectors):"
+client.upsert(
+  index: "demo_index",
+  vectors: [
+    { id: "vec1", values: [0.1, 0.2, 0.3], metadata: { text: "Hello" } },
+    { id: "vec2", values: [0.4, 0.5, 0.6], metadata: { text: "World" } },
+    { id: "vec3", values: [0.7, 0.8, 0.9], metadata: { text: "Test" } }
+  ]
+)
+sleep 0.5
+# Test 2: Query
+puts "2. QUERY (top_k=2):"
+client.query(
+  index: "demo_index",
+  vector: [0.1, 0.2, 0.3],
+  top_k: 2
+)
+sleep 0.5
+# Test 3: Fetch
+puts "3. FETCH (2 IDs):"
+client.fetch(
+  index: "demo_index",
+  ids: ["vec1", "vec2"]
+)
+sleep 0.5
+# Test 4: Update
+puts "4. UPDATE (metadata):"
+client.update(
+  index: "demo_index",
+  id: "vec1",
+  metadata: { text: "Updated", processed: true }
+)
+sleep 0.5
+# Test 5: Delete
+puts "5. DELETE (1 ID):"
+client.delete(
+  index: "demo_index",
+  ids: ["vec3"]
+)
+sleep 0.5
+# Test 6: Bulk operations
+puts "6. BULK UPSERT (100 vectors):"
+bulk_vectors = 100.times.map do |i|
+  { id: "bulk_#{i}", values: [rand, rand, rand], metadata: { index: i } }
+end
+client.upsert(index: "demo_index", vectors: bulk_vectors)
+sleep 0.5
+# Test 7: Large query
+puts "7. LARGE QUERY (top_k=50):"
+client.query(
+  index: "demo_index",
+  vector: [rand, rand, rand],
+  top_k: 50
+)
+# Cleanup
+puts "\n#{"=" * 80}"
+puts "CLEANUP"
+puts "=" * 80
+puts
+puts "Deleting test index..."
+client.provider.delete_index(name: "demo_index")
+puts "\n✅ Demo complete!"
+puts "\nYou can see:"
+puts "  • Operation names (UPSERT, QUERY, FETCH, UPDATE, DELETE)"
+puts "  • Provider and index"
+puts "  • Duration in milliseconds (color-coded)"
+puts "  • Metadata (vector counts, filters, etc.)"
+puts "  • Success/error status"
+puts "\nThis data can be sent to:"
+puts "  • New Relic (require 'vectra/instrumentation/new_relic')"
+puts "  • Datadog (require 'vectra/instrumentation/datadog')"
+puts "  • Custom monitoring systems"

data/lib/generators/vectra/install_generator.rb ADDED Viewed

@@ -0,0 +1,115 @@
+# frozen_string_literal: true
+require "rails/generators/base"
+module Vectra
+  module Generators
+    # Rails generator for installing Vectra
+    #
+    # @example
+    #   rails generate vectra:install
+    #   rails generate vectra:install --provider=pinecone
+    #   rails generate vectra:install --provider=pgvector --database-url=postgres://localhost/mydb
+    #
+    class InstallGenerator < Rails::Generators::Base
+      source_root File.expand_path("templates", __dir__)
+      class_option :provider, type: :string, default: "pgvector",
+                              desc: "Vector database provider (pinecone, pgvector, qdrant, weaviate)"
+      class_option :database_url, type: :string, default: nil,
+                                  desc: "PostgreSQL connection URL (for pgvector)"
+      class_option :api_key, type: :string, default: nil,
+                             desc: "API key for the provider"
+      class_option :instrumentation, type: :boolean, default: false,
+                                     desc: "Enable instrumentation"
+      def create_initializer_file
+        template "vectra.rb", "config/initializers/vectra.rb"
+      end
+      def create_migration
+        return unless options[:provider] == "pgvector"
+        generate :migration, "EnablePgvectorExtension"
+        migration_template(
+          "enable_pgvector_extension.rb",
+          "db/migrate/enable_pgvector_extension.rb",
+          migration_version: migration_version
+        )
+      end
+      def show_readme
+        say "\n"
+        say "Vectra has been installed!", :green
+        say "\n"
+        say "Next steps:", :yellow
+        say "  1. Add your #{options[:provider]} credentials to Rails credentials:"
+        say "     $ rails credentials:edit", :cyan
+        say "\n"
+        case options[:provider]
+        when "pinecone"
+          show_pinecone_instructions
+        when "pgvector"
+          show_pgvector_instructions
+        when "qdrant"
+          show_qdrant_instructions
+        when "weaviate"
+          show_weaviate_instructions
+        end
+        return unless options[:instrumentation]
+        say "\n"
+        say "  📊 Instrumentation is enabled!", :green
+        say "     Add New Relic or Datadog setup to config/initializers/vectra.rb"
+      end
+      private
+      def show_pinecone_instructions
+        say "  2. Add to credentials:", :yellow
+        say "     pinecone:", :cyan
+        say "       api_key: your_api_key_here", :cyan
+        say "       environment: us-east-1", :cyan
+        say "\n"
+        say "  3. Create an index in Pinecone dashboard"
+        say "\n"
+        say "  4. Use in your app:", :yellow
+        say "     @client = Vectra::Client.new", :cyan
+        say "     @client.upsert(index: 'my-index', vectors: [...])", :cyan
+      end
+      def show_pgvector_instructions
+        say "  2. Run migrations:", :yellow
+        say "     $ rails db:migrate", :cyan
+        say "\n"
+        say "  3. Create a vector index:", :yellow
+        say "     $ rails runner 'Vectra::Client.new.provider.create_index(name: \"documents\", dimension: 384)'", :cyan
+        say "\n"
+        say "  4. Use in your app:", :yellow
+        say "     @client = Vectra::Client.new", :cyan
+        say "     @client.upsert(index: 'documents', vectors: [...])", :cyan
+      end
+      def show_qdrant_instructions
+        say "  2. Add to credentials:", :yellow
+        say "     qdrant:", :cyan
+        say "       api_key: your_api_key_here", :cyan
+        say "       host: https://your-cluster.qdrant.io", :cyan
+      end
+      def show_weaviate_instructions
+        say "  2. Add to credentials:", :yellow
+        say "     weaviate:", :cyan
+        say "       api_key: your_api_key_here", :cyan
+        say "       host: https://your-cluster.weaviate.io", :cyan
+      end
+      def migration_version
+        "[#{Rails::VERSION::MAJOR}.#{Rails::VERSION::MINOR}]"
+      end
+    end
+  end
+end