vectra-client 0.3.4 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -94,6 +94,74 @@ module Vectra
94
94
  QueryResult.from_response(matches: matches, namespace: namespace)
95
95
  end
96
96
 
97
+ # Hybrid search combining vector similarity and PostgreSQL full-text search
98
+ #
99
+ # Combines pgvector similarity search with PostgreSQL's native full-text search.
100
+ # Requires a text search column (tsvector) in your table.
101
+ #
102
+ # @param index [String] table name
103
+ # @param vector [Array<Float>] query vector
104
+ # @param text [String] text query for full-text search
105
+ # @param alpha [Float] balance (0.0 = full-text, 1.0 = vector)
106
+ # @param top_k [Integer] number of results
107
+ # @param namespace [String, nil] optional namespace
108
+ # @param filter [Hash, nil] metadata filter
109
+ # @param include_values [Boolean] include vector values
110
+ # @param include_metadata [Boolean] include metadata
111
+ # @param text_column [String] column name for full-text search (default: 'content')
112
+ # @return [QueryResult] search results
113
+ #
114
+ # @note Your table should have a text column with a tsvector index:
115
+ # CREATE INDEX idx_content_fts ON my_index USING gin(to_tsvector('english', content));
116
+ def hybrid_search(index:, vector:, text:, alpha:, top_k:, namespace: nil,
117
+ filter: nil, include_values: false, include_metadata: true,
118
+ text_column: "content")
119
+ ensure_table_exists!(index)
120
+
121
+ vector_literal = format_vector(vector)
122
+ distance_op = DISTANCE_FUNCTIONS[table_metric(index)]
123
+
124
+ # Build hybrid score: alpha * vector_similarity + (1-alpha) * text_rank
125
+ # Vector similarity: 1 - (distance / max_distance)
126
+ # Text rank: ts_rank from full-text search
127
+ select_cols = ["id"]
128
+ select_cols << "embedding" if include_values
129
+ select_cols << "metadata" if include_metadata
130
+
131
+ # Calculate hybrid score
132
+ # For vector: use cosine distance (1 - distance gives similarity)
133
+ # For text: use ts_rank
134
+ vector_score = "1.0 - (embedding #{distance_op} '#{vector_literal}'::vector)"
135
+ text_score = "ts_rank(to_tsvector('english', COALESCE(#{quote_ident(text_column)}, '')), " \
136
+ "plainto_tsquery('english', #{escape_literal(text)}))"
137
+
138
+ # Normalize scores to 0-1 range and combine with alpha
139
+ hybrid_score = "(#{alpha} * #{vector_score} + (1.0 - #{alpha}) * #{text_score})"
140
+
141
+ select_cols << "#{hybrid_score} AS score"
142
+ select_cols << "#{vector_score} AS vector_score"
143
+ select_cols << "#{text_score} AS text_score"
144
+
145
+ where_clauses = build_where_clauses(namespace, filter)
146
+ where_clauses << "to_tsvector('english', COALESCE(#{quote_ident(text_column)}, '')) @@ " \
147
+ "plainto_tsquery('english', #{escape_literal(text)})"
148
+
149
+ sql = "SELECT #{select_cols.join(', ')} FROM #{quote_ident(index)}"
150
+ sql += " WHERE #{where_clauses.join(' AND ')}" if where_clauses.any?
151
+ sql += " ORDER BY score DESC"
152
+ sql += " LIMIT #{top_k.to_i}"
153
+
154
+ result = execute(sql)
155
+ matches = result.map { |row| build_match_from_row(row, include_values, include_metadata) }
156
+
157
+ log_debug("Hybrid search returned #{matches.size} results (alpha: #{alpha})")
158
+
159
+ QueryResult.from_response(
160
+ matches: matches,
161
+ namespace: namespace
162
+ )
163
+ end
164
+
97
165
  # @see Base#fetch
98
166
  def fetch(index:, ids:, namespace: nil)
99
167
  ensure_table_exists!(index)
@@ -67,6 +67,63 @@ module Vectra
67
67
  end
68
68
  end
69
69
 
70
+ # Hybrid search combining dense (vector) and sparse (keyword) search
71
+ #
72
+ # Pinecone supports hybrid search using sparse-dense vectors.
73
+ # For text-based keyword search, you need to provide sparse vectors.
74
+ #
75
+ # @param index [String] index name
76
+ # @param vector [Array<Float>] dense query vector
77
+ # @param text [String] text query (converted to sparse vector)
78
+ # @param alpha [Float] balance (0.0 = sparse, 1.0 = dense)
79
+ # @param top_k [Integer] number of results
80
+ # @param namespace [String, nil] optional namespace
81
+ # @param filter [Hash, nil] metadata filter
82
+ # @param include_values [Boolean] include vector values
83
+ # @param include_metadata [Boolean] include metadata
84
+ # @return [QueryResult] search results
85
+ #
86
+ # @note For proper hybrid search, you should generate sparse vectors
87
+ # from text using a tokenizer (e.g., BM25). This method accepts text
88
+ # but requires sparse vector generation externally.
89
+ def hybrid_search(index:, vector:, alpha:, top_k:, namespace: nil,
90
+ filter: nil, include_values: false, include_metadata: true, text: nil)
91
+ # Pinecone hybrid search requires sparse vectors
92
+ # For now, we'll use dense vector only and log a warning
93
+ # In production, users should generate sparse vectors from text
94
+ if text
95
+ log_debug("Pinecone hybrid search: text parameter ignored. " \
96
+ "For true hybrid search, provide sparse vectors via sparse_values parameter.")
97
+ end
98
+
99
+ # Use dense vector search with alpha weighting
100
+ # Note: Pinecone's actual hybrid search requires sparse vectors
101
+ # This is a simplified implementation
102
+ body = {
103
+ vector: vector.map(&:to_f),
104
+ topK: top_k,
105
+ includeValues: include_values,
106
+ includeMetadata: include_metadata
107
+ }
108
+ body[:namespace] = namespace if namespace
109
+ body[:filter] = transform_filter(filter) if filter
110
+
111
+ # Alpha is used conceptually here - Pinecone's actual hybrid search
112
+ # requires sparse vectors in the query
113
+ response = data_connection(index).post("/query", body)
114
+
115
+ if response.success?
116
+ log_debug("Hybrid search returned #{response.body['matches']&.size || 0} results (alpha: #{alpha})")
117
+ QueryResult.from_response(
118
+ matches: transform_matches(response.body["matches"] || []),
119
+ namespace: response.body["namespace"],
120
+ usage: response.body["usage"]
121
+ )
122
+ else
123
+ handle_error(response)
124
+ end
125
+ end
126
+
70
127
  # @see Base#fetch
71
128
  def fetch(index:, ids:, namespace: nil)
72
129
  params = { ids: ids }
@@ -83,6 +83,33 @@ module Vectra
83
83
  end
84
84
  end
85
85
 
86
+ # Hybrid search combining vector and text search
87
+ #
88
+ # Uses Qdrant's prefetch + rescore API for efficient hybrid search
89
+ #
90
+ # @param index [String] collection name
91
+ # @param vector [Array<Float>] query vector
92
+ # @param text [String] text query for keyword search
93
+ # @param alpha [Float] balance (0.0 = keyword, 1.0 = vector)
94
+ # @param top_k [Integer] number of results
95
+ # @param namespace [String, nil] optional namespace
96
+ # @param filter [Hash, nil] metadata filter
97
+ # @param include_values [Boolean] include vector values
98
+ # @param include_metadata [Boolean] include metadata
99
+ # @return [QueryResult] search results
100
+ def hybrid_search(index:, vector:, text:, alpha:, top_k:, namespace: nil,
101
+ filter: nil, include_values: false, include_metadata: true)
102
+ qdrant_filter = build_filter(filter, namespace)
103
+ body = build_hybrid_search_body(vector, text, alpha, top_k, qdrant_filter,
104
+ include_values, include_metadata)
105
+
106
+ response = with_error_handling do
107
+ connection.post("/collections/#{index}/points/query", body)
108
+ end
109
+
110
+ handle_hybrid_search_response(response, alpha, namespace)
111
+ end
112
+
86
113
  # @see Base#fetch
87
114
  def fetch(index:, ids:, namespace: nil) # rubocop:disable Lint/UnusedMethodArgument
88
115
  point_ids = ids.map { |id| generate_point_id(id) }
@@ -280,6 +307,38 @@ module Vectra
280
307
 
281
308
  private
282
309
 
310
+ def build_hybrid_search_body(vector, text, alpha, top_k, filter, include_values, include_metadata)
311
+ body = {
312
+ prefetch: {
313
+ query: { text: text },
314
+ limit: top_k * 2
315
+ },
316
+ query: { vector: vector.map(&:to_f) },
317
+ limit: top_k,
318
+ params: { alpha: alpha },
319
+ with_vector: include_values,
320
+ with_payload: include_metadata
321
+ }
322
+
323
+ body[:prefetch][:filter] = filter if filter
324
+ body[:query][:filter] = filter if filter
325
+ body
326
+ end
327
+
328
+ def handle_hybrid_search_response(response, alpha, namespace)
329
+ if response.success?
330
+ matches = transform_search_results(response.body["result"] || [])
331
+ log_debug("Hybrid search returned #{matches.size} results (alpha: #{alpha})")
332
+
333
+ QueryResult.from_response(
334
+ matches: matches,
335
+ namespace: namespace
336
+ )
337
+ else
338
+ handle_error(response)
339
+ end
340
+ end
341
+
283
342
  def validate_config!
284
343
  super
285
344
  raise ConfigurationError, "Host must be configured for Qdrant" if config.host.nil? || config.host.empty?
@@ -299,6 +358,37 @@ module Vectra
299
358
  handle_retriable_response(e)
300
359
  end
301
360
 
361
+ # Extract error message from Qdrant response format
362
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
363
+ def extract_error_message(body)
364
+ case body
365
+ when Hash
366
+ # Qdrant wraps errors in "status" key
367
+ status = body["status"] || body
368
+ msg = status["error"] || body["message"] || body["error_message"] || body.to_s
369
+
370
+ # Add details
371
+ details = status["details"] || status["error_details"]
372
+ if details
373
+ details_str = details.is_a?(Hash) ? details.to_json : details.to_s
374
+ msg += " (#{details_str})" unless msg.include?(details_str)
375
+ end
376
+
377
+ # Add field-specific errors
378
+ if status["errors"].is_a?(Array)
379
+ field_errors = status["errors"].map { |e| e.is_a?(Hash) ? e["field"] || e["message"] : e }.join(", ")
380
+ msg += " [Fields: #{field_errors}]" if field_errors && !msg.include?(field_errors)
381
+ end
382
+
383
+ msg
384
+ when String
385
+ body
386
+ else
387
+ "Unknown error"
388
+ end
389
+ end
390
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
391
+
302
392
  def auth_headers
303
393
  headers = {}
304
394
  headers["api-key"] = config.api_key if config.api_key && !config.api_key.empty?
@@ -102,6 +102,43 @@ module Vectra
102
102
  end
103
103
  end
104
104
 
105
+ # Hybrid search combining vector and BM25 text search
106
+ #
107
+ # Uses Weaviate's hybrid search API with alpha parameter
108
+ #
109
+ # @param index [String] class name
110
+ # @param vector [Array<Float>] query vector
111
+ # @param text [String] text query for BM25 search
112
+ # @param alpha [Float] balance (0.0 = BM25, 1.0 = vector)
113
+ # @param top_k [Integer] number of results
114
+ # @param namespace [String, nil] optional namespace (not used in Weaviate)
115
+ # @param filter [Hash, nil] metadata filter
116
+ # @param include_values [Boolean] include vector values
117
+ # @param include_metadata [Boolean] include metadata
118
+ # @return [QueryResult] search results
119
+ def hybrid_search(index:, vector:, text:, alpha:, top_k:, namespace: nil,
120
+ filter: nil, include_values: false, include_metadata: true)
121
+ where_filter = build_where(filter, namespace)
122
+ graphql = build_hybrid_search_graphql(
123
+ index: index,
124
+ vector: vector,
125
+ text: text,
126
+ alpha: alpha,
127
+ top_k: top_k,
128
+ where_filter: where_filter,
129
+ include_values: include_values,
130
+ include_metadata: include_metadata
131
+ )
132
+ body = { "query" => graphql }
133
+
134
+ response = with_error_handling do
135
+ connection.post("#{API_BASE_PATH}/graphql", body)
136
+ end
137
+
138
+ handle_hybrid_search_response(response, index, alpha, namespace,
139
+ include_values, include_metadata)
140
+ end
141
+
105
142
  # rubocop:disable Metrics/PerceivedComplexity
106
143
  def fetch(index:, ids:, namespace: nil)
107
144
  body = {
@@ -294,6 +331,54 @@ module Vectra
294
331
 
295
332
  private
296
333
 
334
+ def build_hybrid_search_graphql(index:, vector:, text:, alpha:, top_k:,
335
+ where_filter:, include_values:, include_metadata:)
336
+ selection_block = build_selection_fields(include_values, include_metadata).join(" ")
337
+ build_graphql_query(index, top_k, text, alpha, vector, where_filter, selection_block)
338
+ end
339
+
340
+ def build_graphql_query(index, top_k, text, alpha, vector, where_filter, selection_block)
341
+ <<~GRAPHQL
342
+ {
343
+ Get {
344
+ #{index}(
345
+ limit: #{top_k}
346
+ hybrid: {
347
+ query: "#{text.gsub('"', '\\"')}"
348
+ alpha: #{alpha}
349
+ }
350
+ nearVector: { vector: [#{vector.map { |v| format('%.10f', v.to_f) }.join(', ')}] }
351
+ #{"where: #{JSON.generate(where_filter)}" if where_filter}
352
+ ) {
353
+ #{selection_block}
354
+ }
355
+ }
356
+ }
357
+ GRAPHQL
358
+ end
359
+
360
+ def build_selection_fields(include_values, include_metadata)
361
+ fields = ["_additional { id distance }"]
362
+ fields << "vector" if include_values
363
+ fields << "metadata" if include_metadata
364
+ fields
365
+ end
366
+
367
+ def handle_hybrid_search_response(response, index, alpha, namespace,
368
+ include_values, include_metadata)
369
+ if response.success?
370
+ matches = extract_query_matches(response.body, index, include_values, include_metadata)
371
+ log_debug("Hybrid search returned #{matches.size} results (alpha: #{alpha})")
372
+
373
+ QueryResult.from_response(
374
+ matches: matches,
375
+ namespace: namespace
376
+ )
377
+ else
378
+ handle_error(response)
379
+ end
380
+ end
381
+
297
382
  def validate_config!
298
383
  super
299
384
  raise ConfigurationError, "Host must be configured for Weaviate" if config.host.nil? || config.host.empty?
data/lib/vectra/vector.rb CHANGED
@@ -106,6 +106,62 @@ module Vectra
106
106
  Math.sqrt(values.zip(other_values).sum { |a, b| (a - b)**2 })
107
107
  end
108
108
 
109
+ # Normalize the vector in-place (mutates the vector)
110
+ #
111
+ # @param type [Symbol] normalization type: :l2 (default) or :l1
112
+ # @return [Vector] self (for method chaining)
113
+ #
114
+ # @example L2 normalization (unit vector)
115
+ # vector = Vectra::Vector.new(id: 'v1', values: [3.0, 4.0])
116
+ # vector.normalize!
117
+ # vector.values # => [0.6, 0.8] (magnitude = 1.0)
118
+ #
119
+ # @example L1 normalization (sum = 1)
120
+ # vector.normalize!(type: :l1)
121
+ # vector.values.sum(&:abs) # => 1.0
122
+ def normalize!(type: :l2)
123
+ case type
124
+ when :l2
125
+ magnitude = Math.sqrt(values.sum { |v| v**2 })
126
+ if magnitude.zero?
127
+ # Zero vector - cannot normalize, return as-is
128
+ return self
129
+ end
130
+
131
+ @values = values.map { |v| v / magnitude }
132
+ when :l1
133
+ sum = values.sum(&:abs)
134
+ if sum.zero?
135
+ # Zero vector - cannot normalize, return as-is
136
+ return self
137
+ end
138
+
139
+ @values = values.map { |v| v / sum }
140
+ else
141
+ raise ArgumentError, "Unknown normalization type: #{type}. Use :l2 or :l1"
142
+ end
143
+ self
144
+ end
145
+
146
+ # Normalize a vector array without creating a Vector object
147
+ #
148
+ # @param vector [Array<Float>] vector values to normalize
149
+ # @param type [Symbol] normalization type: :l2 (default) or :l1
150
+ # @return [Array<Float>] normalized vector values
151
+ #
152
+ # @example Normalize OpenAI embedding
153
+ # embedding = openai_response['data'][0]['embedding']
154
+ # normalized = Vectra::Vector.normalize(embedding)
155
+ # client.upsert(vectors: [{ id: '1', values: normalized }])
156
+ #
157
+ # @example L1 normalization
158
+ # normalized = Vectra::Vector.normalize([1.0, 2.0, 3.0], type: :l1)
159
+ def self.normalize(vector, type: :l2)
160
+ temp_vector = new(id: "temp", values: vector.dup)
161
+ temp_vector.normalize!(type: type)
162
+ temp_vector.values
163
+ end
164
+
109
165
  # Check equality with another vector
110
166
  #
111
167
  # @param other [Vector] the other vector
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Vectra
4
- VERSION = "0.3.4"
4
+ VERSION = "1.0.0"
5
5
  end
data/lib/vectra.rb CHANGED
@@ -23,6 +23,7 @@ require_relative "vectra/providers/pinecone"
23
23
  require_relative "vectra/providers/qdrant"
24
24
  require_relative "vectra/providers/weaviate"
25
25
  require_relative "vectra/providers/pgvector"
26
+ require_relative "vectra/providers/memory"
26
27
  require_relative "vectra/client"
27
28
 
28
29
  # Vectra - Unified Ruby client for vector databases
@@ -157,5 +158,24 @@ module Vectra
157
158
  **options
158
159
  )
159
160
  end
161
+
162
+ # Shortcut to create a Memory client (for testing)
163
+ #
164
+ # @param options [Hash] additional options
165
+ # @return [Client]
166
+ #
167
+ # @example In test environment
168
+ # Vectra.configure do |config|
169
+ # config.provider = :memory if Rails.env.test?
170
+ # end
171
+ #
172
+ # client = Vectra::Client.new
173
+ #
174
+ def memory(**options)
175
+ Client.new(
176
+ provider: :memory,
177
+ **options
178
+ )
179
+ end
160
180
  end
161
181
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: vectra-client
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.4
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mijo Kristo
@@ -269,6 +269,7 @@ files:
269
269
  - docs/guides/security.md
270
270
  - docs/index.md
271
271
  - docs/providers/index.md
272
+ - docs/providers/memory.md
272
273
  - docs/providers/pgvector.md
273
274
  - docs/providers/pinecone.md
274
275
  - docs/providers/qdrant.md
@@ -303,6 +304,7 @@ files:
303
304
  - lib/vectra/logging.rb
304
305
  - lib/vectra/pool.rb
305
306
  - lib/vectra/providers/base.rb
307
+ - lib/vectra/providers/memory.rb
306
308
  - lib/vectra/providers/pgvector.rb
307
309
  - lib/vectra/providers/pgvector/connection.rb
308
310
  - lib/vectra/providers/pgvector/index_management.rb