noiseless 0.0.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +28 -0
  3. data/README.md +214 -0
  4. data/lib/application_search.rb +15 -0
  5. data/lib/noiseless/adapter.rb +339 -0
  6. data/lib/noiseless/adapters/cluster_api.rb +18 -0
  7. data/lib/noiseless/adapters/elasticsearch.rb +30 -0
  8. data/lib/noiseless/adapters/execution_modules/elasticsearch_execution.rb +68 -0
  9. data/lib/noiseless/adapters/execution_modules/es_compatible_execution.rb +83 -0
  10. data/lib/noiseless/adapters/execution_modules/http_transport.rb +83 -0
  11. data/lib/noiseless/adapters/execution_modules/opensearch_execution.rb +209 -0
  12. data/lib/noiseless/adapters/execution_modules/pgvector_support.rb +219 -0
  13. data/lib/noiseless/adapters/execution_modules/postgresql_execution.rb +461 -0
  14. data/lib/noiseless/adapters/execution_modules/typesense_execution.rb +425 -0
  15. data/lib/noiseless/adapters/indices_api.rb +26 -0
  16. data/lib/noiseless/adapters/open_search.rb +168 -0
  17. data/lib/noiseless/adapters/postgresql.rb +171 -0
  18. data/lib/noiseless/adapters/typesense.rb +36 -0
  19. data/lib/noiseless/adapters.rb +14 -0
  20. data/lib/noiseless/ast/aggregation.rb +56 -0
  21. data/lib/noiseless/ast/bool.rb +16 -0
  22. data/lib/noiseless/ast/bulk.rb +18 -0
  23. data/lib/noiseless/ast/collapse.rb +16 -0
  24. data/lib/noiseless/ast/combined_fields.rb +33 -0
  25. data/lib/noiseless/ast/conversation.rb +29 -0
  26. data/lib/noiseless/ast/field_value_node.rb +16 -0
  27. data/lib/noiseless/ast/filter.rb +8 -0
  28. data/lib/noiseless/ast/hybrid.rb +35 -0
  29. data/lib/noiseless/ast/image_query.rb +29 -0
  30. data/lib/noiseless/ast/join.rb +31 -0
  31. data/lib/noiseless/ast/match.rb +8 -0
  32. data/lib/noiseless/ast/multi_match.rb +24 -0
  33. data/lib/noiseless/ast/paginate.rb +15 -0
  34. data/lib/noiseless/ast/prefix.rb +8 -0
  35. data/lib/noiseless/ast/range.rb +18 -0
  36. data/lib/noiseless/ast/root.rb +69 -0
  37. data/lib/noiseless/ast/search_after.rb +14 -0
  38. data/lib/noiseless/ast/sort.rb +15 -0
  39. data/lib/noiseless/ast/vector.rb +27 -0
  40. data/lib/noiseless/ast/wildcard.rb +8 -0
  41. data/lib/noiseless/ast.rb +30 -0
  42. data/lib/noiseless/bulk_importer.rb +195 -0
  43. data/lib/noiseless/callbacks.rb +138 -0
  44. data/lib/noiseless/connection_manager.rb +26 -0
  45. data/lib/noiseless/document_manager.rb +137 -0
  46. data/lib/noiseless/dsl.rb +107 -0
  47. data/lib/noiseless/generators/application_search_generator.rb +24 -0
  48. data/lib/noiseless/instrumentation.rb +174 -0
  49. data/lib/noiseless/introspection/console.rb +228 -0
  50. data/lib/noiseless/introspection/query_visualizer.rb +533 -0
  51. data/lib/noiseless/introspection.rb +221 -0
  52. data/lib/noiseless/mapping.rb +253 -0
  53. data/lib/noiseless/mapping_definition_processor.rb +231 -0
  54. data/lib/noiseless/model.rb +111 -0
  55. data/lib/noiseless/model_registry.rb +77 -0
  56. data/lib/noiseless/multi_search.rb +244 -0
  57. data/lib/noiseless/pagination.rb +375 -0
  58. data/lib/noiseless/query_builder.rb +284 -0
  59. data/lib/noiseless/railtie.rb +35 -0
  60. data/lib/noiseless/response/aggregations.rb +46 -0
  61. data/lib/noiseless/response/empty.rb +20 -0
  62. data/lib/noiseless/response/records.rb +94 -0
  63. data/lib/noiseless/response/results.rb +110 -0
  64. data/lib/noiseless/response/suggestions.rb +55 -0
  65. data/lib/noiseless/response.rb +98 -0
  66. data/lib/noiseless/response_factory.rb +32 -0
  67. data/lib/noiseless/runtime_reset_middleware.rb +15 -0
  68. data/lib/noiseless/search_index_update_job.rb +84 -0
  69. data/lib/noiseless/test_case.rb +230 -0
  70. data/lib/noiseless/test_helper.rb +295 -0
  71. data/lib/noiseless/version.rb +2 -2
  72. data/lib/noiseless.rb +146 -2
  73. data/lib/tasks/benchmark.rake +35 -0
  74. data/lib/tasks/release.rake +22 -0
  75. data/lib/tasks/test.rake +11 -0
  76. metadata +265 -14
@@ -0,0 +1,68 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require_relative "es_compatible_execution"
5
+
6
+ module Noiseless
7
+ module Adapters
8
+ module ExecutionModules
9
+ module ElasticsearchExecution
10
+ include EsCompatibleExecution
11
+
12
+ private
13
+
14
+ def execute_search(query_hash, indexes: [], **_opts)
15
+ path = indexes.any? ? "/#{indexes.join(',')}/_search" : "/_search"
16
+ body = JSON.generate(query_hash)
17
+
18
+ response = post_request(path, body)
19
+ parse_json_response!(response, error_class: Noiseless::SearchError, context: "search")
20
+ ensure
21
+ response&.close
22
+ end
23
+
24
+ def execute_create_index(index_name, mappings: nil, settings: nil, **_opts)
25
+ body = {}
26
+ body[:mappings] = mappings if mappings
27
+ body[:settings] = settings if settings
28
+
29
+ response = put_request("/#{index_name}", body.any? ? JSON.generate(body) : nil)
30
+ parse_json_response!(response, context: "create index #{index_name}")
31
+ ensure
32
+ response&.close
33
+ end
34
+
35
+ def execute_index_document(index, id, document, **_opts)
36
+ path = id ? "/#{index}/_doc/#{id}" : "/#{index}/_doc"
37
+ body = JSON.generate(document)
38
+
39
+ response = id ? put_request(path, body) : post_request(path, body)
40
+ parse_json_response!(response, context: "index document #{index}/#{id}")
41
+ ensure
42
+ response&.close
43
+ end
44
+
45
+ def execute_cluster_health(**_opts)
46
+ response = get_request("/_cluster/health")
47
+ JSON.parse(response.read)
48
+ rescue StandardError => e
49
+ {
50
+ "cluster_name" => "unknown",
51
+ "status" => "red",
52
+ "timed_out" => false,
53
+ "number_of_nodes" => 0,
54
+ "number_of_data_nodes" => 0,
55
+ "active_primary_shards" => 0,
56
+ "active_shards" => 0,
57
+ "error" => {
58
+ "type" => e.class.name,
59
+ "reason" => e.message
60
+ }
61
+ }
62
+ ensure
63
+ response&.close
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require_relative "http_transport"
5
+
6
+ module Noiseless
7
+ module Adapters
8
+ module ExecutionModules
9
+ # Document and index operations shared by the wire-compatible
10
+ # Elasticsearch and OpenSearch HTTP APIs.
11
+ module EsCompatibleExecution
12
+ include HttpTransport
13
+
14
+ private
15
+
16
+ def execute_bulk(actions, **_opts)
17
+ body = actions.map do |action|
18
+ if action[:index]
19
+ action_line = { index: { _index: action[:index][:_index], _id: action[:index][:_id] } }
20
+ data_line = action[:index][:data]
21
+ "#{JSON.generate(action_line)}\n#{JSON.generate(data_line)}\n"
22
+ else
23
+ "#{JSON.generate(action)}\n"
24
+ end
25
+ end.join
26
+
27
+ response = post_request("/_bulk", body, content_type: "application/x-ndjson")
28
+ parse_json_response!(response, context: "bulk")
29
+ ensure
30
+ response&.close
31
+ end
32
+
33
+ def execute_delete_index(index_name, **_opts)
34
+ response = delete_request("/#{index_name}")
35
+ parse_json_response!(response, context: "delete index #{index_name}")
36
+ ensure
37
+ response&.close
38
+ end
39
+
40
+ def execute_refresh_index(index_name)
41
+ response = post_request("/#{index_name}/_refresh", nil)
42
+ parse_json_response!(response, context: "refresh index #{index_name}")
43
+ ensure
44
+ response&.close
45
+ end
46
+
47
+ def execute_index_exists?(index_name)
48
+ response = head_request("/#{index_name}")
49
+ response.success?
50
+ rescue StandardError
51
+ false
52
+ ensure
53
+ response&.close
54
+ end
55
+
56
+ def execute_update_document(index, id, changes, **_opts)
57
+ body = JSON.generate(doc: changes)
58
+
59
+ response = post_request("/#{index}/_update/#{id}", body)
60
+ parse_json_response!(response, context: "update document #{index}/#{id}")
61
+ ensure
62
+ response&.close
63
+ end
64
+
65
+ def execute_delete_document(index, id, **_opts)
66
+ response = delete_request("/#{index}/_doc/#{id}")
67
+ parse_json_response!(response, context: "delete document #{index}/#{id}")
68
+ ensure
69
+ response&.close
70
+ end
71
+
72
+ def execute_document_exists?(index, id)
73
+ response = head_request("/#{index}/_doc/#{id}")
74
+ response.success?
75
+ rescue StandardError
76
+ false
77
+ ensure
78
+ response&.close
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Noiseless
4
+ module Adapters
5
+ module ExecutionModules
6
+ # Shared Async::HTTP connection handling for HTTP-based adapters.
7
+ # Host classes must provide a private +default_port+ method.
8
+ module HttpTransport
9
+ def initialize(hosts: [], **connection_params)
10
+ # Ensure we always have at least one host
11
+ hosts_array = Array(hosts)
12
+ @hosts = hosts_array.empty? ? ["http://localhost:#{default_port}"] : hosts_array
13
+ @connection_params = connection_params
14
+
15
+ # Initialize HTTP clients for each host
16
+ @clients = {}
17
+ @hosts.each do |host|
18
+ endpoint = Async::HTTP::Endpoint.parse(host)
19
+ @clients[host] = Async::HTTP::Client.new(endpoint)
20
+ end
21
+
22
+ super(hosts: @hosts, **connection_params)
23
+ end
24
+
25
+ def close
26
+ @clients&.each_value(&:close)
27
+ end
28
+
29
+ private
30
+
31
+ # HTTP helpers using Async::HTTP with connection pooling
32
+ def get_request(path)
33
+ with_client do |client|
34
+ client.get(path, default_headers)
35
+ end
36
+ end
37
+
38
+ def post_request(path, body, content_type: "application/json")
39
+ headers = body ? default_headers + [["content-type", content_type]] : default_headers
40
+
41
+ with_client do |client|
42
+ client.post(path, headers, body)
43
+ end
44
+ end
45
+
46
+ def put_request(path, body, content_type: "application/json")
47
+ headers = body ? default_headers + [["content-type", content_type]] : default_headers
48
+
49
+ with_client do |client|
50
+ client.put(path, headers, body)
51
+ end
52
+ end
53
+
54
+ def delete_request(path)
55
+ with_client do |client|
56
+ client.delete(path, default_headers)
57
+ end
58
+ end
59
+
60
+ def head_request(path)
61
+ with_client do |client|
62
+ client.head(path, default_headers)
63
+ end
64
+ end
65
+
66
+ def with_client
67
+ # Select a random host for load balancing
68
+ host = @hosts.sample
69
+ client = @clients[host]
70
+
71
+ yield(client)
72
+ end
73
+
74
+ def default_headers
75
+ [
76
+ ["accept", "application/json"],
77
+ ["user-agent", "Noiseless/#{Noiseless::VERSION} (Ruby/#{RUBY_VERSION})"]
78
+ ]
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,209 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require_relative "es_compatible_execution"
5
+
6
+ module Noiseless
7
+ module Adapters
8
+ module ExecutionModules
9
+ module OpensearchExecution
10
+ include EsCompatibleExecution
11
+
12
+ private
13
+
14
+ def execute_search(query_hash, indexes: [], **_opts)
15
+ index_path = indexes.any? ? indexes.join(",") : "_all"
16
+ path = "/#{index_path}/_search"
17
+ body = JSON.generate(query_hash)
18
+
19
+ response = post_request(path, body)
20
+ parse_json_response!(response, error_class: Noiseless::SearchError, context: "search #{index_path}")
21
+ ensure
22
+ response&.close
23
+ end
24
+
25
+ def execute_create_index(index_name, mappings: nil, settings: nil, **opts)
26
+ body = opts.dup
27
+ body[:mappings] = mappings if mappings
28
+ body[:settings] = settings if settings
29
+
30
+ response = put_request("/#{index_name}", body.any? ? JSON.generate(body) : nil)
31
+ parse_json_response!(response, context: "create index #{index_name}")
32
+ ensure
33
+ response&.close
34
+ end
35
+
36
+ def execute_index_document(index, id, document, **_opts)
37
+ path = "/#{index}/_doc/#{id}"
38
+ body = JSON.generate(document)
39
+
40
+ response = put_request(path, body)
41
+ parse_json_response!(response, context: "index document #{index}/#{id}")
42
+ ensure
43
+ response&.close
44
+ end
45
+
46
+ def execute_cluster_health(**_opts)
47
+ response = get_request("/_cluster/health")
48
+ JSON.parse(response.read)
49
+ rescue StandardError => e
50
+ {
51
+ cluster_name: "unknown",
52
+ status: "red",
53
+ timed_out: false,
54
+ number_of_nodes: 0,
55
+ number_of_data_nodes: 0,
56
+ active_primary_shards: 0,
57
+ active_shards: 0,
58
+ relocating_shards: 0,
59
+ initializing_shards: 0,
60
+ unassigned_shards: 0,
61
+ error: { type: e.class.name, reason: e.message }
62
+ }
63
+ ensure
64
+ response&.close
65
+ end
66
+
67
+ # OpenSearch-specific features
68
+ def execute_point_in_time_search(query_hash, pit_id:, **_opts)
69
+ # Point-in-time search for consistent pagination
70
+ enhanced_query = query_hash.merge(pit: { id: pit_id })
71
+ body = JSON.generate(enhanced_query)
72
+
73
+ response = post_request("/_search", body)
74
+ parse_json_response!(response, error_class: Noiseless::SearchError, context: "point-in-time search")
75
+ ensure
76
+ response&.close
77
+ end
78
+
79
+ def execute_search_template(template_id:, params: {}, **_opts)
80
+ # OpenSearch search templates
81
+ template_query = {
82
+ id: template_id,
83
+ params: params
84
+ }
85
+ body = JSON.generate(template_query)
86
+
87
+ response = post_request("/_search/template", body)
88
+ parse_json_response!(response, error_class: Noiseless::SearchError, context: "search template #{template_id}")
89
+ ensure
90
+ response&.close
91
+ end
92
+
93
+ # ============================================
94
+ # Search Pipeline API (OpenSearch 3.x)
95
+ # ============================================
96
+
97
+ def execute_create_pipeline(name, request_processors:, response_processors:, description: nil)
98
+ body = {
99
+ description: description,
100
+ request_processors: request_processors,
101
+ response_processors: response_processors
102
+ }.compact
103
+
104
+ response = put_request("/_search/pipeline/#{name}", JSON.generate(body))
105
+ JSON.parse(response.read)
106
+ rescue StandardError => e
107
+ { acknowledged: false, error: { type: e.class.name, reason: e.message } }
108
+ ensure
109
+ response&.close
110
+ end
111
+
112
+ def execute_get_pipeline(name)
113
+ response = get_request("/_search/pipeline/#{name}")
114
+ JSON.parse(response.read)
115
+ rescue StandardError => e
116
+ { error: { type: e.class.name, reason: e.message } }
117
+ ensure
118
+ response&.close
119
+ end
120
+
121
+ def execute_list_pipelines
122
+ response = get_request("/_search/pipeline")
123
+ JSON.parse(response.read)
124
+ rescue StandardError => e
125
+ { error: { type: e.class.name, reason: e.message } }
126
+ ensure
127
+ response&.close
128
+ end
129
+
130
+ def execute_delete_pipeline(name)
131
+ response = delete_request("/_search/pipeline/#{name}")
132
+ JSON.parse(response.read)
133
+ rescue StandardError => e
134
+ { acknowledged: false, error: { type: e.class.name, reason: e.message } }
135
+ ensure
136
+ response&.close
137
+ end
138
+
139
+ def execute_pipeline_exists?(name)
140
+ response = head_request("/_search/pipeline/#{name}")
141
+ response.success?
142
+ rescue StandardError
143
+ false
144
+ ensure
145
+ response&.close
146
+ end
147
+
148
+ # ============================================
149
+ # Query Rules API (OpenSearch 3.x)
150
+ # ============================================
151
+
152
+ def execute_create_rule(feature_type, rule_id, attributes:, feature_value:)
153
+ body = {
154
+ match_criteria: {
155
+ query: attributes
156
+ },
157
+ feature_value: feature_value
158
+ }
159
+
160
+ response = put_request("/_rules/#{feature_type}/#{rule_id}", JSON.generate(body))
161
+ JSON.parse(response.read)
162
+ rescue StandardError => e
163
+ { acknowledged: false, error: { type: e.class.name, reason: e.message } }
164
+ ensure
165
+ response&.close
166
+ end
167
+
168
+ def execute_get_rule(feature_type, rule_id)
169
+ response = get_request("/_rules/#{feature_type}/#{rule_id}")
170
+ JSON.parse(response.read)
171
+ rescue StandardError => e
172
+ { error: { type: e.class.name, reason: e.message } }
173
+ ensure
174
+ response&.close
175
+ end
176
+
177
+ def execute_list_rules(feature_type, search_after: nil)
178
+ path = "/_rules/#{feature_type}"
179
+ path += "?search_after=#{search_after}" if search_after
180
+
181
+ response = get_request(path)
182
+ JSON.parse(response.read)
183
+ rescue StandardError => e
184
+ { rules: [], error: { type: e.class.name, reason: e.message } }
185
+ ensure
186
+ response&.close
187
+ end
188
+
189
+ def execute_delete_rule(feature_type, rule_id)
190
+ response = delete_request("/_rules/#{feature_type}/#{rule_id}")
191
+ JSON.parse(response.read)
192
+ rescue StandardError => e
193
+ { acknowledged: false, error: { type: e.class.name, reason: e.message } }
194
+ ensure
195
+ response&.close
196
+ end
197
+
198
+ def execute_rule_exists?(feature_type, rule_id)
199
+ response = head_request("/_rules/#{feature_type}/#{rule_id}")
200
+ response.success?
201
+ rescue StandardError
202
+ false
203
+ ensure
204
+ response&.close
205
+ end
206
+ end
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,219 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Noiseless
4
+ module Adapters
5
+ module ExecutionModules
6
+ # pgvector support for semantic/vector search in PostgreSQL
7
+ # Provides similarity search using embeddings
8
+ #
9
+ # Required:
10
+ # CREATE EXTENSION IF NOT EXISTS vector;
11
+ #
12
+ # Table setup:
13
+ # ALTER TABLE your_table ADD COLUMN embedding vector(1536);
14
+ # CREATE INDEX ON your_table USING ivfflat (embedding vector_cosine_ops);
15
+ #
16
+ module PgvectorSupport
17
+ # Perform semantic search using vector similarity
18
+ #
19
+ # @param scope [ActiveRecord::Relation] The base scope to search
20
+ # @param embedding [Array<Float>] The query embedding vector
21
+ # @param column [Symbol] The column containing embeddings (default: :embedding)
22
+ # @param limit [Integer] Maximum results to return
23
+ # @param distance_threshold [Float] Maximum distance threshold (optional)
24
+ # @param distance_metric [Symbol] :cosine, :l2, or :inner_product
25
+ # @return [ActiveRecord::Relation] Scope with vector similarity ordering
26
+ #
27
+ def vector_search(scope, embedding, column: :embedding, limit: 20, distance_threshold: nil,
28
+ distance_metric: :cosine)
29
+ return scope unless pgvector_available?
30
+
31
+ vector_string = "[#{embedding.join(',')}]"
32
+ distance_op = distance_operator(distance_metric)
33
+
34
+ # Build the query with distance calculation
35
+ scope = scope.select(
36
+ "#{scope.table_name}.*",
37
+ "#{quoted_column(column)} #{distance_op} '#{vector_string}' AS vector_distance"
38
+ )
39
+
40
+ # Apply distance threshold if specified
41
+ if distance_threshold
42
+ scope = scope.where(
43
+ "#{quoted_column(column)} #{distance_op} '#{vector_string}' < ?",
44
+ distance_threshold
45
+ )
46
+ end
47
+
48
+ # Order by similarity (ascending distance = more similar)
49
+ scope.order(Arel.sql("#{quoted_column(column)} #{distance_op} '#{vector_string}'"))
50
+ .limit(limit)
51
+ end
52
+
53
+ # Hybrid search combining text and vector search
54
+ #
55
+ # @param scope [ActiveRecord::Relation] Base scope
56
+ # @param text_query [String] Text query for pg_trgm search
57
+ # @param embedding [Array<Float>] Query embedding for vector search
58
+ # @param text_fields [Array<Symbol>] Fields to search with text
59
+ # @param vector_column [Symbol] Column containing embeddings
60
+ # @param text_weight [Float] Weight for text similarity (0.0-1.0)
61
+ # @param vector_weight [Float] Weight for vector similarity (0.0-1.0)
62
+ # @return [ActiveRecord::Relation]
63
+ #
64
+ def hybrid_search(scope, text_query:, embedding:, text_fields:, vector_column: :embedding,
65
+ text_weight: 0.5, vector_weight: 0.5, limit: 20)
66
+ return scope unless pgvector_available?
67
+
68
+ vector_string = "[#{embedding.join(',')}]"
69
+ text_conditions = text_fields.map { |f| "similarity(#{quoted_column(f)}, ?)" }.join(" + ")
70
+ text_similarity_count = text_fields.size
71
+
72
+ # Normalized combined score
73
+ scope.select(
74
+ "#{scope.table_name}.*",
75
+ # Text similarity (0-1 per field, averaged)
76
+ Arel.sql(
77
+ "(#{text_conditions}) / #{text_similarity_count} * #{text_weight} AS text_score"
78
+ ),
79
+ # Vector similarity (convert distance to similarity: 1 - distance for cosine)
80
+ "(1 - (#{quoted_column(vector_column)} <=> '#{vector_string}')) * #{vector_weight} AS vector_score",
81
+ # Combined score
82
+ "(((#{text_conditions}) / #{text_similarity_count}) * #{text_weight} + " \
83
+ "(1 - (#{quoted_column(vector_column)} <=> '#{vector_string}')) * #{vector_weight}) AS combined_score"
84
+ ).where(
85
+ "#{text_conditions} > 0 OR #{quoted_column(vector_column)} IS NOT NULL",
86
+ *Array.new(text_similarity_count, text_query)
87
+ ).order(Arel.sql("combined_score DESC"))
88
+ .limit(limit)
89
+ .tap { |s| s.bind_values.concat(Array.new(text_similarity_count, text_query)) }
90
+ end
91
+
92
+ # Execute a KNN (K-Nearest Neighbors) search
93
+ #
94
+ # @param model [Class] The ActiveRecord model
95
+ # @param embedding [Array<Float>] Query embedding
96
+ # @param k [Integer] Number of nearest neighbors
97
+ # @param column [Symbol] Embedding column
98
+ # @param filters [Hash] Additional WHERE conditions
99
+ # @return [Array<Hash>] Results with distance scores
100
+ #
101
+ def knn_search(model, embedding, k: 10, column: :embedding, filters: {})
102
+ return [] unless pgvector_available?
103
+
104
+ vector_string = "[#{embedding.join(',')}]"
105
+
106
+ scope = model.all
107
+ scope = scope.where(filters) if filters.any?
108
+
109
+ results = scope.select(
110
+ "#{model.table_name}.*",
111
+ "#{quoted_column(column)} <=> '#{vector_string}' AS distance"
112
+ ).order(Arel.sql("#{quoted_column(column)} <=> '#{vector_string}'"))
113
+ .limit(k)
114
+
115
+ format_knn_response(results, model)
116
+ end
117
+
118
+ # Store an embedding for a record
119
+ #
120
+ # @param record [ActiveRecord::Base] The record to update
121
+ # @param embedding [Array<Float>] The embedding vector
122
+ # @param column [Symbol] The column to store the embedding
123
+ #
124
+ def store_embedding(record, embedding, column: :embedding)
125
+ return false unless pgvector_available?
126
+
127
+ vector_string = "[#{embedding.join(',')}]"
128
+ record.update_column(column, vector_string)
129
+ end
130
+
131
+ # Batch store embeddings
132
+ #
133
+ # @param model [Class] The ActiveRecord model
134
+ # @param embeddings [Hash<String, Array<Float>>] Map of ID -> embedding
135
+ # @param column [Symbol] The column to store embeddings
136
+ #
137
+ def batch_store_embeddings(model, embeddings, column: :embedding)
138
+ return 0 unless pgvector_available?
139
+
140
+ # Use UPDATE FROM VALUES for efficient batch update
141
+ values = embeddings.map do |id, emb|
142
+ "(#{ActiveRecord::Base.connection.quote(id)}, '[#{emb.join(',')}]'::vector)"
143
+ end.join(",")
144
+
145
+ sql = <<~SQL.squish
146
+ UPDATE #{model.table_name}
147
+ SET #{column} = v.embedding
148
+ FROM (VALUES #{values}) AS v(id, embedding)
149
+ WHERE #{model.table_name}.id = v.id::uuid
150
+ SQL
151
+
152
+ ActiveRecord::Base.connection.execute(sql)
153
+ embeddings.size
154
+ rescue StandardError => e
155
+ Rails.logger.error("Failed to batch store embeddings: #{e.message}")
156
+ 0
157
+ end
158
+
159
+ # Find similar records to a given record
160
+ #
161
+ # @param record [ActiveRecord::Base] The reference record
162
+ # @param limit [Integer] Number of similar records
163
+ # @param column [Symbol] Embedding column
164
+ # @param exclude_self [Boolean] Exclude the reference record
165
+ # @return [ActiveRecord::Relation]
166
+ #
167
+ def find_similar(record, limit: 10, column: :embedding, exclude_self: true)
168
+ embedding = record.send(column)
169
+ return record.class.none unless embedding && pgvector_available?
170
+
171
+ scope = record.class.where.not(column => nil)
172
+ scope = scope.where.not(id: record.id) if exclude_self
173
+
174
+ vector_search(scope, embedding, column: column, limit: limit)
175
+ end
176
+
177
+ # Check if pgvector is available
178
+ def pgvector_available?
179
+ @pgvector_available ||= available_extensions.include?("vector")
180
+ end
181
+
182
+ private
183
+
184
+ def distance_operator(metric)
185
+ case metric
186
+ when :l2, :euclidean
187
+ "<->" # L2/Euclidean distance
188
+ when :inner_product
189
+ "<#>" # Negative inner product
190
+ else
191
+ "<=>" # Cosine distance (default)
192
+ end
193
+ end
194
+
195
+ def format_knn_response(records, model)
196
+ hits = records.map do |record|
197
+ {
198
+ "_index" => model.table_name,
199
+ "_id" => record.id.to_s,
200
+ "_score" => 1.0 - (record.respond_to?(:distance) ? record.distance : 0),
201
+ "_source" => record.as_json(except: [:distance])
202
+ }
203
+ end
204
+
205
+ {
206
+ "took" => 0,
207
+ "timed_out" => false,
208
+ "_shards" => { "total" => 1, "successful" => 1, "skipped" => 0, "failed" => 0 },
209
+ "hits" => {
210
+ "total" => { "value" => hits.size, "relation" => "eq" },
211
+ "max_score" => hits.first&.dig("_score"),
212
+ "hits" => hits
213
+ }
214
+ }
215
+ end
216
+ end
217
+ end
218
+ end
219
+ end