noiseless 0.0.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE.txt +28 -0
  3. data/README.md +214 -0
  4. data/lib/application_search.rb +15 -0
  5. data/lib/noiseless/adapter.rb +339 -0
  6. data/lib/noiseless/adapters/cluster_api.rb +18 -0
  7. data/lib/noiseless/adapters/elasticsearch.rb +30 -0
  8. data/lib/noiseless/adapters/execution_modules/elasticsearch_execution.rb +68 -0
  9. data/lib/noiseless/adapters/execution_modules/es_compatible_execution.rb +83 -0
  10. data/lib/noiseless/adapters/execution_modules/http_transport.rb +83 -0
  11. data/lib/noiseless/adapters/execution_modules/opensearch_execution.rb +209 -0
  12. data/lib/noiseless/adapters/execution_modules/pgvector_support.rb +219 -0
  13. data/lib/noiseless/adapters/execution_modules/postgresql_execution.rb +461 -0
  14. data/lib/noiseless/adapters/execution_modules/typesense_execution.rb +425 -0
  15. data/lib/noiseless/adapters/indices_api.rb +26 -0
  16. data/lib/noiseless/adapters/open_search.rb +168 -0
  17. data/lib/noiseless/adapters/postgresql.rb +171 -0
  18. data/lib/noiseless/adapters/typesense.rb +36 -0
  19. data/lib/noiseless/adapters.rb +14 -0
  20. data/lib/noiseless/ast/aggregation.rb +56 -0
  21. data/lib/noiseless/ast/bool.rb +16 -0
  22. data/lib/noiseless/ast/bulk.rb +18 -0
  23. data/lib/noiseless/ast/collapse.rb +16 -0
  24. data/lib/noiseless/ast/combined_fields.rb +33 -0
  25. data/lib/noiseless/ast/conversation.rb +29 -0
  26. data/lib/noiseless/ast/field_value_node.rb +16 -0
  27. data/lib/noiseless/ast/filter.rb +8 -0
  28. data/lib/noiseless/ast/hybrid.rb +35 -0
  29. data/lib/noiseless/ast/image_query.rb +29 -0
  30. data/lib/noiseless/ast/join.rb +31 -0
  31. data/lib/noiseless/ast/match.rb +8 -0
  32. data/lib/noiseless/ast/multi_match.rb +24 -0
  33. data/lib/noiseless/ast/paginate.rb +15 -0
  34. data/lib/noiseless/ast/prefix.rb +8 -0
  35. data/lib/noiseless/ast/range.rb +18 -0
  36. data/lib/noiseless/ast/root.rb +69 -0
  37. data/lib/noiseless/ast/search_after.rb +14 -0
  38. data/lib/noiseless/ast/sort.rb +15 -0
  39. data/lib/noiseless/ast/vector.rb +27 -0
  40. data/lib/noiseless/ast/wildcard.rb +8 -0
  41. data/lib/noiseless/ast.rb +30 -0
  42. data/lib/noiseless/bulk_importer.rb +195 -0
  43. data/lib/noiseless/callbacks.rb +138 -0
  44. data/lib/noiseless/connection_manager.rb +26 -0
  45. data/lib/noiseless/document_manager.rb +137 -0
  46. data/lib/noiseless/dsl.rb +107 -0
  47. data/lib/noiseless/generators/application_search_generator.rb +24 -0
  48. data/lib/noiseless/instrumentation.rb +174 -0
  49. data/lib/noiseless/introspection/console.rb +228 -0
  50. data/lib/noiseless/introspection/query_visualizer.rb +533 -0
  51. data/lib/noiseless/introspection.rb +221 -0
  52. data/lib/noiseless/mapping.rb +253 -0
  53. data/lib/noiseless/mapping_definition_processor.rb +231 -0
  54. data/lib/noiseless/model.rb +111 -0
  55. data/lib/noiseless/model_registry.rb +77 -0
  56. data/lib/noiseless/multi_search.rb +244 -0
  57. data/lib/noiseless/pagination.rb +375 -0
  58. data/lib/noiseless/query_builder.rb +284 -0
  59. data/lib/noiseless/railtie.rb +35 -0
  60. data/lib/noiseless/response/aggregations.rb +46 -0
  61. data/lib/noiseless/response/empty.rb +20 -0
  62. data/lib/noiseless/response/records.rb +94 -0
  63. data/lib/noiseless/response/results.rb +110 -0
  64. data/lib/noiseless/response/suggestions.rb +55 -0
  65. data/lib/noiseless/response.rb +98 -0
  66. data/lib/noiseless/response_factory.rb +32 -0
  67. data/lib/noiseless/runtime_reset_middleware.rb +15 -0
  68. data/lib/noiseless/search_index_update_job.rb +84 -0
  69. data/lib/noiseless/test_case.rb +230 -0
  70. data/lib/noiseless/test_helper.rb +295 -0
  71. data/lib/noiseless/version.rb +2 -2
  72. data/lib/noiseless.rb +146 -2
  73. data/lib/tasks/benchmark.rake +35 -0
  74. data/lib/tasks/release.rake +22 -0
  75. data/lib/tasks/test.rake +11 -0
  76. metadata +265 -14
@@ -0,0 +1,461 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "pgvector_support"
4
+
5
+ module Noiseless
6
+ module Adapters
7
+ module ExecutionModules
8
+ # PostgreSQL execution module - translates noiseless AST to PostgreSQL queries
9
+ # Uses pg_trgm for fuzzy matching, unaccent for accent-insensitive search,
10
+ # and optionally pgvector for semantic search
11
+ module PostgresqlExecution
12
+ include PgvectorSupport
13
+
14
+ SIMILARITY_THRESHOLD = 0.3
15
+ DEFAULT_LIMIT = 20
16
+
17
+ private
18
+
19
+ def execute_search(query_hash, model_class: nil, **)
20
+ model = resolve_model(query_hash[:indexes], model_class)
21
+ return empty_response unless model
22
+
23
+ # Check if this is a vector search
24
+ return execute_vector_search(model, query_hash) if query_hash[:vector]
25
+
26
+ scope = build_search_scope(model, query_hash)
27
+ records = scope.to_a
28
+
29
+ format_as_search_response(records, model)
30
+ rescue StandardError => e
31
+ error_response(e)
32
+ end
33
+
34
+ def execute_vector_search(model, query_hash)
35
+ vector_node = query_hash[:vector]
36
+ return empty_response unless vector_node && pgvector_available?
37
+
38
+ # Start with base scope
39
+ scope = model.all
40
+
41
+ # Apply any filters first
42
+ scope = apply_filter_clauses(scope, query_hash[:bool]&.filter || [])
43
+
44
+ # Apply vector search
45
+ scope = vector_search(
46
+ scope,
47
+ vector_node.embedding,
48
+ column: vector_node.field,
49
+ limit: vector_node.k,
50
+ distance_metric: vector_node.distance_metric
51
+ )
52
+
53
+ records = scope.to_a
54
+ format_vector_response(records, model, vector_node)
55
+ rescue StandardError => e
56
+ error_response(e)
57
+ end
58
+
59
+ def format_vector_response(records, model, _vector_node)
60
+ hits = records.map do |record|
61
+ distance = record.respond_to?(:vector_distance) ? record.vector_distance : 0
62
+ {
63
+ "_index" => model.table_name,
64
+ "_id" => record.id.to_s,
65
+ "_score" => 1.0 - distance, # Convert distance to similarity score
66
+ "_source" => record.as_json(except: [:vector_distance])
67
+ }
68
+ end
69
+
70
+ {
71
+ "took" => 0,
72
+ "timed_out" => false,
73
+ "_shards" => { "total" => 1, "successful" => 1, "skipped" => 0, "failed" => 0 },
74
+ "hits" => {
75
+ "total" => { "value" => hits.size, "relation" => "eq" },
76
+ "max_score" => hits.first&.dig("_score"),
77
+ "hits" => hits
78
+ }
79
+ }
80
+ end
81
+
82
+ def execute_bulk(actions, **)
83
+ results = actions.map do |action|
84
+ process_bulk_action(action)
85
+ end
86
+
87
+ { "items" => results, "errors" => results.any? { |r| r["error"] } }
88
+ end
89
+
90
+ def execute_create_index(_index_name, **)
91
+ # No-op for PostgreSQL - tables already exist
92
+ { "acknowledged" => true }
93
+ end
94
+
95
+ def execute_delete_index(_index_name, **)
96
+ # No-op - we don't delete tables via search adapter
97
+ { "acknowledged" => true }
98
+ end
99
+
100
+ def execute_index_exists?(index_name)
101
+ model = resolve_model([index_name])
102
+ model.present? && model.table_exists?
103
+ rescue StandardError
104
+ false
105
+ end
106
+
107
+ def execute_index_document(index, id, document, **)
108
+ model = resolve_model([index])
109
+ return { "_id" => id, "result" => "error", "error" => "Model not found" } unless model
110
+
111
+ record = model.find_or_initialize_by(id: id)
112
+ record.assign_attributes(document.slice(*model.column_names))
113
+ record.save!
114
+
115
+ { "_index" => index, "_id" => id, "result" => record.previously_new_record? ? "created" : "updated" }
116
+ rescue StandardError => e
117
+ { "_index" => index, "_id" => id, "result" => "error", "error" => e.message }
118
+ end
119
+
120
+ def execute_update_document(index, id, changes, **)
121
+ model = resolve_model([index])
122
+ return { "_id" => id, "result" => "error", "error" => "Model not found" } unless model
123
+
124
+ record = model.find(id)
125
+ record.update!(changes.slice(*model.column_names))
126
+
127
+ { "_index" => index, "_id" => id, "result" => "updated" }
128
+ rescue ActiveRecord::RecordNotFound
129
+ { "_index" => index, "_id" => id, "result" => "not_found" }
130
+ rescue StandardError => e
131
+ { "_index" => index, "_id" => id, "result" => "error", "error" => e.message }
132
+ end
133
+
134
+ def execute_delete_document(index, id, **)
135
+ model = resolve_model([index])
136
+ return { "_id" => id, "result" => "error", "error" => "Model not found" } unless model
137
+
138
+ model.destroy(id)
139
+ { "_index" => index, "_id" => id, "result" => "deleted" }
140
+ rescue ActiveRecord::RecordNotFound
141
+ { "_index" => index, "_id" => id, "result" => "not_found" }
142
+ rescue StandardError => e
143
+ { "_index" => index, "_id" => id, "result" => "error", "error" => e.message }
144
+ end
145
+
146
+ def execute_document_exists?(index, id)
147
+ model = resolve_model([index])
148
+ model&.exists?(id: id) || false
149
+ rescue StandardError
150
+ false
151
+ end
152
+
153
+ def execute_cluster_health(**)
154
+ # Verify PostgreSQL connection
155
+ ActiveRecord::Base.connection.execute("SELECT 1")
156
+ {
157
+ "cluster_name" => "postgresql",
158
+ "status" => "green",
159
+ "number_of_nodes" => 1
160
+ }
161
+ rescue StandardError => e
162
+ {
163
+ "cluster_name" => "postgresql",
164
+ "status" => "red",
165
+ "error" => e.message
166
+ }
167
+ end
168
+
169
+ # Query building methods
170
+
171
+ def build_search_scope(model, query_hash)
172
+ scope = model.all
173
+
174
+ # Apply must clauses (full-text search)
175
+ scope = apply_must_clauses(scope, query_hash[:bool]&.must || [], model)
176
+
177
+ # Apply filter clauses (exact matches)
178
+ scope = apply_filter_clauses(scope, query_hash[:bool]&.filter || [])
179
+
180
+ # Apply sorting
181
+ scope = apply_sorting(scope, query_hash[:sort] || [])
182
+
183
+ # Apply pagination
184
+ apply_pagination(scope, query_hash[:paginate])
185
+ end
186
+
187
+ def apply_must_clauses(scope, must_nodes, model)
188
+ return scope if must_nodes.empty?
189
+
190
+ must_nodes.each do |node|
191
+ scope = case node
192
+ when AST::Match
193
+ apply_match(scope, node, model)
194
+ when AST::MultiMatch
195
+ apply_multi_match(scope, node, model)
196
+ when AST::Wildcard
197
+ apply_wildcard(scope, node)
198
+ when AST::Range
199
+ apply_range(scope, node)
200
+ when AST::Prefix
201
+ apply_prefix(scope, node)
202
+ else
203
+ scope
204
+ end
205
+ end
206
+
207
+ scope
208
+ end
209
+
210
+ def apply_match(scope, node, model)
211
+ field = node.field.to_s
212
+ value = node.value.to_s
213
+
214
+ # Use pg_trgm similarity for fuzzy matching with unaccent
215
+ if trgm_available? && text_column?(model, field)
216
+ scope.where(
217
+ "unaccent(#{quoted_column(field)}) % unaccent(?) OR " \
218
+ "unaccent(#{quoted_column(field)}) ILIKE unaccent(?)",
219
+ value,
220
+ "%#{sanitize_like(value)}%"
221
+ )
222
+ else
223
+ # Fallback to ILIKE
224
+ scope.where("#{quoted_column(field)} ILIKE ?", "%#{sanitize_like(value)}%")
225
+ end
226
+ end
227
+
228
+ def apply_multi_match(scope, node, model)
229
+ query = node.query.to_s
230
+ fields = node.fields.map(&:to_s)
231
+
232
+ conditions = fields.map do |field|
233
+ if trgm_available? && text_column?(model, field)
234
+ "(unaccent(#{quoted_column(field)}) % unaccent(?) OR " \
235
+ "unaccent(#{quoted_column(field)}) ILIKE unaccent(?))"
236
+ else
237
+ "#{quoted_column(field)} ILIKE ?"
238
+ end
239
+ end
240
+
241
+ params = fields.flat_map do |field|
242
+ if trgm_available? && text_column?(model, field)
243
+ [query, "%#{sanitize_like(query)}%"]
244
+ else
245
+ ["%#{sanitize_like(query)}%"]
246
+ end
247
+ end
248
+
249
+ scope.where(conditions.join(" OR "), *params)
250
+ end
251
+
252
+ def apply_wildcard(scope, node)
253
+ field = node.field.to_s
254
+ # Convert OpenSearch wildcards to SQL: * -> %, ? -> _
255
+ pattern = node.value.to_s.tr("*", "%").tr("?", "_")
256
+
257
+ scope.where("#{quoted_column(field)} ILIKE ?", pattern)
258
+ end
259
+
260
+ def apply_range(scope, node)
261
+ field = quoted_column(node.field.to_s)
262
+
263
+ scope = scope.where("#{field} >= ?", node.gte) if node.gte
264
+ scope = scope.where("#{field} <= ?", node.lte) if node.lte
265
+ scope = scope.where("#{field} > ?", node.gt) if node.gt
266
+ scope = scope.where("#{field} < ?", node.lt) if node.lt
267
+
268
+ scope
269
+ end
270
+
271
+ def apply_prefix(scope, node)
272
+ scope.where("#{quoted_column(node.field.to_s)} ILIKE ?", "#{sanitize_like(node.value)}%")
273
+ end
274
+
275
+ def apply_filter_clauses(scope, filter_nodes)
276
+ return scope if filter_nodes.empty?
277
+
278
+ filter_nodes.each do |node|
279
+ value = node.value
280
+
281
+ scope = if value.is_a?(Hash) && value[:geo_distance]
282
+ apply_geo_filter(scope, node)
283
+ else
284
+ scope.where(node.field => value)
285
+ end
286
+ end
287
+
288
+ scope
289
+ end
290
+
291
+ def apply_geo_filter(scope, node)
292
+ # Requires PostGIS
293
+ geo_config = node.value[:geo_distance]
294
+ distance = geo_config[:distance]
295
+ field = node.field.to_s
296
+
297
+ # Find the geo point in config
298
+ geo_point = geo_config.find { |_k, v| v.is_a?(Hash) && v[:lat] && v[:lon] }&.last
299
+ return scope unless geo_point
300
+
301
+ # Use PostGIS ST_DWithin for efficient geo filtering
302
+ scope.where(
303
+ "ST_DWithin(#{field}::geography, ST_SetSRID(ST_MakePoint(?, ?), 4326)::geography, ?)",
304
+ geo_point[:lon],
305
+ geo_point[:lat],
306
+ parse_distance(distance)
307
+ )
308
+ rescue StandardError
309
+ # If PostGIS not available, skip geo filter
310
+ scope
311
+ end
312
+
313
+ def apply_sorting(scope, sort_nodes)
314
+ return scope if sort_nodes.empty?
315
+
316
+ order_clauses = sort_nodes.map do |node|
317
+ direction = node.direction.to_s.upcase == "DESC" ? "DESC" : "ASC"
318
+ "#{quoted_column(node.field.to_s)} #{direction}"
319
+ end
320
+
321
+ scope.order(Arel.sql(order_clauses.join(", ")))
322
+ end
323
+
324
+ def apply_pagination(scope, paginate_node)
325
+ page = paginate_node&.page || 1
326
+ per_page = paginate_node&.per_page || DEFAULT_LIMIT
327
+
328
+ offset = (page - 1) * per_page
329
+
330
+ scope.limit(per_page).offset(offset)
331
+ end
332
+
333
+ # Response formatting
334
+
335
+ def format_as_search_response(records, model)
336
+ total = records.size
337
+
338
+ hits = records.map do |record|
339
+ {
340
+ "_index" => model.table_name,
341
+ "_id" => record.id.to_s,
342
+ "_score" => 1.0,
343
+ "_source" => record.as_json
344
+ }
345
+ end
346
+
347
+ {
348
+ "took" => 0,
349
+ "timed_out" => false,
350
+ "_shards" => { "total" => 1, "successful" => 1, "skipped" => 0, "failed" => 0 },
351
+ "hits" => {
352
+ "total" => { "value" => total, "relation" => "eq" },
353
+ "max_score" => hits.any? ? 1.0 : nil,
354
+ "hits" => hits
355
+ }
356
+ }
357
+ end
358
+
359
+ def empty_response
360
+ {
361
+ "took" => 0,
362
+ "timed_out" => false,
363
+ "_shards" => { "total" => 1, "successful" => 1, "skipped" => 0, "failed" => 0 },
364
+ "hits" => {
365
+ "total" => { "value" => 0, "relation" => "eq" },
366
+ "max_score" => nil,
367
+ "hits" => []
368
+ }
369
+ }
370
+ end
371
+
372
+ def error_response(error)
373
+ {
374
+ "took" => 0,
375
+ "timed_out" => false,
376
+ "_shards" => { "total" => 1, "successful" => 0, "skipped" => 0, "failed" => 1 },
377
+ "hits" => {
378
+ "total" => { "value" => 0, "relation" => "eq" },
379
+ "max_score" => nil,
380
+ "hits" => []
381
+ },
382
+ "error" => { "type" => error.class.name, "reason" => error.message }
383
+ }
384
+ end
385
+
386
+ # Helper methods
387
+
388
+ def resolve_model(indexes, model_class = nil)
389
+ return model_class if model_class
390
+
391
+ index_name = indexes&.first
392
+ return nil unless index_name
393
+
394
+ # Try cached model first
395
+ return @model_class_cache[index_name] if @model_class_cache&.key?(index_name)
396
+
397
+ # Try to infer model from index name
398
+ model_name = index_name.to_s.classify
399
+ model_name.constantize
400
+ rescue NameError
401
+ nil
402
+ end
403
+
404
+ def trgm_available?
405
+ @trgm_available ||= available_extensions.include?("pg_trgm")
406
+ end
407
+
408
+ def unaccent_available?
409
+ @unaccent_available ||= available_extensions.include?("unaccent")
410
+ end
411
+
412
+ def text_column?(model, field)
413
+ column = model.columns_hash[field.to_s]
414
+ column && %i[string text].include?(column.type)
415
+ end
416
+
417
+ def quoted_column(field)
418
+ ActiveRecord::Base.connection.quote_column_name(field)
419
+ end
420
+
421
+ def sanitize_like(value)
422
+ # Escape special LIKE characters
423
+ value.to_s.gsub(/[%_\\]/) { |x| "\\#{x}" }
424
+ end
425
+
426
+ def parse_distance(distance)
427
+ # Parse OpenSearch distance format (e.g., "10km", "5mi")
428
+ case distance.to_s
429
+ when /(\d+(?:\.\d+)?)\s*km/i
430
+ ::Regexp.last_match(1).to_f * 1000
431
+ when /(\d+(?:\.\d+)?)\s*mi/i
432
+ ::Regexp.last_match(1).to_f * 1609.34
433
+ when /(\d+(?:\.\d+)?)\s*m/i
434
+ ::Regexp.last_match(1).to_f
435
+ else
436
+ distance.to_f
437
+ end
438
+ end
439
+
440
+ def process_bulk_action(action)
441
+ if action[:index]
442
+ index = action[:index][:_index]
443
+ id = action[:index][:_id]
444
+ data = action[:index][:data]
445
+
446
+ result = execute_index_document(index, id, data)
447
+ { "index" => result }
448
+ elsif action[:delete]
449
+ index = action[:delete][:_index]
450
+ id = action[:delete][:_id]
451
+
452
+ result = execute_delete_document(index, id)
453
+ { "delete" => result }
454
+ else
455
+ { "error" => "Unknown action type" }
456
+ end
457
+ end
458
+ end
459
+ end
460
+ end
461
+ end