search-engine-for-typesense 30.1.5 → 30.1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/search_engine/base/model_dsl.rb +190 -0
- data/lib/search_engine/config.rb +49 -1
- data/lib/search_engine/errors.rb +21 -0
- data/lib/search_engine/instrumentation.rb +3 -3
- data/lib/search_engine/logging_subscriber.rb +5 -2
- data/lib/search_engine/mapper.rb +61 -1
- data/lib/search_engine/observability.rb +25 -2
- data/lib/search_engine/otel.rb +18 -6
- data/lib/search_engine/relation/compiler.rb +151 -2
- data/lib/search_engine/relation/dsl/vectors.rb +324 -0
- data/lib/search_engine/relation/dsl.rb +14 -5
- data/lib/search_engine/relation/dx.rb +18 -0
- data/lib/search_engine/relation/state.rb +11 -2
- data/lib/search_engine/schema.rb +173 -24
- data/lib/search_engine/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 52454481281edd904b9d0a3b68e191b6c8499426194b37f81e47f9fdc1a62435
|
|
4
|
+
data.tar.gz: 6adbd869d14f77faabf6ea10c42bc9f141d247a713ea016e97ab4188ce31564c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '08c22e27bf1e8c039d453584c34af900c4e50b7b55efbad70a29854e8154dfa5353ccb24095f991aaa9ae38bb55cf9a799a5a183e07344a63f4b050c0cf5a23f'
|
|
7
|
+
data.tar.gz: b54156dde160147eb1463d1c108f38c15436a52fcf3958f7462f7acaa49b1b844525aa971f4c1bdd4de482962f21935463ff26c4cb332a6ecd1f90e2da43b105
|
|
@@ -8,6 +8,8 @@ module SearchEngine
|
|
|
8
8
|
module ModelDsl
|
|
9
9
|
extend ActiveSupport::Concern
|
|
10
10
|
|
|
11
|
+
EMBEDDING_SUFFIX = '_embedding'
|
|
12
|
+
|
|
11
13
|
class_methods do
|
|
12
14
|
# Get or set the Typesense collection name for this model.
|
|
13
15
|
#
|
|
@@ -282,6 +284,183 @@ module SearchEngine
|
|
|
282
284
|
end
|
|
283
285
|
end
|
|
284
286
|
|
|
287
|
+
class_methods do
|
|
288
|
+
# Declare a vector embedding field with automatic name resolution, model
|
|
289
|
+
# inference, and source-field validation.
|
|
290
|
+
#
|
|
291
|
+
# @param name [Symbol, String, nil] field name (auto-derived when omitted)
|
|
292
|
+
# @param from [Array<Symbol>, nil] source attribute names to embed from
|
|
293
|
+
# @param suffix [Boolean] append `_embedding` to the field name (default: true)
|
|
294
|
+
# @param model [String, nil] embedding model override (per-field)
|
|
295
|
+
# @param api_key [String, nil] API key for remote embedding providers
|
|
296
|
+
# @param num_dim [Integer, nil] vector dimensions for external embeddings
|
|
297
|
+
# @param hnsw [Hash, nil] HNSW index tuning parameters
|
|
298
|
+
# @param model_config [Hash, nil] extra model_config overrides
|
|
299
|
+
# @return [void]
|
|
300
|
+
def embedding(name = nil, from: nil, suffix: true, model: nil,
|
|
301
|
+
api_key: nil, num_dim: nil, hnsw: nil, model_config: nil)
|
|
302
|
+
resolved_name = __se_resolve_embedding_name(name, from: from, suffix: suffix, num_dim: num_dim)
|
|
303
|
+
resolved_sym = resolved_name.to_sym
|
|
304
|
+
|
|
305
|
+
__se_check_embedding_duplicate!(resolved_sym)
|
|
306
|
+
|
|
307
|
+
external = from.nil? && num_dim
|
|
308
|
+
from_fields = external ? nil : __se_infer_embedding_from(resolved_name, from)
|
|
309
|
+
|
|
310
|
+
__se_validate_embedding_sources!(from_fields) if from_fields
|
|
311
|
+
|
|
312
|
+
vector_opts = {}
|
|
313
|
+
|
|
314
|
+
if external
|
|
315
|
+
vector_opts[:num_dim] = Integer(num_dim)
|
|
316
|
+
else
|
|
317
|
+
resolved_model = __se_resolve_embedding_model(model)
|
|
318
|
+
vector_opts[:embed] = __se_build_embed_hash(
|
|
319
|
+
from_fields, resolved_model,
|
|
320
|
+
api_key: api_key, model_config: model_config
|
|
321
|
+
)
|
|
322
|
+
end
|
|
323
|
+
|
|
324
|
+
vector_opts[:hnsw_params] = hnsw if hnsw
|
|
325
|
+
|
|
326
|
+
attribute(resolved_name, :vector)
|
|
327
|
+
@attribute_options ||= {}
|
|
328
|
+
@attribute_options[resolved_sym] = (@attribute_options[resolved_sym] || {}).merge(vector_opts)
|
|
329
|
+
|
|
330
|
+
__se_store_embedding_metadata!(
|
|
331
|
+
resolved_sym, from: from_fields,
|
|
332
|
+
model: external ? nil : resolved_model,
|
|
333
|
+
external: external ? true : false,
|
|
334
|
+
num_dim: num_dim
|
|
335
|
+
)
|
|
336
|
+
end
|
|
337
|
+
|
|
338
|
+
private
|
|
339
|
+
|
|
340
|
+
# Resolve the canonical field name for an embedding declaration.
|
|
341
|
+
# @return [String]
|
|
342
|
+
def __se_resolve_embedding_name(name, from:, suffix:, num_dim:)
|
|
343
|
+
if name.nil?
|
|
344
|
+
if from
|
|
345
|
+
"#{self.name.demodulize.underscore}#{EMBEDDING_SUFFIX}"
|
|
346
|
+
elsif num_dim
|
|
347
|
+
raise ArgumentError,
|
|
348
|
+
'External embedding (num_dim: without from:) requires an explicit field name'
|
|
349
|
+
else
|
|
350
|
+
raise ArgumentError,
|
|
351
|
+
'embedding requires at least one of: a field name, from: sources, or num_dim: for external vectors'
|
|
352
|
+
end
|
|
353
|
+
else
|
|
354
|
+
n = name.to_s
|
|
355
|
+
if suffix && !n.end_with?(EMBEDDING_SUFFIX)
|
|
356
|
+
"#{n}#{EMBEDDING_SUFFIX}"
|
|
357
|
+
else
|
|
358
|
+
n
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
|
|
363
|
+
# Infer `from:` source fields when not explicitly provided.
|
|
364
|
+
# @return [Array<Symbol>]
|
|
365
|
+
def __se_infer_embedding_from(resolved_name, from)
|
|
366
|
+
if from
|
|
367
|
+
Array(from).map(&:to_sym)
|
|
368
|
+
else
|
|
369
|
+
bare = resolved_name.to_s.delete_suffix(EMBEDDING_SUFFIX)
|
|
370
|
+
if bare.empty?
|
|
371
|
+
raise ArgumentError,
|
|
372
|
+
"Cannot infer from: for embedding '#{resolved_name}'; provide from: explicitly"
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
[bare.to_sym]
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
# Validate that all source fields exist and are string-typed.
|
|
380
|
+
# @raise [ArgumentError]
|
|
381
|
+
def __se_validate_embedding_sources!(from_fields)
|
|
382
|
+
attrs = @attributes || {}
|
|
383
|
+
from_fields.each do |field|
|
|
384
|
+
unless attrs.key?(field)
|
|
385
|
+
raise ArgumentError,
|
|
386
|
+
"embedding from: references undeclared attribute :#{field}. " \
|
|
387
|
+
'Declare it with `attribute` before the `embedding` call.'
|
|
388
|
+
end
|
|
389
|
+
|
|
390
|
+
ts_type = __se_typesense_type_for(attrs[field])
|
|
391
|
+
next if %w[string string[]].include?(ts_type)
|
|
392
|
+
|
|
393
|
+
raise ArgumentError,
|
|
394
|
+
"embedding from: field :#{field} must be string-typed " \
|
|
395
|
+
"(got :#{attrs[field]} -> \"#{ts_type}\"). " \
|
|
396
|
+
'Typesense only auto-embeds text fields.'
|
|
397
|
+
end
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
# Resolve the embedding model with fallback to global config.
|
|
401
|
+
# @return [String]
|
|
402
|
+
# @raise [SearchEngine::Errors::ConfigurationError]
|
|
403
|
+
def __se_resolve_embedding_model(per_field_model)
|
|
404
|
+
return per_field_model if per_field_model && !per_field_model.to_s.strip.empty?
|
|
405
|
+
|
|
406
|
+
global = SearchEngine.config.embedding.model
|
|
407
|
+
return global if global && !global.to_s.strip.empty?
|
|
408
|
+
|
|
409
|
+
raise SearchEngine::Errors::ConfigurationError.new(
|
|
410
|
+
'No embedding model configured. Set `model:` on the embedding declaration ' \
|
|
411
|
+
'or set `SearchEngine.config.embedding.model` globally.',
|
|
412
|
+
hint: "Add `config.embedding.model = 'ts/all-MiniLM-L12-v2'` to your SearchEngine initializer.",
|
|
413
|
+
doc: 'https://typesense.org/docs/30.1/api/vector-search.html#option-b-auto-embedding-generation-within-typesense'
|
|
414
|
+
)
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
# Build the Typesense `embed` hash for auto-embedding fields.
|
|
418
|
+
# @return [Hash]
|
|
419
|
+
def __se_build_embed_hash(from_fields, model_name, api_key: nil, model_config: nil)
|
|
420
|
+
mc = {}
|
|
421
|
+
|
|
422
|
+
global_mc = SearchEngine.config.embedding.model_config
|
|
423
|
+
mc.merge!(global_mc) if global_mc.is_a?(Hash)
|
|
424
|
+
mc.merge!(model_config) if model_config.is_a?(Hash)
|
|
425
|
+
mc[:model_name] = model_name
|
|
426
|
+
|
|
427
|
+
resolved_api_key = api_key || SearchEngine.config.embedding.api_key
|
|
428
|
+
mc[:api_key] = resolved_api_key if resolved_api_key && !resolved_api_key.to_s.strip.empty?
|
|
429
|
+
|
|
430
|
+
{ from: from_fields.map(&:to_s), model_config: mc }
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
# Raise on duplicate embedding field names.
|
|
434
|
+
def __se_check_embedding_duplicate!(resolved_sym)
|
|
435
|
+
return unless (@embeddings_config || {}).key?(resolved_sym)
|
|
436
|
+
|
|
437
|
+
raise ArgumentError, "Duplicate embedding field :#{resolved_sym} already declared"
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# Store embedding metadata for downstream consumers (mapper, indexer, compiler).
|
|
441
|
+
def __se_store_embedding_metadata!(resolved_sym, from:, model:, external:, num_dim:)
|
|
442
|
+
@embeddings_config ||= {}
|
|
443
|
+
@embeddings_config[resolved_sym] = {
|
|
444
|
+
field_name: resolved_sym.to_s,
|
|
445
|
+
from: from,
|
|
446
|
+
model: model,
|
|
447
|
+
external: external,
|
|
448
|
+
num_dim: num_dim
|
|
449
|
+
}.compact
|
|
450
|
+
end
|
|
451
|
+
|
|
452
|
+
# Minimal type resolution for validation (mirrors Schema.typesense_type_for).
|
|
453
|
+
def __se_typesense_type_for(type_descriptor)
|
|
454
|
+
if type_descriptor.is_a?(Array) && type_descriptor.size == 1
|
|
455
|
+
inner = type_descriptor.first
|
|
456
|
+
mapped = SearchEngine::Schema::TYPE_MAPPING[inner.to_s.downcase.to_sym] || inner.to_s
|
|
457
|
+
return "#{mapped}[]"
|
|
458
|
+
end
|
|
459
|
+
|
|
460
|
+
SearchEngine::Schema::TYPE_MAPPING[type_descriptor.to_s.downcase.to_sym] || type_descriptor.to_s
|
|
461
|
+
end
|
|
462
|
+
end
|
|
463
|
+
|
|
285
464
|
class_methods do
|
|
286
465
|
# Validate whether an attribute name is a valid Ruby reader method name
|
|
287
466
|
# (skip dotted names and other invalid identifiers).
|
|
@@ -415,6 +594,14 @@ module SearchEngine
|
|
|
415
594
|
end
|
|
416
595
|
end
|
|
417
596
|
|
|
597
|
+
class_methods do
|
|
598
|
+
# Read-only view of declared embedding metadata for this class.
|
|
599
|
+
# @return [Hash{Symbol=>Hash}] frozen hash keyed by embedding field name
|
|
600
|
+
def embeddings_config
|
|
601
|
+
(@embeddings_config || {}).dup.freeze
|
|
602
|
+
end
|
|
603
|
+
end
|
|
604
|
+
|
|
418
605
|
class_methods do
|
|
419
606
|
# Configure schema retention policy for this collection.
|
|
420
607
|
# @param keep_last [Integer] how many previous physicals to keep after swap
|
|
@@ -444,6 +631,9 @@ module SearchEngine
|
|
|
444
631
|
parent_retention = @schema_retention || {}
|
|
445
632
|
subclass.instance_variable_set(:@schema_retention, parent_retention.dup)
|
|
446
633
|
|
|
634
|
+
parent_embeddings = @embeddings_config || {}
|
|
635
|
+
subclass.instance_variable_set(:@embeddings_config, parent_embeddings.dup)
|
|
636
|
+
|
|
447
637
|
parent_joins = @joins_config || {}
|
|
448
638
|
subclass.instance_variable_set(:@joins_config, parent_joins.dup.freeze)
|
|
449
639
|
|
data/lib/search_engine/config.rb
CHANGED
|
@@ -251,6 +251,10 @@ module SearchEngine
|
|
|
251
251
|
attr_accessor :include_error_messages
|
|
252
252
|
# @return [Boolean] also emit legacy event aliases where applicable
|
|
253
253
|
attr_accessor :emit_legacy_event_aliases
|
|
254
|
+
# @return [Boolean] when true (default), float arrays inside +vector_query+
|
|
255
|
+
# strings are replaced with +[<N dims>]+ in logs and telemetry payloads.
|
|
256
|
+
# Set to +false+ to see raw vectors for debugging.
|
|
257
|
+
attr_accessor :redact_vectors
|
|
254
258
|
|
|
255
259
|
def initialize
|
|
256
260
|
super()
|
|
@@ -260,6 +264,7 @@ module SearchEngine
|
|
|
260
264
|
@max_message_length = 200
|
|
261
265
|
@include_error_messages = false
|
|
262
266
|
@emit_legacy_event_aliases = true
|
|
267
|
+
@redact_vectors = true
|
|
263
268
|
end
|
|
264
269
|
end
|
|
265
270
|
|
|
@@ -321,6 +326,30 @@ module SearchEngine
|
|
|
321
326
|
end
|
|
322
327
|
end
|
|
323
328
|
|
|
329
|
+
# Lightweight nested configuration for global embedding defaults.
|
|
330
|
+
# Provides a default model, optional API key, and extra model_config
|
|
331
|
+
# used by the Schema DSL when compiling auto-embedding fields.
|
|
332
|
+
class EmbeddingConfig
|
|
333
|
+
# @return [String, nil] default embedding model name (e.g. "ts/all-MiniLM-L12-v2")
|
|
334
|
+
attr_accessor :model
|
|
335
|
+
# @return [String, nil] API key for remote embedding providers (e.g. OpenAI)
|
|
336
|
+
attr_accessor :api_key
|
|
337
|
+
# @return [Hash, nil] extra model_config passed to Typesense embed block
|
|
338
|
+
attr_accessor :model_config
|
|
339
|
+
# @return [Float] tolerance for vector search weights sum validation.
|
|
340
|
+
# Weights must sum to ~1.0 within this tolerance. With many small weights
|
|
341
|
+
# floating-point drift can exceed the default. Adjust via
|
|
342
|
+
# +config.embedding.weights_sum_tolerance = 0.05+.
|
|
343
|
+
attr_accessor :weights_sum_tolerance
|
|
344
|
+
|
|
345
|
+
def initialize
|
|
346
|
+
@model = nil
|
|
347
|
+
@api_key = nil
|
|
348
|
+
@model_config = nil
|
|
349
|
+
@weights_sum_tolerance = 0.01
|
|
350
|
+
end
|
|
351
|
+
end
|
|
352
|
+
|
|
324
353
|
# Create a new configuration with defaults, optionally hydrated from ENV.
|
|
325
354
|
#
|
|
326
355
|
# @param env [#[]] environment-like object (defaults to ::ENV)
|
|
@@ -360,6 +389,7 @@ module SearchEngine
|
|
|
360
389
|
@selection = SelectionConfig.new
|
|
361
390
|
@presets = PresetsConfig.new
|
|
362
391
|
@curation = CurationConfig.new
|
|
392
|
+
@embedding = EmbeddingConfig.new
|
|
363
393
|
@default_console_model = nil
|
|
364
394
|
# Path may be relative to Rails.root or absolute. Set nil/false to disable.
|
|
365
395
|
@search_engine_models = 'app/search_engine'
|
|
@@ -438,6 +468,12 @@ module SearchEngine
|
|
|
438
468
|
@curation ||= CurationConfig.new
|
|
439
469
|
end
|
|
440
470
|
|
|
471
|
+
# Expose global embedding configuration.
|
|
472
|
+
# @return [SearchEngine::Config::EmbeddingConfig]
|
|
473
|
+
def embedding
|
|
474
|
+
@embedding ||= EmbeddingConfig.new
|
|
475
|
+
end
|
|
476
|
+
|
|
441
477
|
# Expose observability/logging configuration.
|
|
442
478
|
# @return [SearchEngine::Config::ObservabilityConfig]
|
|
443
479
|
def observability
|
|
@@ -657,6 +693,7 @@ module SearchEngine
|
|
|
657
693
|
selection: selection_hash_for_to_h,
|
|
658
694
|
presets: presets_hash_for_to_h,
|
|
659
695
|
curation: curation_hash_for_to_h,
|
|
696
|
+
embedding: embedding_hash_for_to_h,
|
|
660
697
|
relation_print_materializes: relation_print_materializes ? true : false
|
|
661
698
|
}
|
|
662
699
|
end
|
|
@@ -666,6 +703,7 @@ module SearchEngine
|
|
|
666
703
|
def to_h_redacted
|
|
667
704
|
redacted = to_h.dup
|
|
668
705
|
redacted[:api_key] = '[REDACTED]' unless string_blank?(api_key)
|
|
706
|
+
redacted[:embedding] = redacted[:embedding].merge(api_key: '[REDACTED]') unless string_blank?(embedding.api_key)
|
|
669
707
|
redacted
|
|
670
708
|
end
|
|
671
709
|
|
|
@@ -726,7 +764,8 @@ module SearchEngine
|
|
|
726
764
|
log_format: observability.log_format,
|
|
727
765
|
max_message_length: observability.max_message_length,
|
|
728
766
|
include_error_messages: observability.include_error_messages ? true : false,
|
|
729
|
-
emit_legacy_event_aliases: observability.emit_legacy_event_aliases ? true : false
|
|
767
|
+
emit_legacy_event_aliases: observability.emit_legacy_event_aliases ? true : false,
|
|
768
|
+
redact_vectors: observability.redact_vectors ? true : false
|
|
730
769
|
}
|
|
731
770
|
end
|
|
732
771
|
|
|
@@ -752,6 +791,15 @@ module SearchEngine
|
|
|
752
791
|
}
|
|
753
792
|
end
|
|
754
793
|
|
|
794
|
+
def embedding_hash_for_to_h
|
|
795
|
+
{
|
|
796
|
+
model: embedding.model,
|
|
797
|
+
api_key: embedding.api_key,
|
|
798
|
+
model_config: embedding.model_config,
|
|
799
|
+
weights_sum_tolerance: embedding.weights_sum_tolerance
|
|
800
|
+
}
|
|
801
|
+
end
|
|
802
|
+
|
|
755
803
|
def default_strict_fields
|
|
756
804
|
if defined?(::Rails)
|
|
757
805
|
!::Rails.env.production?
|
data/lib/search_engine/errors.rb
CHANGED
|
@@ -259,6 +259,12 @@ module SearchEngine
|
|
|
259
259
|
# raise SearchEngine::Errors::InvalidOverrideTag, 'InvalidOverrideTag: "" is invalid. Use non-blank strings that match the allowed pattern.'
|
|
260
260
|
class InvalidOverrideTag < Error; end
|
|
261
261
|
|
|
262
|
+
# Raised when a required configuration value is missing or invalid.
|
|
263
|
+
#
|
|
264
|
+
# Typical cause: embedding model not set in either the per-field `model:`
|
|
265
|
+
# kwarg or the global `SearchEngine.config.embedding.model`.
|
|
266
|
+
class ConfigurationError < Error; end
|
|
267
|
+
|
|
262
268
|
# Raised when an option value is invalid or unsupported for a public API.
|
|
263
269
|
#
|
|
264
270
|
# Used by DSL methods to fail fast with actionable hints.
|
|
@@ -286,5 +292,20 @@ module SearchEngine
|
|
|
286
292
|
# details: { total_hits: 12_000, max: 10_000, collection: 'products' }
|
|
287
293
|
# )
|
|
288
294
|
class HitLimitExceeded < Error; end
|
|
295
|
+
|
|
296
|
+
# Raised when vector search DSL receives invalid inputs.
|
|
297
|
+
#
|
|
298
|
+
# Typical causes: unknown embedding field, mutually exclusive query modes,
|
|
299
|
+
# invalid alpha range, or malformed vector arrays.
|
|
300
|
+
#
|
|
301
|
+
# @example
|
|
302
|
+
# raise SearchEngine::Errors::InvalidVectorQuery.new(
|
|
303
|
+
# 'InvalidVectorQuery: query: and id: are mutually exclusive',
|
|
304
|
+
# hint: 'Provide only one of query:, id:, or queries:',
|
|
305
|
+
# doc: 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/v30.1/vector-search'
|
|
306
|
+
# )
|
|
307
|
+
class InvalidVectorQuery < Error
|
|
308
|
+
DOC_URL = 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/v30.1/vector-search'
|
|
309
|
+
end
|
|
289
310
|
end
|
|
290
311
|
end
|
|
@@ -38,8 +38,8 @@ module SearchEngine
|
|
|
38
38
|
optional: %i[duration_ms]
|
|
39
39
|
},
|
|
40
40
|
'search_engine.grouping.compile' => {
|
|
41
|
-
required: %i[
|
|
42
|
-
optional: %i[collection
|
|
41
|
+
required: %i[group_by],
|
|
42
|
+
optional: %i[collection group_limit missing_values duration_ms]
|
|
43
43
|
},
|
|
44
44
|
'search_engine.joins.compile' => {
|
|
45
45
|
required: %i[collection],
|
|
@@ -116,7 +116,7 @@ module SearchEngine
|
|
|
116
116
|
},
|
|
117
117
|
'search_engine.vector.compile' => {
|
|
118
118
|
required: %i[],
|
|
119
|
-
optional: %i[collection query_vector_present dims hybrid_weight ann_params_present duration_ms]
|
|
119
|
+
optional: %i[collection field mode k query_vector_present dims hybrid_weight ann_params_present duration_ms]
|
|
120
120
|
},
|
|
121
121
|
'search_engine.hits.limit' => {
|
|
122
122
|
required: %i[],
|
|
@@ -294,9 +294,12 @@ module SearchEngine
|
|
|
294
294
|
parts << "[#{short}]"
|
|
295
295
|
parts << "id=#{cid}"
|
|
296
296
|
parts << "coll=#{collection}" if collection
|
|
297
|
+
parts << "mode=#{p[:mode] || SearchEngine::Logging::FormatHelpers::DASH}"
|
|
298
|
+
parts << "field=#{p[:field] || SearchEngine::Logging::FormatHelpers::DASH}"
|
|
299
|
+
parts << "k=#{p[:k] || SearchEngine::Logging::FormatHelpers::DASH}"
|
|
297
300
|
parts << "qvec=#{SearchEngine::Logging::FormatHelpers.display_or_dash(p, :query_vector_present)}"
|
|
298
301
|
parts << "dims=#{p[:dims] || SearchEngine::Logging::FormatHelpers::DASH}"
|
|
299
|
-
parts << "
|
|
302
|
+
parts << "alpha=#{p[:hybrid_weight] || SearchEngine::Logging::FormatHelpers::DASH}"
|
|
300
303
|
parts << "ann=#{SearchEngine::Logging::FormatHelpers.display_or_dash(p, :ann_params_present)}"
|
|
301
304
|
parts << "dur=#{duration}ms"
|
|
302
305
|
parts.join(' ')
|
|
@@ -336,7 +339,7 @@ module SearchEngine
|
|
|
336
339
|
h['radius_bucket'] = p[:radius_bucket] if p.key?(:radius_bucket)
|
|
337
340
|
h
|
|
338
341
|
when 'search_engine.vector.compile'
|
|
339
|
-
keys = %i[query_vector_present dims hybrid_weight ann_params_present]
|
|
342
|
+
keys = %i[field mode k query_vector_present dims hybrid_weight ann_params_present]
|
|
340
343
|
keys.each_with_object({}) { |k, h| h[k.to_s] = p[k] if p.key?(k) }
|
|
341
344
|
when 'search_engine.hits.limit'
|
|
342
345
|
keys = %i[early_limit validate_max applied_strategy triggered total_hits]
|
data/lib/search_engine/mapper.rb
CHANGED
|
@@ -320,11 +320,15 @@ module SearchEngine
|
|
|
320
320
|
class Compiled
|
|
321
321
|
attr_reader :klass
|
|
322
322
|
|
|
323
|
-
def initialize(klass:, map_proc:, schema_fields:, types_by_field:,
|
|
323
|
+
def initialize(klass:, map_proc:, schema_fields:, types_by_field:,
|
|
324
|
+
auto_embedding_fields: Set.new, external_embedding_dims: {},
|
|
325
|
+
options: {})
|
|
324
326
|
@klass = klass
|
|
325
327
|
@map_proc = map_proc
|
|
326
328
|
@schema_fields = schema_fields.freeze # Array of field names (String)
|
|
327
329
|
@types_by_field = types_by_field.freeze # { "field" => "int64" }
|
|
330
|
+
@auto_embedding_fields = auto_embedding_fields.freeze # Set<String> — server-generated fields
|
|
331
|
+
@external_embedding_dims = external_embedding_dims.freeze # { "field" => num_dim }
|
|
328
332
|
# Allow all schema fields; treat required as schema fields minus optional attributes
|
|
329
333
|
@allowed_keys = @schema_fields.map(&:to_sym).to_set.freeze
|
|
330
334
|
@required_keys = compute_required_keys
|
|
@@ -360,6 +364,7 @@ module SearchEngine
|
|
|
360
364
|
hash[:id] = computed_id
|
|
361
365
|
hash[:doc_updated_at] = now_i
|
|
362
366
|
|
|
367
|
+
strip_auto_embedding_fields!(hash)
|
|
363
368
|
normalize_optional_blank_strings!(hash)
|
|
364
369
|
|
|
365
370
|
# Populate hidden flags
|
|
@@ -458,6 +463,17 @@ module SearchEngine
|
|
|
458
463
|
end
|
|
459
464
|
end
|
|
460
465
|
|
|
466
|
+
# Remove auto-embedding fields from the document payload.
|
|
467
|
+
# Typesense generates these server-side; including them would cause errors.
|
|
468
|
+
def strip_auto_embedding_fields!(doc)
|
|
469
|
+
return if @auto_embedding_fields.empty?
|
|
470
|
+
|
|
471
|
+
@auto_embedding_fields.each do |name|
|
|
472
|
+
doc.delete(name.to_sym)
|
|
473
|
+
doc.delete(name.to_s)
|
|
474
|
+
end
|
|
475
|
+
end
|
|
476
|
+
|
|
461
477
|
# Normalize empty-string values for optional fields to nil.
|
|
462
478
|
def normalize_optional_blank_strings!(doc)
|
|
463
479
|
return if @__optional_blank_targets__.empty?
|
|
@@ -554,6 +570,10 @@ module SearchEngine
|
|
|
554
570
|
|
|
555
571
|
required.delete(fname.to_sym)
|
|
556
572
|
end
|
|
573
|
+
|
|
574
|
+
# Auto-embedding fields are generated server-side; never require them in documents
|
|
575
|
+
@auto_embedding_fields.each { |name| required.delete(name.to_sym) }
|
|
576
|
+
|
|
557
577
|
required.freeze
|
|
558
578
|
end
|
|
559
579
|
|
|
@@ -593,6 +613,8 @@ module SearchEngine
|
|
|
593
613
|
return [true, nil, nil] if value.is_a?(Array) && value.all? { |v| v.is_a?(String) }
|
|
594
614
|
|
|
595
615
|
[false, nil, invalid_type_message(field, 'Array<String>', value)]
|
|
616
|
+
when 'float[]'
|
|
617
|
+
validate_float_array(value, field)
|
|
596
618
|
else
|
|
597
619
|
# Unknown/opaque type: accept
|
|
598
620
|
[true, nil, nil]
|
|
@@ -634,6 +656,20 @@ module SearchEngine
|
|
|
634
656
|
end
|
|
635
657
|
end
|
|
636
658
|
|
|
659
|
+
def validate_float_array(value, field)
|
|
660
|
+
unless value.is_a?(Array) && value.all? { |v| v.is_a?(Numeric) }
|
|
661
|
+
return [false, nil, invalid_type_message(field, 'Array<Float>', value)]
|
|
662
|
+
end
|
|
663
|
+
|
|
664
|
+
expected_dim = @external_embedding_dims[field]
|
|
665
|
+
if expected_dim && value.size != expected_dim
|
|
666
|
+
msg = "Dimension mismatch for field :#{field} (expected #{expected_dim}, got #{value.size})."
|
|
667
|
+
return [false, nil, msg]
|
|
668
|
+
end
|
|
669
|
+
|
|
670
|
+
[true, nil, nil]
|
|
671
|
+
end
|
|
672
|
+
|
|
637
673
|
def string_integer?(v)
|
|
638
674
|
v.is_a?(String) && v.match?(/^[-+]?\d+$/)
|
|
639
675
|
end
|
|
@@ -749,6 +785,8 @@ module SearchEngine
|
|
|
749
785
|
types_by_field[f[:name].to_s] = f[:type].to_s
|
|
750
786
|
end
|
|
751
787
|
|
|
788
|
+
auto_embedding_fields, external_embedding_dims = partition_embedding_fields(klass)
|
|
789
|
+
|
|
752
790
|
mapper_cfg = SearchEngine.config&.mapper
|
|
753
791
|
coercions_cfg = mapper_cfg&.coercions || {}
|
|
754
792
|
options = {
|
|
@@ -763,10 +801,32 @@ module SearchEngine
|
|
|
763
801
|
map_proc: dsl[:map],
|
|
764
802
|
schema_fields: fields,
|
|
765
803
|
types_by_field: types_by_field,
|
|
804
|
+
auto_embedding_fields: auto_embedding_fields,
|
|
805
|
+
external_embedding_dims: external_embedding_dims,
|
|
766
806
|
options: options
|
|
767
807
|
)
|
|
768
808
|
end
|
|
769
809
|
|
|
810
|
+
# Partition embeddings_config into auto-embedding field names and
|
|
811
|
+
# external embedding field-name-to-dimension mapping.
|
|
812
|
+
# @return [Array(Set<String>, Hash{String=>Integer})]
|
|
813
|
+
def partition_embedding_fields(klass)
|
|
814
|
+
embeddings = klass.respond_to?(:embeddings_config) ? klass.embeddings_config : {}
|
|
815
|
+
auto_fields = Set.new
|
|
816
|
+
external_dims = {}
|
|
817
|
+
|
|
818
|
+
embeddings.each do |sym, meta|
|
|
819
|
+
name = (meta[:field_name] || sym).to_s
|
|
820
|
+
if meta[:external]
|
|
821
|
+
external_dims[name] = meta[:num_dim].to_i if meta[:num_dim]
|
|
822
|
+
else
|
|
823
|
+
auto_fields << name
|
|
824
|
+
end
|
|
825
|
+
end
|
|
826
|
+
|
|
827
|
+
[auto_fields, external_dims]
|
|
828
|
+
end
|
|
829
|
+
|
|
770
830
|
def mapper_dsl_for(klass)
|
|
771
831
|
return unless klass.instance_variable_defined?(:@__mapper_dsl__)
|
|
772
832
|
|
|
@@ -12,9 +12,11 @@ module SearchEngine
|
|
|
12
12
|
|
|
13
13
|
# Whitelisted search parameter keys to include in payload excerpts.
|
|
14
14
|
PARAM_WHITELIST = %i[
|
|
15
|
-
q query_by include_fields exclude_fields per_page page infix filter_by
|
|
15
|
+
q query_by include_fields exclude_fields per_page page infix filter_by sort_by
|
|
16
|
+
group_by group_limit group_missing_values
|
|
16
17
|
facet_by max_facet_values facet_query
|
|
17
18
|
num_typos drop_tokens_threshold prioritize_exact_match query_by_weights
|
|
19
|
+
vector_query
|
|
18
20
|
].freeze
|
|
19
21
|
|
|
20
22
|
# Maximum length for `q` values before truncation.
|
|
@@ -58,6 +60,8 @@ module SearchEngine
|
|
|
58
60
|
result[:q] = truncate_q(val)
|
|
59
61
|
when :filter_by
|
|
60
62
|
result[:filter_by] = redact_filter_by(val)
|
|
63
|
+
when :vector_query
|
|
64
|
+
result[:vector_query] = redact_vector_query(val)
|
|
61
65
|
else
|
|
62
66
|
result[key] = redact_simple_value(val)
|
|
63
67
|
end
|
|
@@ -106,6 +110,25 @@ module SearchEngine
|
|
|
106
110
|
masked.gsub(/\b\d+(?:\.\d+)?\b/, '***')
|
|
107
111
|
end
|
|
108
112
|
|
|
113
|
+
# Internal: Redact raw float arrays in a vector_query string while
|
|
114
|
+
# preserving the structural tokens (field name, k, alpha, etc.).
|
|
115
|
+
# Replaces `[0.1,0.2,...]` with `[<N dims>]`.
|
|
116
|
+
#
|
|
117
|
+
# Disable via +config.observability.redact_vectors = false+ to see
|
|
118
|
+
# raw vectors (useful for debugging).
|
|
119
|
+
#
|
|
120
|
+
# @param vq [String]
|
|
121
|
+
# @return [String]
|
|
122
|
+
def self.redact_vector_query(vq)
|
|
123
|
+
return vq unless vq.is_a?(String)
|
|
124
|
+
return vq if SearchEngine.config.observability.redact_vectors == false
|
|
125
|
+
|
|
126
|
+
vq.gsub(/\[(-?\d+(?:\.\d+)?(?:\s*,\s*-?\d+(?:\.\d+?))*)\]/) do
|
|
127
|
+
dims = Regexp.last_match(1).split(',').size
|
|
128
|
+
"[<#{dims} dims>]"
|
|
129
|
+
end
|
|
130
|
+
end
|
|
131
|
+
|
|
109
132
|
# Build a filtered URL/common options hash for payloads.
|
|
110
133
|
# @param url_opts [Hash]
|
|
111
134
|
# @return [Hash]
|
|
@@ -157,6 +180,6 @@ module SearchEngine
|
|
|
157
180
|
end
|
|
158
181
|
|
|
159
182
|
private_class_method :redact_params_hash, :redact_simple_value, :truncate_q,
|
|
160
|
-
:redact_string, :redact_filter_by
|
|
183
|
+
:redact_string, :redact_filter_by, :redact_vector_query
|
|
161
184
|
end
|
|
162
185
|
end
|
data/lib/search_engine/otel.rb
CHANGED
|
@@ -130,8 +130,8 @@ module SearchEngine
|
|
|
130
130
|
assign_attr(span, 'se.node_count', p[:node_count]) if p.key?(:node_count)
|
|
131
131
|
assign_attr(span, 'se.join_count', p[:join_count]) if p.key?(:join_count)
|
|
132
132
|
assign_attr(span, 'se.groups_count', p[:groups_count]) if p.key?(:groups_count)
|
|
133
|
-
assign_attr(span, 'se.group_by', p[:
|
|
134
|
-
assign_attr(span, 'se.group_limit', p[:
|
|
133
|
+
assign_attr(span, 'se.group_by', p[:group_by]) if p.key?(:group_by)
|
|
134
|
+
assign_attr(span, 'se.group_limit', p[:group_limit]) if p.key?(:group_limit)
|
|
135
135
|
return unless p.key?(:missing_values) || p.key?(:group_missing_values)
|
|
136
136
|
|
|
137
137
|
assign_attr(span, 'se.group_missing_values', p[:missing_values] || p[:group_missing_values])
|
|
@@ -142,7 +142,14 @@ module SearchEngine
|
|
|
142
142
|
deleted_count searches_count fields_changed_count added_count removed_count in_sync].each do |k|
|
|
143
143
|
assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
|
|
144
144
|
end
|
|
145
|
-
|
|
145
|
+
apply_feature_detail_attributes(span, p)
|
|
146
|
+
apply_vector_attributes(span, p)
|
|
147
|
+
%i[early_limit validate_max applied_strategy triggered total_hits].each do |k|
|
|
148
|
+
assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def apply_feature_detail_attributes(span, p)
|
|
146
153
|
%i[fields_count queries_count max_facet_values sort_flags conflicts].each do |k|
|
|
147
154
|
assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
|
|
148
155
|
end
|
|
@@ -161,12 +168,17 @@ module SearchEngine
|
|
|
161
168
|
%i[sort_mode radius_bucket].each do |k|
|
|
162
169
|
assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
|
|
163
170
|
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def apply_vector_attributes(span, p)
|
|
174
|
+
return unless p.key?(:query_vector_present)
|
|
175
|
+
|
|
176
|
+
assign_attr(span, 'se.vector.field', p[:field]) if p.key?(:field)
|
|
177
|
+
assign_attr(span, 'se.vector.mode', p[:mode]&.to_s) if p.key?(:mode)
|
|
178
|
+
assign_attr(span, 'se.vector.k', p[:k]) if p.key?(:k)
|
|
164
179
|
%i[query_vector_present dims hybrid_weight ann_params_present].each do |k|
|
|
165
180
|
assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
|
|
166
181
|
end
|
|
167
|
-
%i[early_limit validate_max applied_strategy triggered total_hits].each do |k|
|
|
168
|
-
assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
|
|
169
|
-
end
|
|
170
182
|
end
|
|
171
183
|
|
|
172
184
|
def apply_params_preview(span, payload)
|