search-engine-for-typesense 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +148 -0
  4. data/app/search_engine/search_engine/app_info.rb +11 -0
  5. data/app/search_engine/search_engine/index_partition_job.rb +170 -0
  6. data/lib/generators/search_engine/install/install_generator.rb +20 -0
  7. data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
  8. data/lib/generators/search_engine/model/model_generator.rb +86 -0
  9. data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
  10. data/lib/search-engine-for-typesense.rb +12 -0
  11. data/lib/search_engine/active_record_syncable.rb +247 -0
  12. data/lib/search_engine/admin/stopwords.rb +125 -0
  13. data/lib/search_engine/admin/synonyms.rb +125 -0
  14. data/lib/search_engine/admin.rb +12 -0
  15. data/lib/search_engine/ast/and.rb +52 -0
  16. data/lib/search_engine/ast/binary_op.rb +75 -0
  17. data/lib/search_engine/ast/eq.rb +19 -0
  18. data/lib/search_engine/ast/group.rb +18 -0
  19. data/lib/search_engine/ast/gt.rb +12 -0
  20. data/lib/search_engine/ast/gte.rb +12 -0
  21. data/lib/search_engine/ast/in.rb +28 -0
  22. data/lib/search_engine/ast/lt.rb +12 -0
  23. data/lib/search_engine/ast/lte.rb +12 -0
  24. data/lib/search_engine/ast/matches.rb +55 -0
  25. data/lib/search_engine/ast/node.rb +176 -0
  26. data/lib/search_engine/ast/not_eq.rb +13 -0
  27. data/lib/search_engine/ast/not_in.rb +24 -0
  28. data/lib/search_engine/ast/or.rb +52 -0
  29. data/lib/search_engine/ast/prefix.rb +51 -0
  30. data/lib/search_engine/ast/raw.rb +41 -0
  31. data/lib/search_engine/ast/unary_op.rb +43 -0
  32. data/lib/search_engine/ast.rb +101 -0
  33. data/lib/search_engine/base/creation.rb +727 -0
  34. data/lib/search_engine/base/deletion.rb +80 -0
  35. data/lib/search_engine/base/display_coercions.rb +36 -0
  36. data/lib/search_engine/base/hydration.rb +312 -0
  37. data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
  38. data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
  39. data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
  40. data/lib/search_engine/base/index_maintenance.rb +459 -0
  41. data/lib/search_engine/base/indexing_dsl.rb +255 -0
  42. data/lib/search_engine/base/joins.rb +479 -0
  43. data/lib/search_engine/base/model_dsl.rb +472 -0
  44. data/lib/search_engine/base/presets.rb +43 -0
  45. data/lib/search_engine/base/pretty_printer.rb +315 -0
  46. data/lib/search_engine/base/relation_delegation.rb +42 -0
  47. data/lib/search_engine/base/scopes.rb +113 -0
  48. data/lib/search_engine/base/updating.rb +92 -0
  49. data/lib/search_engine/base.rb +38 -0
  50. data/lib/search_engine/bulk.rb +284 -0
  51. data/lib/search_engine/cache.rb +33 -0
  52. data/lib/search_engine/cascade.rb +531 -0
  53. data/lib/search_engine/cli/doctor.rb +631 -0
  54. data/lib/search_engine/cli/support.rb +217 -0
  55. data/lib/search_engine/cli.rb +222 -0
  56. data/lib/search_engine/client/http_adapter.rb +63 -0
  57. data/lib/search_engine/client/request_builder.rb +92 -0
  58. data/lib/search_engine/client/services/base.rb +74 -0
  59. data/lib/search_engine/client/services/collections.rb +161 -0
  60. data/lib/search_engine/client/services/documents.rb +214 -0
  61. data/lib/search_engine/client/services/operations.rb +152 -0
  62. data/lib/search_engine/client/services/search.rb +190 -0
  63. data/lib/search_engine/client/services.rb +29 -0
  64. data/lib/search_engine/client.rb +765 -0
  65. data/lib/search_engine/client_options.rb +20 -0
  66. data/lib/search_engine/collection_resolver.rb +191 -0
  67. data/lib/search_engine/collections_graph.rb +330 -0
  68. data/lib/search_engine/compiled_params.rb +143 -0
  69. data/lib/search_engine/compiler.rb +383 -0
  70. data/lib/search_engine/config/observability.rb +27 -0
  71. data/lib/search_engine/config/presets.rb +92 -0
  72. data/lib/search_engine/config/selection.rb +16 -0
  73. data/lib/search_engine/config/typesense.rb +48 -0
  74. data/lib/search_engine/config/validators.rb +97 -0
  75. data/lib/search_engine/config.rb +917 -0
  76. data/lib/search_engine/console_helpers.rb +130 -0
  77. data/lib/search_engine/deletion.rb +103 -0
  78. data/lib/search_engine/dispatcher.rb +125 -0
  79. data/lib/search_engine/dsl/parser.rb +582 -0
  80. data/lib/search_engine/engine.rb +167 -0
  81. data/lib/search_engine/errors.rb +290 -0
  82. data/lib/search_engine/filters/sanitizer.rb +189 -0
  83. data/lib/search_engine/hydration/materializers.rb +808 -0
  84. data/lib/search_engine/hydration/selection_context.rb +96 -0
  85. data/lib/search_engine/indexer/batch_planner.rb +76 -0
  86. data/lib/search_engine/indexer/bulk_import.rb +626 -0
  87. data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
  88. data/lib/search_engine/indexer/retry_policy.rb +103 -0
  89. data/lib/search_engine/indexer.rb +747 -0
  90. data/lib/search_engine/instrumentation.rb +308 -0
  91. data/lib/search_engine/joins/guard.rb +202 -0
  92. data/lib/search_engine/joins/resolver.rb +95 -0
  93. data/lib/search_engine/logging/color.rb +78 -0
  94. data/lib/search_engine/logging/format_helpers.rb +92 -0
  95. data/lib/search_engine/logging/partition_progress.rb +53 -0
  96. data/lib/search_engine/logging_subscriber.rb +388 -0
  97. data/lib/search_engine/mapper.rb +785 -0
  98. data/lib/search_engine/multi.rb +286 -0
  99. data/lib/search_engine/multi_result.rb +186 -0
  100. data/lib/search_engine/notifications/compact_logger.rb +675 -0
  101. data/lib/search_engine/observability.rb +162 -0
  102. data/lib/search_engine/operations.rb +58 -0
  103. data/lib/search_engine/otel.rb +227 -0
  104. data/lib/search_engine/partitioner.rb +128 -0
  105. data/lib/search_engine/ranking_plan.rb +118 -0
  106. data/lib/search_engine/registry.rb +158 -0
  107. data/lib/search_engine/relation/compiler.rb +711 -0
  108. data/lib/search_engine/relation/deletion.rb +37 -0
  109. data/lib/search_engine/relation/dsl/filters.rb +624 -0
  110. data/lib/search_engine/relation/dsl/selection.rb +240 -0
  111. data/lib/search_engine/relation/dsl.rb +903 -0
  112. data/lib/search_engine/relation/dx/dry_run.rb +59 -0
  113. data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
  114. data/lib/search_engine/relation/dx.rb +231 -0
  115. data/lib/search_engine/relation/materializers.rb +118 -0
  116. data/lib/search_engine/relation/options.rb +138 -0
  117. data/lib/search_engine/relation/state.rb +274 -0
  118. data/lib/search_engine/relation/updating.rb +44 -0
  119. data/lib/search_engine/relation.rb +623 -0
  120. data/lib/search_engine/result.rb +664 -0
  121. data/lib/search_engine/schema.rb +1083 -0
  122. data/lib/search_engine/sources/active_record_source.rb +185 -0
  123. data/lib/search_engine/sources/base.rb +62 -0
  124. data/lib/search_engine/sources/lambda_source.rb +55 -0
  125. data/lib/search_engine/sources/sql_source.rb +196 -0
  126. data/lib/search_engine/sources.rb +71 -0
  127. data/lib/search_engine/stale_rules.rb +160 -0
  128. data/lib/search_engine/test/minitest_assertions.rb +57 -0
  129. data/lib/search_engine/test/offline_client.rb +134 -0
  130. data/lib/search_engine/test/rspec_matchers.rb +77 -0
  131. data/lib/search_engine/test/stub_client.rb +201 -0
  132. data/lib/search_engine/test.rb +66 -0
  133. data/lib/search_engine/test_autoload.rb +8 -0
  134. data/lib/search_engine/update.rb +35 -0
  135. data/lib/search_engine/version.rb +7 -0
  136. data/lib/search_engine.rb +332 -0
  137. data/lib/tasks/search_engine.rake +501 -0
  138. data/lib/tasks/search_engine_doctor.rake +16 -0
  139. metadata +225 -0
@@ -0,0 +1,162 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ # Lightweight utilities for observability concerns (redaction, excerpts).
5
+ #
6
+ # Provides a single public entry point {.redact} used by the client and
7
+ # optional subscribers to produce compact, redacted payloads that avoid
8
+ # leaking secrets while keeping useful context.
9
+ module Observability
10
+ # Keys that are considered sensitive and must be redacted whenever present.
11
+ SENSITIVE_KEY_PATTERN = /key|token|secret|password/i
12
+
13
+ # Whitelisted search parameter keys to include in payload excerpts.
14
+ PARAM_WHITELIST = %i[
15
+ q query_by include_fields exclude_fields per_page page infix filter_by group_by group_limit group_missing_values
16
+ facet_by max_facet_values facet_query
17
+ num_typos drop_tokens_threshold prioritize_exact_match query_by_weights
18
+ ].freeze
19
+
20
+ # Maximum length for `q` values before truncation.
21
+ MAX_Q_LENGTH = 128
22
+
23
+ # Redact a value producing a new structure without mutating the input.
24
+ #
25
+ # - When given a Hash of search params, returns a compact excerpt that only
26
+ # includes whitelisted keys with secrets redacted and `filter_by` masked.
27
+ # - When given an Array, returns a redacted array by applying the same logic
28
+ # to each element.
29
+ # - For other values, returns a best-effort redacted representation.
30
+ #
31
+ # @param value [Object]
32
+ # @return [Object]
33
+ def self.redact(value)
34
+ case value
35
+ when Hash
36
+ redact_params_hash(value)
37
+ when Array
38
+ value.map { |v| redact(v) }
39
+ when String
40
+ redact_string(value)
41
+ else
42
+ value
43
+ end
44
+ end
45
+
46
+ # Internal: Redact a Hash presumed to be Typesense search params.
47
+ # Returns a new Hash with only whitelisted keys preserved. Sensitive keys
48
+ # are not included; `filter_by` literals are masked.
49
+ def self.redact_params_hash(params)
50
+ result = {}
51
+
52
+ PARAM_WHITELIST.each do |key|
53
+ next unless params.key?(key)
54
+
55
+ val = params[key]
56
+ case key
57
+ when :q
58
+ result[:q] = truncate_q(val)
59
+ when :filter_by
60
+ result[:filter_by] = redact_filter_by(val)
61
+ else
62
+ result[key] = redact_simple_value(val)
63
+ end
64
+ end
65
+
66
+ result
67
+ end
68
+
69
+ # Internal: Best-effort redaction for simple scalar values.
70
+ def self.redact_simple_value(value)
71
+ return value unless value.is_a?(String)
72
+
73
+ redact_string(value)
74
+ end
75
+
76
+ # Internal: Truncate overly long query strings.
77
+ def self.truncate_q(query)
78
+ return query unless query.is_a?(String)
79
+
80
+ query.length > MAX_Q_LENGTH ? "#{query[0, MAX_Q_LENGTH]}..." : query
81
+ end
82
+
83
+ # Internal: Redact secrets in a string and mask obvious literal fragments.
84
+ def self.redact_string(str)
85
+ return str unless str.is_a?(String)
86
+
87
+ # Mask obvious quoted literals first
88
+ redacted = str.gsub(/"[^"]*"|'[^']*'/, '***')
89
+
90
+ # Mask numeric literals (best-effort)
91
+ redacted.gsub(/\b\d+(?:\.\d+)?\b/, '***')
92
+ end
93
+
94
+ # Internal: Mask literal values in Typesense filter expressions while
95
+ # preserving attribute/operator structure. Best-effort and lightweight.
96
+ # Examples:
97
+ # "category_id:=123" => "category_id:=***"
98
+ # "price:>10 && brand:='Acme'" => "price:>*** && brand:=***"
99
+ def self.redact_filter_by(filter)
100
+ return filter unless filter.is_a?(String)
101
+
102
+ # Replace values that follow a comparator or a colon with *** until a
103
+ # delimiter is reached. Also mask quoted strings and numbers.
104
+ masked = filter.gsub(/([!:><=]{1,2})\s*([^\s)&|]+)/, '\1***')
105
+ masked = masked.gsub(/"[^"]*"|'[^']*'/, '***')
106
+ masked.gsub(/\b\d+(?:\.\d+)?\b/, '***')
107
+ end
108
+
109
+ # Build a filtered URL/common options hash for payloads.
110
+ # @param url_opts [Hash]
111
+ # @return [Hash]
112
+ def self.filtered_url_opts(url_opts)
113
+ return {} unless url_opts.is_a?(Hash)
114
+
115
+ {
116
+ use_cache: url_opts[:use_cache],
117
+ cache_ttl: url_opts[:cache_ttl]
118
+ }
119
+ end
120
+
121
+ # Compute a SHA1 hex digest for a value.
122
+ # @param value [#to_s]
123
+ # @return [String]
124
+ def self.sha_1(value)
125
+ require 'digest'
126
+ Digest::SHA1.hexdigest(value.to_s)
127
+ end
128
+
129
+ # Return a shortened hash prefix for display/logging.
130
+ # @param hexdigest [String]
131
+ # @param length [Integer]
132
+ # @return [String]
133
+ def self.short_hash(hexdigest, length = 8)
134
+ s = hexdigest.to_s
135
+ s[0, length]
136
+ end
137
+
138
+ # Truncate and normalize a free-text message to a single line.
139
+ # @param message [String]
140
+ # @param max [Integer]
141
+ # @return [String]
142
+ def self.truncate_message(message, max = 200)
143
+ s = message.to_s.gsub(/\s+/, ' ').strip
144
+ s[0, max]
145
+ end
146
+
147
+ # Compute partition helpers used in logs: prefer raw numeric; hash strings.
148
+ # @param partition [Object]
149
+ # @return [Hash] { partition: <raw>, partition_hash: <String,nil> }
150
+ def self.partition_fields(partition)
151
+ if partition.is_a?(Numeric)
152
+ { partition: partition, partition_hash: nil }
153
+ else
154
+ hex = sha_1(partition)
155
+ { partition: partition, partition_hash: short_hash(hex) }
156
+ end
157
+ end
158
+
159
+ private_class_method :redact_params_hash, :redact_simple_value, :truncate_q,
160
+ :redact_string, :redact_filter_by
161
+ end
162
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ # Operations namespace for convenience access to operational endpoints.
5
+ #
6
+ # Provides small wrappers around {SearchEngine::Client} for metrics, stats,
7
+ # and health, emitting instrumentation and supporting dependency injection
8
+ # via an optional client.
9
+ module Operations
10
+ class << self
11
+ # Retrieve raw server metrics.
12
+ #
13
+ # @param client [SearchEngine::Client, nil] optional injected client
14
+ # @return [Hash] raw payload from Typesense `/metrics.json`
15
+ # @see SearchEngine::Client#metrics
16
+ def metrics(client: nil)
17
+ SearchEngine::Instrumentation.instrument('search_engine.operations.metrics', {}) do
18
+ ts_client = client || SearchEngine.client
19
+ ts_client.metrics
20
+ end
21
+ end
22
+
23
+ # Retrieve raw server stats.
24
+ #
25
+ # @param client [SearchEngine::Client, nil] optional injected client
26
+ # @return [Hash] raw payload from Typesense `/stats.json`
27
+ # @see SearchEngine::Client#stats
28
+ def stats(client: nil)
29
+ SearchEngine::Instrumentation.instrument('search_engine.operations.stats', {}) do
30
+ ts_client = client || SearchEngine.client
31
+ ts_client.stats
32
+ end
33
+ end
34
+
35
+ # Retrieve server health.
36
+ #
37
+ # @param client [SearchEngine::Client, nil] optional injected client
38
+ # @return [Hash] Typesense health response
39
+ # @see SearchEngine::Client#health
40
+ def health(client: nil)
41
+ SearchEngine::Instrumentation.instrument('search_engine.operations.health', {}) do
42
+ ts_client = client || SearchEngine.client
43
+ ts_client.health
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ def configured_client
50
+ return unless SearchEngine.config.respond_to?(:client)
51
+
52
+ SearchEngine.config.client
53
+ rescue StandardError
54
+ nil
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,227 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ # Optional OpenTelemetry adapter that translates unified instrumentation
5
+ # events into OpenTelemetry spans. Activation is gated by the presence of the
6
+ # OpenTelemetry SDK and by `SearchEngine.config.opentelemetry.enabled`.
7
+ #
8
+ # @since M8
9
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/observability#opentelemetry
10
+ #
11
+ # Public API:
12
+ # - .installed? => Boolean
13
+ # - .enabled? => Boolean (config + SDK present)
14
+ # - .start! => idempotently subscribes to events
15
+ # - .stop! => unsubscribes
16
+ module Otel
17
+ class << self
18
+ # @return [Boolean] whether the OpenTelemetry SDK is available
19
+ # @since M8
20
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/observability#opentelemetry
21
+ def installed?
22
+ defined?(::OpenTelemetry::SDK)
23
+ end
24
+
25
+ # @return [Boolean] whether the adapter should be active
26
+ # @since M8
27
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/observability#opentelemetry
28
+ def enabled?
29
+ installed? && SearchEngine.respond_to?(:config) && SearchEngine.config&.opentelemetry&.enabled
30
+ end
31
+
32
+ # Start the adapter (idempotent). No-ops when disabled or SDK unavailable.
33
+ # @return [Object, nil] subscription handle or nil when not installed/enabled
34
+ # @since M8
35
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/observability#opentelemetry
36
+ def start!
37
+ stop!
38
+ return nil unless enabled?
39
+ return nil unless defined?(ActiveSupport::Notifications)
40
+
41
+ @service_name = begin
42
+ SearchEngine.config.opentelemetry.service_name
43
+ rescue StandardError
44
+ 'search_engine'
45
+ end
46
+ @tracer = tracer_provider&.tracer('search_engine', SearchEngine::VERSION)
47
+
48
+ @handle = ActiveSupport::Notifications.subscribe(/^search_engine\./) do |*args|
49
+ # Lazily build Event only when sampled downstream; allocation kept minimal otherwise
50
+ event = ActiveSupport::Notifications::Event.new(*args)
51
+ handle_event(event)
52
+ end
53
+ rescue StandardError
54
+ # Never raise from adapter startup
55
+ nil
56
+ end
57
+
58
+ # Stop the adapter if previously started.
59
+ # @return [Boolean]
60
+ # @since M8
61
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/observability#opentelemetry
62
+ def stop!
63
+ return false unless defined?(ActiveSupport::Notifications)
64
+ return false unless @handle
65
+
66
+ ActiveSupport::Notifications.unsubscribe(@handle)
67
+ @handle = nil
68
+ true
69
+ end
70
+
71
+ private
72
+
73
+ def tracer_provider
74
+ return nil unless installed?
75
+
76
+ ::OpenTelemetry.tracer_provider
77
+ rescue StandardError
78
+ nil
79
+ end
80
+
81
+ attr_reader :tracer
82
+
83
+ def handle_event(event)
84
+ return unless tracer
85
+
86
+ payload = event.payload || {}
87
+ duration = compute_duration(event, payload)
88
+
89
+ tracer.in_span(event.name) do |span|
90
+ apply_common_attributes(span, event, payload, duration)
91
+ apply_url_attributes(span, payload)
92
+ apply_feature_attributes(span, payload)
93
+ apply_indexer_schema_attributes(span, payload)
94
+ apply_params_preview(span, payload)
95
+ apply_status(span, payload)
96
+ end
97
+ rescue StandardError
98
+ # Never raise from subscriber
99
+ nil
100
+ end
101
+
102
+ def compute_duration(event, payload)
103
+ d = (event.respond_to?(:duration) ? event.duration : payload[:duration_ms]).to_f
104
+ d = payload[:duration_ms].to_f if d.zero? && payload[:duration_ms]
105
+ d.round(1)
106
+ end
107
+
108
+ def apply_common_attributes(span, event, payload, duration)
109
+ assign_attr(span, 'service.name', @service_name)
110
+ assign_attr(span, 'se.event', event.name)
111
+ assign_attr(span, 'se.cid', payload[:correlation_id]) if payload.key?(:correlation_id)
112
+ assign_attr(span, 'http.status_code', payload[:http_status]) if payload.key?(:http_status)
113
+ assign_attr(span, 'se.duration_ms', duration) if duration.positive?
114
+ return unless payload[:collection] || payload[:logical]
115
+
116
+ assign_attr(span, 'se.collection', payload[:collection] || payload[:logical])
117
+ end
118
+
119
+ def apply_url_attributes(span, payload)
120
+ url_opts = payload[:url_opts]
121
+ return unless url_opts.is_a?(Hash)
122
+
123
+ assign_attr(span, 'se.url_use_cache', url_opts[:use_cache]) if url_opts.key?(:use_cache)
124
+ assign_attr(span, 'se.url_cache_ttl', url_opts[:cache_ttl]) if url_opts.key?(:cache_ttl)
125
+ end
126
+
127
+ def apply_feature_attributes(span, p)
128
+ assign_attr(span, 'se.labels_count', Array(p[:labels]).size) if p.key?(:labels)
129
+ assign_attr(span, 'se.searches_count', p[:searches_count]) if p.key?(:searches_count)
130
+ assign_attr(span, 'se.node_count', p[:node_count]) if p.key?(:node_count)
131
+ assign_attr(span, 'se.join_count', p[:join_count]) if p.key?(:join_count)
132
+ assign_attr(span, 'se.groups_count', p[:groups_count]) if p.key?(:groups_count)
133
+ assign_attr(span, 'se.group_by', p[:field] || p[:group_by]) if p.key?(:field) || p.key?(:group_by)
134
+ assign_attr(span, 'se.group_limit', p[:limit] || p[:group_limit]) if p.key?(:limit) || p.key?(:group_limit)
135
+ return unless p.key?(:missing_values) || p.key?(:group_missing_values)
136
+
137
+ assign_attr(span, 'se.group_missing_values', p[:missing_values] || p[:group_missing_values])
138
+ end
139
+
140
+ def apply_indexer_schema_attributes(span, p)
141
+ %i[into partition partition_hash batch_index docs_count success_count failure_count attempts bytes_sent
142
+ deleted_count searches_count fields_changed_count added_count removed_count in_sync].each do |k|
143
+ assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
144
+ end
145
+ # New event attributes (redacted/summarized)
146
+ %i[fields_count queries_count max_facet_values sort_flags conflicts].each do |k|
147
+ assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
148
+ end
149
+ %i[full_fields_count affix_tokens snippet_threshold tag_kind].each do |k|
150
+ assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
151
+ end
152
+ %i[use_synonyms use_stopwords source].each do |k|
153
+ assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
154
+ end
155
+ if p.key?(:shapes)
156
+ shapes = p[:shapes] || {}
157
+ assign_attr(span, 'se.shapes.point', shapes[:point]) if shapes.key?(:point)
158
+ assign_attr(span, 'se.shapes.rect', shapes[:rect]) if shapes.key?(:rect)
159
+ assign_attr(span, 'se.shapes.circle', shapes[:circle]) if shapes.key?(:circle)
160
+ end
161
+ %i[sort_mode radius_bucket].each do |k|
162
+ assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
163
+ end
164
+ %i[query_vector_present dims hybrid_weight ann_params_present].each do |k|
165
+ assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
166
+ end
167
+ %i[early_limit validate_max applied_strategy triggered total_hits].each do |k|
168
+ assign_attr(span, "se.#{k}", p[k]) if p.key?(k)
169
+ end
170
+ end
171
+
172
+ def apply_params_preview(span, payload)
173
+ return unless payload.key?(:params_preview)
174
+
175
+ red = SearchEngine::Instrumentation.redact(payload[:params_preview])
176
+ keys_count = (red.is_a?(Hash) ? red.keys.size : nil)
177
+ assign_attr(span, 'se.params_preview_keys', keys_count) if keys_count
178
+ rescue StandardError
179
+ nil
180
+ end
181
+
182
+ def apply_status(span, payload)
183
+ http = payload[:http_status]
184
+ status = payload[:status]
185
+ err_class = payload[:error_class]
186
+ err_msg = payload[:error_message]
187
+
188
+ if (status && status.to_sym == :error) || (http && http.to_i >= 400) || err_class
189
+ # Record a lightweight exception event with sanitized message
190
+ if err_class || err_msg
191
+ msg = SearchEngine::Observability.truncate_message(err_msg || err_class.to_s, 200)
192
+ span.add_event(
193
+ 'exception',
194
+ attributes: {
195
+ 'exception.type' => (err_class || 'Error').to_s,
196
+ 'exception.message' => msg
197
+ }
198
+ )
199
+ end
200
+ span_status_error(span, err_msg || err_class)
201
+ else
202
+ span_status_ok(span)
203
+ end
204
+ end
205
+
206
+ def span_status_error(span, description = nil)
207
+ span.status = ::OpenTelemetry::Trace::Status.error(description.to_s) if defined?(::OpenTelemetry::Trace::Status)
208
+ rescue StandardError
209
+ nil
210
+ end
211
+
212
+ def span_status_ok(span)
213
+ span.status = ::OpenTelemetry::Trace::Status.ok if defined?(::OpenTelemetry::Trace::Status)
214
+ rescue StandardError
215
+ nil
216
+ end
217
+
218
+ def assign_attr(span, key, value)
219
+ return if value.nil?
220
+
221
+ span.set_attribute(key, value)
222
+ rescue StandardError
223
+ nil
224
+ end
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ # Compiles and validates partitioning directives captured by the index DSL.
5
+ #
6
+ # Provides an immutable object with callables for:
7
+ # - partitions -> Enumerable of partition keys
8
+ # - partition_fetch(partition) -> Enumerable of batches (Arrays of records)
9
+ # - before_hook(partition)
10
+ # - after_hook(partition)
11
+ class Partitioner
12
+ # Immutable compiled holder
13
+ class Compiled
14
+ attr_reader :klass, :partitions_proc, :partition_fetch_proc, :before_hook_proc, :after_hook_proc, :max_parallel
15
+
16
+ def initialize(klass:, partitions_proc:, partition_fetch_proc:, before_hook_proc:, after_hook_proc:,
17
+ max_parallel: 1)
18
+ @klass = klass
19
+ @partitions_proc = partitions_proc
20
+ @partition_fetch_proc = partition_fetch_proc
21
+ validate_hook_arity!(before_hook_proc, name: 'before_partition') if before_hook_proc
22
+ validate_hook_arity!(after_hook_proc, name: 'after_partition') if after_hook_proc
23
+ @before_hook_proc = before_hook_proc
24
+ @after_hook_proc = after_hook_proc
25
+ mp = begin
26
+ Integer(max_parallel)
27
+ rescue StandardError
28
+ 1
29
+ end
30
+ @max_parallel = mp.positive? ? mp : 1
31
+ freeze
32
+ end
33
+
34
+ # Enumerate partition keys. Validates the return value shape.
35
+ # @return [Enumerable] list/Enumerable of opaque partition tokens
36
+ # @raise [SearchEngine::Errors::InvalidParams] when the block does not return an Enumerable
37
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning
38
+ def partitions
39
+ return [] unless @partitions_proc
40
+
41
+ res = @partitions_proc.call
42
+ unless res.respond_to?(:each)
43
+ raise SearchEngine::Errors::InvalidParams,
44
+ 'partitions block must return an Enumerable of partition keys (Array acceptable). ' \
45
+ 'See https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning.'
46
+ end
47
+ res
48
+ end
49
+
50
+ # Return an Enumerator for batches for the given partition, validating element shape.
51
+ # @param partition [Object]
52
+ # @return [Enumerable<Array>] enumerator yielding Arrays of records
53
+ # @raise [ArgumentError] when partition_fetch is not defined
54
+ # @raise [SearchEngine::Errors::InvalidParams] when the block returns a non-enumerable or yields non-arrays
55
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning
56
+ def partition_fetch_enum(partition)
57
+ raise ArgumentError, 'partition_fetch not defined' unless @partition_fetch_proc
58
+
59
+ enum = @partition_fetch_proc.call(partition)
60
+ unless enum.respond_to?(:each)
61
+ raise SearchEngine::Errors::InvalidParams,
62
+ 'partition_fetch must return an Enumerable yielding Arrays of records. ' \
63
+ 'See https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning.'
64
+ end
65
+
66
+ Enumerator.new do |y|
67
+ idx = 0
68
+ enum.each do |batch|
69
+ unless batch.is_a?(Array) || batch.respond_to?(:to_a)
70
+ raise SearchEngine::Errors::InvalidParams,
71
+ "partition_fetch must yield Arrays of records; got #{batch.class} at index #{idx}."
72
+ end
73
+ y << (batch.is_a?(Array) ? batch : batch.to_a)
74
+ idx += 1
75
+ end
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def validate_hook_arity!(proc_obj, name:)
82
+ ar = proc_obj.arity
83
+ return if ar == 1 || ar.negative?
84
+
85
+ raise SearchEngine::Errors::InvalidParams, "#{name} block must accept exactly 1 parameter (partition)."
86
+ end
87
+ end
88
+
89
+ class << self
90
+ # Resolve a compiled partitioner for a model class, or nil if directives are absent.
91
+ # @param klass [Class]
92
+ # @return [SearchEngine::Partitioner::Compiled, nil]
93
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning
94
+ def for(klass)
95
+ dsl = mapper_dsl_for(klass)
96
+ return nil unless dsl
97
+
98
+ any = dsl[:partitions] || dsl[:partition_fetch] || dsl[:before_partition] || dsl[:after_partition]
99
+ return nil unless any
100
+
101
+ cache[klass] ||= compile(klass, dsl)
102
+ end
103
+
104
+ private
105
+
106
+ def cache
107
+ @cache ||= {}
108
+ end
109
+
110
+ def compile(klass, dsl)
111
+ Compiled.new(
112
+ klass: klass,
113
+ partitions_proc: dsl[:partitions],
114
+ partition_fetch_proc: dsl[:partition_fetch],
115
+ before_hook_proc: dsl[:before_partition],
116
+ after_hook_proc: dsl[:after_partition],
117
+ max_parallel: dsl[:partition_max_parallel]
118
+ )
119
+ end
120
+
121
+ def mapper_dsl_for(klass)
122
+ return unless klass.instance_variable_defined?(:@__mapper_dsl__)
123
+
124
+ klass.instance_variable_get(:@__mapper_dsl__)
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ # Pure, deterministic normalizer for ranking/typo/prefix tuning.
5
+ # Accepts relation context, effective query_by, and raw ranking state,
6
+ # validates and emits authoritative Typesense params.
7
+ #
8
+ # Usage: RankingPlan.new(relation: rel, query_by: "name,description", ranking: {...}).params
9
+ class RankingPlan
10
+ # @return [Hash]
11
+ attr_reader :params
12
+
13
+ # @param relation [SearchEngine::Relation]
14
+ # @param query_by [String, nil]
15
+ # @param ranking [Hash]
16
+ def initialize(relation:, query_by:, ranking: {})
17
+ @relation = relation
18
+ @raw_query_by = query_by
19
+ @raw = ranking || {}
20
+ @params = compile!
21
+ freeze
22
+ end
23
+
24
+ # Return effective query_by fields as an Array<String> (trimmed, non-blank)
25
+ def effective_query_by_fields
26
+ resolve_query_by(@raw_query_by)
27
+ end
28
+
29
+ private
30
+
31
+ def compile!
32
+ out = {}
33
+
34
+ out[:num_typos] = @raw[:num_typos] if @raw.key?(:num_typos) && !@raw[:num_typos].nil?
35
+
36
+ if @raw.key?(:drop_tokens_threshold) && !@raw[:drop_tokens_threshold].nil?
37
+ out[:drop_tokens_threshold] = @raw[:drop_tokens_threshold]
38
+ end
39
+
40
+ if @raw.key?(:prioritize_exact_match) && !@raw[:prioritize_exact_match].nil?
41
+ out[:prioritize_exact_match] = @raw[:prioritize_exact_match]
42
+ end
43
+
44
+ if (weights = @raw[:query_by_weights])
45
+ fields = effective_query_by_fields
46
+ if fields.empty?
47
+ raise SearchEngine::Errors::InvalidOption.new(
48
+ 'InvalidOption: query_by is empty; cannot apply query_by_weights',
49
+ hint: 'Set SearchEngine.config.default_query_by or pass options(query_by: ...)',
50
+ doc: 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/ranking#weights'
51
+ )
52
+ end
53
+
54
+ normalized_weights = build_weight_vector!(fields, weights)
55
+ if normalized_weights.all? { |w| w.to_i.zero? }
56
+ raise SearchEngine::Errors::InvalidOption.new(
57
+ 'InvalidOption: at least one weighted field must have weight > 0',
58
+ doc: 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/ranking#weights'
59
+ )
60
+ end
61
+ out[:query_by_weights] = normalized_weights.join(',')
62
+ end
63
+
64
+ out
65
+ end
66
+
67
+ def resolve_query_by(query_by)
68
+ query_by.to_s.split(',').map(&:strip).reject(&:empty?)
69
+ end
70
+
71
+ def build_weight_vector!(fields, weight_map)
72
+ # Validate that provided keys are subset of effective query_by
73
+ known = fields
74
+ provided = weight_map.keys.map(&:to_s)
75
+ unknown = provided - known
76
+ unless unknown.empty?
77
+ suggestions = suggest_for(unknown.first, known)
78
+ suffix = if suggestions.empty?
79
+ ''
80
+ elsif suggestions.length == 1
81
+ " (did you mean #{suggestions.first.inspect}?)"
82
+ else
83
+ others = suggestions[0..-2].map(&:inspect).join(', ')
84
+ last = suggestions.last.inspect
85
+ " (did you mean #{others}, or #{last}?)"
86
+ end
87
+ raise SearchEngine::Errors::InvalidOption.new(
88
+ "InvalidOption: weight specified for unknown field #{unknown.first.inspect}#{suffix}",
89
+ doc: 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/relation-reference#selection',
90
+ details: { unknown: unknown.first, allowed: known }
91
+ )
92
+ end
93
+
94
+ fields.map { |f| Integer(weight_map.fetch(f, 1)) }
95
+ rescue ArgumentError, TypeError
96
+ raise SearchEngine::Errors::InvalidOption.new(
97
+ 'InvalidOption: query_by_weights must compile to integers',
98
+ doc: 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/ranking#weights'
99
+ )
100
+ end
101
+
102
+ def suggest_for(input, candidates)
103
+ return [] if candidates.empty?
104
+
105
+ begin
106
+ require 'did_you_mean'
107
+ require 'did_you_mean/levenshtein'
108
+ rescue StandardError
109
+ return []
110
+ end
111
+
112
+ distances = candidates.each_with_object({}) do |cand, acc|
113
+ acc[cand] = DidYouMean::Levenshtein.distance(input.to_s, cand.to_s)
114
+ end
115
+ distances.sort_by { |(_c, d)| d }.take(3).select { |(_c, d)| d <= 2 }.map(&:first)
116
+ end
117
+ end
118
+ end