search-engine-for-typesense 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +148 -0
  4. data/app/search_engine/search_engine/app_info.rb +11 -0
  5. data/app/search_engine/search_engine/index_partition_job.rb +170 -0
  6. data/lib/generators/search_engine/install/install_generator.rb +20 -0
  7. data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
  8. data/lib/generators/search_engine/model/model_generator.rb +86 -0
  9. data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
  10. data/lib/search-engine-for-typesense.rb +12 -0
  11. data/lib/search_engine/active_record_syncable.rb +247 -0
  12. data/lib/search_engine/admin/stopwords.rb +125 -0
  13. data/lib/search_engine/admin/synonyms.rb +125 -0
  14. data/lib/search_engine/admin.rb +12 -0
  15. data/lib/search_engine/ast/and.rb +52 -0
  16. data/lib/search_engine/ast/binary_op.rb +75 -0
  17. data/lib/search_engine/ast/eq.rb +19 -0
  18. data/lib/search_engine/ast/group.rb +18 -0
  19. data/lib/search_engine/ast/gt.rb +12 -0
  20. data/lib/search_engine/ast/gte.rb +12 -0
  21. data/lib/search_engine/ast/in.rb +28 -0
  22. data/lib/search_engine/ast/lt.rb +12 -0
  23. data/lib/search_engine/ast/lte.rb +12 -0
  24. data/lib/search_engine/ast/matches.rb +55 -0
  25. data/lib/search_engine/ast/node.rb +176 -0
  26. data/lib/search_engine/ast/not_eq.rb +13 -0
  27. data/lib/search_engine/ast/not_in.rb +24 -0
  28. data/lib/search_engine/ast/or.rb +52 -0
  29. data/lib/search_engine/ast/prefix.rb +51 -0
  30. data/lib/search_engine/ast/raw.rb +41 -0
  31. data/lib/search_engine/ast/unary_op.rb +43 -0
  32. data/lib/search_engine/ast.rb +101 -0
  33. data/lib/search_engine/base/creation.rb +727 -0
  34. data/lib/search_engine/base/deletion.rb +80 -0
  35. data/lib/search_engine/base/display_coercions.rb +36 -0
  36. data/lib/search_engine/base/hydration.rb +312 -0
  37. data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
  38. data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
  39. data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
  40. data/lib/search_engine/base/index_maintenance.rb +459 -0
  41. data/lib/search_engine/base/indexing_dsl.rb +255 -0
  42. data/lib/search_engine/base/joins.rb +479 -0
  43. data/lib/search_engine/base/model_dsl.rb +472 -0
  44. data/lib/search_engine/base/presets.rb +43 -0
  45. data/lib/search_engine/base/pretty_printer.rb +315 -0
  46. data/lib/search_engine/base/relation_delegation.rb +42 -0
  47. data/lib/search_engine/base/scopes.rb +113 -0
  48. data/lib/search_engine/base/updating.rb +92 -0
  49. data/lib/search_engine/base.rb +38 -0
  50. data/lib/search_engine/bulk.rb +284 -0
  51. data/lib/search_engine/cache.rb +33 -0
  52. data/lib/search_engine/cascade.rb +531 -0
  53. data/lib/search_engine/cli/doctor.rb +631 -0
  54. data/lib/search_engine/cli/support.rb +217 -0
  55. data/lib/search_engine/cli.rb +222 -0
  56. data/lib/search_engine/client/http_adapter.rb +63 -0
  57. data/lib/search_engine/client/request_builder.rb +92 -0
  58. data/lib/search_engine/client/services/base.rb +74 -0
  59. data/lib/search_engine/client/services/collections.rb +161 -0
  60. data/lib/search_engine/client/services/documents.rb +214 -0
  61. data/lib/search_engine/client/services/operations.rb +152 -0
  62. data/lib/search_engine/client/services/search.rb +190 -0
  63. data/lib/search_engine/client/services.rb +29 -0
  64. data/lib/search_engine/client.rb +765 -0
  65. data/lib/search_engine/client_options.rb +20 -0
  66. data/lib/search_engine/collection_resolver.rb +191 -0
  67. data/lib/search_engine/collections_graph.rb +330 -0
  68. data/lib/search_engine/compiled_params.rb +143 -0
  69. data/lib/search_engine/compiler.rb +383 -0
  70. data/lib/search_engine/config/observability.rb +27 -0
  71. data/lib/search_engine/config/presets.rb +92 -0
  72. data/lib/search_engine/config/selection.rb +16 -0
  73. data/lib/search_engine/config/typesense.rb +48 -0
  74. data/lib/search_engine/config/validators.rb +97 -0
  75. data/lib/search_engine/config.rb +917 -0
  76. data/lib/search_engine/console_helpers.rb +130 -0
  77. data/lib/search_engine/deletion.rb +103 -0
  78. data/lib/search_engine/dispatcher.rb +125 -0
  79. data/lib/search_engine/dsl/parser.rb +582 -0
  80. data/lib/search_engine/engine.rb +167 -0
  81. data/lib/search_engine/errors.rb +290 -0
  82. data/lib/search_engine/filters/sanitizer.rb +189 -0
  83. data/lib/search_engine/hydration/materializers.rb +808 -0
  84. data/lib/search_engine/hydration/selection_context.rb +96 -0
  85. data/lib/search_engine/indexer/batch_planner.rb +76 -0
  86. data/lib/search_engine/indexer/bulk_import.rb +626 -0
  87. data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
  88. data/lib/search_engine/indexer/retry_policy.rb +103 -0
  89. data/lib/search_engine/indexer.rb +747 -0
  90. data/lib/search_engine/instrumentation.rb +308 -0
  91. data/lib/search_engine/joins/guard.rb +202 -0
  92. data/lib/search_engine/joins/resolver.rb +95 -0
  93. data/lib/search_engine/logging/color.rb +78 -0
  94. data/lib/search_engine/logging/format_helpers.rb +92 -0
  95. data/lib/search_engine/logging/partition_progress.rb +53 -0
  96. data/lib/search_engine/logging_subscriber.rb +388 -0
  97. data/lib/search_engine/mapper.rb +785 -0
  98. data/lib/search_engine/multi.rb +286 -0
  99. data/lib/search_engine/multi_result.rb +186 -0
  100. data/lib/search_engine/notifications/compact_logger.rb +675 -0
  101. data/lib/search_engine/observability.rb +162 -0
  102. data/lib/search_engine/operations.rb +58 -0
  103. data/lib/search_engine/otel.rb +227 -0
  104. data/lib/search_engine/partitioner.rb +128 -0
  105. data/lib/search_engine/ranking_plan.rb +118 -0
  106. data/lib/search_engine/registry.rb +158 -0
  107. data/lib/search_engine/relation/compiler.rb +711 -0
  108. data/lib/search_engine/relation/deletion.rb +37 -0
  109. data/lib/search_engine/relation/dsl/filters.rb +624 -0
  110. data/lib/search_engine/relation/dsl/selection.rb +240 -0
  111. data/lib/search_engine/relation/dsl.rb +903 -0
  112. data/lib/search_engine/relation/dx/dry_run.rb +59 -0
  113. data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
  114. data/lib/search_engine/relation/dx.rb +231 -0
  115. data/lib/search_engine/relation/materializers.rb +118 -0
  116. data/lib/search_engine/relation/options.rb +138 -0
  117. data/lib/search_engine/relation/state.rb +274 -0
  118. data/lib/search_engine/relation/updating.rb +44 -0
  119. data/lib/search_engine/relation.rb +623 -0
  120. data/lib/search_engine/result.rb +664 -0
  121. data/lib/search_engine/schema.rb +1083 -0
  122. data/lib/search_engine/sources/active_record_source.rb +185 -0
  123. data/lib/search_engine/sources/base.rb +62 -0
  124. data/lib/search_engine/sources/lambda_source.rb +55 -0
  125. data/lib/search_engine/sources/sql_source.rb +196 -0
  126. data/lib/search_engine/sources.rb +71 -0
  127. data/lib/search_engine/stale_rules.rb +160 -0
  128. data/lib/search_engine/test/minitest_assertions.rb +57 -0
  129. data/lib/search_engine/test/offline_client.rb +134 -0
  130. data/lib/search_engine/test/rspec_matchers.rb +77 -0
  131. data/lib/search_engine/test/stub_client.rb +201 -0
  132. data/lib/search_engine/test.rb +66 -0
  133. data/lib/search_engine/test_autoload.rb +8 -0
  134. data/lib/search_engine/update.rb +35 -0
  135. data/lib/search_engine/version.rb +7 -0
  136. data/lib/search_engine.rb +332 -0
  137. data/lib/tasks/search_engine.rake +501 -0
  138. data/lib/tasks/search_engine_doctor.rake +16 -0
  139. metadata +225 -0
@@ -0,0 +1,785 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'set'
4
+
5
+ module SearchEngine
6
+ # Mapper compiles a per-collection mapping function and validates
7
+ # mapped documents against the compiled schema.
8
+ #
9
+ # Public API:
10
+ # - {SearchEngine::Mapper.for(klass)} -> compiled mapper or nil when undefined
11
+ # - {SearchEngine::Mapper::Compiled#map_batch!(rows, batch_index:)} -> [Array<Hash>, Hash]
12
+ module Mapper
13
+ # Simple DSL holder used by Base#index to capture source and map block.
14
+ #
15
+ # Describes where data is fetched from and how records are transformed into
16
+ # Typesense documents. Compiled by {SearchEngine::Mapper.for}.
17
+ #
18
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
19
+ class Dsl
20
+ # @return [Hash, nil] original source definition captured from DSL
21
+ attr_reader :source_def
22
+ # @return [Proc, nil] mapping proc captured from DSL
23
+ attr_reader :map_proc
24
+
25
+ def initialize(klass)
26
+ @klass = klass
27
+ @source_def = nil
28
+ @map_proc = nil
29
+ @partitions_proc = nil
30
+ @partition_fetch_proc = nil
31
+ @before_partition_proc = nil
32
+ @after_partition_proc = nil
33
+ @max_parallel = 1
34
+ @stale_entries = []
35
+ end
36
+
37
+ # Declare a source adapter for this collection. Compatible with
38
+ # SearchEngine::Sources.build signature. Stored for compatibility; the
39
+ # mapper only requires the `map`.
40
+ # @param type [Symbol]
41
+ # @param options [Hash]
42
+ # @yield for :lambda sources
43
+ # @return [void]
44
+ # @raise [ArgumentError] when type is nil/blank
45
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
46
+ def source(type, **options, &block)
47
+ @source_def = { type: type.to_sym, options: options, block: block }
48
+ nil
49
+ end
50
+
51
+ # Define the mapping block.
52
+ # @yield [record] yields each source record to the block
53
+ # @yieldparam record [Object]
54
+ # @yieldreturn [Hash, #to_h, #as_json] a document-like object
55
+ # @return [void]
56
+ # @raise [ArgumentError] when no block is given
57
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
58
+ def map(&block)
59
+ raise ArgumentError, 'map requires a block' unless block
60
+
61
+ @map_proc = block
62
+ nil
63
+ end
64
+
65
+ # Delete documents by filter before/after a partition import or ad-hoc.
66
+ # Accepts either a raw Typesense filter string or a hash which will be
67
+ # converted to a filter string using the Filters::Sanitizer.
68
+ #
69
+ # Examples:
70
+ # delete_by filter_by: "store_id:=#{store_id}"
71
+ # delete_by store_id: store_id
72
+ #
73
+ # @param filter_or_str [String, nil]
74
+ # @param filter_by [String, nil]
75
+ # @param into [String, nil]
76
+ # @param partition [Object, nil]
77
+ # @param timeout_ms [Integer, nil]
78
+ # @param hash [Hash] remaining keyword arguments treated as filter hash
79
+ # @return [Integer] number of deleted documents
80
+ def delete_by(filter_or_str = nil, into: nil, partition: nil, timeout_ms: nil, filter_by: nil, **hash)
81
+ effective_partition = partition || Thread.current[:__se_current_partition__] ||
82
+ instance_variable_get(:@__current_partition__)
83
+ SearchEngine::Deletion.delete_by(
84
+ klass: @klass,
85
+ filter: filter_or_str || filter_by,
86
+ hash: (hash.empty? ? nil : hash),
87
+ into: into,
88
+ partition: effective_partition,
89
+ timeout_ms: timeout_ms
90
+ )
91
+ end
92
+
93
+ # Register a stale-cleanup rule evaluated during {SearchEngine::Base.cleanup}.
94
+ #
95
+ # Accepts one of the following forms:
96
+ # - `stale { where(active: false) }` — block evaluated against the model class
97
+ # - `stale scope: :inactive` — named scope invoked on the model
98
+ # - `stale :archived` or `stale attribute: :archived, value: true` — attribute equality
99
+ # - `stale(filter: 'status:=archived')` — raw Typesense filter fragment
100
+ # - `stale(product_state: 'archived')` — Hash converted to Relation filters
101
+ #
102
+ # Multiple stale entries are OR-ed together when cleanup runs.
103
+ # @param target [Symbol, String, Hash, SearchEngine::Relation, nil]
104
+ # @param scope [Symbol, nil]
105
+ # @param attribute [Symbol, nil]
106
+ # @param value [Object] value used with attribute form (defaults to +true+)
107
+ # @param filter [String, nil]
108
+ # @yield block evaluated against the model class; should return a Relation, String, or Hash
109
+ # @return [void]
110
+ def stale(target = nil, scope: nil, attribute: nil, value: true, filter: nil, &block)
111
+ entry = build_stale_entry(
112
+ target,
113
+ scope: scope,
114
+ attribute: attribute,
115
+ value: value,
116
+ filter: filter,
117
+ block: block
118
+ )
119
+ @stale_entries << entry.freeze
120
+ nil
121
+ end
122
+
123
+ # @return [Array<Hash>]
124
+ def stale_entries
125
+ @stale_entries.dup
126
+ end
127
+
128
+ # Partitioning: declare how to enumerate partitions for full rebuilds.
129
+ # @yieldreturn [Enumerable] a list/Enumerable of opaque partition keys
130
+ # @return [void]
131
+ # @raise [ArgumentError] when no block is given
132
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning
133
+ def partitions(&block)
134
+ raise ArgumentError, 'partitions requires a block' unless block
135
+
136
+ @partitions_proc = block
137
+ nil
138
+ end
139
+
140
+ # Configure maximum parallel threads for partitioned indexation.
141
+ # Applies only when partitioning is used and only to full indexation.
142
+ # @param max_parallel [Integer]
143
+ # @return [void]
144
+ # @raise [SearchEngine::Errors::InvalidOption] when n is not a positive Integer
145
+ def partition_max_parallel(max_parallel)
146
+ unless max_parallel.is_a?(Integer) && max_parallel.positive?
147
+ raise SearchEngine::Errors::InvalidOption,
148
+ 'partition_max_parallel must be a positive Integer (> 0)'
149
+ end
150
+
151
+ @partition_max_parallel = max_parallel
152
+ nil
153
+ end
154
+
155
+ # Configure maximum parallel threads for batch processing within a partition (or non-partitioned import).
156
+ # Applies to batch-level parallelism, independent of partition-level parallelism.
157
+ # @param max_parallel [Integer]
158
+ # @return [void]
159
+ # @raise [SearchEngine::Errors::InvalidOption] when n is not a positive Integer
160
+ def max_parallel(max_parallel)
161
+ unless max_parallel.is_a?(Integer) && max_parallel.positive?
162
+ raise SearchEngine::Errors::InvalidOption,
163
+ 'max_parallel must be a positive Integer (> 0)'
164
+ end
165
+
166
+ @max_parallel = max_parallel
167
+ nil
168
+ end
169
+
170
+ # Partitioning: provide a per-partition batch enumerator.
171
+ # The block receives the partition key and must return an Enumerable of batches (Arrays of records).
172
+ # @yieldparam partition [Object]
173
+ # @yieldreturn [Enumerable<Array>] yields Arrays of records per batch
174
+ # @return [void]
175
+ # @raise [ArgumentError] when no block is given
176
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning
177
+ def partition_fetch(&block)
178
+ raise ArgumentError, 'partition_fetch requires a block' unless block
179
+
180
+ @partition_fetch_proc = block
181
+ nil
182
+ end
183
+
184
+ # Hook executed before importing a partition.
185
+ # The block receives the partition key.
186
+ # @yieldparam partition [Object]
187
+ # @return [void]
188
+ # @raise [ArgumentError] when no block is given
189
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning
190
+ def before_partition(&block)
191
+ raise ArgumentError, 'before_partition requires a block' unless block
192
+
193
+ # Wrap to expose current partition on the DSL instance for helpers
194
+ @before_partition_proc = lambda do |partition|
195
+ instance_variable_set(:@__current_partition__, partition)
196
+ prev = Thread.current[:__se_current_partition__]
197
+ Thread.current[:__se_current_partition__] = partition
198
+ if block.arity == 1
199
+ yield(partition)
200
+ else
201
+ instance_exec(partition, &block)
202
+ end
203
+ ensure
204
+ Thread.current[:__se_current_partition__] = prev
205
+ remove_instance_variable(:@__current_partition__) if instance_variable_defined?(:@__current_partition__)
206
+ end
207
+ nil
208
+ end
209
+
210
+ # Hook executed after importing a partition.
211
+ # The block receives the partition key.
212
+ # @yieldparam partition [Object]
213
+ # @return [void]
214
+ # @raise [ArgumentError] when no block is given
215
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning
216
+ def after_partition(&block)
217
+ raise ArgumentError, 'after_partition requires a block' unless block
218
+
219
+ @after_partition_proc = lambda do |partition|
220
+ instance_variable_set(:@__current_partition__, partition)
221
+ prev = Thread.current[:__se_current_partition__]
222
+ Thread.current[:__se_current_partition__] = partition
223
+ if block.arity == 1
224
+ yield(partition)
225
+ else
226
+ instance_exec(partition, &block)
227
+ end
228
+ ensure
229
+ Thread.current[:__se_current_partition__] = prev
230
+ remove_instance_variable(:@__current_partition__) if instance_variable_defined?(:@__current_partition__)
231
+ end
232
+ nil
233
+ end
234
+
235
+ # Freeze internal state for immutability and return a definition Hash.
236
+ # @return [Hash]
237
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
238
+ def to_definition
239
+ {
240
+ source: @source_def,
241
+ map: @map_proc,
242
+ partitions: @partitions_proc,
243
+ partition_fetch: @partition_fetch_proc,
244
+ before_partition: @before_partition_proc,
245
+ after_partition: @after_partition_proc,
246
+ partition_max_parallel: @partition_max_parallel,
247
+ max_parallel: @max_parallel,
248
+ stale: @stale_entries.dup.freeze
249
+ }.freeze
250
+ end
251
+
252
+ private
253
+
254
+ def build_stale_entry(target, scope:, attribute:, value:, filter:, block:)
255
+ if block
256
+ { type: :block, block: block }
257
+ elsif scope
258
+ ensure_symbol!(:scope, scope)
259
+ { type: :scope, name: scope.to_sym }
260
+ elsif attribute || attribute_target?(target)
261
+ attr_name = (attribute || target).to_sym
262
+ { type: :attribute, name: attr_name, value: value }
263
+ elsif hash_target?(target)
264
+ normalized_hash = normalize_stale_hash(target)
265
+ { type: :hash, hash: normalized_hash }
266
+ elsif filter || filter_target?(target)
267
+ str = filter || target
268
+ ensure_string!(:filter, str)
269
+ { type: :filter, value: str.to_s }
270
+ elsif relation_target?(target)
271
+ { type: :relation, relation: target }
272
+ else
273
+ raise ArgumentError,
274
+ 'stale requires a block, scope:, attribute:, filter:, relation, or hash input'
275
+ end
276
+ end
277
+
278
+ def attribute_target?(target)
279
+ target.is_a?(Symbol)
280
+ end
281
+
282
+ def hash_target?(target)
283
+ target.is_a?(Hash)
284
+ end
285
+
286
+ def relation_target?(target)
287
+ defined?(SearchEngine::Relation) && target.is_a?(SearchEngine::Relation)
288
+ end
289
+
290
+ def filter_target?(target)
291
+ target.is_a?(String)
292
+ end
293
+
294
+ def normalize_stale_hash(hash)
295
+ hash.each_with_object({}) do |(key, value), acc|
296
+ sym_key = key.respond_to?(:to_sym) ? key.to_sym : key
297
+ acc[sym_key] = value
298
+ end.freeze
299
+ end
300
+
301
+ def ensure_symbol!(name, value)
302
+ return if value.is_a?(Symbol)
303
+
304
+ raise ArgumentError, "#{name} must be a Symbol"
305
+ end
306
+
307
+ def ensure_string!(name, value)
308
+ return if value.respond_to?(:to_s)
309
+
310
+ raise ArgumentError, "#{name} must be convertible to String"
311
+ end
312
+ end
313
+
314
+ # Immutable compiled mapper for a specific collection class.
315
+ #
316
+ # Validates mapped documents against the compiled schema, sets hidden flags
317
+ # for array/optional fields and emits instrumentation.
318
+ #
319
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
320
+ class Compiled
321
+ attr_reader :klass
322
+
323
+ def initialize(klass:, map_proc:, schema_fields:, types_by_field:, options: {})
324
+ @klass = klass
325
+ @map_proc = map_proc
326
+ @schema_fields = schema_fields.freeze # Array of field names (String)
327
+ @types_by_field = types_by_field.freeze # { "field" => "int64" }
328
+ # Allow all schema fields; treat required as schema fields minus optional attributes
329
+ @allowed_keys = @schema_fields.map(&:to_sym).to_set.freeze
330
+ @required_keys = compute_required_keys
331
+ @options = default_options.merge(options || {})
332
+ @__empty_filtering_targets__ = compute_empty_filtering_targets
333
+ @__optional_blank_targets__ = compute_optional_blank_targets
334
+ freeze
335
+ end
336
+
337
+ # Map and validate a batch of rows.
338
+ # @param rows [Array<Object>] source records to map
339
+ # @param batch_index [Integer, nil] optional index for instrumentation
340
+ # @return [Array<Array<Hash>, Hash>] [documents, report]
341
+ # @raise [SearchEngine::Errors::InvalidParams] on missing required fields or invalid document shape
342
+ # @raise [SearchEngine::Errors::InvalidField] when strict_unknown_keys is enabled and extras are present
343
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#troubleshooting
344
+ def map_batch!(rows, batch_index: nil)
345
+ start_ms = monotonic_ms
346
+ docs = []
347
+ stats = init_stats
348
+
349
+ rows.each do |row|
350
+ hash = normalize_document(@map_proc.call(row))
351
+ # Ignore any provided id from map; always inject computed document id
352
+ hash.delete(:id)
353
+ hash.delete('id')
354
+ begin
355
+ computed_id = @klass.compute_document_id(row)
356
+ rescue NoMethodError
357
+ # Fallback for older compiled mappers if needed; derive from record.id
358
+ rid = row.respond_to?(:id) ? row.id : nil
359
+ computed_id = rid.is_a?(String) ? rid : rid.to_s
360
+ end
361
+ hash[:id] = computed_id
362
+ # Force system timestamp field on every document; developers cannot override.
363
+ now_i = if defined?(Time) && defined?(Time.zone) && Time.zone
364
+ Time.zone.now.to_i
365
+ else
366
+ Time.now.to_i
367
+ end
368
+ # Overwrite any provided value
369
+ hash[:doc_updated_at] = now_i
370
+
371
+ normalize_optional_blank_strings!(hash)
372
+
373
+ # Populate hidden flags
374
+ set_hidden_empty_flags!(hash)
375
+ set_hidden_blank_flags!(hash)
376
+
377
+ update_stats_for_doc!(stats, hash)
378
+ validate_and_coerce_types!(stats, hash)
379
+ docs << hash
380
+ end
381
+
382
+ ensure_required_present!(stats)
383
+ ensure_no_unknowns!(stats)
384
+
385
+ duration = monotonic_ms - start_ms
386
+ instrument_batch_mapped(
387
+ batch_index: batch_index,
388
+ docs_count: docs.size,
389
+ duration_ms: duration,
390
+ missing_required_count: stats[:missing_required].size,
391
+ extra_keys_count: stats[:extras_samples].size,
392
+ invalid_type_count: stats[:invalid_type_samples].size,
393
+ coerced_count: stats[:coerced_count]
394
+ )
395
+
396
+ report = build_report(stats, docs.size, batch_index, duration)
397
+ [docs, report]
398
+ end
399
+
400
+ private
401
+
402
+ def default_options
403
+ {
404
+ strict_unknown_keys: false,
405
+ coercions_enabled: false,
406
+ coercion_rules: {},
407
+ max_error_samples: 5
408
+ }
409
+ end
410
+
411
+ def init_stats
412
+ {
413
+ missing_required: [],
414
+ extras_samples: [],
415
+ invalid_type_samples: [],
416
+ coerced_count: 0,
417
+ total_keys: 0,
418
+ nil_id: 0
419
+ }
420
+ end
421
+
422
+ def update_stats_for_doc!(stats, hash)
423
+ stats[:total_keys] += hash.size
424
+
425
+ id_has_key = hash.key?(:id) || hash.key?('id')
426
+ id_value = hash[:id] || hash['id']
427
+ stats[:nil_id] += 1 if id_has_key && id_value.nil?
428
+
429
+ present_keys = hash.keys.map(&:to_sym)
430
+ missing = @required_keys - present_keys
431
+ stats[:missing_required] |= missing.to_a unless missing.empty?
432
+
433
+ extras = present_keys.to_set - @allowed_keys
434
+ stats[:extras_samples] |= extras.to_a unless extras.empty?
435
+ end
436
+
437
+ def validate_and_coerce_types!(stats, hash)
438
+ hash.each do |key, value|
439
+ fname = key.to_s
440
+ expected = @types_by_field[fname]
441
+ next unless expected
442
+
443
+ # Allow nil for attributes declared as optional in the model DSL
444
+ next if value.nil? && @__optional_blank_targets__.include?(fname)
445
+
446
+ valid, coerced, err = validate_value(expected, value, field: fname)
447
+ if coerced
448
+ stats[:coerced_count] += 1
449
+ hash[key] = coerced
450
+ elsif !valid && stats[:invalid_type_samples].size < @options[:max_error_samples]
451
+ stats[:invalid_type_samples] << err
452
+ end
453
+ end
454
+ end
455
+
456
+ # Compute and set hidden *_empty flags based on configured array fields.
457
+ # Adds the hidden flag only when the field is present in the schema (allowed_keys).
458
+ def set_hidden_empty_flags!(doc)
459
+ return if @__empty_filtering_targets__.empty?
460
+
461
+ @__empty_filtering_targets__.each do |base_name|
462
+ value = doc[base_name.to_sym]
463
+ value = doc[base_name.to_s] if value.nil?
464
+ flag_name = "#{base_name}_empty"
465
+ doc[flag_name.to_sym] = value.nil? || (value.is_a?(Array) && value.empty?)
466
+ end
467
+ end
468
+
469
+ # Normalize empty-string values for optional fields to nil.
470
+ def normalize_optional_blank_strings!(doc)
471
+ return if @__optional_blank_targets__.empty?
472
+
473
+ @__optional_blank_targets__.each do |base_name|
474
+ key_sym = base_name.to_sym
475
+ if doc.key?(key_sym)
476
+ value = doc[key_sym]
477
+ doc[key_sym] = nil if value.is_a?(String) && value.empty?
478
+ next
479
+ end
480
+
481
+ next unless doc.key?(base_name)
482
+
483
+ value = doc[base_name]
484
+ doc[base_name] = nil if value.is_a?(String) && value.empty?
485
+ end
486
+ end
487
+
488
+ # Compute and set hidden *_blank flags based on optional fields.
489
+ # Adds the hidden flag only when the field is present in the schema (allowed_keys).
490
+ def set_hidden_blank_flags!(doc)
491
+ return if @__optional_blank_targets__.empty?
492
+
493
+ @__optional_blank_targets__.each do |base_name|
494
+ value = doc[base_name.to_sym]
495
+ value = doc[base_name.to_s] if value.nil?
496
+ flag_name = "#{base_name}_blank"
497
+ doc[flag_name.to_sym] = value.nil?
498
+ end
499
+ end
500
+
501
+ def ensure_required_present!(stats)
502
+ return if stats[:missing_required].empty?
503
+
504
+ message = "Missing required fields: #{stats[:missing_required].sort.inspect} for #{klass.name} mapper."
505
+ instrument_error(error_class: 'SearchEngine::Errors::InvalidParams', message: message)
506
+ raise SearchEngine::Errors::InvalidParams.new(
507
+ message,
508
+ doc: 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#troubleshooting',
509
+ details: { missing_required: stats[:missing_required].sort }
510
+ )
511
+ end
512
+
513
+ def ensure_no_unknowns!(stats)
514
+ return unless @options[:strict_unknown_keys] && !stats[:extras_samples].empty?
515
+
516
+ message = [
517
+ 'Unknown fields detected:',
518
+ "#{stats[:extras_samples].sort.inspect} (set mapper.strict_unknown_keys)."
519
+ ].join(' ')
520
+ instrument_error(error_class: 'SearchEngine::Errors::InvalidField', message: message)
521
+ raise SearchEngine::Errors::InvalidField.new(
522
+ message,
523
+ doc: 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#troubleshooting',
524
+ details: { extras: stats[:extras_samples].sort }
525
+ )
526
+ end
527
+
528
+ def build_report(stats, docs_size, batch_index, duration)
529
+ {
530
+ collection: klass.respond_to?(:collection) ? klass.collection : nil,
531
+ batch_index: batch_index,
532
+ docs_count: docs_size,
533
+ missing_required: stats[:missing_required].sort,
534
+ extras_sample: stats[:extras_samples].sort[0, @options[:max_error_samples]],
535
+ invalid_type_sample: stats[:invalid_type_samples][0, @options[:max_error_samples]],
536
+ coerced_count: stats[:coerced_count],
537
+ total_keys: stats[:total_keys],
538
+ nil_id: stats[:nil_id],
539
+ duration_ms: duration.round(1)
540
+ }
541
+ end
542
+
543
+ # Compute required keys as all schema fields minus attributes marked optional in the model DSL.
544
+ # Hidden flags like <name>_blank remain required; they are populated automatically by the mapper.
545
+ #
546
+ # Nested fields (dotted names like "retail_prices.current_price") are excluded from
547
+ # presence checks by default, since nested object/object[] subfields may be sparsely
548
+ # populated. Typesense accepts documents missing nested subfields.
549
+ def compute_required_keys
550
+ begin
551
+ opts = @klass.respond_to?(:attribute_options) ? (@klass.attribute_options || {}) : {}
552
+ rescue StandardError
553
+ opts = {}
554
+ end
555
+
556
+ # Start with all schema fields and drop dotted nested field names from required set
557
+ base_fields = @schema_fields.map(&:to_sym)
558
+ base_fields.reject! { |fname| fname.to_s.include?('.') }
559
+ required = base_fields.to_set
560
+ opts.each do |fname, o|
561
+ next unless o.is_a?(Hash) && o[:optional]
562
+
563
+ required.delete(fname.to_sym)
564
+ end
565
+ required.freeze
566
+ end
567
+
568
+ def normalize_document(obj)
569
+ return obj if obj.is_a?(Hash)
570
+ return obj.to_h if obj.respond_to?(:to_h)
571
+ return obj.as_json if obj.respond_to?(:as_json)
572
+
573
+ raise SearchEngine::Errors::InvalidParams,
574
+ 'Mapper map block must return a Hash-like document (Hash/#to_h/#as_json)'
575
+ end
576
+
577
+ def validate_value(expected_type, value, field:)
578
+ # Returns [valid(Boolean), coerced_value_or_nil, error_message]
579
+ case expected_type
580
+ when 'int64', 'int32'
581
+ # Accept Time universally by coercing to epoch seconds
582
+ return [true, value.to_i, true] if value.is_a?(Time)
583
+
584
+ validate_integer(value, field)
585
+ when 'float'
586
+ validate_float(value, field)
587
+ when 'bool'
588
+ validate_bool(value, field)
589
+ when 'string'
590
+ # Accept Time/Date/DateTime universally by coercing to ISO8601
591
+ if value.is_a?(Time)
592
+ return [true, value.iso8601, true]
593
+ elsif defined?(DateTime) && value.is_a?(DateTime)
594
+ return [true, value.to_time.utc.iso8601, true]
595
+ elsif defined?(Date) && value.is_a?(Date)
596
+ return [true, value.to_time.utc.iso8601, true]
597
+ end
598
+
599
+ [value.is_a?(String), nil, invalid_type_message(field, 'String', value)]
600
+ when 'string[]'
601
+ return [true, nil, nil] if value.is_a?(Array) && value.all? { |v| v.is_a?(String) }
602
+
603
+ [false, nil, invalid_type_message(field, 'Array<String>', value)]
604
+ else
605
+ # Unknown/opaque type: accept
606
+ [true, nil, nil]
607
+ end
608
+ end
609
+
610
+ def validate_integer(value, field)
611
+ if value.is_a?(Integer)
612
+ [true, nil, nil]
613
+ elsif @options[:coercions_enabled] && string_integer?(value)
614
+ [true, Integer(value), true]
615
+ else
616
+ [false, nil, invalid_type_message(field, 'Integer', value)]
617
+ end
618
+ end
619
+
620
+ def validate_float(value, field)
621
+ if value.is_a?(Numeric) && finite_number?(value)
622
+ [true, nil, nil]
623
+ elsif @options[:coercions_enabled] && string_float?(value)
624
+ f = begin
625
+ Float(value)
626
+ rescue StandardError
627
+ nil
628
+ end
629
+ f && finite_number?(f) ? [true, f, true] : [false, nil, invalid_type_message(field, 'Float', value)]
630
+ else
631
+ [false, nil, invalid_type_message(field, 'Float', value)]
632
+ end
633
+ end
634
+
635
+ def validate_bool(value, field)
636
+ if [true, false].include?(value)
637
+ [true, nil, nil]
638
+ elsif @options[:coercions_enabled] && %w[true false 1 0].include?(value.to_s.downcase)
639
+ [true, %w[true 1].include?(value.to_s.downcase), true]
640
+ else
641
+ [false, nil, invalid_type_message(field, 'Boolean', value)]
642
+ end
643
+ end
644
+
645
+ def string_integer?(v)
646
+ v.is_a?(String) && v.match?(/^[-+]?\d+$/)
647
+ end
648
+
649
+ def string_float?(v)
650
+ v.is_a?(String) && v.match?(/^[-+]?\d*(?:\.\d+)?$/)
651
+ end
652
+
653
+ def finite_number?(v)
654
+ return v.finite? if v.is_a?(Float)
655
+
656
+ true
657
+ end
658
+
659
+ def invalid_type_message(field, expected, got)
660
+ got_class = got.nil? ? 'NilClass' : got.class.name
661
+ got_preview = got.is_a?(String) ? got[0, 50] : got.to_s[0, 50]
662
+ "Invalid type for field :#{field} (expected #{expected}, got #{got_class}: \"#{got_preview}\")."
663
+ end
664
+
665
+ # Determine which declared array attributes have empty_filtering enabled.
666
+ # Returns an Array of base field names as Strings.
667
+ def compute_empty_filtering_targets
668
+ begin
669
+ opts = @klass.respond_to?(:attribute_options) ? (@klass.attribute_options || {}) : {}
670
+ rescue StandardError
671
+ opts = {}
672
+ end
673
+ targets = []
674
+ opts.each do |fname, o|
675
+ next unless o.is_a?(Hash) && o[:empty_filtering]
676
+
677
+ hidden = "#{fname}_empty"
678
+ targets << fname.to_s if @types_by_field.key?(hidden) || @required_keys.include?(hidden.to_sym)
679
+ end
680
+ targets.freeze
681
+ end
682
+
683
+ # Determine which declared attributes have optional enabled.
684
+ # Returns an Array of base field names as Strings, only when corresponding _blank is present in schema.
685
+ def compute_optional_blank_targets
686
+ begin
687
+ opts = @klass.respond_to?(:attribute_options) ? (@klass.attribute_options || {}) : {}
688
+ rescue StandardError
689
+ opts = {}
690
+ end
691
+ targets = []
692
+ opts.each do |fname, o|
693
+ next unless o.is_a?(Hash) && o[:optional]
694
+
695
+ hidden = "#{fname}_blank"
696
+ targets << fname.to_s if @types_by_field.key?(hidden) || @required_keys.include?(hidden.to_sym)
697
+ end
698
+ targets.freeze
699
+ end
700
+
701
+ def instrument_batch_mapped(batch_index:, docs_count:, duration_ms:,
702
+ missing_required_count:, extra_keys_count:,
703
+ invalid_type_count:, coerced_count:)
704
+ return unless defined?(ActiveSupport::Notifications)
705
+
706
+ payload = {
707
+ collection: klass.respond_to?(:collection) ? klass.collection : nil,
708
+ batch_index: batch_index,
709
+ docs_count: docs_count,
710
+ duration_ms: duration_ms.round(1),
711
+ missing_required_count: missing_required_count,
712
+ extra_keys_count: extra_keys_count,
713
+ invalid_type_count: invalid_type_count,
714
+ coerced_count: coerced_count
715
+ }
716
+ SearchEngine::Instrumentation.instrument('search_engine.mapper.batch_mapped', payload) {}
717
+ end
718
+
719
+ def instrument_error(error_class:, message:)
720
+ return unless defined?(ActiveSupport::Notifications)
721
+
722
+ payload = {
723
+ collection: klass.respond_to?(:collection) ? klass.collection : nil,
724
+ error_class: error_class,
725
+ message: message.to_s[0, 200]
726
+ }
727
+ SearchEngine::Instrumentation.instrument('search_engine.mapper.error', payload) {}
728
+ end
729
+
730
+ def monotonic_ms
731
+ SearchEngine::Instrumentation.monotonic_ms
732
+ end
733
+ end
734
+
735
+ class << self
736
+ # Resolve a compiled mapper for a model class, or nil if no DSL is defined.
737
+ # @param klass [Class]
738
+ # @return [SearchEngine::Mapper::Compiled, nil]
739
+ def for(klass)
740
+ dsl = mapper_dsl_for(klass)
741
+ return nil unless dsl && dsl[:map].respond_to?(:call)
742
+
743
+ cache[klass] ||= compile(klass, dsl)
744
+ end
745
+
746
+ private
747
+
748
+ def cache
749
+ @cache ||= {}
750
+ end
751
+
752
+ def compile(klass, dsl)
753
+ compiled_schema = SearchEngine::Schema.compile(klass)
754
+ fields = Array(compiled_schema[:fields]).map { |f| f[:name].to_s }
755
+ types_by_field = {}
756
+ Array(compiled_schema[:fields]).each do |f|
757
+ types_by_field[f[:name].to_s] = f[:type].to_s
758
+ end
759
+
760
+ mapper_cfg = SearchEngine.config&.mapper
761
+ coercions_cfg = mapper_cfg&.coercions || {}
762
+ options = {
763
+ strict_unknown_keys: mapper_cfg&.strict_unknown_keys ? true : false,
764
+ coercions_enabled: coercions_cfg[:enabled] ? true : false,
765
+ coercion_rules: coercions_cfg[:rules].is_a?(Hash) ? coercions_cfg[:rules] : {},
766
+ max_error_samples: (mapper_cfg&.max_error_samples.to_i.positive? ? mapper_cfg.max_error_samples.to_i : 5)
767
+ }
768
+
769
+ Compiled.new(
770
+ klass: klass,
771
+ map_proc: dsl[:map],
772
+ schema_fields: fields,
773
+ types_by_field: types_by_field,
774
+ options: options
775
+ )
776
+ end
777
+
778
+ def mapper_dsl_for(klass)
779
+ return unless klass.instance_variable_defined?(:@__mapper_dsl__)
780
+
781
+ klass.instance_variable_get(:@__mapper_dsl__)
782
+ end
783
+ end
784
+ end
785
+ end