search-engine-for-typesense 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +148 -0
  4. data/app/search_engine/search_engine/app_info.rb +11 -0
  5. data/app/search_engine/search_engine/index_partition_job.rb +170 -0
  6. data/lib/generators/search_engine/install/install_generator.rb +20 -0
  7. data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
  8. data/lib/generators/search_engine/model/model_generator.rb +86 -0
  9. data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
  10. data/lib/search-engine-for-typesense.rb +12 -0
  11. data/lib/search_engine/active_record_syncable.rb +247 -0
  12. data/lib/search_engine/admin/stopwords.rb +125 -0
  13. data/lib/search_engine/admin/synonyms.rb +125 -0
  14. data/lib/search_engine/admin.rb +12 -0
  15. data/lib/search_engine/ast/and.rb +52 -0
  16. data/lib/search_engine/ast/binary_op.rb +75 -0
  17. data/lib/search_engine/ast/eq.rb +19 -0
  18. data/lib/search_engine/ast/group.rb +18 -0
  19. data/lib/search_engine/ast/gt.rb +12 -0
  20. data/lib/search_engine/ast/gte.rb +12 -0
  21. data/lib/search_engine/ast/in.rb +28 -0
  22. data/lib/search_engine/ast/lt.rb +12 -0
  23. data/lib/search_engine/ast/lte.rb +12 -0
  24. data/lib/search_engine/ast/matches.rb +55 -0
  25. data/lib/search_engine/ast/node.rb +176 -0
  26. data/lib/search_engine/ast/not_eq.rb +13 -0
  27. data/lib/search_engine/ast/not_in.rb +24 -0
  28. data/lib/search_engine/ast/or.rb +52 -0
  29. data/lib/search_engine/ast/prefix.rb +51 -0
  30. data/lib/search_engine/ast/raw.rb +41 -0
  31. data/lib/search_engine/ast/unary_op.rb +43 -0
  32. data/lib/search_engine/ast.rb +101 -0
  33. data/lib/search_engine/base/creation.rb +727 -0
  34. data/lib/search_engine/base/deletion.rb +80 -0
  35. data/lib/search_engine/base/display_coercions.rb +36 -0
  36. data/lib/search_engine/base/hydration.rb +312 -0
  37. data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
  38. data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
  39. data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
  40. data/lib/search_engine/base/index_maintenance.rb +459 -0
  41. data/lib/search_engine/base/indexing_dsl.rb +255 -0
  42. data/lib/search_engine/base/joins.rb +479 -0
  43. data/lib/search_engine/base/model_dsl.rb +472 -0
  44. data/lib/search_engine/base/presets.rb +43 -0
  45. data/lib/search_engine/base/pretty_printer.rb +315 -0
  46. data/lib/search_engine/base/relation_delegation.rb +42 -0
  47. data/lib/search_engine/base/scopes.rb +113 -0
  48. data/lib/search_engine/base/updating.rb +92 -0
  49. data/lib/search_engine/base.rb +38 -0
  50. data/lib/search_engine/bulk.rb +284 -0
  51. data/lib/search_engine/cache.rb +33 -0
  52. data/lib/search_engine/cascade.rb +531 -0
  53. data/lib/search_engine/cli/doctor.rb +631 -0
  54. data/lib/search_engine/cli/support.rb +217 -0
  55. data/lib/search_engine/cli.rb +222 -0
  56. data/lib/search_engine/client/http_adapter.rb +63 -0
  57. data/lib/search_engine/client/request_builder.rb +92 -0
  58. data/lib/search_engine/client/services/base.rb +74 -0
  59. data/lib/search_engine/client/services/collections.rb +161 -0
  60. data/lib/search_engine/client/services/documents.rb +214 -0
  61. data/lib/search_engine/client/services/operations.rb +152 -0
  62. data/lib/search_engine/client/services/search.rb +190 -0
  63. data/lib/search_engine/client/services.rb +29 -0
  64. data/lib/search_engine/client.rb +765 -0
  65. data/lib/search_engine/client_options.rb +20 -0
  66. data/lib/search_engine/collection_resolver.rb +191 -0
  67. data/lib/search_engine/collections_graph.rb +330 -0
  68. data/lib/search_engine/compiled_params.rb +143 -0
  69. data/lib/search_engine/compiler.rb +383 -0
  70. data/lib/search_engine/config/observability.rb +27 -0
  71. data/lib/search_engine/config/presets.rb +92 -0
  72. data/lib/search_engine/config/selection.rb +16 -0
  73. data/lib/search_engine/config/typesense.rb +48 -0
  74. data/lib/search_engine/config/validators.rb +97 -0
  75. data/lib/search_engine/config.rb +917 -0
  76. data/lib/search_engine/console_helpers.rb +130 -0
  77. data/lib/search_engine/deletion.rb +103 -0
  78. data/lib/search_engine/dispatcher.rb +125 -0
  79. data/lib/search_engine/dsl/parser.rb +582 -0
  80. data/lib/search_engine/engine.rb +167 -0
  81. data/lib/search_engine/errors.rb +290 -0
  82. data/lib/search_engine/filters/sanitizer.rb +189 -0
  83. data/lib/search_engine/hydration/materializers.rb +808 -0
  84. data/lib/search_engine/hydration/selection_context.rb +96 -0
  85. data/lib/search_engine/indexer/batch_planner.rb +76 -0
  86. data/lib/search_engine/indexer/bulk_import.rb +626 -0
  87. data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
  88. data/lib/search_engine/indexer/retry_policy.rb +103 -0
  89. data/lib/search_engine/indexer.rb +747 -0
  90. data/lib/search_engine/instrumentation.rb +308 -0
  91. data/lib/search_engine/joins/guard.rb +202 -0
  92. data/lib/search_engine/joins/resolver.rb +95 -0
  93. data/lib/search_engine/logging/color.rb +78 -0
  94. data/lib/search_engine/logging/format_helpers.rb +92 -0
  95. data/lib/search_engine/logging/partition_progress.rb +53 -0
  96. data/lib/search_engine/logging_subscriber.rb +388 -0
  97. data/lib/search_engine/mapper.rb +785 -0
  98. data/lib/search_engine/multi.rb +286 -0
  99. data/lib/search_engine/multi_result.rb +186 -0
  100. data/lib/search_engine/notifications/compact_logger.rb +675 -0
  101. data/lib/search_engine/observability.rb +162 -0
  102. data/lib/search_engine/operations.rb +58 -0
  103. data/lib/search_engine/otel.rb +227 -0
  104. data/lib/search_engine/partitioner.rb +128 -0
  105. data/lib/search_engine/ranking_plan.rb +118 -0
  106. data/lib/search_engine/registry.rb +158 -0
  107. data/lib/search_engine/relation/compiler.rb +711 -0
  108. data/lib/search_engine/relation/deletion.rb +37 -0
  109. data/lib/search_engine/relation/dsl/filters.rb +624 -0
  110. data/lib/search_engine/relation/dsl/selection.rb +240 -0
  111. data/lib/search_engine/relation/dsl.rb +903 -0
  112. data/lib/search_engine/relation/dx/dry_run.rb +59 -0
  113. data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
  114. data/lib/search_engine/relation/dx.rb +231 -0
  115. data/lib/search_engine/relation/materializers.rb +118 -0
  116. data/lib/search_engine/relation/options.rb +138 -0
  117. data/lib/search_engine/relation/state.rb +274 -0
  118. data/lib/search_engine/relation/updating.rb +44 -0
  119. data/lib/search_engine/relation.rb +623 -0
  120. data/lib/search_engine/result.rb +664 -0
  121. data/lib/search_engine/schema.rb +1083 -0
  122. data/lib/search_engine/sources/active_record_source.rb +185 -0
  123. data/lib/search_engine/sources/base.rb +62 -0
  124. data/lib/search_engine/sources/lambda_source.rb +55 -0
  125. data/lib/search_engine/sources/sql_source.rb +196 -0
  126. data/lib/search_engine/sources.rb +71 -0
  127. data/lib/search_engine/stale_rules.rb +160 -0
  128. data/lib/search_engine/test/minitest_assertions.rb +57 -0
  129. data/lib/search_engine/test/offline_client.rb +134 -0
  130. data/lib/search_engine/test/rspec_matchers.rb +77 -0
  131. data/lib/search_engine/test/stub_client.rb +201 -0
  132. data/lib/search_engine/test.rb +66 -0
  133. data/lib/search_engine/test_autoload.rb +8 -0
  134. data/lib/search_engine/update.rb +35 -0
  135. data/lib/search_engine/version.rb +7 -0
  136. data/lib/search_engine.rb +332 -0
  137. data/lib/tasks/search_engine.rake +501 -0
  138. data/lib/tasks/search_engine_doctor.rake +16 -0
  139. metadata +225 -0
@@ -0,0 +1,198 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module SearchEngine
6
+ class Indexer
7
+ # Orchestrates a single batch import attempt and retry loop.
8
+ #
9
+ # Emits the same instrumentation as the monolithic Indexer, preserves
10
+ # :upsert default action, and supports dry-run mode (serialize only).
11
+ #
12
+ # @since M8
13
+ class ImportDispatcher
14
+ # Result of a single import (may be used as an element of batches list)
15
+ # Keys mirror existing payload structure.
16
+ # @return [Hash]
17
+ def self.import_batch(client:, collection:, action:, jsonl:, docs_count:, bytes_sent:, batch_index:,
18
+ retry_policy:, dry_run: false)
19
+ # @param client [SearchEngine::Client]
20
+ # @param collection [String] physical collection name to import into
21
+ # @param action [Symbol, String] one of :upsert, :create, :update
22
+ # @param jsonl [String] JSONL body
23
+ # @param docs_count [Integer] number of docs encoded in jsonl
24
+ # @param bytes_sent [Integer] payload size in bytes
25
+ # @param batch_index [Integer, nil] batch sequence number
26
+ # @param retry_policy [SearchEngine::Indexer::RetryPolicy]
27
+ # @param dry_run [Boolean] when true, do not perform network call
28
+ # @return [Hash] stats payload: { index:, docs_count:, success_count:, failure_count:, attempts:, http_status:, duration_ms:, bytes_sent:, errors_sample: [] }
29
+ # @raise [SearchEngine::Errors::Api] when the underlying client raises an API error (propagated)
30
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
31
+ if dry_run
32
+ # Emit instrumentation parity without network
33
+ http_status = 200
34
+ success_count = docs_count
35
+ failure_count = 0
36
+ duration_ms = 0.0
37
+ if defined?(ActiveSupport::Notifications)
38
+ payload = base_payload(collection, batch_index, docs_count, bytes_sent)
39
+ SearchEngine::Instrumentation.instrument('search_engine.indexer.batch_import', payload) do |ctx|
40
+ ctx[:success_count] = success_count
41
+ ctx[:failure_count] = failure_count
42
+ ctx[:http_status] = http_status
43
+ end
44
+ end
45
+ return build_stats(batch_index, docs_count, success_count, failure_count, 1, http_status, duration_ms,
46
+ bytes_sent, []
47
+ )
48
+ end
49
+
50
+ attempt = 1
51
+ loop do
52
+ stats = perform_attempt(client, collection, action, jsonl, docs_count, bytes_sent, batch_index, attempt)
53
+ return stats
54
+ rescue StandardError => error
55
+ if retry_policy.retry?(attempt, error)
56
+ delay = retry_policy.next_delay(attempt, error)
57
+ sleep(delay) if delay.positive?
58
+ attempt += 1
59
+ next
60
+ end
61
+ raise
62
+ end
63
+ rescue SearchEngine::Errors::Api => error
64
+ # Let 413 handling and other mapping be owned by the caller for splitting
65
+ raise error
66
+ end
67
+
68
+ class << self
69
+ private
70
+
71
+ def perform_attempt(client, collection, action, jsonl, docs_count, bytes_sent, idx, attempt)
72
+ start = monotonic_ms
73
+ success_count = 0
74
+ failure_count = 0
75
+ http_status = 200
76
+ error_sample = []
77
+
78
+ if defined?(ActiveSupport::Notifications)
79
+ se_payload = base_payload(collection, idx, docs_count, bytes_sent).merge(attempts: attempt,
80
+ http_status: nil
81
+ )
82
+ SearchEngine::Instrumentation.instrument('search_engine.indexer.batch_import', se_payload) do |ctx|
83
+ raw = client.import_documents(collection: collection, jsonl: jsonl, action: action)
84
+ success_count, failure_count, error_sample = parse_import_response(raw)
85
+ http_status = 200
86
+ ctx[:success_count] = success_count
87
+ ctx[:failure_count] = failure_count
88
+ ctx[:http_status] = http_status
89
+ end
90
+ else
91
+ raw = client.import_documents(collection: collection, jsonl: jsonl, action: action)
92
+ success_count, failure_count, error_sample = parse_import_response(raw)
93
+ end
94
+
95
+ duration = monotonic_ms - start
96
+ build_stats(idx, docs_count, success_count, failure_count, attempt, http_status, duration, bytes_sent,
97
+ error_sample
98
+ )
99
+ end
100
+
101
+ def base_payload(collection, idx, docs_count, bytes_sent)
102
+ {
103
+ collection: SearchEngine::Instrumentation.context[:collection] || collection,
104
+ into: collection,
105
+ batch_index: idx,
106
+ docs_count: docs_count,
107
+ success_count: nil,
108
+ failure_count: nil,
109
+ attempts: nil,
110
+ http_status: nil,
111
+ bytes_sent: bytes_sent,
112
+ transient_retry: false,
113
+ retry_after_s: nil,
114
+ error_sample: nil
115
+ }
116
+ end
117
+
118
+ def build_stats(idx, docs_count, success_count, failure_count, attempts, http_status, duration_ms, bytes_sent,
119
+ error_sample)
120
+ {
121
+ index: idx,
122
+ docs_count: docs_count,
123
+ success_count: success_count,
124
+ failure_count: failure_count,
125
+ attempts: attempts,
126
+ http_status: http_status,
127
+ duration_ms: duration_ms.round(1),
128
+ bytes_sent: bytes_sent,
129
+ errors_sample: Array(error_sample)[0, 5]
130
+ }
131
+ end
132
+
133
+ def parse_import_response(raw)
134
+ return parse_from_string(raw) if raw.is_a?(String)
135
+ return parse_from_array(raw) if raw.is_a?(Array)
136
+
137
+ [0, 0, []]
138
+ end
139
+
140
+ def parse_from_string(str)
141
+ success = 0
142
+ failure = 0
143
+ samples = []
144
+ str.each_line do |line|
145
+ line = line.strip
146
+ next if line.empty?
147
+
148
+ h = safe_parse_json(line)
149
+ unless h
150
+ failure += 1
151
+ samples << 'invalid-json-line'
152
+ next
153
+ end
154
+
155
+ if truthy?(h['success'] || h[:success])
156
+ success += 1
157
+ else
158
+ failure += 1
159
+ msg = h['error'] || h[:error] || h['message'] || h[:message]
160
+ samples << msg.to_s[0, 200] if msg
161
+ end
162
+ end
163
+ [success, failure, samples[0, 5]]
164
+ end
165
+
166
+ def parse_from_array(arr)
167
+ success = 0
168
+ failure = 0
169
+ samples = []
170
+ arr.each do |h|
171
+ if h.is_a?(Hash) && truthy?(h['success'] || h[:success])
172
+ success += 1
173
+ else
174
+ failure += 1
175
+ msg = h.is_a?(Hash) ? (h['error'] || h[:error] || h['message'] || h[:message]) : nil
176
+ samples << msg.to_s[0, 200] if msg
177
+ end
178
+ end
179
+ [success, failure, samples[0, 5]]
180
+ end
181
+
182
+ def safe_parse_json(line)
183
+ JSON.parse(line)
184
+ rescue StandardError
185
+ nil
186
+ end
187
+
188
+ def truthy?(val)
189
+ val == true || val.to_s.downcase == 'true'
190
+ end
191
+
192
+ def monotonic_ms
193
+ SearchEngine::Instrumentation.monotonic_ms
194
+ end
195
+ end
196
+ end
197
+ end
198
+ end
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ class Indexer
5
+ # Encapsulates retryability rules and exponential backoff with jitter
6
+ # for import attempts. Pure and deterministic except for jitter.
7
+ #
8
+ # Usage:
9
+ # policy = SearchEngine::Indexer::RetryPolicy.from_config(SearchEngine.config.indexer.retries)
10
+ # attempt = 1
11
+ # begin
12
+ # # perform
13
+ # rescue => error
14
+ # if policy.retry?(attempt, error)
15
+ # sleep(policy.next_delay(attempt, error))
16
+ # attempt += 1
17
+ # retry
18
+ # else
19
+ # raise
20
+ # end
21
+ # end
22
+ #
23
+ # @since M8
24
+ class RetryPolicy
25
+ # @return [Integer]
26
+ attr_reader :attempts
27
+
28
+ # @param attempts [Integer]
29
+ # @param base [Float]
30
+ # @param max [Float]
31
+ # @param jitter_fraction [Float]
32
+ def initialize(attempts:, base:, max:, jitter_fraction:)
33
+ @attempts = Integer(attempts)
34
+ @base = base.to_f
35
+ @max = max.to_f
36
+ @jitter_fraction = jitter_fraction.to_f
37
+ end
38
+
39
+ # Build a policy from a config-like Hash.
40
+ # @param cfg [Hash]
41
+ # @return [RetryPolicy]
42
+ # @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
43
+ def self.from_config(cfg)
44
+ c = cfg || {}
45
+ new(
46
+ attempts: (c[:attempts]&.to_i&.positive? ? c[:attempts].to_i : 3),
47
+ base: (c[:base]&.to_f&.positive? ? c[:base].to_f : 0.5),
48
+ max: (c[:max]&.to_f&.positive? ? c[:max].to_f : 5.0),
49
+ jitter_fraction: (c[:jitter_fraction].to_f >= 0 ? c[:jitter_fraction].to_f : 0.2)
50
+ )
51
+ end
52
+
53
+ # Whether we should retry for a given attempt and error.
54
+ # @param attempt [Integer] 1-based attempt index
55
+ # @param error [Exception]
56
+ # @return [Boolean]
57
+ def retry?(attempt, error)
58
+ return false if attempt >= @attempts
59
+
60
+ case error
61
+ when SearchEngine::Errors::Timeout, SearchEngine::Errors::Connection
62
+ true
63
+ when SearchEngine::Errors::Api
64
+ transient_status?(error.status.to_i)
65
+ else
66
+ false
67
+ end
68
+ end
69
+
70
+ # Compute the delay in seconds before the next attempt.
71
+ # @param attempt [Integer] 1-based attempt index
72
+ # @param error [Exception]
73
+ # @return [Float]
74
+ def next_delay(attempt, _error)
75
+ # Exponential backoff with bounded jitter
76
+ exp = [@base * (2 ** (attempt - 1)), @max].min
77
+ jitter = exp * @jitter_fraction
78
+ delta = random_in_range(-jitter..jitter)
79
+ sleep_time = exp + delta
80
+ sleep_time.positive? ? sleep_time : 0.0
81
+ end
82
+
83
+ private
84
+
85
+ def transient_status?(code)
86
+ return true if code == 429
87
+ return true if code >= 500 && code <= 599
88
+
89
+ false
90
+ end
91
+
92
+ def random_in_range(range)
93
+ # Use a thread-local RNG for low contention and testability
94
+ rng = (Thread.current[:__se_retry_rng__] ||= Random.new)
95
+ min = range.begin.to_f
96
+ max = range.end.to_f
97
+ # Range may be exclusive; normalize to inclusive space
98
+ span = max - min
99
+ min + (rng.rand * span)
100
+ end
101
+ end
102
+ end
103
+ end