search-engine-for-typesense 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +148 -0
  4. data/app/search_engine/search_engine/app_info.rb +11 -0
  5. data/app/search_engine/search_engine/index_partition_job.rb +170 -0
  6. data/lib/generators/search_engine/install/install_generator.rb +20 -0
  7. data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
  8. data/lib/generators/search_engine/model/model_generator.rb +86 -0
  9. data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
  10. data/lib/search-engine-for-typesense.rb +12 -0
  11. data/lib/search_engine/active_record_syncable.rb +247 -0
  12. data/lib/search_engine/admin/stopwords.rb +125 -0
  13. data/lib/search_engine/admin/synonyms.rb +125 -0
  14. data/lib/search_engine/admin.rb +12 -0
  15. data/lib/search_engine/ast/and.rb +52 -0
  16. data/lib/search_engine/ast/binary_op.rb +75 -0
  17. data/lib/search_engine/ast/eq.rb +19 -0
  18. data/lib/search_engine/ast/group.rb +18 -0
  19. data/lib/search_engine/ast/gt.rb +12 -0
  20. data/lib/search_engine/ast/gte.rb +12 -0
  21. data/lib/search_engine/ast/in.rb +28 -0
  22. data/lib/search_engine/ast/lt.rb +12 -0
  23. data/lib/search_engine/ast/lte.rb +12 -0
  24. data/lib/search_engine/ast/matches.rb +55 -0
  25. data/lib/search_engine/ast/node.rb +176 -0
  26. data/lib/search_engine/ast/not_eq.rb +13 -0
  27. data/lib/search_engine/ast/not_in.rb +24 -0
  28. data/lib/search_engine/ast/or.rb +52 -0
  29. data/lib/search_engine/ast/prefix.rb +51 -0
  30. data/lib/search_engine/ast/raw.rb +41 -0
  31. data/lib/search_engine/ast/unary_op.rb +43 -0
  32. data/lib/search_engine/ast.rb +101 -0
  33. data/lib/search_engine/base/creation.rb +727 -0
  34. data/lib/search_engine/base/deletion.rb +80 -0
  35. data/lib/search_engine/base/display_coercions.rb +36 -0
  36. data/lib/search_engine/base/hydration.rb +312 -0
  37. data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
  38. data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
  39. data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
  40. data/lib/search_engine/base/index_maintenance.rb +459 -0
  41. data/lib/search_engine/base/indexing_dsl.rb +255 -0
  42. data/lib/search_engine/base/joins.rb +479 -0
  43. data/lib/search_engine/base/model_dsl.rb +472 -0
  44. data/lib/search_engine/base/presets.rb +43 -0
  45. data/lib/search_engine/base/pretty_printer.rb +315 -0
  46. data/lib/search_engine/base/relation_delegation.rb +42 -0
  47. data/lib/search_engine/base/scopes.rb +113 -0
  48. data/lib/search_engine/base/updating.rb +92 -0
  49. data/lib/search_engine/base.rb +38 -0
  50. data/lib/search_engine/bulk.rb +284 -0
  51. data/lib/search_engine/cache.rb +33 -0
  52. data/lib/search_engine/cascade.rb +531 -0
  53. data/lib/search_engine/cli/doctor.rb +631 -0
  54. data/lib/search_engine/cli/support.rb +217 -0
  55. data/lib/search_engine/cli.rb +222 -0
  56. data/lib/search_engine/client/http_adapter.rb +63 -0
  57. data/lib/search_engine/client/request_builder.rb +92 -0
  58. data/lib/search_engine/client/services/base.rb +74 -0
  59. data/lib/search_engine/client/services/collections.rb +161 -0
  60. data/lib/search_engine/client/services/documents.rb +214 -0
  61. data/lib/search_engine/client/services/operations.rb +152 -0
  62. data/lib/search_engine/client/services/search.rb +190 -0
  63. data/lib/search_engine/client/services.rb +29 -0
  64. data/lib/search_engine/client.rb +765 -0
  65. data/lib/search_engine/client_options.rb +20 -0
  66. data/lib/search_engine/collection_resolver.rb +191 -0
  67. data/lib/search_engine/collections_graph.rb +330 -0
  68. data/lib/search_engine/compiled_params.rb +143 -0
  69. data/lib/search_engine/compiler.rb +383 -0
  70. data/lib/search_engine/config/observability.rb +27 -0
  71. data/lib/search_engine/config/presets.rb +92 -0
  72. data/lib/search_engine/config/selection.rb +16 -0
  73. data/lib/search_engine/config/typesense.rb +48 -0
  74. data/lib/search_engine/config/validators.rb +97 -0
  75. data/lib/search_engine/config.rb +917 -0
  76. data/lib/search_engine/console_helpers.rb +130 -0
  77. data/lib/search_engine/deletion.rb +103 -0
  78. data/lib/search_engine/dispatcher.rb +125 -0
  79. data/lib/search_engine/dsl/parser.rb +582 -0
  80. data/lib/search_engine/engine.rb +167 -0
  81. data/lib/search_engine/errors.rb +290 -0
  82. data/lib/search_engine/filters/sanitizer.rb +189 -0
  83. data/lib/search_engine/hydration/materializers.rb +808 -0
  84. data/lib/search_engine/hydration/selection_context.rb +96 -0
  85. data/lib/search_engine/indexer/batch_planner.rb +76 -0
  86. data/lib/search_engine/indexer/bulk_import.rb +626 -0
  87. data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
  88. data/lib/search_engine/indexer/retry_policy.rb +103 -0
  89. data/lib/search_engine/indexer.rb +747 -0
  90. data/lib/search_engine/instrumentation.rb +308 -0
  91. data/lib/search_engine/joins/guard.rb +202 -0
  92. data/lib/search_engine/joins/resolver.rb +95 -0
  93. data/lib/search_engine/logging/color.rb +78 -0
  94. data/lib/search_engine/logging/format_helpers.rb +92 -0
  95. data/lib/search_engine/logging/partition_progress.rb +53 -0
  96. data/lib/search_engine/logging_subscriber.rb +388 -0
  97. data/lib/search_engine/mapper.rb +785 -0
  98. data/lib/search_engine/multi.rb +286 -0
  99. data/lib/search_engine/multi_result.rb +186 -0
  100. data/lib/search_engine/notifications/compact_logger.rb +675 -0
  101. data/lib/search_engine/observability.rb +162 -0
  102. data/lib/search_engine/operations.rb +58 -0
  103. data/lib/search_engine/otel.rb +227 -0
  104. data/lib/search_engine/partitioner.rb +128 -0
  105. data/lib/search_engine/ranking_plan.rb +118 -0
  106. data/lib/search_engine/registry.rb +158 -0
  107. data/lib/search_engine/relation/compiler.rb +711 -0
  108. data/lib/search_engine/relation/deletion.rb +37 -0
  109. data/lib/search_engine/relation/dsl/filters.rb +624 -0
  110. data/lib/search_engine/relation/dsl/selection.rb +240 -0
  111. data/lib/search_engine/relation/dsl.rb +903 -0
  112. data/lib/search_engine/relation/dx/dry_run.rb +59 -0
  113. data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
  114. data/lib/search_engine/relation/dx.rb +231 -0
  115. data/lib/search_engine/relation/materializers.rb +118 -0
  116. data/lib/search_engine/relation/options.rb +138 -0
  117. data/lib/search_engine/relation/state.rb +274 -0
  118. data/lib/search_engine/relation/updating.rb +44 -0
  119. data/lib/search_engine/relation.rb +623 -0
  120. data/lib/search_engine/result.rb +664 -0
  121. data/lib/search_engine/schema.rb +1083 -0
  122. data/lib/search_engine/sources/active_record_source.rb +185 -0
  123. data/lib/search_engine/sources/base.rb +62 -0
  124. data/lib/search_engine/sources/lambda_source.rb +55 -0
  125. data/lib/search_engine/sources/sql_source.rb +196 -0
  126. data/lib/search_engine/sources.rb +71 -0
  127. data/lib/search_engine/stale_rules.rb +160 -0
  128. data/lib/search_engine/test/minitest_assertions.rb +57 -0
  129. data/lib/search_engine/test/offline_client.rb +134 -0
  130. data/lib/search_engine/test/rspec_matchers.rb +77 -0
  131. data/lib/search_engine/test/stub_client.rb +201 -0
  132. data/lib/search_engine/test.rb +66 -0
  133. data/lib/search_engine/test_autoload.rb +8 -0
  134. data/lib/search_engine/update.rb +35 -0
  135. data/lib/search_engine/version.rb +7 -0
  136. data/lib/search_engine.rb +332 -0
  137. data/lib/tasks/search_engine.rake +501 -0
  138. data/lib/tasks/search_engine_doctor.rake +16 -0
  139. metadata +225 -0
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ module Sources
5
+ # ActiveRecord-backed source adapter that yields arrays of records in batches.
6
+ #
7
+ # Uses ORM batch APIs (`in_batches`/`find_in_batches`) honoring batch_size and an
8
+ # optional scope proc. Ensures stable memory by disabling query cache and using
9
+ # readonly relations. Does not accumulate results across batches.
10
+ #
11
+ # @example
12
+ # src = SearchEngine::Sources::ActiveRecordSource.new(model: ::Product, scope: -> { where(active: true) }, batch_size: 2000)
13
+ # src.each_batch(partition: nil, cursor: nil) { |rows| ... }
14
+ #
15
+ # @note Emits "search_engine.source.batch_fetched" and "search_engine.source.error".
16
+ class ActiveRecordSource
17
+ include Base
18
+
19
+ # @param model [Class] ActiveRecord model class
20
+ # @param scope [Proc, nil] optional proc evaluated in the context of model.all (returns a Relation)
21
+ # @param batch_size [Integer, nil] override batch size (defaults from config)
22
+ # @param use_transaction [Boolean, nil] wrap in read-only transaction (best-effort)
23
+ # @param readonly [Boolean, nil] mark relations readonly (default true)
24
+ def initialize(model:, scope: nil, batch_size: nil, use_transaction: nil, readonly: nil)
25
+ @model = model
26
+ @scope_proc = scope
27
+ cfg = SearchEngine.config.sources.active_record
28
+ @batch_size = (batch_size || cfg.batch_size).to_i
29
+ @use_transaction = use_transaction.nil? ? truthy?(cfg.use_transaction) : truthy?(use_transaction)
30
+ @readonly = !readonly.nil? ? truthy?(readonly) : truthy?(cfg.readonly)
31
+ end
32
+
33
+ # Iterate over batches of records.
34
+ #
35
+ # @param partition [Object, nil] optional opaque partition (e.g., id range)
36
+ # @param cursor [Object, nil] optional opaque cursor (e.g., last id)
37
+ # @yieldparam rows [Array<Object>] array of model instances
38
+ # @return [Enumerator] when no block is given
39
+ def each_batch(partition: nil, cursor: nil, &block)
40
+ return enum_for(:each_batch, partition: partition, cursor: cursor) unless block_given?
41
+
42
+ relation = base_relation
43
+ relation = apply_partition(relation, partition) if partition
44
+ relation = apply_cursor(relation, cursor) if cursor
45
+
46
+ idx = 0
47
+ started = monotonic_ms
48
+ run_with_connection do
49
+ run_in_readonly_txn_if_needed do
50
+ without_query_cache do
51
+ dispatch_batches(relation, started, idx, partition, cursor, &block)
52
+ end
53
+ end
54
+ end
55
+ rescue StandardError => error
56
+ instrument_error(source: 'active_record', error: error, partition: partition, cursor: cursor,
57
+ adapter_options: { batch_size: @batch_size }
58
+ )
59
+ raise
60
+ end
61
+
62
+ private
63
+
64
+ def truthy?(val)
65
+ val == true
66
+ end
67
+
68
+ def dispatch_batches(relation, started, idx, partition, cursor)
69
+ if relation.respond_to?(:in_batches)
70
+ relation.in_batches(of: @batch_size) do |batch_scope|
71
+ batch_scope = mark_readonly(batch_scope)
72
+ rows = batch_scope.to_a
73
+ duration = monotonic_ms - started
74
+ instrument_batch_fetched(source: 'active_record', batch_index: idx, rows_count: rows.size,
75
+ duration_ms: duration, partition: partition, cursor: cursor,
76
+ adapter_options: { batch_size: @batch_size }
77
+ )
78
+ yield(rows)
79
+ idx += 1
80
+ started = monotonic_ms
81
+ end
82
+ elsif relation.respond_to?(:find_in_batches)
83
+ relation.find_in_batches(batch_size: @batch_size) do |rows|
84
+ rows = rows.map { |r| r }
85
+ duration = monotonic_ms - started
86
+ instrument_batch_fetched(source: 'active_record', batch_index: idx, rows_count: rows.size,
87
+ duration_ms: duration, partition: partition, cursor: cursor,
88
+ adapter_options: { batch_size: @batch_size }
89
+ )
90
+ yield(rows)
91
+ idx += 1
92
+ started = monotonic_ms
93
+ end
94
+ else
95
+ # Last resort: materialize and slice
96
+ records = relation.to_a
97
+ records.each_slice(@batch_size) do |rows|
98
+ duration = monotonic_ms - started
99
+ instrument_batch_fetched(source: 'active_record', batch_index: idx, rows_count: rows.size,
100
+ duration_ms: duration, partition: partition, cursor: cursor,
101
+ adapter_options: { batch_size: @batch_size }
102
+ )
103
+ yield(rows)
104
+ idx += 1
105
+ started = monotonic_ms
106
+ end
107
+ end
108
+ end
109
+
110
+ def base_relation
111
+ rel = @model.all
112
+ rel = rel.instance_exec(&@scope_proc) if @scope_proc.respond_to?(:call)
113
+ mark_readonly(rel)
114
+ end
115
+
116
+ def mark_readonly(rel)
117
+ if @readonly && rel.respond_to?(:readonly)
118
+ rel.readonly(true)
119
+ else
120
+ rel
121
+ end
122
+ end
123
+
124
+ def apply_partition(rel, partition)
125
+ case partition
126
+ when Range
127
+ pk = rel.klass.primary_key
128
+ rel.where(pk => partition)
129
+ when Hash
130
+ rel.where(partition)
131
+ else
132
+ rel
133
+ end
134
+ end
135
+
136
+ def apply_cursor(rel, cursor)
137
+ return rel unless cursor
138
+
139
+ pk = rel.klass.primary_key
140
+ if cursor.is_a?(Hash)
141
+ rel.where(cursor)
142
+ else
143
+ rel.where(arel_table(rel)[pk].gt(cursor)).order(pk => :asc)
144
+ end
145
+ end
146
+
147
+ def arel_table(rel)
148
+ rel.klass.arel_table
149
+ end
150
+
151
+ def run_with_connection(&block)
152
+ if defined?(ActiveRecord::Base)
153
+ ActiveRecord::Base.connection_pool.with_connection(&block)
154
+ else
155
+ yield
156
+ end
157
+ end
158
+
159
+ def run_in_readonly_txn_if_needed
160
+ if @use_transaction && defined?(ActiveRecord::Base)
161
+ ActiveRecord::Base.connection.transaction(requires_new: true) do
162
+ if ActiveRecord::Base.connection.respond_to?(:execute)
163
+ begin
164
+ ActiveRecord::Base.connection.execute('SET TRANSACTION READ ONLY')
165
+ rescue StandardError
166
+ # best-effort; ignore if not supported
167
+ end
168
+ end
169
+ yield
170
+ end
171
+ else
172
+ yield
173
+ end
174
+ end
175
+
176
+ def without_query_cache(&block)
177
+ if defined?(ActiveRecord::Base) && ActiveRecord::Base.respond_to?(:connection)
178
+ ActiveRecord::Base.connection.uncached(&block)
179
+ else
180
+ yield
181
+ end
182
+ end
183
+ end
184
+ end
185
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ module Sources
5
+ # Internal helpers shared by source adapters.
6
+ #
7
+ # Provides small utilities for instrumentation and for returning
8
+ # an Enumerator when a block is not supplied.
9
+ module Base
10
+ private
11
+
12
+ def monotonic_ms
13
+ SearchEngine::Instrumentation.monotonic_ms
14
+ end
15
+
16
+ def instrument_batch_fetched(source:, batch_index:, rows_count:, duration_ms:, partition: nil, cursor: nil,
17
+ adapter_options: {})
18
+ return unless defined?(ActiveSupport::Notifications)
19
+
20
+ payload = {
21
+ source: source,
22
+ batch_index: batch_index,
23
+ rows_count: rows_count,
24
+ duration_ms: duration_ms
25
+ }
26
+ payload[:partition] = partition unless partition.nil?
27
+ payload[:cursor] = cursor unless cursor.nil?
28
+ if defined?(SearchEngine::Observability)
29
+ payload[:adapter_options] = SearchEngine::Observability.redact(adapter_options)
30
+ end
31
+ SearchEngine::Instrumentation.instrument('search_engine.source.batch_fetched', payload) {}
32
+ end
33
+
34
+ def instrument_error(source:, error:, partition: nil, cursor: nil, adapter_options: {})
35
+ return unless defined?(ActiveSupport::Notifications)
36
+
37
+ payload = {
38
+ source: source,
39
+ error_class: error.class.name,
40
+ message: error.message.to_s[0, 200]
41
+ }
42
+ payload[:partition] = partition unless partition.nil?
43
+ payload[:cursor] = cursor unless cursor.nil?
44
+ if defined?(SearchEngine::Observability)
45
+ payload[:adapter_options] = SearchEngine::Observability.redact(adapter_options)
46
+ end
47
+ if error.respond_to?(:to_h)
48
+ h = error.to_h
49
+ payload[:error_hint] = h[:hint] if h[:hint]
50
+ payload[:error_doc] = h[:doc] if h[:doc]
51
+ end
52
+ SearchEngine::Instrumentation.instrument('search_engine.source.error', payload) {}
53
+ end
54
+
55
+ def enum_for_each_batch(partition:, cursor:)
56
+ return to_enum(:each_batch, partition: partition, cursor: cursor) unless block_given?
57
+
58
+ yield
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ module Sources
5
+ # Adapter that delegates batch enumeration to a provided callable.
6
+ #
7
+ # The callable is expected to implement `call(cursor:, partition:)` and return either
8
+ # an Enumerator or yield arrays of rows. Shapes are application-defined.
9
+ #
10
+ # @example
11
+ # src = SearchEngine::Sources::LambdaSource.new(->(cursor:, partition:) { [[row1, row2]] })
12
+ # src.each_batch { |rows| ... }
13
+ #
14
+ # @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer`
15
+ class LambdaSource
16
+ include Base
17
+
18
+ # @param callable [#call] object responding to call(cursor:, partition:)
19
+ # @raise [ArgumentError] when callable does not respond to :call
20
+ def initialize(callable)
21
+ raise ArgumentError, 'callable must respond to :call(cursor:, partition:)' unless callable.respond_to?(:call)
22
+
23
+ @callable = callable
24
+ end
25
+
26
+ # Enumerate batches produced by the callable.
27
+ # @param partition [Object, nil]
28
+ # @param cursor [Object, nil]
29
+ # @yieldparam rows [Array]
30
+ # @return [Enumerator]
31
+ def each_batch(partition: nil, cursor: nil)
32
+ return enum_for(:each_batch, partition: partition, cursor: cursor) unless block_given?
33
+
34
+ started = monotonic_ms
35
+ begin
36
+ enum = @callable.call(cursor: cursor, partition: partition)
37
+ Array(enum).each do |rows|
38
+ duration = monotonic_ms - started
39
+ instrument_batch_fetched(source: 'lambda', batch_index: nil, rows_count: Array(rows).size,
40
+ duration_ms: duration, partition: partition, cursor: cursor,
41
+ adapter_options: { callable: @callable.class.name }
42
+ )
43
+ yield(rows)
44
+ started = monotonic_ms
45
+ end
46
+ rescue StandardError => error
47
+ instrument_error(source: 'lambda', error: error, partition: partition, cursor: cursor,
48
+ adapter_options: { callable: @callable.class.name }
49
+ )
50
+ raise
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ module Sources
5
+ # Stream rows from SQL using ActiveRecord connection, with PG cursor support when available.
6
+ #
7
+ # Supports large result sets via server-side cursors on PostgreSQL connections and falls
8
+ # back to paginated SELECT for other adapters. Yields arrays of rows in batches.
9
+ #
10
+ # @example
11
+ # src = SearchEngine::Sources::SqlSource.new(sql: "SELECT id, name FROM users", fetch_size: 2000)
12
+ # src.each_batch { |rows| ... }
13
+ #
14
+ # @note Emits "search_engine.source.batch_fetched" and "search_engine.source.error".
15
+ # @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer`
16
+ class SqlSource
17
+ include Base
18
+
19
+ # @param sql [String] SQL statement to execute
20
+ # @param binds [Array, Hash, nil] optional parameters for the SQL
21
+ # @param fetch_size [Integer, nil] batch size hint
22
+ # @param row_shape [Symbol, nil] :hash, :array or :auto (PG only)
23
+ # @param statement_timeout_ms [Integer, nil] optional statement timeout (PG only)
24
+ def initialize(sql:, binds: nil, fetch_size: nil, row_shape: nil, statement_timeout_ms: nil)
25
+ @sql = sql.to_s
26
+ @binds = binds
27
+ cfg = SearchEngine.config.sources.sql
28
+ @fetch_size = (fetch_size || cfg.fetch_size).to_i
29
+ @row_shape = row_shape || :auto
30
+ @statement_timeout_ms = statement_timeout_ms
31
+ end
32
+
33
+ # Iterate over batches produced by the SQL query.
34
+ # @param partition [Object, nil]
35
+ # @param cursor [Object, nil]
36
+ # @yieldparam rows [Array<Hash>, Array<Array>]
37
+ # @return [Enumerator]
38
+ def each_batch(partition: nil, cursor: nil, &block)
39
+ return enum_for(:each_batch, partition: partition, cursor: cursor) unless block_given?
40
+
41
+ run_with_connection do
42
+ if postgres_connection?(ActiveRecord::Base.connection.raw_connection)
43
+ stream_postgres(ActiveRecord::Base.connection.raw_connection, partition: partition, cursor: cursor, &block)
44
+ else
45
+ stream_generic(nil, partition: partition, cursor: cursor, &block)
46
+ end
47
+ end
48
+ rescue StandardError => error
49
+ instrument_error(source: 'sql', error: error, partition: partition, cursor: cursor,
50
+ adapter_options: { fetch_size: @fetch_size, row_shape: @row_shape }
51
+ )
52
+ raise
53
+ end
54
+
55
+ private
56
+
57
+ def run_with_connection
58
+ unless defined?(ActiveRecord::Base)
59
+ raise SearchEngine::Errors::InvalidParams, 'SqlSource requires ActiveRecord connection'
60
+ end
61
+
62
+ ActiveRecord::Base.connection_pool.with_connection do |ar_conn|
63
+ raw = raw_connection(ar_conn)
64
+ yield raw
65
+ end
66
+ end
67
+
68
+ def raw_connection(ar_conn)
69
+ if ar_conn.respond_to?(:raw_connection)
70
+ ar_conn.raw_connection
71
+ else
72
+ ar_conn
73
+ end
74
+ end
75
+
76
+ def postgres_connection?(conn)
77
+ (defined?(PG) && conn.is_a?(PG::Connection)) || conn.class.name.include?('PG')
78
+ end
79
+
80
+ def stream_postgres(conn, partition:, cursor:)
81
+ cursor_name = "se_cursor_#{object_id}"
82
+ sql, params = build_sql_and_params(partition: partition, cursor: cursor)
83
+ started = monotonic_ms
84
+ begin
85
+ set_statement_timeout(conn, @statement_timeout_ms) if @statement_timeout_ms
86
+ # Use unnamed prepared statement + DECLARE CURSOR for streaming
87
+ conn.exec('BEGIN READ ONLY')
88
+ conn.exec_params("DECLARE #{cursor_name} NO SCROLL CURSOR FOR #{sql}", params)
89
+ idx = 0
90
+ loop do
91
+ res = conn.exec("FETCH FORWARD #{@fetch_size} FROM #{cursor_name}")
92
+ break if res.ntuples.zero?
93
+
94
+ rows = rows_from_pg_result(res)
95
+ duration = monotonic_ms - started
96
+ instrument_batch_fetched(source: 'sql', batch_index: idx, rows_count: rows.size, duration_ms: duration,
97
+ partition: partition, cursor: cursor,
98
+ adapter_options: { fetch_size: @fetch_size, row_shape: @row_shape }
99
+ )
100
+ yield rows
101
+ idx += 1
102
+ started = monotonic_ms
103
+ end
104
+ ensure
105
+ begin
106
+ conn.exec("CLOSE #{cursor_name}")
107
+ rescue StandardError
108
+ # ignore
109
+ end
110
+ begin
111
+ conn.exec('COMMIT')
112
+ rescue StandardError
113
+ # ignore
114
+ end
115
+ reset_statement_timeout(conn) if @statement_timeout_ms
116
+ end
117
+ end
118
+
119
+ def rows_from_pg_result(res)
120
+ if @row_shape == :hash || @row_shape == :auto
121
+ res.to_a
122
+ else
123
+ res.values
124
+ end
125
+ end
126
+
127
+ def stream_generic(_conn, partition:, cursor:)
128
+ # Fallback: try ActiveRecord select_all with pagination via placeholders
129
+ ar_conn = ActiveRecord::Base.connection
130
+ sql, params = build_sql_and_params(partition: partition, cursor: cursor)
131
+ idx = 0
132
+ started = monotonic_ms
133
+ loop do
134
+ chunked_sql = sql_with_limit(sql, @fetch_size, idx)
135
+ rows = ar_conn.exec_query(chunked_sql, 'SqlSource', params_for_ar(params)).to_a
136
+ break if rows.empty?
137
+
138
+ duration = monotonic_ms - started
139
+ instrument_batch_fetched(source: 'sql', batch_index: idx, rows_count: rows.size, duration_ms: duration,
140
+ partition: partition, cursor: cursor,
141
+ adapter_options: { fetch_size: @fetch_size, row_shape: :hash }
142
+ )
143
+ yield rows
144
+ idx += 1
145
+ started = monotonic_ms
146
+ end
147
+ end
148
+
149
+ def sql_with_limit(base_sql, fetch_size, page_idx)
150
+ offset = page_idx * fetch_size
151
+ "SELECT * FROM (#{base_sql}) se_sub LIMIT #{Integer(fetch_size)} OFFSET #{Integer(offset)}"
152
+ end
153
+
154
+ def params_for_ar(params)
155
+ return [] if params.nil?
156
+
157
+ return params if params.is_a?(Array)
158
+
159
+ []
160
+ end
161
+
162
+ def build_sql_and_params(**)
163
+ # For safety, do not mutate the original SQL. Bind support is adapter-specific.
164
+ # We support a simple Hash expansion for named placeholders in PG `exec_params` style: $1, $2 ...
165
+ sql = @sql.dup
166
+ binds = []
167
+ case @binds
168
+ when Array
169
+ binds = @binds.dup
170
+ when Hash
171
+ raise SearchEngine::Errors::InvalidParams,
172
+ 'SqlSource with Hash binds is not supported; use positional binds (Array)'
173
+ when nil
174
+ # noop
175
+ else
176
+ raise SearchEngine::Errors::InvalidParams, 'SqlSource binds must be an Array or nil'
177
+ end
178
+
179
+ # Partition/cursor semantics are adapter/domain-specific; callers should incorporate placeholders.
180
+ [sql, binds]
181
+ end
182
+
183
+ def set_statement_timeout(conn, ms)
184
+ conn.exec_params('SET LOCAL statement_timeout = $1', [Integer(ms)])
185
+ rescue StandardError
186
+ # ignore
187
+ end
188
+
189
+ def reset_statement_timeout(conn)
190
+ conn.exec('RESET statement_timeout')
191
+ rescue StandardError
192
+ # ignore
193
+ end
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ # Factory and DSL for building data source adapters that yield batches.
5
+ #
6
+ # Usage via symbol and options:
7
+ # SearchEngine::Sources.build(:active_record, model: ::Product, scope: -> { where(active: true) }, batch_size: 2000)
8
+ # SearchEngine::Sources.build(:sql, sql: "SELECT * FROM products WHERE active = TRUE", fetch_size: 2000)
9
+ #
10
+ # Usage via block (lambda source):
11
+ # SearchEngine::Sources.build(:lambda) do |cursor:, partition:|
12
+ # Enumerator.new { |y| external_api.each_page(cursor) { |rows| y << rows } }
13
+ # end
14
+ #
15
+ # All adapters implement `each_batch(partition:, cursor:)` and return an Enumerator
16
+ # when no block is provided.
17
+ module Sources
18
+ # Build a source adapter from a symbol and options or from a block.
19
+ #
20
+ # @param type [Symbol] :active_record, :sql, or :lambda
21
+ # @param options [Hash] adapter-specific options
22
+ # @yield for :lambda sources, a block taking (cursor:, partition:) and returning an Enumerator
23
+ # @return [Object] adapter responding to `each_batch(partition:, cursor:)`
24
+ def self.build(type, **options, &block)
25
+ case type.to_sym
26
+ when :active_record
27
+ model = options[:model]
28
+ unless model.is_a?(Class)
29
+ raise SearchEngine::Errors::InvalidParams,
30
+ 'active_record source requires :model (ActiveRecord class). See ' \
31
+ 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer.'
32
+ end
33
+
34
+ scope = options[:scope]
35
+ batch_size = options[:batch_size]
36
+ readonly = options[:readonly]
37
+ use_txn = options[:use_transaction]
38
+ ActiveRecordSource.new(model: model, scope: scope, batch_size: batch_size, use_transaction: use_txn,
39
+ readonly: readonly
40
+ )
41
+ when :sql
42
+ sql = options[:sql]
43
+ unless sql.is_a?(String) && !sql.strip.empty?
44
+ raise SearchEngine::Errors::InvalidParams,
45
+ 'sql source requires :sql (String). See ' \
46
+ 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer.'
47
+ end
48
+
49
+ binds = options[:binds]
50
+ fetch_size = options[:fetch_size]
51
+ row_shape = options[:row_shape]
52
+ stmt_timeout = options[:statement_timeout_ms]
53
+ SqlSource.new(sql: sql, binds: binds, fetch_size: fetch_size, row_shape: row_shape,
54
+ statement_timeout_ms: stmt_timeout
55
+ )
56
+ when :lambda
57
+ callable = block || options[:callable]
58
+ unless callable
59
+ raise SearchEngine::Errors::InvalidParams,
60
+ 'lambda source requires a block or :callable. See ' \
61
+ 'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer.'
62
+ end
63
+
64
+ LambdaSource.new(callable)
65
+ else
66
+ raise SearchEngine::Errors::InvalidParams,
67
+ "unknown source type: #{type.inspect}. Supported: :active_record, :sql, :lambda"
68
+ end
69
+ end
70
+ end
71
+ end