search-engine-for-typesense 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +148 -0
- data/app/search_engine/search_engine/app_info.rb +11 -0
- data/app/search_engine/search_engine/index_partition_job.rb +170 -0
- data/lib/generators/search_engine/install/install_generator.rb +20 -0
- data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
- data/lib/generators/search_engine/model/model_generator.rb +86 -0
- data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
- data/lib/search-engine-for-typesense.rb +12 -0
- data/lib/search_engine/active_record_syncable.rb +247 -0
- data/lib/search_engine/admin/stopwords.rb +125 -0
- data/lib/search_engine/admin/synonyms.rb +125 -0
- data/lib/search_engine/admin.rb +12 -0
- data/lib/search_engine/ast/and.rb +52 -0
- data/lib/search_engine/ast/binary_op.rb +75 -0
- data/lib/search_engine/ast/eq.rb +19 -0
- data/lib/search_engine/ast/group.rb +18 -0
- data/lib/search_engine/ast/gt.rb +12 -0
- data/lib/search_engine/ast/gte.rb +12 -0
- data/lib/search_engine/ast/in.rb +28 -0
- data/lib/search_engine/ast/lt.rb +12 -0
- data/lib/search_engine/ast/lte.rb +12 -0
- data/lib/search_engine/ast/matches.rb +55 -0
- data/lib/search_engine/ast/node.rb +176 -0
- data/lib/search_engine/ast/not_eq.rb +13 -0
- data/lib/search_engine/ast/not_in.rb +24 -0
- data/lib/search_engine/ast/or.rb +52 -0
- data/lib/search_engine/ast/prefix.rb +51 -0
- data/lib/search_engine/ast/raw.rb +41 -0
- data/lib/search_engine/ast/unary_op.rb +43 -0
- data/lib/search_engine/ast.rb +101 -0
- data/lib/search_engine/base/creation.rb +727 -0
- data/lib/search_engine/base/deletion.rb +80 -0
- data/lib/search_engine/base/display_coercions.rb +36 -0
- data/lib/search_engine/base/hydration.rb +312 -0
- data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
- data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
- data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
- data/lib/search_engine/base/index_maintenance.rb +459 -0
- data/lib/search_engine/base/indexing_dsl.rb +255 -0
- data/lib/search_engine/base/joins.rb +479 -0
- data/lib/search_engine/base/model_dsl.rb +472 -0
- data/lib/search_engine/base/presets.rb +43 -0
- data/lib/search_engine/base/pretty_printer.rb +315 -0
- data/lib/search_engine/base/relation_delegation.rb +42 -0
- data/lib/search_engine/base/scopes.rb +113 -0
- data/lib/search_engine/base/updating.rb +92 -0
- data/lib/search_engine/base.rb +38 -0
- data/lib/search_engine/bulk.rb +284 -0
- data/lib/search_engine/cache.rb +33 -0
- data/lib/search_engine/cascade.rb +531 -0
- data/lib/search_engine/cli/doctor.rb +631 -0
- data/lib/search_engine/cli/support.rb +217 -0
- data/lib/search_engine/cli.rb +222 -0
- data/lib/search_engine/client/http_adapter.rb +63 -0
- data/lib/search_engine/client/request_builder.rb +92 -0
- data/lib/search_engine/client/services/base.rb +74 -0
- data/lib/search_engine/client/services/collections.rb +161 -0
- data/lib/search_engine/client/services/documents.rb +214 -0
- data/lib/search_engine/client/services/operations.rb +152 -0
- data/lib/search_engine/client/services/search.rb +190 -0
- data/lib/search_engine/client/services.rb +29 -0
- data/lib/search_engine/client.rb +765 -0
- data/lib/search_engine/client_options.rb +20 -0
- data/lib/search_engine/collection_resolver.rb +191 -0
- data/lib/search_engine/collections_graph.rb +330 -0
- data/lib/search_engine/compiled_params.rb +143 -0
- data/lib/search_engine/compiler.rb +383 -0
- data/lib/search_engine/config/observability.rb +27 -0
- data/lib/search_engine/config/presets.rb +92 -0
- data/lib/search_engine/config/selection.rb +16 -0
- data/lib/search_engine/config/typesense.rb +48 -0
- data/lib/search_engine/config/validators.rb +97 -0
- data/lib/search_engine/config.rb +917 -0
- data/lib/search_engine/console_helpers.rb +130 -0
- data/lib/search_engine/deletion.rb +103 -0
- data/lib/search_engine/dispatcher.rb +125 -0
- data/lib/search_engine/dsl/parser.rb +582 -0
- data/lib/search_engine/engine.rb +167 -0
- data/lib/search_engine/errors.rb +290 -0
- data/lib/search_engine/filters/sanitizer.rb +189 -0
- data/lib/search_engine/hydration/materializers.rb +808 -0
- data/lib/search_engine/hydration/selection_context.rb +96 -0
- data/lib/search_engine/indexer/batch_planner.rb +76 -0
- data/lib/search_engine/indexer/bulk_import.rb +626 -0
- data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
- data/lib/search_engine/indexer/retry_policy.rb +103 -0
- data/lib/search_engine/indexer.rb +747 -0
- data/lib/search_engine/instrumentation.rb +308 -0
- data/lib/search_engine/joins/guard.rb +202 -0
- data/lib/search_engine/joins/resolver.rb +95 -0
- data/lib/search_engine/logging/color.rb +78 -0
- data/lib/search_engine/logging/format_helpers.rb +92 -0
- data/lib/search_engine/logging/partition_progress.rb +53 -0
- data/lib/search_engine/logging_subscriber.rb +388 -0
- data/lib/search_engine/mapper.rb +785 -0
- data/lib/search_engine/multi.rb +286 -0
- data/lib/search_engine/multi_result.rb +186 -0
- data/lib/search_engine/notifications/compact_logger.rb +675 -0
- data/lib/search_engine/observability.rb +162 -0
- data/lib/search_engine/operations.rb +58 -0
- data/lib/search_engine/otel.rb +227 -0
- data/lib/search_engine/partitioner.rb +128 -0
- data/lib/search_engine/ranking_plan.rb +118 -0
- data/lib/search_engine/registry.rb +158 -0
- data/lib/search_engine/relation/compiler.rb +711 -0
- data/lib/search_engine/relation/deletion.rb +37 -0
- data/lib/search_engine/relation/dsl/filters.rb +624 -0
- data/lib/search_engine/relation/dsl/selection.rb +240 -0
- data/lib/search_engine/relation/dsl.rb +903 -0
- data/lib/search_engine/relation/dx/dry_run.rb +59 -0
- data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
- data/lib/search_engine/relation/dx.rb +231 -0
- data/lib/search_engine/relation/materializers.rb +118 -0
- data/lib/search_engine/relation/options.rb +138 -0
- data/lib/search_engine/relation/state.rb +274 -0
- data/lib/search_engine/relation/updating.rb +44 -0
- data/lib/search_engine/relation.rb +623 -0
- data/lib/search_engine/result.rb +664 -0
- data/lib/search_engine/schema.rb +1083 -0
- data/lib/search_engine/sources/active_record_source.rb +185 -0
- data/lib/search_engine/sources/base.rb +62 -0
- data/lib/search_engine/sources/lambda_source.rb +55 -0
- data/lib/search_engine/sources/sql_source.rb +196 -0
- data/lib/search_engine/sources.rb +71 -0
- data/lib/search_engine/stale_rules.rb +160 -0
- data/lib/search_engine/test/minitest_assertions.rb +57 -0
- data/lib/search_engine/test/offline_client.rb +134 -0
- data/lib/search_engine/test/rspec_matchers.rb +77 -0
- data/lib/search_engine/test/stub_client.rb +201 -0
- data/lib/search_engine/test.rb +66 -0
- data/lib/search_engine/test_autoload.rb +8 -0
- data/lib/search_engine/update.rb +35 -0
- data/lib/search_engine/version.rb +7 -0
- data/lib/search_engine.rb +332 -0
- data/lib/tasks/search_engine.rake +501 -0
- data/lib/tasks/search_engine_doctor.rake +16 -0
- metadata +225 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SearchEngine
|
|
4
|
+
module Sources
|
|
5
|
+
# ActiveRecord-backed source adapter that yields arrays of records in batches.
|
|
6
|
+
#
|
|
7
|
+
# Uses ORM batch APIs (`in_batches`/`find_in_batches`) honoring batch_size and an
|
|
8
|
+
# optional scope proc. Ensures stable memory by disabling query cache and using
|
|
9
|
+
# readonly relations. Does not accumulate results across batches.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# src = SearchEngine::Sources::ActiveRecordSource.new(model: ::Product, scope: -> { where(active: true) }, batch_size: 2000)
|
|
13
|
+
# src.each_batch(partition: nil, cursor: nil) { |rows| ... }
|
|
14
|
+
#
|
|
15
|
+
# @note Emits "search_engine.source.batch_fetched" and "search_engine.source.error".
|
|
16
|
+
class ActiveRecordSource
|
|
17
|
+
include Base
|
|
18
|
+
|
|
19
|
+
# @param model [Class] ActiveRecord model class
|
|
20
|
+
# @param scope [Proc, nil] optional proc evaluated in the context of model.all (returns a Relation)
|
|
21
|
+
# @param batch_size [Integer, nil] override batch size (defaults from config)
|
|
22
|
+
# @param use_transaction [Boolean, nil] wrap in read-only transaction (best-effort)
|
|
23
|
+
# @param readonly [Boolean, nil] mark relations readonly (default true)
|
|
24
|
+
def initialize(model:, scope: nil, batch_size: nil, use_transaction: nil, readonly: nil)
|
|
25
|
+
@model = model
|
|
26
|
+
@scope_proc = scope
|
|
27
|
+
cfg = SearchEngine.config.sources.active_record
|
|
28
|
+
@batch_size = (batch_size || cfg.batch_size).to_i
|
|
29
|
+
@use_transaction = use_transaction.nil? ? truthy?(cfg.use_transaction) : truthy?(use_transaction)
|
|
30
|
+
@readonly = !readonly.nil? ? truthy?(readonly) : truthy?(cfg.readonly)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Iterate over batches of records.
|
|
34
|
+
#
|
|
35
|
+
# @param partition [Object, nil] optional opaque partition (e.g., id range)
|
|
36
|
+
# @param cursor [Object, nil] optional opaque cursor (e.g., last id)
|
|
37
|
+
# @yieldparam rows [Array<Object>] array of model instances
|
|
38
|
+
# @return [Enumerator] when no block is given
|
|
39
|
+
def each_batch(partition: nil, cursor: nil, &block)
|
|
40
|
+
return enum_for(:each_batch, partition: partition, cursor: cursor) unless block_given?
|
|
41
|
+
|
|
42
|
+
relation = base_relation
|
|
43
|
+
relation = apply_partition(relation, partition) if partition
|
|
44
|
+
relation = apply_cursor(relation, cursor) if cursor
|
|
45
|
+
|
|
46
|
+
idx = 0
|
|
47
|
+
started = monotonic_ms
|
|
48
|
+
run_with_connection do
|
|
49
|
+
run_in_readonly_txn_if_needed do
|
|
50
|
+
without_query_cache do
|
|
51
|
+
dispatch_batches(relation, started, idx, partition, cursor, &block)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
rescue StandardError => error
|
|
56
|
+
instrument_error(source: 'active_record', error: error, partition: partition, cursor: cursor,
|
|
57
|
+
adapter_options: { batch_size: @batch_size }
|
|
58
|
+
)
|
|
59
|
+
raise
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
private
|
|
63
|
+
|
|
64
|
+
def truthy?(val)
|
|
65
|
+
val == true
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def dispatch_batches(relation, started, idx, partition, cursor)
|
|
69
|
+
if relation.respond_to?(:in_batches)
|
|
70
|
+
relation.in_batches(of: @batch_size) do |batch_scope|
|
|
71
|
+
batch_scope = mark_readonly(batch_scope)
|
|
72
|
+
rows = batch_scope.to_a
|
|
73
|
+
duration = monotonic_ms - started
|
|
74
|
+
instrument_batch_fetched(source: 'active_record', batch_index: idx, rows_count: rows.size,
|
|
75
|
+
duration_ms: duration, partition: partition, cursor: cursor,
|
|
76
|
+
adapter_options: { batch_size: @batch_size }
|
|
77
|
+
)
|
|
78
|
+
yield(rows)
|
|
79
|
+
idx += 1
|
|
80
|
+
started = monotonic_ms
|
|
81
|
+
end
|
|
82
|
+
elsif relation.respond_to?(:find_in_batches)
|
|
83
|
+
relation.find_in_batches(batch_size: @batch_size) do |rows|
|
|
84
|
+
rows = rows.map { |r| r }
|
|
85
|
+
duration = monotonic_ms - started
|
|
86
|
+
instrument_batch_fetched(source: 'active_record', batch_index: idx, rows_count: rows.size,
|
|
87
|
+
duration_ms: duration, partition: partition, cursor: cursor,
|
|
88
|
+
adapter_options: { batch_size: @batch_size }
|
|
89
|
+
)
|
|
90
|
+
yield(rows)
|
|
91
|
+
idx += 1
|
|
92
|
+
started = monotonic_ms
|
|
93
|
+
end
|
|
94
|
+
else
|
|
95
|
+
# Last resort: materialize and slice
|
|
96
|
+
records = relation.to_a
|
|
97
|
+
records.each_slice(@batch_size) do |rows|
|
|
98
|
+
duration = monotonic_ms - started
|
|
99
|
+
instrument_batch_fetched(source: 'active_record', batch_index: idx, rows_count: rows.size,
|
|
100
|
+
duration_ms: duration, partition: partition, cursor: cursor,
|
|
101
|
+
adapter_options: { batch_size: @batch_size }
|
|
102
|
+
)
|
|
103
|
+
yield(rows)
|
|
104
|
+
idx += 1
|
|
105
|
+
started = monotonic_ms
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def base_relation
|
|
111
|
+
rel = @model.all
|
|
112
|
+
rel = rel.instance_exec(&@scope_proc) if @scope_proc.respond_to?(:call)
|
|
113
|
+
mark_readonly(rel)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def mark_readonly(rel)
|
|
117
|
+
if @readonly && rel.respond_to?(:readonly)
|
|
118
|
+
rel.readonly(true)
|
|
119
|
+
else
|
|
120
|
+
rel
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def apply_partition(rel, partition)
|
|
125
|
+
case partition
|
|
126
|
+
when Range
|
|
127
|
+
pk = rel.klass.primary_key
|
|
128
|
+
rel.where(pk => partition)
|
|
129
|
+
when Hash
|
|
130
|
+
rel.where(partition)
|
|
131
|
+
else
|
|
132
|
+
rel
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
def apply_cursor(rel, cursor)
|
|
137
|
+
return rel unless cursor
|
|
138
|
+
|
|
139
|
+
pk = rel.klass.primary_key
|
|
140
|
+
if cursor.is_a?(Hash)
|
|
141
|
+
rel.where(cursor)
|
|
142
|
+
else
|
|
143
|
+
rel.where(arel_table(rel)[pk].gt(cursor)).order(pk => :asc)
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
def arel_table(rel)
|
|
148
|
+
rel.klass.arel_table
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def run_with_connection(&block)
|
|
152
|
+
if defined?(ActiveRecord::Base)
|
|
153
|
+
ActiveRecord::Base.connection_pool.with_connection(&block)
|
|
154
|
+
else
|
|
155
|
+
yield
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def run_in_readonly_txn_if_needed
|
|
160
|
+
if @use_transaction && defined?(ActiveRecord::Base)
|
|
161
|
+
ActiveRecord::Base.connection.transaction(requires_new: true) do
|
|
162
|
+
if ActiveRecord::Base.connection.respond_to?(:execute)
|
|
163
|
+
begin
|
|
164
|
+
ActiveRecord::Base.connection.execute('SET TRANSACTION READ ONLY')
|
|
165
|
+
rescue StandardError
|
|
166
|
+
# best-effort; ignore if not supported
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
yield
|
|
170
|
+
end
|
|
171
|
+
else
|
|
172
|
+
yield
|
|
173
|
+
end
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def without_query_cache(&block)
|
|
177
|
+
if defined?(ActiveRecord::Base) && ActiveRecord::Base.respond_to?(:connection)
|
|
178
|
+
ActiveRecord::Base.connection.uncached(&block)
|
|
179
|
+
else
|
|
180
|
+
yield
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SearchEngine
|
|
4
|
+
module Sources
|
|
5
|
+
# Internal helpers shared by source adapters.
|
|
6
|
+
#
|
|
7
|
+
# Provides small utilities for instrumentation and for returning
|
|
8
|
+
# an Enumerator when a block is not supplied.
|
|
9
|
+
module Base
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
def monotonic_ms
|
|
13
|
+
SearchEngine::Instrumentation.monotonic_ms
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def instrument_batch_fetched(source:, batch_index:, rows_count:, duration_ms:, partition: nil, cursor: nil,
|
|
17
|
+
adapter_options: {})
|
|
18
|
+
return unless defined?(ActiveSupport::Notifications)
|
|
19
|
+
|
|
20
|
+
payload = {
|
|
21
|
+
source: source,
|
|
22
|
+
batch_index: batch_index,
|
|
23
|
+
rows_count: rows_count,
|
|
24
|
+
duration_ms: duration_ms
|
|
25
|
+
}
|
|
26
|
+
payload[:partition] = partition unless partition.nil?
|
|
27
|
+
payload[:cursor] = cursor unless cursor.nil?
|
|
28
|
+
if defined?(SearchEngine::Observability)
|
|
29
|
+
payload[:adapter_options] = SearchEngine::Observability.redact(adapter_options)
|
|
30
|
+
end
|
|
31
|
+
SearchEngine::Instrumentation.instrument('search_engine.source.batch_fetched', payload) {}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def instrument_error(source:, error:, partition: nil, cursor: nil, adapter_options: {})
|
|
35
|
+
return unless defined?(ActiveSupport::Notifications)
|
|
36
|
+
|
|
37
|
+
payload = {
|
|
38
|
+
source: source,
|
|
39
|
+
error_class: error.class.name,
|
|
40
|
+
message: error.message.to_s[0, 200]
|
|
41
|
+
}
|
|
42
|
+
payload[:partition] = partition unless partition.nil?
|
|
43
|
+
payload[:cursor] = cursor unless cursor.nil?
|
|
44
|
+
if defined?(SearchEngine::Observability)
|
|
45
|
+
payload[:adapter_options] = SearchEngine::Observability.redact(adapter_options)
|
|
46
|
+
end
|
|
47
|
+
if error.respond_to?(:to_h)
|
|
48
|
+
h = error.to_h
|
|
49
|
+
payload[:error_hint] = h[:hint] if h[:hint]
|
|
50
|
+
payload[:error_doc] = h[:doc] if h[:doc]
|
|
51
|
+
end
|
|
52
|
+
SearchEngine::Instrumentation.instrument('search_engine.source.error', payload) {}
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def enum_for_each_batch(partition:, cursor:)
|
|
56
|
+
return to_enum(:each_batch, partition: partition, cursor: cursor) unless block_given?
|
|
57
|
+
|
|
58
|
+
yield
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SearchEngine
|
|
4
|
+
module Sources
|
|
5
|
+
# Adapter that delegates batch enumeration to a provided callable.
|
|
6
|
+
#
|
|
7
|
+
# The callable is expected to implement `call(cursor:, partition:)` and return either
|
|
8
|
+
# an Enumerator or yield arrays of rows. Shapes are application-defined.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# src = SearchEngine::Sources::LambdaSource.new(->(cursor:, partition:) { [[row1, row2]] })
|
|
12
|
+
# src.each_batch { |rows| ... }
|
|
13
|
+
#
|
|
14
|
+
# @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer`
|
|
15
|
+
class LambdaSource
|
|
16
|
+
include Base
|
|
17
|
+
|
|
18
|
+
# @param callable [#call] object responding to call(cursor:, partition:)
|
|
19
|
+
# @raise [ArgumentError] when callable does not respond to :call
|
|
20
|
+
def initialize(callable)
|
|
21
|
+
raise ArgumentError, 'callable must respond to :call(cursor:, partition:)' unless callable.respond_to?(:call)
|
|
22
|
+
|
|
23
|
+
@callable = callable
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Enumerate batches produced by the callable.
|
|
27
|
+
# @param partition [Object, nil]
|
|
28
|
+
# @param cursor [Object, nil]
|
|
29
|
+
# @yieldparam rows [Array]
|
|
30
|
+
# @return [Enumerator]
|
|
31
|
+
def each_batch(partition: nil, cursor: nil)
|
|
32
|
+
return enum_for(:each_batch, partition: partition, cursor: cursor) unless block_given?
|
|
33
|
+
|
|
34
|
+
started = monotonic_ms
|
|
35
|
+
begin
|
|
36
|
+
enum = @callable.call(cursor: cursor, partition: partition)
|
|
37
|
+
Array(enum).each do |rows|
|
|
38
|
+
duration = monotonic_ms - started
|
|
39
|
+
instrument_batch_fetched(source: 'lambda', batch_index: nil, rows_count: Array(rows).size,
|
|
40
|
+
duration_ms: duration, partition: partition, cursor: cursor,
|
|
41
|
+
adapter_options: { callable: @callable.class.name }
|
|
42
|
+
)
|
|
43
|
+
yield(rows)
|
|
44
|
+
started = monotonic_ms
|
|
45
|
+
end
|
|
46
|
+
rescue StandardError => error
|
|
47
|
+
instrument_error(source: 'lambda', error: error, partition: partition, cursor: cursor,
|
|
48
|
+
adapter_options: { callable: @callable.class.name }
|
|
49
|
+
)
|
|
50
|
+
raise
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SearchEngine
|
|
4
|
+
module Sources
|
|
5
|
+
# Stream rows from SQL using ActiveRecord connection, with PG cursor support when available.
|
|
6
|
+
#
|
|
7
|
+
# Supports large result sets via server-side cursors on PostgreSQL connections and falls
|
|
8
|
+
# back to paginated SELECT for other adapters. Yields arrays of rows in batches.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# src = SearchEngine::Sources::SqlSource.new(sql: "SELECT id, name FROM users", fetch_size: 2000)
|
|
12
|
+
# src.each_batch { |rows| ... }
|
|
13
|
+
#
|
|
14
|
+
# @note Emits "search_engine.source.batch_fetched" and "search_engine.source.error".
|
|
15
|
+
# @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer`
|
|
16
|
+
class SqlSource
|
|
17
|
+
include Base
|
|
18
|
+
|
|
19
|
+
# @param sql [String] SQL statement to execute
|
|
20
|
+
# @param binds [Array, Hash, nil] optional parameters for the SQL
|
|
21
|
+
# @param fetch_size [Integer, nil] batch size hint
|
|
22
|
+
# @param row_shape [Symbol, nil] :hash, :array or :auto (PG only)
|
|
23
|
+
# @param statement_timeout_ms [Integer, nil] optional statement timeout (PG only)
|
|
24
|
+
def initialize(sql:, binds: nil, fetch_size: nil, row_shape: nil, statement_timeout_ms: nil)
|
|
25
|
+
@sql = sql.to_s
|
|
26
|
+
@binds = binds
|
|
27
|
+
cfg = SearchEngine.config.sources.sql
|
|
28
|
+
@fetch_size = (fetch_size || cfg.fetch_size).to_i
|
|
29
|
+
@row_shape = row_shape || :auto
|
|
30
|
+
@statement_timeout_ms = statement_timeout_ms
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Iterate over batches produced by the SQL query.
|
|
34
|
+
# @param partition [Object, nil]
|
|
35
|
+
# @param cursor [Object, nil]
|
|
36
|
+
# @yieldparam rows [Array<Hash>, Array<Array>]
|
|
37
|
+
# @return [Enumerator]
|
|
38
|
+
def each_batch(partition: nil, cursor: nil, &block)
|
|
39
|
+
return enum_for(:each_batch, partition: partition, cursor: cursor) unless block_given?
|
|
40
|
+
|
|
41
|
+
run_with_connection do
|
|
42
|
+
if postgres_connection?(ActiveRecord::Base.connection.raw_connection)
|
|
43
|
+
stream_postgres(ActiveRecord::Base.connection.raw_connection, partition: partition, cursor: cursor, &block)
|
|
44
|
+
else
|
|
45
|
+
stream_generic(nil, partition: partition, cursor: cursor, &block)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
rescue StandardError => error
|
|
49
|
+
instrument_error(source: 'sql', error: error, partition: partition, cursor: cursor,
|
|
50
|
+
adapter_options: { fetch_size: @fetch_size, row_shape: @row_shape }
|
|
51
|
+
)
|
|
52
|
+
raise
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def run_with_connection
|
|
58
|
+
unless defined?(ActiveRecord::Base)
|
|
59
|
+
raise SearchEngine::Errors::InvalidParams, 'SqlSource requires ActiveRecord connection'
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
ActiveRecord::Base.connection_pool.with_connection do |ar_conn|
|
|
63
|
+
raw = raw_connection(ar_conn)
|
|
64
|
+
yield raw
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def raw_connection(ar_conn)
|
|
69
|
+
if ar_conn.respond_to?(:raw_connection)
|
|
70
|
+
ar_conn.raw_connection
|
|
71
|
+
else
|
|
72
|
+
ar_conn
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def postgres_connection?(conn)
|
|
77
|
+
(defined?(PG) && conn.is_a?(PG::Connection)) || conn.class.name.include?('PG')
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def stream_postgres(conn, partition:, cursor:)
|
|
81
|
+
cursor_name = "se_cursor_#{object_id}"
|
|
82
|
+
sql, params = build_sql_and_params(partition: partition, cursor: cursor)
|
|
83
|
+
started = monotonic_ms
|
|
84
|
+
begin
|
|
85
|
+
set_statement_timeout(conn, @statement_timeout_ms) if @statement_timeout_ms
|
|
86
|
+
# Use unnamed prepared statement + DECLARE CURSOR for streaming
|
|
87
|
+
conn.exec('BEGIN READ ONLY')
|
|
88
|
+
conn.exec_params("DECLARE #{cursor_name} NO SCROLL CURSOR FOR #{sql}", params)
|
|
89
|
+
idx = 0
|
|
90
|
+
loop do
|
|
91
|
+
res = conn.exec("FETCH FORWARD #{@fetch_size} FROM #{cursor_name}")
|
|
92
|
+
break if res.ntuples.zero?
|
|
93
|
+
|
|
94
|
+
rows = rows_from_pg_result(res)
|
|
95
|
+
duration = monotonic_ms - started
|
|
96
|
+
instrument_batch_fetched(source: 'sql', batch_index: idx, rows_count: rows.size, duration_ms: duration,
|
|
97
|
+
partition: partition, cursor: cursor,
|
|
98
|
+
adapter_options: { fetch_size: @fetch_size, row_shape: @row_shape }
|
|
99
|
+
)
|
|
100
|
+
yield rows
|
|
101
|
+
idx += 1
|
|
102
|
+
started = monotonic_ms
|
|
103
|
+
end
|
|
104
|
+
ensure
|
|
105
|
+
begin
|
|
106
|
+
conn.exec("CLOSE #{cursor_name}")
|
|
107
|
+
rescue StandardError
|
|
108
|
+
# ignore
|
|
109
|
+
end
|
|
110
|
+
begin
|
|
111
|
+
conn.exec('COMMIT')
|
|
112
|
+
rescue StandardError
|
|
113
|
+
# ignore
|
|
114
|
+
end
|
|
115
|
+
reset_statement_timeout(conn) if @statement_timeout_ms
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def rows_from_pg_result(res)
|
|
120
|
+
if @row_shape == :hash || @row_shape == :auto
|
|
121
|
+
res.to_a
|
|
122
|
+
else
|
|
123
|
+
res.values
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def stream_generic(_conn, partition:, cursor:)
|
|
128
|
+
# Fallback: try ActiveRecord select_all with pagination via placeholders
|
|
129
|
+
ar_conn = ActiveRecord::Base.connection
|
|
130
|
+
sql, params = build_sql_and_params(partition: partition, cursor: cursor)
|
|
131
|
+
idx = 0
|
|
132
|
+
started = monotonic_ms
|
|
133
|
+
loop do
|
|
134
|
+
chunked_sql = sql_with_limit(sql, @fetch_size, idx)
|
|
135
|
+
rows = ar_conn.exec_query(chunked_sql, 'SqlSource', params_for_ar(params)).to_a
|
|
136
|
+
break if rows.empty?
|
|
137
|
+
|
|
138
|
+
duration = monotonic_ms - started
|
|
139
|
+
instrument_batch_fetched(source: 'sql', batch_index: idx, rows_count: rows.size, duration_ms: duration,
|
|
140
|
+
partition: partition, cursor: cursor,
|
|
141
|
+
adapter_options: { fetch_size: @fetch_size, row_shape: :hash }
|
|
142
|
+
)
|
|
143
|
+
yield rows
|
|
144
|
+
idx += 1
|
|
145
|
+
started = monotonic_ms
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def sql_with_limit(base_sql, fetch_size, page_idx)
|
|
150
|
+
offset = page_idx * fetch_size
|
|
151
|
+
"SELECT * FROM (#{base_sql}) se_sub LIMIT #{Integer(fetch_size)} OFFSET #{Integer(offset)}"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def params_for_ar(params)
|
|
155
|
+
return [] if params.nil?
|
|
156
|
+
|
|
157
|
+
return params if params.is_a?(Array)
|
|
158
|
+
|
|
159
|
+
[]
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def build_sql_and_params(**)
|
|
163
|
+
# For safety, do not mutate the original SQL. Bind support is adapter-specific.
|
|
164
|
+
# We support a simple Hash expansion for named placeholders in PG `exec_params` style: $1, $2 ...
|
|
165
|
+
sql = @sql.dup
|
|
166
|
+
binds = []
|
|
167
|
+
case @binds
|
|
168
|
+
when Array
|
|
169
|
+
binds = @binds.dup
|
|
170
|
+
when Hash
|
|
171
|
+
raise SearchEngine::Errors::InvalidParams,
|
|
172
|
+
'SqlSource with Hash binds is not supported; use positional binds (Array)'
|
|
173
|
+
when nil
|
|
174
|
+
# noop
|
|
175
|
+
else
|
|
176
|
+
raise SearchEngine::Errors::InvalidParams, 'SqlSource binds must be an Array or nil'
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Partition/cursor semantics are adapter/domain-specific; callers should incorporate placeholders.
|
|
180
|
+
[sql, binds]
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def set_statement_timeout(conn, ms)
|
|
184
|
+
conn.exec_params('SET LOCAL statement_timeout = $1', [Integer(ms)])
|
|
185
|
+
rescue StandardError
|
|
186
|
+
# ignore
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
def reset_statement_timeout(conn)
|
|
190
|
+
conn.exec('RESET statement_timeout')
|
|
191
|
+
rescue StandardError
|
|
192
|
+
# ignore
|
|
193
|
+
end
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SearchEngine
|
|
4
|
+
# Factory and DSL for building data source adapters that yield batches.
|
|
5
|
+
#
|
|
6
|
+
# Usage via symbol and options:
|
|
7
|
+
# SearchEngine::Sources.build(:active_record, model: ::Product, scope: -> { where(active: true) }, batch_size: 2000)
|
|
8
|
+
# SearchEngine::Sources.build(:sql, sql: "SELECT * FROM products WHERE active = TRUE", fetch_size: 2000)
|
|
9
|
+
#
|
|
10
|
+
# Usage via block (lambda source):
|
|
11
|
+
# SearchEngine::Sources.build(:lambda) do |cursor:, partition:|
|
|
12
|
+
# Enumerator.new { |y| external_api.each_page(cursor) { |rows| y << rows } }
|
|
13
|
+
# end
|
|
14
|
+
#
|
|
15
|
+
# All adapters implement `each_batch(partition:, cursor:)` and return an Enumerator
|
|
16
|
+
# when no block is provided.
|
|
17
|
+
module Sources
|
|
18
|
+
# Build a source adapter from a symbol and options or from a block.
|
|
19
|
+
#
|
|
20
|
+
# @param type [Symbol] :active_record, :sql, or :lambda
|
|
21
|
+
# @param options [Hash] adapter-specific options
|
|
22
|
+
# @yield for :lambda sources, a block taking (cursor:, partition:) and returning an Enumerator
|
|
23
|
+
# @return [Object] adapter responding to `each_batch(partition:, cursor:)`
|
|
24
|
+
def self.build(type, **options, &block)
|
|
25
|
+
case type.to_sym
|
|
26
|
+
when :active_record
|
|
27
|
+
model = options[:model]
|
|
28
|
+
unless model.is_a?(Class)
|
|
29
|
+
raise SearchEngine::Errors::InvalidParams,
|
|
30
|
+
'active_record source requires :model (ActiveRecord class). See ' \
|
|
31
|
+
'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer.'
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
scope = options[:scope]
|
|
35
|
+
batch_size = options[:batch_size]
|
|
36
|
+
readonly = options[:readonly]
|
|
37
|
+
use_txn = options[:use_transaction]
|
|
38
|
+
ActiveRecordSource.new(model: model, scope: scope, batch_size: batch_size, use_transaction: use_txn,
|
|
39
|
+
readonly: readonly
|
|
40
|
+
)
|
|
41
|
+
when :sql
|
|
42
|
+
sql = options[:sql]
|
|
43
|
+
unless sql.is_a?(String) && !sql.strip.empty?
|
|
44
|
+
raise SearchEngine::Errors::InvalidParams,
|
|
45
|
+
'sql source requires :sql (String). See ' \
|
|
46
|
+
'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer.'
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
binds = options[:binds]
|
|
50
|
+
fetch_size = options[:fetch_size]
|
|
51
|
+
row_shape = options[:row_shape]
|
|
52
|
+
stmt_timeout = options[:statement_timeout_ms]
|
|
53
|
+
SqlSource.new(sql: sql, binds: binds, fetch_size: fetch_size, row_shape: row_shape,
|
|
54
|
+
statement_timeout_ms: stmt_timeout
|
|
55
|
+
)
|
|
56
|
+
when :lambda
|
|
57
|
+
callable = block || options[:callable]
|
|
58
|
+
unless callable
|
|
59
|
+
raise SearchEngine::Errors::InvalidParams,
|
|
60
|
+
'lambda source requires a block or :callable. See ' \
|
|
61
|
+
'https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer.'
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
LambdaSource.new(callable)
|
|
65
|
+
else
|
|
66
|
+
raise SearchEngine::Errors::InvalidParams,
|
|
67
|
+
"unknown source type: #{type.inspect}. Supported: :active_record, :sql, :lambda"
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|