search-engine-for-typesense 30.1.8.2 → 30.1.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,241 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SearchEngine
4
+ # Public dependency-ordering helpers for Typesense collection reference graphs.
5
+ module DependencyPlanner
6
+ class << self
7
+ # Build a normalized reverse dependency graph.
8
+ # @param source [Symbol] :registry, :typesense, or :auto
9
+ # @param client [SearchEngine::Client, nil]
10
+ # @return [Hash{String=>Array<Hash>}] target collection => reference edges
11
+ def reverse_graph(source: :registry, client: nil)
12
+ graph = case source.to_sym
13
+ when :registry
14
+ SearchEngine::Cascade.send(:build_from_registry)
15
+ when :typesense
16
+ SearchEngine::Cascade.send(:build_from_typesense, client || SearchEngine.client)
17
+ when :auto
18
+ SearchEngine::Cascade.build_reverse_graph(client: client || SearchEngine.client)
19
+ else
20
+ raise ArgumentError, 'source must be :registry, :typesense, or :auto'
21
+ end
22
+
23
+ normalize_reverse_graph(graph)
24
+ end
25
+
26
+ # Order collection names so referenced collections precede referrers.
27
+ # @param collections [Array<Symbol, String, Class>] collections or model classes
28
+ # @param source [Symbol] :registry, :typesense, or :auto
29
+ # @param client [SearchEngine::Client, nil]
30
+ # @param reverse_graph [Hash, nil] prebuilt reverse graph
31
+ # @return [Array<String>]
32
+ def order_collections(collections, source: :registry, client: nil, reverse_graph: nil)
33
+ graph = graph_or_build(reverse_graph, source: source, client: client)
34
+ topo_sort_subset(graph, normalize_collections(collections))
35
+ end
36
+
37
+ # Order events by their collection dependency order while preserving per-collection event order.
38
+ # @param events [Array<Object, Hash>]
39
+ # @param collection_method [Symbol, String] event reader or hash key for collection name
40
+ # @param source [Symbol] :registry, :typesense, or :auto
41
+ # @param client [SearchEngine::Client, nil]
42
+ # @param reverse_graph [Hash, nil] prebuilt reverse graph
43
+ # @return [Array<Object, Hash>]
44
+ def order_events(events, collection_method: :collection, source: :registry, client: nil, reverse_graph: nil)
45
+ graph = graph_or_build(reverse_graph, source: source, client: client)
46
+ grouped = Hash.new { |h, k| h[k] = [] }
47
+ without_collection = []
48
+
49
+ Array(events).each do |event|
50
+ collection = collection_from_event(event, collection_method)
51
+ if collection.nil? || collection.empty?
52
+ without_collection << event
53
+ else
54
+ grouped[collection] << event
55
+ end
56
+ end
57
+
58
+ ordered_collections = topo_sort_subset(graph, grouped.keys)
59
+ ordered_collections.flat_map { |collection| grouped[collection] } + without_collection
60
+ end
61
+
62
+ # Return collections that directly reference the given collection.
63
+ # @param collection [Symbol, String, Class]
64
+ # @param source [Symbol] :registry, :typesense, or :auto
65
+ # @param client [SearchEngine::Client, nil]
66
+ # @param reverse_graph [Hash, nil] prebuilt reverse graph
67
+ # @return [Array<String>]
68
+ def referencers_for(collection, source: :registry, client: nil, reverse_graph: nil)
69
+ graph = graph_or_build(reverse_graph, source: source, client: client)
70
+ name = normalize_collection(collection)
71
+ Array(graph[name]).filter_map { |edge| edge[:referrer] }.uniq
72
+ end
73
+
74
+ # Return collections directly referenced by the given collection.
75
+ # @param collection [Symbol, String, Class]
76
+ # @param source [Symbol] :registry, :typesense, or :auto
77
+ # @param client [SearchEngine::Client, nil]
78
+ # @param reverse_graph [Hash, nil] prebuilt reverse graph
79
+ # @return [Array<String>]
80
+ def dependencies_for(collection, source: :registry, client: nil, reverse_graph: nil)
81
+ graph = graph_or_build(reverse_graph, source: source, client: client)
82
+ name = normalize_collection(collection)
83
+ deps = []
84
+ graph.each do |target, edges|
85
+ deps << target if Array(edges).any? { |edge| edge[:referrer] == name }
86
+ end
87
+ deps.uniq
88
+ end
89
+
90
+ # Build the two Bulk stages from collection dependencies.
91
+ # @param collections [Array<Symbol, String, Class>] collections or model classes
92
+ # @param source [Symbol] :registry, :typesense, or :auto
93
+ # @param client [SearchEngine::Client, nil]
94
+ # @param reverse_graph [Hash, nil] prebuilt reverse graph
95
+ # @return [Hash{Symbol=>Array<String>}] :stage_1 and :cascade collection names
96
+ def bulk_stages(collections, source: :auto, client: nil, reverse_graph: nil)
97
+ inputs = normalize_collections(collections)
98
+ graph = graph_or_build(reverse_graph, source: source, client: client)
99
+ input_set = inputs.to_h { |name| [name, true] }
100
+ internal_referrers = internal_referrers_within_inputs(graph, input_set)
101
+ cascade_candidates = unique_referencers_of_inputs(graph, inputs)
102
+
103
+ {
104
+ stage_1: inputs.reject { |name| internal_referrers.include?(name) },
105
+ cascade: topo_sort_subset(graph, cascade_candidates)
106
+ }
107
+ end
108
+
109
+ private
110
+
111
+ def graph_or_build(graph, source:, client:)
112
+ graph ? normalize_reverse_graph(graph) : reverse_graph(source: source, client: client)
113
+ end
114
+
115
+ def normalize_reverse_graph(graph)
116
+ normalized = Hash.new { |h, k| h[k] = [] }
117
+ Hash(graph).each do |target, edges|
118
+ target_name = target.to_s
119
+ next if target_name.empty?
120
+
121
+ Array(edges).each do |edge|
122
+ normalized[target_name] << normalize_edge(edge)
123
+ end
124
+ end
125
+ normalized
126
+ end
127
+
128
+ def normalize_edge(edge)
129
+ {
130
+ referrer: edge_value(edge, :referrer).to_s,
131
+ local_key: edge_value(edge, :local_key).to_s,
132
+ foreign_key: edge_value(edge, :foreign_key).to_s
133
+ }
134
+ end
135
+
136
+ def edge_value(edge, key)
137
+ return edge.public_send(key) if edge.respond_to?(key)
138
+
139
+ edge[key] || edge[key.to_s]
140
+ end
141
+
142
+ def normalize_collections(collections)
143
+ Array(collections).flatten.compact.filter_map do |collection|
144
+ name = normalize_collection(collection)
145
+ name unless name.empty?
146
+ end.uniq
147
+ end
148
+
149
+ def normalize_collection(collection)
150
+ if collection.is_a?(Class)
151
+ collection.respond_to?(:collection) ? collection.collection.to_s : collection.name.to_s
152
+ else
153
+ collection.to_s
154
+ end
155
+ end
156
+
157
+ def collection_from_event(event, collection_method)
158
+ if event.respond_to?(collection_method)
159
+ event.public_send(collection_method).to_s
160
+ elsif event.respond_to?(:[])
161
+ (event[collection_method.to_sym] || event[collection_method.to_s]).to_s
162
+ else
163
+ ''
164
+ end
165
+ end
166
+
167
+ def internal_referrers_within_inputs(reverse_graph, input_set)
168
+ require 'set'
169
+ refs = Set.new
170
+ reverse_graph.each do |target, edges|
171
+ next unless input_set[target]
172
+
173
+ Array(edges).each do |edge|
174
+ referrer = edge[:referrer].to_s
175
+ refs.add(referrer) if input_set[referrer]
176
+ end
177
+ end
178
+ refs
179
+ end
180
+
181
+ def unique_referencers_of_inputs(reverse_graph, inputs)
182
+ require 'set'
183
+ seen = Set.new
184
+ inputs.each do |name|
185
+ Array(reverse_graph[name]).each do |edge|
186
+ referrer = edge[:referrer].to_s
187
+ seen.add(referrer) unless referrer.empty?
188
+ end
189
+ end
190
+ seen.to_a
191
+ end
192
+
193
+ def topo_sort_subset(reverse_graph, subset)
194
+ require 'set'
195
+ nodes = Array(subset).uniq
196
+ node_set = nodes.to_h { |name| [name, true] }
197
+ adj = Hash.new { |h, k| h[k] = Set.new }
198
+ indeg = Hash.new(0)
199
+
200
+ nodes.each { |name| indeg[name] = 0 }
201
+
202
+ reverse_graph.each do |target, edges|
203
+ Array(edges).each do |edge|
204
+ referrer = edge[:referrer].to_s
205
+ next unless node_set[referrer] && node_set[target]
206
+ next if adj[target].include?(referrer)
207
+
208
+ adj[target] << referrer
209
+ indeg[referrer] += 1
210
+ end
211
+ end
212
+
213
+ queue = nodes.select { |name| indeg[name].to_i <= 0 }.sort
214
+ order = []
215
+ until queue.empty?
216
+ name = queue.shift
217
+ order << name
218
+ adj[name].each do |dependent|
219
+ indeg[dependent] -= 1
220
+ queue << dependent if indeg[dependent] <= 0
221
+ end
222
+ queue.sort!
223
+ end
224
+
225
+ remaining = nodes - order
226
+ instrument_cycle!(remaining) if remaining.any?
227
+ order + remaining.sort
228
+ end
229
+
230
+ def instrument_cycle!(collections)
231
+ payload = { collections: collections.sort }
232
+ SearchEngine::Instrumentation.instrument('search_engine.dependency_planner.cycle', payload) {}
233
+ SearchEngine.config.logger&.warn(
234
+ "search_engine dependency planner cycle detected: #{payload[:collections].join(', ')}"
235
+ )
236
+ rescue StandardError
237
+ nil
238
+ end
239
+ end
240
+ end
241
+ end
@@ -66,42 +66,25 @@ module SearchEngine
66
66
  client = SearchEngine.client
67
67
  buffer = +''
68
68
  next_index = sequence_generator
69
-
70
- batches = []
71
- docs_total = 0
72
- success_total = 0
73
- failed_total = 0
74
- failed_batches_total = 0
75
- batches_total = 0
76
- source_batches_done = 0
69
+ import_context = {
70
+ client: client,
71
+ collection: into,
72
+ action: action,
73
+ retry_policy: retry_policy,
74
+ buffer: buffer,
75
+ next_index: next_index
76
+ }
77
+ state = initialize_sequential_state
77
78
  started_at = monotonic_ms
78
79
 
79
80
  docs_enum.each do |raw_batch|
80
- stats_list = import_batch_with_handling(
81
- client: client,
82
- collection: into,
81
+ process_single_batch_sequential(
82
+ import_context,
83
83
  raw_batch: raw_batch,
84
- action: action,
85
- retry_policy: retry_policy,
86
- buffer: buffer,
87
- next_index: next_index
88
- )
89
-
90
- stats_list.each do |stats|
91
- docs_total += stats[:docs_count].to_i
92
- success_total += stats[:success_count].to_i
93
- failed_total += stats[:failure_count].to_i
94
- failed_batches_total += 1 if stats[:failure_count].to_i.positive?
95
- batches_total += 1
96
- batches << stats
97
- validate_soft_batch_size!(batch_size, stats[:docs_count])
98
- log_batch(stats, batches_total) if log_batches
99
- end
100
-
101
- source_batches_done += 1
102
- on_batch&.call(
103
- batches_done: source_batches_done, docs_total: docs_total,
104
- success_total: success_total, failed_total: failed_total
84
+ batch_size: batch_size,
85
+ log_batches: log_batches,
86
+ on_batch: on_batch,
87
+ state: state
105
88
  )
106
89
  end
107
90
 
@@ -110,17 +93,56 @@ module SearchEngine
110
93
 
111
94
  Summary.new(
112
95
  collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
113
- status: status_from_counts(success_total, failed_total),
114
- batches_total: batches_total,
115
- docs_total: docs_total,
116
- success_total: success_total,
117
- failed_total: failed_total,
118
- failed_batches_total: failed_batches_total,
96
+ status: status_from_counts(state[:success_total], state[:failed_total]),
97
+ batches_total: state[:batches_total],
98
+ docs_total: state[:docs_total],
99
+ success_total: state[:success_total],
100
+ failed_total: state[:failed_total],
101
+ failed_batches_total: state[:failed_batches_total],
119
102
  duration_ms_total: total_duration_ms,
120
- batches: batches
103
+ source_duration_ms_total: state[:source_duration_ms_total].round(1),
104
+ map_duration_ms_total: state[:map_duration_ms_total].round(1),
105
+ jsonl_duration_ms_total: state[:jsonl_duration_ms_total].round(1),
106
+ import_duration_ms_total: state[:import_duration_ms_total].round(1),
107
+ batches: state[:batches]
121
108
  )
122
109
  end
123
110
 
111
+ # @return [Hash] sequential import counters using the same keys as parallel aggregation
112
+ def initialize_sequential_state
113
+ {
114
+ batches: [],
115
+ docs_total: 0,
116
+ success_total: 0,
117
+ failed_total: 0,
118
+ failed_batches_total: 0,
119
+ batches_total: 0,
120
+ source_duration_ms_total: 0.0,
121
+ map_duration_ms_total: 0.0,
122
+ jsonl_duration_ms_total: 0.0,
123
+ import_duration_ms_total: 0.0,
124
+ source_batches_done: 0
125
+ }
126
+ end
127
+
128
+ def process_single_batch_sequential(import_context, raw_batch:, batch_size:, log_batches:, on_batch:, state:)
129
+ stage_metrics = stage_metrics_for(raw_batch)
130
+ stats_list = import_batch_with_handling(
131
+ client: import_context.fetch(:client),
132
+ collection: import_context.fetch(:collection),
133
+ raw_batch: raw_batch,
134
+ action: import_context.fetch(:action),
135
+ retry_policy: import_context.fetch(:retry_policy),
136
+ buffer: import_context.fetch(:buffer),
137
+ next_index: import_context.fetch(:next_index)
138
+ )
139
+
140
+ aggregate_stats(stats_list, state, batch_size, log_batches)
141
+ aggregate_stage_metrics(stage_metrics, stats_list, state)
142
+ state[:source_batches_done] += 1
143
+ on_batch&.call(**progress_snapshot(state))
144
+ end
145
+
124
146
  # Process batches in parallel using a thread pool.
125
147
  #
126
148
  # Materializes all batches upfront and processes them concurrently using
@@ -202,6 +224,10 @@ module SearchEngine
202
224
  failed_total: 0,
203
225
  failed_batches_total: 0,
204
226
  batches_total: 0,
227
+ source_duration_ms_total: 0.0,
228
+ map_duration_ms_total: 0.0,
229
+ jsonl_duration_ms_total: 0.0,
230
+ import_duration_ms_total: 0.0,
205
231
  source_batches_done: 0,
206
232
  idx_counter: -1,
207
233
  started_at: monotonic_ms,
@@ -325,6 +351,7 @@ module SearchEngine
325
351
  thread_client = SearchEngine.client
326
352
  thread_buffer = +''
327
353
  thread_idx = shared_state[:mtx].synchronize { shared_state[:idx_counter] += 1 }
354
+ stage_metrics = stage_metrics_for(raw_batch)
328
355
 
329
356
  snapshot = begin
330
357
  stats_list = import_batch_with_handling(
@@ -339,6 +366,7 @@ module SearchEngine
339
366
 
340
367
  shared_state[:mtx].synchronize do
341
368
  aggregate_stats(stats_list, shared_state, batch_size, log_batches)
369
+ aggregate_stage_metrics(stage_metrics, stats_list, shared_state)
342
370
  shared_state[:source_batches_done] += 1
343
371
  progress_snapshot(shared_state)
344
372
  end
@@ -355,6 +383,7 @@ module SearchEngine
355
383
  err_msg = " batch_index=#{thread_idx} → error=#{error.class}: #{error.message.to_s[0, 200]}"
356
384
  warn(SearchEngine::Logging::Color.apply(err_msg, :red))
357
385
  aggregate_stats([failure_stat], shared_state, batch_size, log_batches)
386
+ aggregate_stage_metrics(stage_metrics, [failure_stat], shared_state)
358
387
  shared_state[:source_batches_done] += 1
359
388
  progress_snapshot(shared_state)
360
389
  end
@@ -420,6 +449,10 @@ module SearchEngine
420
449
  failed_total: shared_state[:failed_total],
421
450
  failed_batches_total: shared_state[:failed_batches_total],
422
451
  duration_ms_total: total_duration_ms,
452
+ source_duration_ms_total: shared_state[:source_duration_ms_total].round(1),
453
+ map_duration_ms_total: shared_state[:map_duration_ms_total].round(1),
454
+ jsonl_duration_ms_total: shared_state[:jsonl_duration_ms_total].round(1),
455
+ import_duration_ms_total: shared_state[:import_duration_ms_total].round(1),
423
456
  batches: shared_state[:batches]
424
457
  )
425
458
  end
@@ -586,8 +619,10 @@ module SearchEngine
586
619
  docs = BatchPlanner.to_array(raw_batch)
587
620
  return [] if docs.empty?
588
621
 
622
+ jsonl_started_at = monotonic_ms
589
623
  docs_count, bytes_sent = BatchPlanner.encode_jsonl!(docs, buffer)
590
624
  jsonl = buffer.dup
625
+ jsonl_duration_ms = (monotonic_ms - jsonl_started_at).round(1)
591
626
  # Use provided batch_index if available (for recursive splits), otherwise compute from next_index
592
627
  idx = batch_index || (next_index.is_a?(Proc) ? next_index.call : next_index)
593
628
 
@@ -605,6 +640,8 @@ module SearchEngine
605
640
  dry_run: false
606
641
  )
607
642
  stats[:duration_ms] = (monotonic_ms - started_at).round(1)
643
+ stats[:jsonl_duration_ms] = jsonl_duration_ms
644
+ stats[:import_duration_ms] = stats[:duration_ms]
608
645
  stats[:index] = idx
609
646
  [stats]
610
647
  rescue Errors::Api => error
@@ -646,11 +683,29 @@ module SearchEngine
646
683
  attempts: 1,
647
684
  http_status: error&.status.to_i,
648
685
  duration_ms: 0.0,
686
+ jsonl_duration_ms: 0.0,
687
+ import_duration_ms: 0.0,
649
688
  bytes_sent: bytes_sent,
650
689
  errors_sample: [safe_error_excerpt(error)]
651
690
  }
652
691
  end
653
692
 
693
+ def stage_metrics_for(raw_batch)
694
+ metrics = raw_batch.instance_variable_get(:@__search_engine_stage_metrics__)
695
+ return metrics if metrics.is_a?(Hash)
696
+
697
+ {}
698
+ rescue StandardError
699
+ {}
700
+ end
701
+
702
+ def aggregate_stage_metrics(stage_metrics, stats_list, shared_state)
703
+ shared_state[:source_duration_ms_total] += stage_metrics[:source_duration_ms].to_f
704
+ shared_state[:map_duration_ms_total] += stage_metrics[:map_duration_ms].to_f
705
+ shared_state[:jsonl_duration_ms_total] += stats_list.sum { |stats| stats[:jsonl_duration_ms].to_f }
706
+ shared_state[:import_duration_ms_total] += stats_list.sum { |stats| stats[:import_duration_ms].to_f }
707
+ end
708
+
654
709
  def safe_error_excerpt(error)
655
710
  cls = error&.class&.name
656
711
  msg = error&.message.to_s
@@ -26,6 +26,10 @@ module SearchEngine
26
26
  :failed_total,
27
27
  :failed_batches_total,
28
28
  :duration_ms_total,
29
+ :source_duration_ms_total,
30
+ :map_duration_ms_total,
31
+ :jsonl_duration_ms_total,
32
+ :import_duration_ms_total,
29
33
  :batches,
30
34
  keyword_init: true
31
35
  )
@@ -489,6 +493,7 @@ module SearchEngine
489
493
  end
490
494
 
491
495
  def instrument_partition_finish(klass, target_into, pfields, summary, started_at)
496
+ duration_ms = (monotonic_ms - started_at).round(1)
492
497
  SearchEngine::Instrumentation.instrument(
493
498
  'search_engine.indexer.partition_finish',
494
499
  {
@@ -501,8 +506,12 @@ module SearchEngine
501
506
  success_total: summary.success_total,
502
507
  failed_total: summary.failed_total,
503
508
  status: summary.status,
504
- duration_ms: (monotonic_ms - started_at).round(1)
505
- }
509
+ duration_ms: duration_ms,
510
+ source_duration_ms_total: summary_metric(summary, :source_duration_ms_total),
511
+ map_duration_ms_total: summary_metric(summary, :map_duration_ms_total),
512
+ jsonl_duration_ms_total: summary_metric(summary, :jsonl_duration_ms_total),
513
+ import_duration_ms_total: summary_metric(summary, :import_duration_ms_total)
514
+ }.compact
506
515
  ) {}
507
516
  end
508
517
 
@@ -510,12 +519,37 @@ module SearchEngine
510
519
  Enumerator.new do |y|
511
520
  idx = 0
512
521
  rows_enum.each do |rows|
522
+ source_started_at = monotonic_ms
523
+ rows = BatchPlanner.to_array(rows)
524
+ source_duration_ms = (monotonic_ms - source_started_at).round(1)
525
+
526
+ map_started_at = monotonic_ms
513
527
  docs, _report = mapper.map_batch!(rows, batch_index: idx)
528
+ map_duration_ms = (monotonic_ms - map_started_at).round(1)
529
+ attach_stage_metrics!(
530
+ docs,
531
+ source_duration_ms: source_duration_ms,
532
+ map_duration_ms: map_duration_ms,
533
+ source_rows_count: rows.size
534
+ )
514
535
  y << docs
515
536
  idx += 1
516
537
  end
517
538
  end
518
539
  end
540
+
541
+ def attach_stage_metrics!(docs, metrics)
542
+ docs.instance_variable_set(:@__search_engine_stage_metrics__, metrics)
543
+ rescue StandardError
544
+ nil
545
+ end
546
+
547
+ def summary_metric(summary, key)
548
+ return unless summary.respond_to?(key)
549
+
550
+ value = summary.public_send(key)
551
+ value&.to_f&.round(1)
552
+ end
519
553
  end
520
554
  end
521
555
  end
@@ -172,6 +172,11 @@ module SearchEngine
172
172
  entry[:success_total] = summary_value(summary, :success_total).to_i
173
173
  entry[:failed_total] = summary_value(summary, :failed_total).to_i
174
174
  entry[:sample_error] = summary_value(summary, :sample_error)
175
+ entry[:duration_ms_total] = summary_value(summary, :duration_ms_total)
176
+ entry[:source_duration_ms_total] = summary_value(summary, :source_duration_ms_total)
177
+ entry[:map_duration_ms_total] = summary_value(summary, :map_duration_ms_total)
178
+ entry[:jsonl_duration_ms_total] = summary_value(summary, :jsonl_duration_ms_total)
179
+ entry[:import_duration_ms_total] = summary_value(summary, :import_duration_ms_total)
175
180
  end
176
181
 
177
182
  def summary_value(summary, key)