connectors_service 8.6.0.4.pre.20221114T233727Z → 8.6.0.4.pre.20221116T024501Z

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +4 -4
  3. data/lib/app/app.rb +4 -0
  4. data/lib/app/dispatcher.rb +30 -17
  5. data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
  6. data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
  7. data/lib/connectors/base/connector.rb +27 -5
  8. data/lib/connectors/example/connector.rb +3 -12
  9. data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
  10. data/lib/connectors/gitlab/connector.rb +3 -12
  11. data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
  12. data/lib/connectors/mongodb/connector.rb +9 -24
  13. data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
  14. data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
  15. data/lib/connectors/sync_status.rb +6 -1
  16. data/lib/connectors/tolerable_error_helper.rb +43 -0
  17. data/lib/core/connector_job.rb +96 -23
  18. data/lib/core/connector_settings.rb +29 -6
  19. data/lib/core/elastic_connector_actions.rb +77 -55
  20. data/lib/core/filtering/validation_job_runner.rb +1 -1
  21. data/lib/core/ingestion/es_sink.rb +68 -9
  22. data/lib/core/ingestion.rb +0 -1
  23. data/lib/core/jobs/consumer.rb +114 -0
  24. data/lib/core/jobs/producer.rb +26 -0
  25. data/lib/core/single_scheduler.rb +1 -1
  26. data/lib/core/sync_job_runner.rb +20 -12
  27. data/lib/core.rb +2 -0
  28. data/lib/utility/error_monitor.rb +108 -0
  29. data/lib/utility/errors.rb +0 -12
  30. data/lib/utility/logger.rb +0 -1
  31. data/lib/utility.rb +6 -0
  32. metadata +12 -3
  33. data/lib/core/ingestion/ingester.rb +0 -90
@@ -132,11 +132,35 @@ module Core
132
132
  update_connector_fields(connector_id, { :filtering => filtering })
133
133
  end
134
134
 
135
- def claim_job(connector_id)
135
+ def update_connector_sync_now(connector_id, sync_now)
136
+ doc = connector_with_concurrency_control(connector_id)
137
+
138
+ body = { sync_now: sync_now, last_synced: Time.now }
139
+
140
+ update_connector_fields(
141
+ connector_id,
142
+ body,
143
+ doc[:seq_no],
144
+ doc[:primary_term]
145
+ )
146
+ end
147
+
148
+ def update_connector_last_sync_status(connector_id, last_sync_status)
149
+ doc = connector_with_concurrency_control(connector_id)
150
+
151
+ update_connector_fields(
152
+ connector_id,
153
+ { last_sync_status: last_sync_status },
154
+ doc[:seq_no],
155
+ doc[:primary_term]
156
+ )
157
+ end
158
+
159
+ def connector_with_concurrency_control(connector_id)
136
160
  seq_no = nil
137
161
  primary_term = nil
138
- sync_in_progress = false
139
- connector_record = client.get(
162
+
163
+ doc = client.get(
140
164
  :index => Utility::Constants::CONNECTORS_INDEX,
141
165
  :id => connector_id,
142
166
  :ignore => 404,
@@ -144,42 +168,31 @@ module Core
144
168
  ).tap do |response|
145
169
  seq_no = response['_seq_no']
146
170
  primary_term = response['_primary_term']
147
- sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
148
- end
149
- if sync_in_progress
150
- raise JobAlreadyRunningError.new(connector_id)
151
171
  end
152
- update_connector_fields(
153
- connector_id,
154
- { :sync_now => false,
155
- :last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
156
- :last_synced => Time.now },
157
- seq_no,
158
- primary_term
159
- )
160
172
 
173
+ { doc: doc, seq_no: seq_no, primary_term: primary_term }
174
+ end
175
+
176
+ def create_job(connector_settings:)
161
177
  body = {
162
- :status => Connectors::SyncStatus::IN_PROGRESS,
163
- :worker_hostname => Socket.gethostname,
164
- :created_at => Time.now,
165
- :started_at => Time.now,
166
- :last_seen => Time.now,
167
- :connector => {
168
- :id => connector_id,
169
- :filtering => convert_connector_filtering_to_job_filtering(connector_record.dig('_source', 'filtering'))
178
+ status: Connectors::SyncStatus::PENDING,
179
+ created_at: Time.now,
180
+ last_seen: Time.now,
181
+ connector: {
182
+ id: connector_settings.id,
183
+ filtering: convert_connector_filtering_to_job_filtering(connector_settings.filtering),
184
+ index_name: connector_settings.index_name,
185
+ language: connector_settings[:language],
186
+ pipeline: connector_settings[:pipeline],
187
+ service_type: connector_settings.service_type
170
188
  }
171
189
  }
172
190
 
173
- index_response = client.index(:index => Utility::Constants::JOB_INDEX, :body => body, :refresh => true)
174
- if index_response['result'] == 'created'
175
- # TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
176
- return client.get(
177
- :index => Utility::Constants::JOB_INDEX,
178
- :id => index_response['_id'],
179
- :ignore => 404
180
- ).with_indifferent_access
181
- end
182
- raise JobNotCreatedError.new(connector_id, index_response)
191
+ index_response = client.index(index: Utility::Constants::JOB_INDEX, body: body, refresh: true)
192
+
193
+ return index_response if index_response['result'] == 'created'
194
+
195
+ raise JobNotCreatedError.new(connector_settings.id, index_response)
183
196
  end
184
197
 
185
198
  def convert_connector_filtering_to_job_filtering(connector_filtering)
@@ -507,31 +520,15 @@ module Core
507
520
  end
508
521
 
509
522
  def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
510
- return if doc.empty?
511
- update_args = {
512
- :index => Utility::Constants::CONNECTORS_INDEX,
513
- :id => connector_id,
514
- :body => { :doc => doc },
515
- :refresh => true,
516
- :retry_on_conflict => 3
517
- }
518
- # seq_no and primary_term are used for optimistic concurrency control
519
- # see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
520
- if seq_no && primary_term
521
- update_args[:if_seq_no] = seq_no
522
- update_args[:if_primary_term] = primary_term
523
- update_args.delete(:retry_on_conflict)
524
- end
525
- begin
526
- client.update(update_args)
527
- rescue Elastic::Transport::Transport::Errors::Conflict
528
- # VersionConflictException
529
- # see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
530
- raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
531
- end
523
+ update_doc_fields(Utility::Constants::CONNECTORS_INDEX, connector_id, doc, seq_no, primary_term)
524
+ end
525
+
526
+ def update_job_fields(job_id, doc = {}, seq_no = nil, primary_term = nil)
527
+ update_doc_fields(Utility::Constants::JOB_INDEX, job_id, doc, seq_no, primary_term)
532
528
  end
533
529
 
534
530
  def document_count(index_name)
531
+ client.indices.refresh(:index => index_name)
535
532
  client.count(:index => index_name)['count']
536
533
  end
537
534
 
@@ -563,6 +560,31 @@ module Core
563
560
  filter.deep_merge!(new_validation_state)
564
561
  end
565
562
  end
563
+
564
+ def update_doc_fields(index, id, doc = {}, seq_no = nil, primary_term = nil)
565
+ return if doc.empty?
566
+ update_args = {
567
+ :index => index,
568
+ :id => id,
569
+ :body => { :doc => doc },
570
+ :refresh => true,
571
+ :retry_on_conflict => 3
572
+ }
573
+
574
+ if seq_no && primary_term
575
+ update_args[:if_seq_no] = seq_no
576
+ update_args[:if_primary_term] = primary_term
577
+ update_args.delete(:retry_on_conflict)
578
+ end
579
+
580
+ begin
581
+ client.update(update_args)
582
+ rescue Elastic::Transport::Transport::Errors::Conflict
583
+ # VersionConflictException
584
+ # see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
585
+ raise ConnectorVersionChangedError.new(id, seq_no, primary_term)
586
+ end
587
+ end
566
588
  end
567
589
  end
568
590
  end
@@ -24,7 +24,7 @@ module Core
24
24
  def execute
25
25
  Utility::Logger.info("Starting a validation job for connector #{@connector_settings.id}.")
26
26
 
27
- validation_result = @connector_class.validate_filtering(@connector_settings.filtering)
27
+ validation_result = @connector_class.validate_filtering(@connector_settings.filtering[:draft])
28
28
 
29
29
  # currently only used for connectors -> DEFAULT domain can be assumed (will be changed with the integration of crawler)
30
30
  ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_result })
@@ -11,19 +11,54 @@ require 'utility/bulk_queue'
11
11
  require 'utility/es_client'
12
12
  require 'utility/logger'
13
13
  require 'elasticsearch/api'
14
-
14
+ #
15
+ # This class is responsible for sending the data to the data storage.
16
+ # While we don't actually allow to output our data anywhere except
17
+ # Elasticsearch, we still want to be able to do so sometime in future.
18
+ #
19
+ # This class should stay simple and any change to the class should be careful
20
+ # with the thought of introducing other sinks in future.
15
21
  module Core
16
22
  module Ingestion
17
23
  class EsSink
18
- def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new)
24
+ def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new, max_allowed_document_size = 5 * 1024 * 1024)
19
25
  @client = Utility::EsClient.new(App::Config[:elasticsearch])
20
26
  @index_name = index_name
21
27
  @request_pipeline = request_pipeline
22
28
  @operation_queue = bulk_queue
29
+
30
+ @max_allowed_document_size = max_allowed_document_size
31
+
32
+ @queued = {
33
+ :indexed_document_count => 0,
34
+ :deleted_document_count => 0,
35
+ :indexed_document_volume => 0
36
+ }
37
+
38
+ @completed = {
39
+ :indexed_document_count => 0,
40
+ :deleted_document_count => 0,
41
+ :indexed_document_volume => 0
42
+ }
23
43
  end
24
44
 
25
- def ingest(id, serialized_document)
26
- index_op = serialize({ 'index' => { '_index' => index_name, '_id' => id } })
45
+ def ingest(document)
46
+ if document.nil? || document.empty?
47
+ Utility::Logger.warn('Connector attempted to ingest an empty document, skipping')
48
+ return
49
+ end
50
+
51
+ id = document['id']
52
+ serialized_document = serialize(document)
53
+
54
+ document_size = serialized_document.bytesize
55
+
56
+ if @max_allowed_document_size > 0 && document_size > @max_allowed_document_size
57
+ Utility::Logger.warn("Connector attempted to ingest too large document with id=#{document['id']} [#{document_size}/#{@max_allowed_document_size}], skipping the document.")
58
+ return
59
+ end
60
+
61
+ index_op = serialize({ 'index' => { '_index' => @index_name, '_id' => id } })
27
62
 
28
63
  flush unless @operation_queue.will_fit?(index_op, serialized_document)
29
64
 
@@ -31,13 +66,27 @@ module Core
31
66
  index_op,
32
67
  serialized_document
33
68
  )
69
+
70
+ @queued[:indexed_document_count] += 1
71
+ @queued[:indexed_document_volume] += document_size
72
+ end
73
+
74
+ def ingest_multiple(documents)
75
+ documents.each { |doc| ingest(doc) }
34
76
  end
35
77
 
36
- def delete(doc_id)
37
- delete_op = serialize({ 'delete' => { '_index' => index_name, '_id' => doc_id } })
78
+ def delete(id)
79
+ return if id.nil?
80
+
81
+ delete_op = serialize({ 'delete' => { '_index' => @index_name, '_id' => id } })
38
82
  flush unless @operation_queue.will_fit?(delete_op)
39
83
 
40
84
  @operation_queue.add(delete_op)
85
+ @queued[:deleted_document_count] += 1
86
+ end
87
+
88
+ def delete_multiple(ids)
89
+ ids.each { |id| delete(id) }
41
90
  end
42
91
 
43
92
  def flush
@@ -45,15 +94,25 @@ module Core
45
94
  return if data.empty?
46
95
 
47
96
  @client.bulk(:body => data, :pipeline => @request_pipeline)
97
+
98
+ @completed[:indexed_document_count] += @queued[:indexed_document_count]
99
+ @completed[:deleted_document_count] += @queued[:deleted_document_count]
100
+ @completed[:indexed_document_volume] += @queued[:indexed_document_volume]
101
+
102
+ @queued[:indexed_document_count] = 0
103
+ @queued[:deleted_document_count] = 0
104
+ @queued[:indexed_document_volume] = 0
48
105
  end
49
106
 
50
- def serialize(obj)
51
- Elasticsearch::API.serializer.dump(obj)
107
+ def ingestion_stats
108
+ @completed.dup
52
109
  end
53
110
 
54
111
  private
55
112
 
56
- attr_accessor :index_name
113
+ def serialize(document)
114
+ Elasticsearch::API.serializer.dump(document)
115
+ end
57
116
  end
58
117
  end
59
118
  end
@@ -6,5 +6,4 @@
6
6
 
7
7
  # frozen_string_literal: true
8
8
 
9
- require 'core/ingestion/ingester'
10
9
  require 'core/ingestion/es_sink'
@@ -0,0 +1,114 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ module Core
10
+ module Jobs
11
+ class Consumer
12
+ def initialize(scheduler:, poll_interval: 3, termination_timeout: 60, min_threads: 1, max_threads: 5, max_queue: 100, idle_time: 5)
13
+ @scheduler = scheduler
14
+ @poll_interval = poll_interval
15
+ @termination_timeout = termination_timeout
16
+ @min_threads = min_threads
17
+ @max_threads = max_threads
18
+ @max_queue = max_queue
19
+ @idle_time = idle_time
20
+
21
+ @running = Concurrent::AtomicBoolean.new(false)
22
+ end
23
+
24
+ def subscribe!(index_name:)
25
+ @index_name = index_name
26
+
27
+ start_loop!
28
+ end
29
+
30
+ def running?
31
+ # @TODO check if a loop thread is alive
32
+ pool.running? && @running.true?
33
+ end
34
+
35
+ def shutdown!
36
+ Utility::Logger.info("Shutting down consumer for #{@index_name} index")
37
+ @running.make_false
38
+ pool.shutdown
39
+ pool.wait_for_termination(@termination_timeout)
40
+ # reset pool
41
+ @pool = nil
42
+ end
43
+
44
+ private
45
+
46
+ def start_loop!
47
+ Utility::Logger.info("Starting a new consumer for #{@index_name} index")
48
+
49
+ Thread.new do
50
+ # assign a name to the thread
51
+ # see @TODO in #self.running?
52
+ Thread.current[:name] = "consumer-group-#{@index_name}"
53
+
54
+ loop do
55
+ if @running.false?
56
+ Utility::Logger.info('Shutting down the loop')
57
+ break
58
+ end
59
+
60
+ sleep(@poll_interval)
61
+ Utility::Logger.debug('Getting registered connectors')
62
+
63
+ connectors = ready_for_sync_connectors
64
+ next unless connectors.any?
65
+
66
+ Utility::Logger.debug("Number of available connectors: #{connectors.size}")
67
+
68
+ # @TODO It is assumed that @index_name is used to retrive pending jobs.
69
+ # This will be discussed after 8.6 release
70
+ pending_jobs = Core::ConnectorJob.pending_jobs(connectors_ids: connectors.keys)
71
+ Utility::Logger.info("Number of pending jobs: #{pending_jobs.size}")
72
+
73
+ pending_jobs.each do |job|
74
+ connector_settings = connectors[job.connector_id]
75
+
76
+ pool.post do
77
+ Utility::Logger.info("Connector #{connector_settings.formatted} picked up the job #{job.id}")
78
+ Core::ElasticConnectorActions.ensure_content_index_exists(connector_settings.index_name)
79
+ job_runner = Core::SyncJobRunner.new(connector_settings, job)
80
+ job_runner.execute
81
+ rescue Core::JobAlreadyRunningError
82
+ Utility::Logger.info("Sync job for #{connector_settings.formatted} is already running, skipping.")
83
+ rescue Core::ConnectorVersionChangedError => e
84
+ Utility::Logger.info("Could not start the job because #{connector_settings.formatted} has been updated externally. Message: #{e.message}")
85
+ rescue StandardError => e
86
+ Utility::ExceptionTracking.log_exception(e, "Sync job for #{connector_settings.formatted} failed due to unexpected error.")
87
+ end
88
+ end
89
+ rescue StandardError => e
90
+ Utility::ExceptionTracking.log_exception(e, 'The consumer group failed')
91
+ end
92
+ end
93
+
94
+ @running.make_true
95
+ end
96
+
97
+ def pool
98
+ @pool ||= Concurrent::ThreadPoolExecutor.new(
99
+ min_threads: @min_threads,
100
+ max_threads: @max_threads,
101
+ max_queue: @max_queue,
102
+ fallback_policy: :abort,
103
+ idletime: @idle_time
104
+ )
105
+ end
106
+
107
+ def ready_for_sync_connectors
108
+ @scheduler.connector_settings
109
+ .select(&:ready_for_sync?)
110
+ .inject({}) { |memo, cs| memo.merge(cs.id => cs) }
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,26 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ module Core
10
+ module Jobs
11
+ class Producer
12
+ JOB_TYPES = %i(sync).freeze
13
+
14
+ class << self
15
+ def enqueue_job(job_type:, connector_settings:)
16
+ raise UnsupportedJobType unless JOB_TYPES.include?(job_type)
17
+ raise ArgumentError unless connector_settings.kind_of?(ConnectorSettings)
18
+
19
+ ElasticConnectorActions.create_job(connector_settings: connector_settings)
20
+ end
21
+ end
22
+ end
23
+
24
+ class UnsupportedJobType < StandardError; end
25
+ end
26
+ end
@@ -20,7 +20,7 @@ module Core
20
20
 
21
21
  def connector_settings
22
22
  connector_settings = Core::ConnectorSettings.fetch_by_id(@connector_id)
23
- [connector_settings]
23
+ [connector_settings].compact
24
24
  rescue *Utility::AUTHORIZATION_ERRORS => e
25
25
  # should be handled by the general scheduler
26
26
  raise e
@@ -23,9 +23,9 @@ module Core
23
23
  class SyncJobRunner
24
24
  JOB_REPORTING_INTERVAL = 10
25
25
 
26
- def initialize(connector_settings)
26
+ def initialize(connector_settings, job)
27
27
  @connector_settings = connector_settings
28
- @ingester = Core::Ingestion::Ingester.new(Core::Ingestion::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline))
28
+ @sink = Core::Ingestion::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline)
29
29
  @connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
30
30
  @sync_finished = false
31
31
  @sync_error = nil
@@ -35,6 +35,7 @@ module Core
35
35
  :indexed_document_volume => 0,
36
36
  :error => nil
37
37
  }
38
+ @job = job
38
39
  end
39
40
 
40
41
  def execute
@@ -47,9 +48,16 @@ module Core
47
48
  def do_sync!
48
49
  Utility::Logger.info("Claiming a sync job for connector #{@connector_settings.id}.")
49
50
 
50
- job_record = ElasticConnectorActions.claim_job(@connector_settings.id)
51
- job_description = job_record['_source']
52
- job_id = job_record['_id']
51
+ # connector service doesn't support multiple jobs running simultaneously
52
+ raise Core::JobAlreadyRunningError.new(@connector_settings.id) if @connector_settings.running?
53
+
54
+ Core::ElasticConnectorActions.update_connector_last_sync_status(@connector_settings.id, Connectors::SyncStatus::IN_PROGRESS)
55
+
56
+ # claim the job
57
+ @job.make_running!
58
+
59
+ job_description = @job.es_source
60
+ job_id = @job.id
53
61
  job_description['_id'] = job_id
54
62
 
55
63
  unless job_id.present?
@@ -80,12 +88,12 @@ module Core
80
88
  document = add_ingest_metadata(document)
81
89
  post_process_result = post_processing_engine.process(document)
82
90
  if post_process_result.is_include?
83
- @ingester.ingest(document)
91
+ @sink.ingest(document)
84
92
  incoming_ids << document['id']
85
93
  end
86
94
 
87
95
  if Time.now - reporting_cycle_start >= JOB_REPORTING_INTERVAL
88
- ElasticConnectorActions.update_sync(job_id, @ingester.ingestion_stats.merge(:metadata => connector_instance.metadata))
96
+ ElasticConnectorActions.update_sync(job_id, @sink.ingestion_stats.merge(:metadata => connector_instance.metadata))
89
97
  reporting_cycle_start = Time.now
90
98
  end
91
99
  end
@@ -95,15 +103,15 @@ module Core
95
103
  Utility::Logger.info("Deleting #{ids_to_delete.size} documents from index #{@connector_settings.index_name}.")
96
104
 
97
105
  ids_to_delete.each do |id|
98
- @ingester.delete(id)
106
+ @sink.delete(id)
99
107
 
100
108
  if Time.now - reporting_cycle_start >= JOB_REPORTING_INTERVAL
101
- ElasticConnectorActions.update_sync(job_id, @ingester.ingestion_stats.merge(:metadata => connector_instance.metadata))
109
+ ElasticConnectorActions.update_sync(job_id, @sink.ingestion_stats.merge(:metadata => connector_instance.metadata))
102
110
  reporting_cycle_start = Time.now
103
111
  end
104
112
  end
105
113
 
106
- @ingester.flush
114
+ @sink.flush
107
115
 
108
116
  # We use this mechanism for checking, whether an interrupt (or something else lead to the thread not finishing)
109
117
  # occurred as most of the time the main execution thread is interrupted and we miss this Signal/Exception here
@@ -112,7 +120,7 @@ module Core
112
120
  @sync_error = e.message
113
121
  Utility::ExceptionTracking.log_exception(e)
114
122
  ensure
115
- stats = @ingester.ingestion_stats
123
+ stats = @sink.ingestion_stats
116
124
 
117
125
  Utility::Logger.debug("Sync stats are: #{stats}")
118
126
 
@@ -129,7 +137,7 @@ module Core
129
137
  end
130
138
 
131
139
  unless connector_instance.nil?
132
- metadata = @ingester.ingestion_stats.merge(:metadata => connector_instance.metadata)
140
+ metadata = @sink.ingestion_stats.merge(:metadata => connector_instance.metadata)
133
141
  metadata[:total_document_count] = ElasticConnectorActions.document_count(@connector_settings.index_name)
134
142
  end
135
143
 
data/lib/core.rb CHANGED
@@ -16,3 +16,5 @@ require 'core/scheduler'
16
16
  require 'core/single_scheduler'
17
17
  require 'core/native_scheduler'
18
18
  require 'core/sync_job_runner'
19
+ require 'core/jobs/producer'
20
+ require 'core/jobs/consumer'
@@ -0,0 +1,108 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'time'
10
+ require 'utility/errors'
11
+ require 'utility/exception_tracking'
12
+
13
+ module Utility
14
+ class ErrorMonitor
15
+ class MonitoringError < StandardError
16
+ attr_accessor :tripped_by
17
+
18
+ def initialize(message = nil, tripped_by: nil)
19
+ super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
20
+ @tripped_by = tripped_by
21
+ end
22
+ end
23
+
24
+ class MaxSuccessiveErrorsExceededError < MonitoringError; end
25
+ class MaxErrorsExceededError < MonitoringError; end
26
+ class MaxErrorsInWindowExceededError < MonitoringError; end
27
+
28
+ attr_reader :total_error_count, :success_count, :consecutive_error_count, :error_queue
29
+
30
+ def initialize(
31
+ max_errors: 1000,
32
+ max_consecutive_errors: 10,
33
+ max_error_ratio: 0.15,
34
+ window_size: 100,
35
+ error_queue_size: 20
36
+ )
37
+ @max_errors = max_errors
38
+ @max_consecutive_errors = max_consecutive_errors
39
+ @max_error_ratio = max_error_ratio
40
+ @window_size = window_size
41
+ @total_error_count = 0
42
+ @success_count = 0
43
+ @consecutive_error_count = 0
44
+ @window_errors = Array.new(window_size) { false }
45
+ @window_index = 0
46
+ @last_error = nil
47
+ @error_queue_size = error_queue_size
48
+ @error_queue = []
49
+ end
50
+
51
+ def note_success
52
+ @consecutive_error_count = 0
53
+ @success_count += 1
54
+ increment_window_index
55
+ end
56
+
57
+ def note_error(error, id: Time.now.to_i)
58
+ stack_trace = Utility::ExceptionTracking.generate_stack_trace(error)
59
+ error_message = Utility::ExceptionTracking.generate_error_message(error, nil, nil)
60
+ Utility::Logger.debug("Message id: #{id} - #{error_message}\n#{stack_trace}")
61
+ @total_error_count += 1
62
+ @consecutive_error_count += 1
63
+ @window_errors[@window_index] = true
64
+ @error_queue << DocumentError.new(error.class.name, error_message, stack_trace, id)
65
+ @error_queue = @error_queue.drop(1) if @error_queue.size > @error_queue_size
66
+ increment_window_index
67
+ @last_error = error
68
+
69
+ raise_if_necessary
70
+ end
71
+
72
+ def finalize
73
+ total_documents = @total_error_count + @success_count
74
+ if total_documents > 0 && @total_error_count.to_f / total_documents > @max_error_ratio
75
+ raise_with_last_cause(MaxErrorsInWindowExceededError.new("There were #{@total_error_count} errors out of #{total_documents} total documents", :tripped_by => @last_error))
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def raise_if_necessary
82
+ error =
83
+ if @consecutive_error_count > @max_consecutive_errors
84
+ MaxSuccessiveErrorsExceededError.new("Exceeded maximum consecutive errors - saw #{@consecutive_error_count} errors in a row.", :tripped_by => @last_error)
85
+ elsif @total_error_count > @max_errors
86
+ MaxErrorsExceededError.new("Exceeded maximum number of errors - saw #{@total_error_count} errors in total.", :tripped_by => @last_error)
87
+ elsif @window_size > 0 && num_errors_in_window / @window_size > @max_error_ratio
88
+ MaxErrorsInWindowExceededError.new("Exceeded maximum error ratio of #{@max_error_ratio}. Of the last #{@window_size} documents, #{num_errors_in_window} had errors", :tripped_by => @last_error)
89
+ end
90
+
91
+ raise_with_last_cause(error) if error
92
+ end
93
+
94
+ def num_errors_in_window
95
+ @window_errors.count(&:itself).to_f
96
+ end
97
+
98
+ def increment_window_index
99
+ @window_index = (@window_index + 1) % @window_size
100
+ end
101
+
102
+ def raise_with_last_cause(error)
103
+ raise @last_error
104
+ rescue StandardError
105
+ raise error
106
+ end
107
+ end
108
+ end
@@ -60,18 +60,6 @@ module Utility
60
60
  class JobDocumentLimitError < StandardError; end
61
61
  class JobClaimingError < StandardError; end
62
62
 
63
- class MonitoringError < StandardError
64
- attr_accessor :tripped_by
65
-
66
- def initialize(message = nil, tripped_by: nil)
67
- super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
68
- @tripped_by = tripped_by
69
- end
70
- end
71
- class MaxSuccessiveErrorsExceededError < MonitoringError; end
72
- class MaxErrorsExceededError < MonitoringError; end
73
- class MaxErrorsInWindowExceededError < MonitoringError; end
74
-
75
63
  class JobSyncNotPossibleYetError < StandardError
76
64
  attr_accessor :sync_will_be_possible_at
77
65