connectors_service 8.6.0.4.pre.20221114T233727Z → 8.6.0.4.pre.20221116T024501Z

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +4 -4
  3. data/lib/app/app.rb +4 -0
  4. data/lib/app/dispatcher.rb +30 -17
  5. data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
  6. data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
  7. data/lib/connectors/base/connector.rb +27 -5
  8. data/lib/connectors/example/connector.rb +3 -12
  9. data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
  10. data/lib/connectors/gitlab/connector.rb +3 -12
  11. data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
  12. data/lib/connectors/mongodb/connector.rb +9 -24
  13. data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
  14. data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
  15. data/lib/connectors/sync_status.rb +6 -1
  16. data/lib/connectors/tolerable_error_helper.rb +43 -0
  17. data/lib/core/connector_job.rb +96 -23
  18. data/lib/core/connector_settings.rb +29 -6
  19. data/lib/core/elastic_connector_actions.rb +77 -55
  20. data/lib/core/filtering/validation_job_runner.rb +1 -1
  21. data/lib/core/ingestion/es_sink.rb +68 -9
  22. data/lib/core/ingestion.rb +0 -1
  23. data/lib/core/jobs/consumer.rb +114 -0
  24. data/lib/core/jobs/producer.rb +26 -0
  25. data/lib/core/single_scheduler.rb +1 -1
  26. data/lib/core/sync_job_runner.rb +20 -12
  27. data/lib/core.rb +2 -0
  28. data/lib/utility/error_monitor.rb +108 -0
  29. data/lib/utility/errors.rb +0 -12
  30. data/lib/utility/logger.rb +0 -1
  31. data/lib/utility.rb +6 -0
  32. metadata +12 -3
  33. data/lib/core/ingestion/ingester.rb +0 -90
@@ -132,11 +132,35 @@ module Core
132
132
  update_connector_fields(connector_id, { :filtering => filtering })
133
133
  end
134
134
 
135
- def claim_job(connector_id)
135
+ def update_connector_sync_now(connector_id, sync_now)
136
+ doc = connector_with_concurrency_control(connector_id)
137
+
138
+ body = { sync_now: sync_now, last_synced: Time.now }
139
+
140
+ update_connector_fields(
141
+ connector_id,
142
+ body,
143
+ doc[:seq_no],
144
+ doc[:primary_term]
145
+ )
146
+ end
147
+
148
+ def update_connector_last_sync_status(connector_id, last_sync_status)
149
+ doc = connector_with_concurrency_control(connector_id)
150
+
151
+ update_connector_fields(
152
+ connector_id,
153
+ { last_sync_status: last_sync_status },
154
+ doc[:seq_no],
155
+ doc[:primary_term]
156
+ )
157
+ end
158
+
159
+ def connector_with_concurrency_control(connector_id)
136
160
  seq_no = nil
137
161
  primary_term = nil
138
- sync_in_progress = false
139
- connector_record = client.get(
162
+
163
+ doc = client.get(
140
164
  :index => Utility::Constants::CONNECTORS_INDEX,
141
165
  :id => connector_id,
142
166
  :ignore => 404,
@@ -144,42 +168,31 @@ module Core
144
168
  ).tap do |response|
145
169
  seq_no = response['_seq_no']
146
170
  primary_term = response['_primary_term']
147
- sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
148
- end
149
- if sync_in_progress
150
- raise JobAlreadyRunningError.new(connector_id)
151
171
  end
152
- update_connector_fields(
153
- connector_id,
154
- { :sync_now => false,
155
- :last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
156
- :last_synced => Time.now },
157
- seq_no,
158
- primary_term
159
- )
160
172
 
173
+ { doc: doc, seq_no: seq_no, primary_term: primary_term }
174
+ end
175
+
176
+ def create_job(connector_settings:)
161
177
  body = {
162
- :status => Connectors::SyncStatus::IN_PROGRESS,
163
- :worker_hostname => Socket.gethostname,
164
- :created_at => Time.now,
165
- :started_at => Time.now,
166
- :last_seen => Time.now,
167
- :connector => {
168
- :id => connector_id,
169
- :filtering => convert_connector_filtering_to_job_filtering(connector_record.dig('_source', 'filtering'))
178
+ status: Connectors::SyncStatus::PENDING,
179
+ created_at: Time.now,
180
+ last_seen: Time.now,
181
+ connector: {
182
+ id: connector_settings.id,
183
+ filtering: convert_connector_filtering_to_job_filtering(connector_settings.filtering),
184
+ index_name: connector_settings.index_name,
185
+ language: connector_settings[:language],
186
+ pipeline: connector_settings[:pipeline],
187
+ service_type: connector_settings.service_type
170
188
  }
171
189
  }
172
190
 
173
- index_response = client.index(:index => Utility::Constants::JOB_INDEX, :body => body, :refresh => true)
174
- if index_response['result'] == 'created'
175
- # TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
176
- return client.get(
177
- :index => Utility::Constants::JOB_INDEX,
178
- :id => index_response['_id'],
179
- :ignore => 404
180
- ).with_indifferent_access
181
- end
182
- raise JobNotCreatedError.new(connector_id, index_response)
191
+ index_response = client.index(index: Utility::Constants::JOB_INDEX, body: body, refresh: true)
192
+
193
+ return index_response if index_response['result'] == 'created'
194
+
195
+ raise JobNotCreatedError.new(connector_settings.id, index_response)
183
196
  end
184
197
 
185
198
  def convert_connector_filtering_to_job_filtering(connector_filtering)
@@ -507,31 +520,15 @@ module Core
507
520
  end
508
521
 
509
522
  def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
510
- return if doc.empty?
511
- update_args = {
512
- :index => Utility::Constants::CONNECTORS_INDEX,
513
- :id => connector_id,
514
- :body => { :doc => doc },
515
- :refresh => true,
516
- :retry_on_conflict => 3
517
- }
518
- # seq_no and primary_term are used for optimistic concurrency control
519
- # see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
520
- if seq_no && primary_term
521
- update_args[:if_seq_no] = seq_no
522
- update_args[:if_primary_term] = primary_term
523
- update_args.delete(:retry_on_conflict)
524
- end
525
- begin
526
- client.update(update_args)
527
- rescue Elastic::Transport::Transport::Errors::Conflict
528
- # VersionConflictException
529
- # see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
530
- raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
531
- end
523
+ update_doc_fields(Utility::Constants::CONNECTORS_INDEX, connector_id, doc, seq_no, primary_term)
524
+ end
525
+
526
+ def update_job_fields(job_id, doc = {}, seq_no = nil, primary_term = nil)
527
+ update_doc_fields(Utility::Constants::JOB_INDEX, job_id, doc, seq_no, primary_term)
532
528
  end
533
529
 
534
530
  def document_count(index_name)
531
+ client.indices.refresh(:index => index_name)
535
532
  client.count(:index => index_name)['count']
536
533
  end
537
534
 
@@ -563,6 +560,31 @@ module Core
563
560
  filter.deep_merge!(new_validation_state)
564
561
  end
565
562
  end
563
+
564
+ def update_doc_fields(index, id, doc = {}, seq_no = nil, primary_term = nil)
565
+ return if doc.empty?
566
+ update_args = {
567
+ :index => index,
568
+ :id => id,
569
+ :body => { :doc => doc },
570
+ :refresh => true,
571
+ :retry_on_conflict => 3
572
+ }
573
+
574
+ if seq_no && primary_term
575
+ update_args[:if_seq_no] = seq_no
576
+ update_args[:if_primary_term] = primary_term
577
+ update_args.delete(:retry_on_conflict)
578
+ end
579
+
580
+ begin
581
+ client.update(update_args)
582
+ rescue Elastic::Transport::Transport::Errors::Conflict
583
+ # VersionConflictException
584
+ # see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
585
+ raise ConnectorVersionChangedError.new(id, seq_no, primary_term)
586
+ end
587
+ end
566
588
  end
567
589
  end
568
590
  end
@@ -24,7 +24,7 @@ module Core
24
24
  def execute
25
25
  Utility::Logger.info("Starting a validation job for connector #{@connector_settings.id}.")
26
26
 
27
- validation_result = @connector_class.validate_filtering(@connector_settings.filtering)
27
+ validation_result = @connector_class.validate_filtering(@connector_settings.filtering[:draft])
28
28
 
29
29
  # currently only used for connectors -> DEFAULT domain can be assumed (will be changed with the integration of crawler)
30
30
  ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_result })
@@ -11,19 +11,54 @@ require 'utility/bulk_queue'
11
11
  require 'utility/es_client'
12
12
  require 'utility/logger'
13
13
  require 'elasticsearch/api'
14
-
14
+ #
15
+ # This class is responsible for sending the data to the data storage.
16
+ # While we don't actually allow to output our data anywhere except
17
+ # Elasticsearch, we still want to be able to do so sometime in future.
18
+ #
19
+ # This class should stay simple and any change to the class should be careful
20
+ # with the thought of introducing other sinks in future.
15
21
  module Core
16
22
  module Ingestion
17
23
  class EsSink
18
- def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new)
24
+ def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new, max_allowed_document_size = 5 * 1024 * 1024)
19
25
  @client = Utility::EsClient.new(App::Config[:elasticsearch])
20
26
  @index_name = index_name
21
27
  @request_pipeline = request_pipeline
22
28
  @operation_queue = bulk_queue
29
+
30
+ @max_allowed_document_size = max_allowed_document_size
31
+
32
+ @queued = {
33
+ :indexed_document_count => 0,
34
+ :deleted_document_count => 0,
35
+ :indexed_document_volume => 0
36
+ }
37
+
38
+ @completed = {
39
+ :indexed_document_count => 0,
40
+ :deleted_document_count => 0,
41
+ :indexed_document_volume => 0
42
+ }
23
43
  end
24
44
 
25
- def ingest(id, serialized_document)
26
- index_op = serialize({ 'index' => { '_index' => index_name, '_id' => id } })
45
+ def ingest(document)
46
+ if document.nil? || document.empty?
47
+ Utility::Logger.warn('Connector attempted to ingest an empty document, skipping')
48
+ return
49
+ end
50
+
51
+ id = document['id']
52
+ serialized_document = serialize(document)
53
+
54
+ document_size = serialized_document.bytesize
55
+
56
+ if @max_allowed_document_size > 0 && document_size > @max_allowed_document_size
57
+ Utility::Logger.warn("Connector attempted to ingest too large document with id=#{document['id']} [#{document_size}/#{@max_allowed_document_size}], skipping the document.")
58
+ return
59
+ end
60
+
61
+ index_op = serialize({ 'index' => { '_index' => @index_name, '_id' => id } })
27
62
 
28
63
  flush unless @operation_queue.will_fit?(index_op, serialized_document)
29
64
 
@@ -31,13 +66,27 @@ module Core
31
66
  index_op,
32
67
  serialized_document
33
68
  )
69
+
70
+ @queued[:indexed_document_count] += 1
71
+ @queued[:indexed_document_volume] += document_size
72
+ end
73
+
74
+ def ingest_multiple(documents)
75
+ documents.each { |doc| ingest(doc) }
34
76
  end
35
77
 
36
- def delete(doc_id)
37
- delete_op = serialize({ 'delete' => { '_index' => index_name, '_id' => doc_id } })
78
+ def delete(id)
79
+ return if id.nil?
80
+
81
+ delete_op = serialize({ 'delete' => { '_index' => @index_name, '_id' => id } })
38
82
  flush unless @operation_queue.will_fit?(delete_op)
39
83
 
40
84
  @operation_queue.add(delete_op)
85
+ @queued[:deleted_document_count] += 1
86
+ end
87
+
88
+ def delete_multiple(ids)
89
+ ids.each { |id| delete(id) }
41
90
  end
42
91
 
43
92
  def flush
@@ -45,15 +94,25 @@ module Core
45
94
  return if data.empty?
46
95
 
47
96
  @client.bulk(:body => data, :pipeline => @request_pipeline)
97
+
98
+ @completed[:indexed_document_count] += @queued[:indexed_document_count]
99
+ @completed[:deleted_document_count] += @queued[:deleted_document_count]
100
+ @completed[:indexed_document_volume] += @queued[:indexed_document_volume]
101
+
102
+ @queued[:indexed_document_count] = 0
103
+ @queued[:deleted_document_count] = 0
104
+ @queued[:indexed_document_volume] = 0
48
105
  end
49
106
 
50
- def serialize(obj)
51
- Elasticsearch::API.serializer.dump(obj)
107
+ def ingestion_stats
108
+ @completed.dup
52
109
  end
53
110
 
54
111
  private
55
112
 
56
- attr_accessor :index_name
113
+ def serialize(document)
114
+ Elasticsearch::API.serializer.dump(document)
115
+ end
57
116
  end
58
117
  end
59
118
  end
@@ -6,5 +6,4 @@
6
6
 
7
7
  # frozen_string_literal: true
8
8
 
9
- require 'core/ingestion/ingester'
10
9
  require 'core/ingestion/es_sink'
@@ -0,0 +1,114 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ module Core
10
+ module Jobs
11
+ class Consumer
12
+ def initialize(scheduler:, poll_interval: 3, termination_timeout: 60, min_threads: 1, max_threads: 5, max_queue: 100, idle_time: 5)
13
+ @scheduler = scheduler
14
+ @poll_interval = poll_interval
15
+ @termination_timeout = termination_timeout
16
+ @min_threads = min_threads
17
+ @max_threads = max_threads
18
+ @max_queue = max_queue
19
+ @idle_time = idle_time
20
+
21
+ @running = Concurrent::AtomicBoolean.new(false)
22
+ end
23
+
24
+ def subscribe!(index_name:)
25
+ @index_name = index_name
26
+
27
+ start_loop!
28
+ end
29
+
30
+ def running?
31
+ # @TODO check if a loop thread is alive
32
+ pool.running? && @running.true?
33
+ end
34
+
35
+ def shutdown!
36
+ Utility::Logger.info("Shutting down consumer for #{@index_name} index")
37
+ @running.make_false
38
+ pool.shutdown
39
+ pool.wait_for_termination(@termination_timeout)
40
+ # reset pool
41
+ @pool = nil
42
+ end
43
+
44
+ private
45
+
46
+ def start_loop!
47
+ Utility::Logger.info("Starting a new consumer for #{@index_name} index")
48
+
49
+ Thread.new do
50
+ # assign a name to the thread
51
+ # see @TODO in #self.running?
52
+ Thread.current[:name] = "consumer-group-#{@index_name}"
53
+
54
+ loop do
55
+ if @running.false?
56
+ Utility::Logger.info('Shutting down the loop')
57
+ break
58
+ end
59
+
60
+ sleep(@poll_interval)
61
+ Utility::Logger.debug('Getting registered connectors')
62
+
63
+ connectors = ready_for_sync_connectors
64
+ next unless connectors.any?
65
+
66
+ Utility::Logger.debug("Number of available connectors: #{connectors.size}")
67
+
68
+ # @TODO It is assumed that @index_name is used to retrive pending jobs.
69
+ # This will be discussed after 8.6 release
70
+ pending_jobs = Core::ConnectorJob.pending_jobs(connectors_ids: connectors.keys)
71
+ Utility::Logger.info("Number of pending jobs: #{pending_jobs.size}")
72
+
73
+ pending_jobs.each do |job|
74
+ connector_settings = connectors[job.connector_id]
75
+
76
+ pool.post do
77
+ Utility::Logger.info("Connector #{connector_settings.formatted} picked up the job #{job.id}")
78
+ Core::ElasticConnectorActions.ensure_content_index_exists(connector_settings.index_name)
79
+ job_runner = Core::SyncJobRunner.new(connector_settings, job)
80
+ job_runner.execute
81
+ rescue Core::JobAlreadyRunningError
82
+ Utility::Logger.info("Sync job for #{connector_settings.formatted} is already running, skipping.")
83
+ rescue Core::ConnectorVersionChangedError => e
84
+ Utility::Logger.info("Could not start the job because #{connector_settings.formatted} has been updated externally. Message: #{e.message}")
85
+ rescue StandardError => e
86
+ Utility::ExceptionTracking.log_exception(e, "Sync job for #{connector_settings.formatted} failed due to unexpected error.")
87
+ end
88
+ end
89
+ rescue StandardError => e
90
+ Utility::ExceptionTracking.log_exception(e, 'The consumer group failed')
91
+ end
92
+ end
93
+
94
+ @running.make_true
95
+ end
96
+
97
+ def pool
98
+ @pool ||= Concurrent::ThreadPoolExecutor.new(
99
+ min_threads: @min_threads,
100
+ max_threads: @max_threads,
101
+ max_queue: @max_queue,
102
+ fallback_policy: :abort,
103
+ idletime: @idle_time
104
+ )
105
+ end
106
+
107
+ def ready_for_sync_connectors
108
+ @scheduler.connector_settings
109
+ .select(&:ready_for_sync?)
110
+ .inject({}) { |memo, cs| memo.merge(cs.id => cs) }
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,26 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ module Core
10
+ module Jobs
11
+ class Producer
12
+ JOB_TYPES = %i(sync).freeze
13
+
14
+ class << self
15
+ def enqueue_job(job_type:, connector_settings:)
16
+ raise UnsupportedJobType unless JOB_TYPES.include?(job_type)
17
+ raise ArgumentError unless connector_settings.kind_of?(ConnectorSettings)
18
+
19
+ ElasticConnectorActions.create_job(connector_settings: connector_settings)
20
+ end
21
+ end
22
+ end
23
+
24
+ class UnsupportedJobType < StandardError; end
25
+ end
26
+ end
@@ -20,7 +20,7 @@ module Core
20
20
 
21
21
  def connector_settings
22
22
  connector_settings = Core::ConnectorSettings.fetch_by_id(@connector_id)
23
- [connector_settings]
23
+ [connector_settings].compact
24
24
  rescue *Utility::AUTHORIZATION_ERRORS => e
25
25
  # should be handled by the general scheduler
26
26
  raise e
@@ -23,9 +23,9 @@ module Core
23
23
  class SyncJobRunner
24
24
  JOB_REPORTING_INTERVAL = 10
25
25
 
26
- def initialize(connector_settings)
26
+ def initialize(connector_settings, job)
27
27
  @connector_settings = connector_settings
28
- @ingester = Core::Ingestion::Ingester.new(Core::Ingestion::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline))
28
+ @sink = Core::Ingestion::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline)
29
29
  @connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
30
30
  @sync_finished = false
31
31
  @sync_error = nil
@@ -35,6 +35,7 @@ module Core
35
35
  :indexed_document_volume => 0,
36
36
  :error => nil
37
37
  }
38
+ @job = job
38
39
  end
39
40
 
40
41
  def execute
@@ -47,9 +48,16 @@ module Core
47
48
  def do_sync!
48
49
  Utility::Logger.info("Claiming a sync job for connector #{@connector_settings.id}.")
49
50
 
50
- job_record = ElasticConnectorActions.claim_job(@connector_settings.id)
51
- job_description = job_record['_source']
52
- job_id = job_record['_id']
51
+ # connector service doesn't support multiple jobs running simultaneously
52
+ raise Core::JobAlreadyRunningError.new(@connector_settings.id) if @connector_settings.running?
53
+
54
+ Core::ElasticConnectorActions.update_connector_last_sync_status(@connector_settings.id, Connectors::SyncStatus::IN_PROGRESS)
55
+
56
+ # claim the job
57
+ @job.make_running!
58
+
59
+ job_description = @job.es_source
60
+ job_id = @job.id
53
61
  job_description['_id'] = job_id
54
62
 
55
63
  unless job_id.present?
@@ -80,12 +88,12 @@ module Core
80
88
  document = add_ingest_metadata(document)
81
89
  post_process_result = post_processing_engine.process(document)
82
90
  if post_process_result.is_include?
83
- @ingester.ingest(document)
91
+ @sink.ingest(document)
84
92
  incoming_ids << document['id']
85
93
  end
86
94
 
87
95
  if Time.now - reporting_cycle_start >= JOB_REPORTING_INTERVAL
88
- ElasticConnectorActions.update_sync(job_id, @ingester.ingestion_stats.merge(:metadata => connector_instance.metadata))
96
+ ElasticConnectorActions.update_sync(job_id, @sink.ingestion_stats.merge(:metadata => connector_instance.metadata))
89
97
  reporting_cycle_start = Time.now
90
98
  end
91
99
  end
@@ -95,15 +103,15 @@ module Core
95
103
  Utility::Logger.info("Deleting #{ids_to_delete.size} documents from index #{@connector_settings.index_name}.")
96
104
 
97
105
  ids_to_delete.each do |id|
98
- @ingester.delete(id)
106
+ @sink.delete(id)
99
107
 
100
108
  if Time.now - reporting_cycle_start >= JOB_REPORTING_INTERVAL
101
- ElasticConnectorActions.update_sync(job_id, @ingester.ingestion_stats.merge(:metadata => connector_instance.metadata))
109
+ ElasticConnectorActions.update_sync(job_id, @sink.ingestion_stats.merge(:metadata => connector_instance.metadata))
102
110
  reporting_cycle_start = Time.now
103
111
  end
104
112
  end
105
113
 
106
- @ingester.flush
114
+ @sink.flush
107
115
 
108
116
  # We use this mechanism for checking, whether an interrupt (or something else lead to the thread not finishing)
109
117
  # occurred as most of the time the main execution thread is interrupted and we miss this Signal/Exception here
@@ -112,7 +120,7 @@ module Core
112
120
  @sync_error = e.message
113
121
  Utility::ExceptionTracking.log_exception(e)
114
122
  ensure
115
- stats = @ingester.ingestion_stats
123
+ stats = @sink.ingestion_stats
116
124
 
117
125
  Utility::Logger.debug("Sync stats are: #{stats}")
118
126
 
@@ -129,7 +137,7 @@ module Core
129
137
  end
130
138
 
131
139
  unless connector_instance.nil?
132
- metadata = @ingester.ingestion_stats.merge(:metadata => connector_instance.metadata)
140
+ metadata = @sink.ingestion_stats.merge(:metadata => connector_instance.metadata)
133
141
  metadata[:total_document_count] = ElasticConnectorActions.document_count(@connector_settings.index_name)
134
142
  end
135
143
 
data/lib/core.rb CHANGED
@@ -16,3 +16,5 @@ require 'core/scheduler'
16
16
  require 'core/single_scheduler'
17
17
  require 'core/native_scheduler'
18
18
  require 'core/sync_job_runner'
19
+ require 'core/jobs/producer'
20
+ require 'core/jobs/consumer'
@@ -0,0 +1,108 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'time'
10
+ require 'utility/errors'
11
+ require 'utility/exception_tracking'
12
+
13
+ module Utility
14
+ class ErrorMonitor
15
+ class MonitoringError < StandardError
16
+ attr_accessor :tripped_by
17
+
18
+ def initialize(message = nil, tripped_by: nil)
19
+ super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
20
+ @tripped_by = tripped_by
21
+ end
22
+ end
23
+
24
+ class MaxSuccessiveErrorsExceededError < MonitoringError; end
25
+ class MaxErrorsExceededError < MonitoringError; end
26
+ class MaxErrorsInWindowExceededError < MonitoringError; end
27
+
28
+ attr_reader :total_error_count, :success_count, :consecutive_error_count, :error_queue
29
+
30
+ def initialize(
31
+ max_errors: 1000,
32
+ max_consecutive_errors: 10,
33
+ max_error_ratio: 0.15,
34
+ window_size: 100,
35
+ error_queue_size: 20
36
+ )
37
+ @max_errors = max_errors
38
+ @max_consecutive_errors = max_consecutive_errors
39
+ @max_error_ratio = max_error_ratio
40
+ @window_size = window_size
41
+ @total_error_count = 0
42
+ @success_count = 0
43
+ @consecutive_error_count = 0
44
+ @window_errors = Array.new(window_size) { false }
45
+ @window_index = 0
46
+ @last_error = nil
47
+ @error_queue_size = error_queue_size
48
+ @error_queue = []
49
+ end
50
+
51
+ def note_success
52
+ @consecutive_error_count = 0
53
+ @success_count += 1
54
+ increment_window_index
55
+ end
56
+
57
+ def note_error(error, id: Time.now.to_i)
58
+ stack_trace = Utility::ExceptionTracking.generate_stack_trace(error)
59
+ error_message = Utility::ExceptionTracking.generate_error_message(error, nil, nil)
60
+ Utility::Logger.debug("Message id: #{id} - #{error_message}\n#{stack_trace}")
61
+ @total_error_count += 1
62
+ @consecutive_error_count += 1
63
+ @window_errors[@window_index] = true
64
+ @error_queue << DocumentError.new(error.class.name, error_message, stack_trace, id)
65
+ @error_queue = @error_queue.drop(1) if @error_queue.size > @error_queue_size
66
+ increment_window_index
67
+ @last_error = error
68
+
69
+ raise_if_necessary
70
+ end
71
+
72
+ def finalize
73
+ total_documents = @total_error_count + @success_count
74
+ if total_documents > 0 && @total_error_count.to_f / total_documents > @max_error_ratio
75
+ raise_with_last_cause(MaxErrorsInWindowExceededError.new("There were #{@total_error_count} errors out of #{total_documents} total documents", :tripped_by => @last_error))
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def raise_if_necessary
82
+ error =
83
+ if @consecutive_error_count > @max_consecutive_errors
84
+ MaxSuccessiveErrorsExceededError.new("Exceeded maximum consecutive errors - saw #{@consecutive_error_count} errors in a row.", :tripped_by => @last_error)
85
+ elsif @total_error_count > @max_errors
86
+ MaxErrorsExceededError.new("Exceeded maximum number of errors - saw #{@total_error_count} errors in total.", :tripped_by => @last_error)
87
+ elsif @window_size > 0 && num_errors_in_window / @window_size > @max_error_ratio
88
+ MaxErrorsInWindowExceededError.new("Exceeded maximum error ratio of #{@max_error_ratio}. Of the last #{@window_size} documents, #{num_errors_in_window} had errors", :tripped_by => @last_error)
89
+ end
90
+
91
+ raise_with_last_cause(error) if error
92
+ end
93
+
94
+ def num_errors_in_window
95
+ @window_errors.count(&:itself).to_f
96
+ end
97
+
98
+ def increment_window_index
99
+ @window_index = (@window_index + 1) % @window_size
100
+ end
101
+
102
+ def raise_with_last_cause(error)
103
+ raise @last_error
104
+ rescue StandardError
105
+ raise error
106
+ end
107
+ end
108
+ end
@@ -60,18 +60,6 @@ module Utility
60
60
  class JobDocumentLimitError < StandardError; end
61
61
  class JobClaimingError < StandardError; end
62
62
 
63
- class MonitoringError < StandardError
64
- attr_accessor :tripped_by
65
-
66
- def initialize(message = nil, tripped_by: nil)
67
- super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
68
- @tripped_by = tripped_by
69
- end
70
- end
71
- class MaxSuccessiveErrorsExceededError < MonitoringError; end
72
- class MaxErrorsExceededError < MonitoringError; end
73
- class MaxErrorsInWindowExceededError < MonitoringError; end
74
-
75
63
  class JobSyncNotPossibleYetError < StandardError
76
64
  attr_accessor :sync_will_be_possible_at
77
65