connectors_service 8.6.0.4.pre.20221114T233727Z → 8.6.0.4.pre.20221116T024501Z
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/connectors.yml +4 -4
- data/lib/app/app.rb +4 -0
- data/lib/app/dispatcher.rb +30 -17
- data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
- data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
- data/lib/connectors/base/connector.rb +27 -5
- data/lib/connectors/example/connector.rb +3 -12
- data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
- data/lib/connectors/gitlab/connector.rb +3 -12
- data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
- data/lib/connectors/mongodb/connector.rb +9 -24
- data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
- data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
- data/lib/connectors/sync_status.rb +6 -1
- data/lib/connectors/tolerable_error_helper.rb +43 -0
- data/lib/core/connector_job.rb +96 -23
- data/lib/core/connector_settings.rb +29 -6
- data/lib/core/elastic_connector_actions.rb +77 -55
- data/lib/core/filtering/validation_job_runner.rb +1 -1
- data/lib/core/ingestion/es_sink.rb +68 -9
- data/lib/core/ingestion.rb +0 -1
- data/lib/core/jobs/consumer.rb +114 -0
- data/lib/core/jobs/producer.rb +26 -0
- data/lib/core/single_scheduler.rb +1 -1
- data/lib/core/sync_job_runner.rb +20 -12
- data/lib/core.rb +2 -0
- data/lib/utility/error_monitor.rb +108 -0
- data/lib/utility/errors.rb +0 -12
- data/lib/utility/logger.rb +0 -1
- data/lib/utility.rb +6 -0
- metadata +12 -3
- data/lib/core/ingestion/ingester.rb +0 -90
@@ -132,11 +132,35 @@ module Core
|
|
132
132
|
update_connector_fields(connector_id, { :filtering => filtering })
|
133
133
|
end
|
134
134
|
|
135
|
-
def
|
135
|
+
def update_connector_sync_now(connector_id, sync_now)
|
136
|
+
doc = connector_with_concurrency_control(connector_id)
|
137
|
+
|
138
|
+
body = { sync_now: sync_now, last_synced: Time.now }
|
139
|
+
|
140
|
+
update_connector_fields(
|
141
|
+
connector_id,
|
142
|
+
body,
|
143
|
+
doc[:seq_no],
|
144
|
+
doc[:primary_term]
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
def update_connector_last_sync_status(connector_id, last_sync_status)
|
149
|
+
doc = connector_with_concurrency_control(connector_id)
|
150
|
+
|
151
|
+
update_connector_fields(
|
152
|
+
connector_id,
|
153
|
+
{ last_sync_status: last_sync_status },
|
154
|
+
doc[:seq_no],
|
155
|
+
doc[:primary_term]
|
156
|
+
)
|
157
|
+
end
|
158
|
+
|
159
|
+
def connector_with_concurrency_control(connector_id)
|
136
160
|
seq_no = nil
|
137
161
|
primary_term = nil
|
138
|
-
|
139
|
-
|
162
|
+
|
163
|
+
doc = client.get(
|
140
164
|
:index => Utility::Constants::CONNECTORS_INDEX,
|
141
165
|
:id => connector_id,
|
142
166
|
:ignore => 404,
|
@@ -144,42 +168,31 @@ module Core
|
|
144
168
|
).tap do |response|
|
145
169
|
seq_no = response['_seq_no']
|
146
170
|
primary_term = response['_primary_term']
|
147
|
-
sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
|
148
|
-
end
|
149
|
-
if sync_in_progress
|
150
|
-
raise JobAlreadyRunningError.new(connector_id)
|
151
171
|
end
|
152
|
-
update_connector_fields(
|
153
|
-
connector_id,
|
154
|
-
{ :sync_now => false,
|
155
|
-
:last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
|
156
|
-
:last_synced => Time.now },
|
157
|
-
seq_no,
|
158
|
-
primary_term
|
159
|
-
)
|
160
172
|
|
173
|
+
{ doc: doc, seq_no: seq_no, primary_term: primary_term }
|
174
|
+
end
|
175
|
+
|
176
|
+
def create_job(connector_settings:)
|
161
177
|
body = {
|
162
|
-
:
|
163
|
-
:
|
164
|
-
:
|
165
|
-
:
|
166
|
-
|
167
|
-
|
168
|
-
:
|
169
|
-
:
|
178
|
+
status: Connectors::SyncStatus::PENDING,
|
179
|
+
created_at: Time.now,
|
180
|
+
last_seen: Time.now,
|
181
|
+
connector: {
|
182
|
+
id: connector_settings.id,
|
183
|
+
filtering: convert_connector_filtering_to_job_filtering(connector_settings.filtering),
|
184
|
+
index_name: connector_settings.index_name,
|
185
|
+
language: connector_settings[:language],
|
186
|
+
pipeline: connector_settings[:pipeline],
|
187
|
+
service_type: connector_settings.service_type
|
170
188
|
}
|
171
189
|
}
|
172
190
|
|
173
|
-
index_response = client.index(:
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
:id => index_response['_id'],
|
179
|
-
:ignore => 404
|
180
|
-
).with_indifferent_access
|
181
|
-
end
|
182
|
-
raise JobNotCreatedError.new(connector_id, index_response)
|
191
|
+
index_response = client.index(index: Utility::Constants::JOB_INDEX, body: body, refresh: true)
|
192
|
+
|
193
|
+
return index_response if index_response['result'] == 'created'
|
194
|
+
|
195
|
+
raise JobNotCreatedError.new(connector_settings.id, index_response)
|
183
196
|
end
|
184
197
|
|
185
198
|
def convert_connector_filtering_to_job_filtering(connector_filtering)
|
@@ -507,31 +520,15 @@ module Core
|
|
507
520
|
end
|
508
521
|
|
509
522
|
def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
:refresh => true,
|
516
|
-
:retry_on_conflict => 3
|
517
|
-
}
|
518
|
-
# seq_no and primary_term are used for optimistic concurrency control
|
519
|
-
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
|
520
|
-
if seq_no && primary_term
|
521
|
-
update_args[:if_seq_no] = seq_no
|
522
|
-
update_args[:if_primary_term] = primary_term
|
523
|
-
update_args.delete(:retry_on_conflict)
|
524
|
-
end
|
525
|
-
begin
|
526
|
-
client.update(update_args)
|
527
|
-
rescue Elastic::Transport::Transport::Errors::Conflict
|
528
|
-
# VersionConflictException
|
529
|
-
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
|
530
|
-
raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
|
531
|
-
end
|
523
|
+
update_doc_fields(Utility::Constants::CONNECTORS_INDEX, connector_id, doc, seq_no, primary_term)
|
524
|
+
end
|
525
|
+
|
526
|
+
def update_job_fields(job_id, doc = {}, seq_no = nil, primary_term = nil)
|
527
|
+
update_doc_fields(Utility::Constants::JOB_INDEX, job_id, doc, seq_no, primary_term)
|
532
528
|
end
|
533
529
|
|
534
530
|
def document_count(index_name)
|
531
|
+
client.indices.refresh(:index => index_name)
|
535
532
|
client.count(:index => index_name)['count']
|
536
533
|
end
|
537
534
|
|
@@ -563,6 +560,31 @@ module Core
|
|
563
560
|
filter.deep_merge!(new_validation_state)
|
564
561
|
end
|
565
562
|
end
|
563
|
+
|
564
|
+
def update_doc_fields(index, id, doc = {}, seq_no = nil, primary_term = nil)
|
565
|
+
return if doc.empty?
|
566
|
+
update_args = {
|
567
|
+
:index => index,
|
568
|
+
:id => id,
|
569
|
+
:body => { :doc => doc },
|
570
|
+
:refresh => true,
|
571
|
+
:retry_on_conflict => 3
|
572
|
+
}
|
573
|
+
|
574
|
+
if seq_no && primary_term
|
575
|
+
update_args[:if_seq_no] = seq_no
|
576
|
+
update_args[:if_primary_term] = primary_term
|
577
|
+
update_args.delete(:retry_on_conflict)
|
578
|
+
end
|
579
|
+
|
580
|
+
begin
|
581
|
+
client.update(update_args)
|
582
|
+
rescue Elastic::Transport::Transport::Errors::Conflict
|
583
|
+
# VersionConflictException
|
584
|
+
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
|
585
|
+
raise ConnectorVersionChangedError.new(id, seq_no, primary_term)
|
586
|
+
end
|
587
|
+
end
|
566
588
|
end
|
567
589
|
end
|
568
590
|
end
|
@@ -24,7 +24,7 @@ module Core
|
|
24
24
|
def execute
|
25
25
|
Utility::Logger.info("Starting a validation job for connector #{@connector_settings.id}.")
|
26
26
|
|
27
|
-
validation_result = @connector_class.validate_filtering(@connector_settings.filtering)
|
27
|
+
validation_result = @connector_class.validate_filtering(@connector_settings.filtering[:draft])
|
28
28
|
|
29
29
|
# currently only used for connectors -> DEFAULT domain can be assumed (will be changed with the integration of crawler)
|
30
30
|
ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_result })
|
@@ -11,19 +11,54 @@ require 'utility/bulk_queue'
|
|
11
11
|
require 'utility/es_client'
|
12
12
|
require 'utility/logger'
|
13
13
|
require 'elasticsearch/api'
|
14
|
-
|
14
|
+
#
|
15
|
+
# This class is responsible for sending the data to the data storage.
|
16
|
+
# While we don't actually allow to output our data anywhere except
|
17
|
+
# Elasticsearch, we still want to be able to do so sometime in future.
|
18
|
+
#
|
19
|
+
# This class should stay simple and any change to the class should be careful
|
20
|
+
# with the thought of introducing other sinks in future.
|
15
21
|
module Core
|
16
22
|
module Ingestion
|
17
23
|
class EsSink
|
18
|
-
def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new)
|
24
|
+
def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new, max_allowed_document_size = 5 * 1024 * 1024)
|
19
25
|
@client = Utility::EsClient.new(App::Config[:elasticsearch])
|
20
26
|
@index_name = index_name
|
21
27
|
@request_pipeline = request_pipeline
|
22
28
|
@operation_queue = bulk_queue
|
29
|
+
|
30
|
+
@max_allowed_document_size = max_allowed_document_size
|
31
|
+
|
32
|
+
@queued = {
|
33
|
+
:indexed_document_count => 0,
|
34
|
+
:deleted_document_count => 0,
|
35
|
+
:indexed_document_volume => 0
|
36
|
+
}
|
37
|
+
|
38
|
+
@completed = {
|
39
|
+
:indexed_document_count => 0,
|
40
|
+
:deleted_document_count => 0,
|
41
|
+
:indexed_document_volume => 0
|
42
|
+
}
|
23
43
|
end
|
24
44
|
|
25
|
-
def ingest(
|
26
|
-
|
45
|
+
def ingest(document)
|
46
|
+
if document.nil? || document.empty?
|
47
|
+
Utility::Logger.warn('Connector attempted to ingest an empty document, skipping')
|
48
|
+
return
|
49
|
+
end
|
50
|
+
|
51
|
+
id = document['id']
|
52
|
+
serialized_document = serialize(document)
|
53
|
+
|
54
|
+
document_size = serialized_document.bytesize
|
55
|
+
|
56
|
+
if @max_allowed_document_size > 0 && document_size > @max_allowed_document_size
|
57
|
+
Utility::Logger.warn("Connector attempted to ingest too large document with id=#{document['id']} [#{document_size}/#{@max_allowed_document_size}], skipping the document.")
|
58
|
+
return
|
59
|
+
end
|
60
|
+
|
61
|
+
index_op = serialize({ 'index' => { '_index' => @index_name, '_id' => id } })
|
27
62
|
|
28
63
|
flush unless @operation_queue.will_fit?(index_op, serialized_document)
|
29
64
|
|
@@ -31,13 +66,27 @@ module Core
|
|
31
66
|
index_op,
|
32
67
|
serialized_document
|
33
68
|
)
|
69
|
+
|
70
|
+
@queued[:indexed_document_count] += 1
|
71
|
+
@queued[:indexed_document_volume] += document_size
|
72
|
+
end
|
73
|
+
|
74
|
+
def ingest_multiple(documents)
|
75
|
+
documents.each { |doc| ingest(doc) }
|
34
76
|
end
|
35
77
|
|
36
|
-
def delete(
|
37
|
-
|
78
|
+
def delete(id)
|
79
|
+
return if id.nil?
|
80
|
+
|
81
|
+
delete_op = serialize({ 'delete' => { '_index' => @index_name, '_id' => id } })
|
38
82
|
flush unless @operation_queue.will_fit?(delete_op)
|
39
83
|
|
40
84
|
@operation_queue.add(delete_op)
|
85
|
+
@queued[:deleted_document_count] += 1
|
86
|
+
end
|
87
|
+
|
88
|
+
def delete_multiple(ids)
|
89
|
+
ids.each { |id| delete(id) }
|
41
90
|
end
|
42
91
|
|
43
92
|
def flush
|
@@ -45,15 +94,25 @@ module Core
|
|
45
94
|
return if data.empty?
|
46
95
|
|
47
96
|
@client.bulk(:body => data, :pipeline => @request_pipeline)
|
97
|
+
|
98
|
+
@completed[:indexed_document_count] += @queued[:indexed_document_count]
|
99
|
+
@completed[:deleted_document_count] += @queued[:deleted_document_count]
|
100
|
+
@completed[:indexed_document_volume] += @queued[:indexed_document_volume]
|
101
|
+
|
102
|
+
@queued[:indexed_document_count] = 0
|
103
|
+
@queued[:deleted_document_count] = 0
|
104
|
+
@queued[:indexed_document_volume] = 0
|
48
105
|
end
|
49
106
|
|
50
|
-
def
|
51
|
-
|
107
|
+
def ingestion_stats
|
108
|
+
@completed.dup
|
52
109
|
end
|
53
110
|
|
54
111
|
private
|
55
112
|
|
56
|
-
|
113
|
+
def serialize(document)
|
114
|
+
Elasticsearch::API.serializer.dump(document)
|
115
|
+
end
|
57
116
|
end
|
58
117
|
end
|
59
118
|
end
|
data/lib/core/ingestion.rb
CHANGED
@@ -0,0 +1,114 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
module Core
|
10
|
+
module Jobs
|
11
|
+
class Consumer
|
12
|
+
def initialize(scheduler:, poll_interval: 3, termination_timeout: 60, min_threads: 1, max_threads: 5, max_queue: 100, idle_time: 5)
|
13
|
+
@scheduler = scheduler
|
14
|
+
@poll_interval = poll_interval
|
15
|
+
@termination_timeout = termination_timeout
|
16
|
+
@min_threads = min_threads
|
17
|
+
@max_threads = max_threads
|
18
|
+
@max_queue = max_queue
|
19
|
+
@idle_time = idle_time
|
20
|
+
|
21
|
+
@running = Concurrent::AtomicBoolean.new(false)
|
22
|
+
end
|
23
|
+
|
24
|
+
def subscribe!(index_name:)
|
25
|
+
@index_name = index_name
|
26
|
+
|
27
|
+
start_loop!
|
28
|
+
end
|
29
|
+
|
30
|
+
def running?
|
31
|
+
# @TODO check if a loop thread is alive
|
32
|
+
pool.running? && @running.true?
|
33
|
+
end
|
34
|
+
|
35
|
+
def shutdown!
|
36
|
+
Utility::Logger.info("Shutting down consumer for #{@index_name} index")
|
37
|
+
@running.make_false
|
38
|
+
pool.shutdown
|
39
|
+
pool.wait_for_termination(@termination_timeout)
|
40
|
+
# reset pool
|
41
|
+
@pool = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def start_loop!
|
47
|
+
Utility::Logger.info("Starting a new consumer for #{@index_name} index")
|
48
|
+
|
49
|
+
Thread.new do
|
50
|
+
# assign a name to the thread
|
51
|
+
# see @TODO in #self.running?
|
52
|
+
Thread.current[:name] = "consumer-group-#{@index_name}"
|
53
|
+
|
54
|
+
loop do
|
55
|
+
if @running.false?
|
56
|
+
Utility::Logger.info('Shutting down the loop')
|
57
|
+
break
|
58
|
+
end
|
59
|
+
|
60
|
+
sleep(@poll_interval)
|
61
|
+
Utility::Logger.debug('Getting registered connectors')
|
62
|
+
|
63
|
+
connectors = ready_for_sync_connectors
|
64
|
+
next unless connectors.any?
|
65
|
+
|
66
|
+
Utility::Logger.debug("Number of available connectors: #{connectors.size}")
|
67
|
+
|
68
|
+
# @TODO It is assumed that @index_name is used to retrive pending jobs.
|
69
|
+
# This will be discussed after 8.6 release
|
70
|
+
pending_jobs = Core::ConnectorJob.pending_jobs(connectors_ids: connectors.keys)
|
71
|
+
Utility::Logger.info("Number of pending jobs: #{pending_jobs.size}")
|
72
|
+
|
73
|
+
pending_jobs.each do |job|
|
74
|
+
connector_settings = connectors[job.connector_id]
|
75
|
+
|
76
|
+
pool.post do
|
77
|
+
Utility::Logger.info("Connector #{connector_settings.formatted} picked up the job #{job.id}")
|
78
|
+
Core::ElasticConnectorActions.ensure_content_index_exists(connector_settings.index_name)
|
79
|
+
job_runner = Core::SyncJobRunner.new(connector_settings, job)
|
80
|
+
job_runner.execute
|
81
|
+
rescue Core::JobAlreadyRunningError
|
82
|
+
Utility::Logger.info("Sync job for #{connector_settings.formatted} is already running, skipping.")
|
83
|
+
rescue Core::ConnectorVersionChangedError => e
|
84
|
+
Utility::Logger.info("Could not start the job because #{connector_settings.formatted} has been updated externally. Message: #{e.message}")
|
85
|
+
rescue StandardError => e
|
86
|
+
Utility::ExceptionTracking.log_exception(e, "Sync job for #{connector_settings.formatted} failed due to unexpected error.")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
rescue StandardError => e
|
90
|
+
Utility::ExceptionTracking.log_exception(e, 'The consumer group failed')
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
@running.make_true
|
95
|
+
end
|
96
|
+
|
97
|
+
def pool
|
98
|
+
@pool ||= Concurrent::ThreadPoolExecutor.new(
|
99
|
+
min_threads: @min_threads,
|
100
|
+
max_threads: @max_threads,
|
101
|
+
max_queue: @max_queue,
|
102
|
+
fallback_policy: :abort,
|
103
|
+
idletime: @idle_time
|
104
|
+
)
|
105
|
+
end
|
106
|
+
|
107
|
+
def ready_for_sync_connectors
|
108
|
+
@scheduler.connector_settings
|
109
|
+
.select(&:ready_for_sync?)
|
110
|
+
.inject({}) { |memo, cs| memo.merge(cs.id => cs) }
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
module Core
|
10
|
+
module Jobs
|
11
|
+
class Producer
|
12
|
+
JOB_TYPES = %i(sync).freeze
|
13
|
+
|
14
|
+
class << self
|
15
|
+
def enqueue_job(job_type:, connector_settings:)
|
16
|
+
raise UnsupportedJobType unless JOB_TYPES.include?(job_type)
|
17
|
+
raise ArgumentError unless connector_settings.kind_of?(ConnectorSettings)
|
18
|
+
|
19
|
+
ElasticConnectorActions.create_job(connector_settings: connector_settings)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class UnsupportedJobType < StandardError; end
|
25
|
+
end
|
26
|
+
end
|
@@ -20,7 +20,7 @@ module Core
|
|
20
20
|
|
21
21
|
def connector_settings
|
22
22
|
connector_settings = Core::ConnectorSettings.fetch_by_id(@connector_id)
|
23
|
-
[connector_settings]
|
23
|
+
[connector_settings].compact
|
24
24
|
rescue *Utility::AUTHORIZATION_ERRORS => e
|
25
25
|
# should be handled by the general scheduler
|
26
26
|
raise e
|
data/lib/core/sync_job_runner.rb
CHANGED
@@ -23,9 +23,9 @@ module Core
|
|
23
23
|
class SyncJobRunner
|
24
24
|
JOB_REPORTING_INTERVAL = 10
|
25
25
|
|
26
|
-
def initialize(connector_settings)
|
26
|
+
def initialize(connector_settings, job)
|
27
27
|
@connector_settings = connector_settings
|
28
|
-
@
|
28
|
+
@sink = Core::Ingestion::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline)
|
29
29
|
@connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
|
30
30
|
@sync_finished = false
|
31
31
|
@sync_error = nil
|
@@ -35,6 +35,7 @@ module Core
|
|
35
35
|
:indexed_document_volume => 0,
|
36
36
|
:error => nil
|
37
37
|
}
|
38
|
+
@job = job
|
38
39
|
end
|
39
40
|
|
40
41
|
def execute
|
@@ -47,9 +48,16 @@ module Core
|
|
47
48
|
def do_sync!
|
48
49
|
Utility::Logger.info("Claiming a sync job for connector #{@connector_settings.id}.")
|
49
50
|
|
50
|
-
|
51
|
-
|
52
|
-
|
51
|
+
# connector service doesn't support multiple jobs running simultaneously
|
52
|
+
raise Core::JobAlreadyRunningError.new(@connector_settings.id) if @connector_settings.running?
|
53
|
+
|
54
|
+
Core::ElasticConnectorActions.update_connector_last_sync_status(@connector_settings.id, Connectors::SyncStatus::IN_PROGRESS)
|
55
|
+
|
56
|
+
# claim the job
|
57
|
+
@job.make_running!
|
58
|
+
|
59
|
+
job_description = @job.es_source
|
60
|
+
job_id = @job.id
|
53
61
|
job_description['_id'] = job_id
|
54
62
|
|
55
63
|
unless job_id.present?
|
@@ -80,12 +88,12 @@ module Core
|
|
80
88
|
document = add_ingest_metadata(document)
|
81
89
|
post_process_result = post_processing_engine.process(document)
|
82
90
|
if post_process_result.is_include?
|
83
|
-
@
|
91
|
+
@sink.ingest(document)
|
84
92
|
incoming_ids << document['id']
|
85
93
|
end
|
86
94
|
|
87
95
|
if Time.now - reporting_cycle_start >= JOB_REPORTING_INTERVAL
|
88
|
-
ElasticConnectorActions.update_sync(job_id, @
|
96
|
+
ElasticConnectorActions.update_sync(job_id, @sink.ingestion_stats.merge(:metadata => connector_instance.metadata))
|
89
97
|
reporting_cycle_start = Time.now
|
90
98
|
end
|
91
99
|
end
|
@@ -95,15 +103,15 @@ module Core
|
|
95
103
|
Utility::Logger.info("Deleting #{ids_to_delete.size} documents from index #{@connector_settings.index_name}.")
|
96
104
|
|
97
105
|
ids_to_delete.each do |id|
|
98
|
-
@
|
106
|
+
@sink.delete(id)
|
99
107
|
|
100
108
|
if Time.now - reporting_cycle_start >= JOB_REPORTING_INTERVAL
|
101
|
-
ElasticConnectorActions.update_sync(job_id, @
|
109
|
+
ElasticConnectorActions.update_sync(job_id, @sink.ingestion_stats.merge(:metadata => connector_instance.metadata))
|
102
110
|
reporting_cycle_start = Time.now
|
103
111
|
end
|
104
112
|
end
|
105
113
|
|
106
|
-
@
|
114
|
+
@sink.flush
|
107
115
|
|
108
116
|
# We use this mechanism for checking, whether an interrupt (or something else lead to the thread not finishing)
|
109
117
|
# occurred as most of the time the main execution thread is interrupted and we miss this Signal/Exception here
|
@@ -112,7 +120,7 @@ module Core
|
|
112
120
|
@sync_error = e.message
|
113
121
|
Utility::ExceptionTracking.log_exception(e)
|
114
122
|
ensure
|
115
|
-
stats = @
|
123
|
+
stats = @sink.ingestion_stats
|
116
124
|
|
117
125
|
Utility::Logger.debug("Sync stats are: #{stats}")
|
118
126
|
|
@@ -129,7 +137,7 @@ module Core
|
|
129
137
|
end
|
130
138
|
|
131
139
|
unless connector_instance.nil?
|
132
|
-
metadata = @
|
140
|
+
metadata = @sink.ingestion_stats.merge(:metadata => connector_instance.metadata)
|
133
141
|
metadata[:total_document_count] = ElasticConnectorActions.document_count(@connector_settings.index_name)
|
134
142
|
end
|
135
143
|
|
data/lib/core.rb
CHANGED
@@ -0,0 +1,108 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'time'
|
10
|
+
require 'utility/errors'
|
11
|
+
require 'utility/exception_tracking'
|
12
|
+
|
13
|
+
module Utility
|
14
|
+
class ErrorMonitor
|
15
|
+
class MonitoringError < StandardError
|
16
|
+
attr_accessor :tripped_by
|
17
|
+
|
18
|
+
def initialize(message = nil, tripped_by: nil)
|
19
|
+
super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
|
20
|
+
@tripped_by = tripped_by
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class MaxSuccessiveErrorsExceededError < MonitoringError; end
|
25
|
+
class MaxErrorsExceededError < MonitoringError; end
|
26
|
+
class MaxErrorsInWindowExceededError < MonitoringError; end
|
27
|
+
|
28
|
+
attr_reader :total_error_count, :success_count, :consecutive_error_count, :error_queue
|
29
|
+
|
30
|
+
def initialize(
|
31
|
+
max_errors: 1000,
|
32
|
+
max_consecutive_errors: 10,
|
33
|
+
max_error_ratio: 0.15,
|
34
|
+
window_size: 100,
|
35
|
+
error_queue_size: 20
|
36
|
+
)
|
37
|
+
@max_errors = max_errors
|
38
|
+
@max_consecutive_errors = max_consecutive_errors
|
39
|
+
@max_error_ratio = max_error_ratio
|
40
|
+
@window_size = window_size
|
41
|
+
@total_error_count = 0
|
42
|
+
@success_count = 0
|
43
|
+
@consecutive_error_count = 0
|
44
|
+
@window_errors = Array.new(window_size) { false }
|
45
|
+
@window_index = 0
|
46
|
+
@last_error = nil
|
47
|
+
@error_queue_size = error_queue_size
|
48
|
+
@error_queue = []
|
49
|
+
end
|
50
|
+
|
51
|
+
def note_success
|
52
|
+
@consecutive_error_count = 0
|
53
|
+
@success_count += 1
|
54
|
+
increment_window_index
|
55
|
+
end
|
56
|
+
|
57
|
+
def note_error(error, id: Time.now.to_i)
|
58
|
+
stack_trace = Utility::ExceptionTracking.generate_stack_trace(error)
|
59
|
+
error_message = Utility::ExceptionTracking.generate_error_message(error, nil, nil)
|
60
|
+
Utility::Logger.debug("Message id: #{id} - #{error_message}\n#{stack_trace}")
|
61
|
+
@total_error_count += 1
|
62
|
+
@consecutive_error_count += 1
|
63
|
+
@window_errors[@window_index] = true
|
64
|
+
@error_queue << DocumentError.new(error.class.name, error_message, stack_trace, id)
|
65
|
+
@error_queue = @error_queue.drop(1) if @error_queue.size > @error_queue_size
|
66
|
+
increment_window_index
|
67
|
+
@last_error = error
|
68
|
+
|
69
|
+
raise_if_necessary
|
70
|
+
end
|
71
|
+
|
72
|
+
def finalize
|
73
|
+
total_documents = @total_error_count + @success_count
|
74
|
+
if total_documents > 0 && @total_error_count.to_f / total_documents > @max_error_ratio
|
75
|
+
raise_with_last_cause(MaxErrorsInWindowExceededError.new("There were #{@total_error_count} errors out of #{total_documents} total documents", :tripped_by => @last_error))
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def raise_if_necessary
|
82
|
+
error =
|
83
|
+
if @consecutive_error_count > @max_consecutive_errors
|
84
|
+
MaxSuccessiveErrorsExceededError.new("Exceeded maximum consecutive errors - saw #{@consecutive_error_count} errors in a row.", :tripped_by => @last_error)
|
85
|
+
elsif @total_error_count > @max_errors
|
86
|
+
MaxErrorsExceededError.new("Exceeded maximum number of errors - saw #{@total_error_count} errors in total.", :tripped_by => @last_error)
|
87
|
+
elsif @window_size > 0 && num_errors_in_window / @window_size > @max_error_ratio
|
88
|
+
MaxErrorsInWindowExceededError.new("Exceeded maximum error ratio of #{@max_error_ratio}. Of the last #{@window_size} documents, #{num_errors_in_window} had errors", :tripped_by => @last_error)
|
89
|
+
end
|
90
|
+
|
91
|
+
raise_with_last_cause(error) if error
|
92
|
+
end
|
93
|
+
|
94
|
+
def num_errors_in_window
|
95
|
+
@window_errors.count(&:itself).to_f
|
96
|
+
end
|
97
|
+
|
98
|
+
def increment_window_index
|
99
|
+
@window_index = (@window_index + 1) % @window_size
|
100
|
+
end
|
101
|
+
|
102
|
+
def raise_with_last_cause(error)
|
103
|
+
raise @last_error
|
104
|
+
rescue StandardError
|
105
|
+
raise error
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/utility/errors.rb
CHANGED
@@ -60,18 +60,6 @@ module Utility
|
|
60
60
|
class JobDocumentLimitError < StandardError; end
|
61
61
|
class JobClaimingError < StandardError; end
|
62
62
|
|
63
|
-
class MonitoringError < StandardError
|
64
|
-
attr_accessor :tripped_by
|
65
|
-
|
66
|
-
def initialize(message = nil, tripped_by: nil)
|
67
|
-
super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
|
68
|
-
@tripped_by = tripped_by
|
69
|
-
end
|
70
|
-
end
|
71
|
-
class MaxSuccessiveErrorsExceededError < MonitoringError; end
|
72
|
-
class MaxErrorsExceededError < MonitoringError; end
|
73
|
-
class MaxErrorsInWindowExceededError < MonitoringError; end
|
74
|
-
|
75
63
|
class JobSyncNotPossibleYetError < StandardError
|
76
64
|
attr_accessor :sync_will_be_possible_at
|
77
65
|
|