connectors_service 8.6.0.4.pre.20221114T233727Z → 8.6.0.4.pre.20221116T024501Z
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config/connectors.yml +4 -4
- data/lib/app/app.rb +4 -0
- data/lib/app/dispatcher.rb +30 -17
- data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
- data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
- data/lib/connectors/base/connector.rb +27 -5
- data/lib/connectors/example/connector.rb +3 -12
- data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
- data/lib/connectors/gitlab/connector.rb +3 -12
- data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
- data/lib/connectors/mongodb/connector.rb +9 -24
- data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
- data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
- data/lib/connectors/sync_status.rb +6 -1
- data/lib/connectors/tolerable_error_helper.rb +43 -0
- data/lib/core/connector_job.rb +96 -23
- data/lib/core/connector_settings.rb +29 -6
- data/lib/core/elastic_connector_actions.rb +77 -55
- data/lib/core/filtering/validation_job_runner.rb +1 -1
- data/lib/core/ingestion/es_sink.rb +68 -9
- data/lib/core/ingestion.rb +0 -1
- data/lib/core/jobs/consumer.rb +114 -0
- data/lib/core/jobs/producer.rb +26 -0
- data/lib/core/single_scheduler.rb +1 -1
- data/lib/core/sync_job_runner.rb +20 -12
- data/lib/core.rb +2 -0
- data/lib/utility/error_monitor.rb +108 -0
- data/lib/utility/errors.rb +0 -12
- data/lib/utility/logger.rb +0 -1
- data/lib/utility.rb +6 -0
- metadata +12 -3
- data/lib/core/ingestion/ingester.rb +0 -90
@@ -132,11 +132,35 @@ module Core
|
|
132
132
|
update_connector_fields(connector_id, { :filtering => filtering })
|
133
133
|
end
|
134
134
|
|
135
|
-
def
|
135
|
+
def update_connector_sync_now(connector_id, sync_now)
|
136
|
+
doc = connector_with_concurrency_control(connector_id)
|
137
|
+
|
138
|
+
body = { sync_now: sync_now, last_synced: Time.now }
|
139
|
+
|
140
|
+
update_connector_fields(
|
141
|
+
connector_id,
|
142
|
+
body,
|
143
|
+
doc[:seq_no],
|
144
|
+
doc[:primary_term]
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
def update_connector_last_sync_status(connector_id, last_sync_status)
|
149
|
+
doc = connector_with_concurrency_control(connector_id)
|
150
|
+
|
151
|
+
update_connector_fields(
|
152
|
+
connector_id,
|
153
|
+
{ last_sync_status: last_sync_status },
|
154
|
+
doc[:seq_no],
|
155
|
+
doc[:primary_term]
|
156
|
+
)
|
157
|
+
end
|
158
|
+
|
159
|
+
def connector_with_concurrency_control(connector_id)
|
136
160
|
seq_no = nil
|
137
161
|
primary_term = nil
|
138
|
-
|
139
|
-
|
162
|
+
|
163
|
+
doc = client.get(
|
140
164
|
:index => Utility::Constants::CONNECTORS_INDEX,
|
141
165
|
:id => connector_id,
|
142
166
|
:ignore => 404,
|
@@ -144,42 +168,31 @@ module Core
|
|
144
168
|
).tap do |response|
|
145
169
|
seq_no = response['_seq_no']
|
146
170
|
primary_term = response['_primary_term']
|
147
|
-
sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
|
148
|
-
end
|
149
|
-
if sync_in_progress
|
150
|
-
raise JobAlreadyRunningError.new(connector_id)
|
151
171
|
end
|
152
|
-
update_connector_fields(
|
153
|
-
connector_id,
|
154
|
-
{ :sync_now => false,
|
155
|
-
:last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
|
156
|
-
:last_synced => Time.now },
|
157
|
-
seq_no,
|
158
|
-
primary_term
|
159
|
-
)
|
160
172
|
|
173
|
+
{ doc: doc, seq_no: seq_no, primary_term: primary_term }
|
174
|
+
end
|
175
|
+
|
176
|
+
def create_job(connector_settings:)
|
161
177
|
body = {
|
162
|
-
:
|
163
|
-
:
|
164
|
-
:
|
165
|
-
:
|
166
|
-
|
167
|
-
|
168
|
-
:
|
169
|
-
:
|
178
|
+
status: Connectors::SyncStatus::PENDING,
|
179
|
+
created_at: Time.now,
|
180
|
+
last_seen: Time.now,
|
181
|
+
connector: {
|
182
|
+
id: connector_settings.id,
|
183
|
+
filtering: convert_connector_filtering_to_job_filtering(connector_settings.filtering),
|
184
|
+
index_name: connector_settings.index_name,
|
185
|
+
language: connector_settings[:language],
|
186
|
+
pipeline: connector_settings[:pipeline],
|
187
|
+
service_type: connector_settings.service_type
|
170
188
|
}
|
171
189
|
}
|
172
190
|
|
173
|
-
index_response = client.index(:
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
:id => index_response['_id'],
|
179
|
-
:ignore => 404
|
180
|
-
).with_indifferent_access
|
181
|
-
end
|
182
|
-
raise JobNotCreatedError.new(connector_id, index_response)
|
191
|
+
index_response = client.index(index: Utility::Constants::JOB_INDEX, body: body, refresh: true)
|
192
|
+
|
193
|
+
return index_response if index_response['result'] == 'created'
|
194
|
+
|
195
|
+
raise JobNotCreatedError.new(connector_settings.id, index_response)
|
183
196
|
end
|
184
197
|
|
185
198
|
def convert_connector_filtering_to_job_filtering(connector_filtering)
|
@@ -507,31 +520,15 @@ module Core
|
|
507
520
|
end
|
508
521
|
|
509
522
|
def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
:refresh => true,
|
516
|
-
:retry_on_conflict => 3
|
517
|
-
}
|
518
|
-
# seq_no and primary_term are used for optimistic concurrency control
|
519
|
-
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
|
520
|
-
if seq_no && primary_term
|
521
|
-
update_args[:if_seq_no] = seq_no
|
522
|
-
update_args[:if_primary_term] = primary_term
|
523
|
-
update_args.delete(:retry_on_conflict)
|
524
|
-
end
|
525
|
-
begin
|
526
|
-
client.update(update_args)
|
527
|
-
rescue Elastic::Transport::Transport::Errors::Conflict
|
528
|
-
# VersionConflictException
|
529
|
-
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
|
530
|
-
raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
|
531
|
-
end
|
523
|
+
update_doc_fields(Utility::Constants::CONNECTORS_INDEX, connector_id, doc, seq_no, primary_term)
|
524
|
+
end
|
525
|
+
|
526
|
+
def update_job_fields(job_id, doc = {}, seq_no = nil, primary_term = nil)
|
527
|
+
update_doc_fields(Utility::Constants::JOB_INDEX, job_id, doc, seq_no, primary_term)
|
532
528
|
end
|
533
529
|
|
534
530
|
def document_count(index_name)
|
531
|
+
client.indices.refresh(:index => index_name)
|
535
532
|
client.count(:index => index_name)['count']
|
536
533
|
end
|
537
534
|
|
@@ -563,6 +560,31 @@ module Core
|
|
563
560
|
filter.deep_merge!(new_validation_state)
|
564
561
|
end
|
565
562
|
end
|
563
|
+
|
564
|
+
def update_doc_fields(index, id, doc = {}, seq_no = nil, primary_term = nil)
|
565
|
+
return if doc.empty?
|
566
|
+
update_args = {
|
567
|
+
:index => index,
|
568
|
+
:id => id,
|
569
|
+
:body => { :doc => doc },
|
570
|
+
:refresh => true,
|
571
|
+
:retry_on_conflict => 3
|
572
|
+
}
|
573
|
+
|
574
|
+
if seq_no && primary_term
|
575
|
+
update_args[:if_seq_no] = seq_no
|
576
|
+
update_args[:if_primary_term] = primary_term
|
577
|
+
update_args.delete(:retry_on_conflict)
|
578
|
+
end
|
579
|
+
|
580
|
+
begin
|
581
|
+
client.update(update_args)
|
582
|
+
rescue Elastic::Transport::Transport::Errors::Conflict
|
583
|
+
# VersionConflictException
|
584
|
+
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
|
585
|
+
raise ConnectorVersionChangedError.new(id, seq_no, primary_term)
|
586
|
+
end
|
587
|
+
end
|
566
588
|
end
|
567
589
|
end
|
568
590
|
end
|
@@ -24,7 +24,7 @@ module Core
|
|
24
24
|
def execute
|
25
25
|
Utility::Logger.info("Starting a validation job for connector #{@connector_settings.id}.")
|
26
26
|
|
27
|
-
validation_result = @connector_class.validate_filtering(@connector_settings.filtering)
|
27
|
+
validation_result = @connector_class.validate_filtering(@connector_settings.filtering[:draft])
|
28
28
|
|
29
29
|
# currently only used for connectors -> DEFAULT domain can be assumed (will be changed with the integration of crawler)
|
30
30
|
ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_result })
|
@@ -11,19 +11,54 @@ require 'utility/bulk_queue'
|
|
11
11
|
require 'utility/es_client'
|
12
12
|
require 'utility/logger'
|
13
13
|
require 'elasticsearch/api'
|
14
|
-
|
14
|
+
#
|
15
|
+
# This class is responsible for sending the data to the data storage.
|
16
|
+
# While we don't actually allow to output our data anywhere except
|
17
|
+
# Elasticsearch, we still want to be able to do so sometime in future.
|
18
|
+
#
|
19
|
+
# This class should stay simple and any change to the class should be careful
|
20
|
+
# with the thought of introducing other sinks in future.
|
15
21
|
module Core
|
16
22
|
module Ingestion
|
17
23
|
class EsSink
|
18
|
-
def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new)
|
24
|
+
def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new, max_allowed_document_size = 5 * 1024 * 1024)
|
19
25
|
@client = Utility::EsClient.new(App::Config[:elasticsearch])
|
20
26
|
@index_name = index_name
|
21
27
|
@request_pipeline = request_pipeline
|
22
28
|
@operation_queue = bulk_queue
|
29
|
+
|
30
|
+
@max_allowed_document_size = max_allowed_document_size
|
31
|
+
|
32
|
+
@queued = {
|
33
|
+
:indexed_document_count => 0,
|
34
|
+
:deleted_document_count => 0,
|
35
|
+
:indexed_document_volume => 0
|
36
|
+
}
|
37
|
+
|
38
|
+
@completed = {
|
39
|
+
:indexed_document_count => 0,
|
40
|
+
:deleted_document_count => 0,
|
41
|
+
:indexed_document_volume => 0
|
42
|
+
}
|
23
43
|
end
|
24
44
|
|
25
|
-
def ingest(
|
26
|
-
|
45
|
+
def ingest(document)
|
46
|
+
if document.nil? || document.empty?
|
47
|
+
Utility::Logger.warn('Connector attempted to ingest an empty document, skipping')
|
48
|
+
return
|
49
|
+
end
|
50
|
+
|
51
|
+
id = document['id']
|
52
|
+
serialized_document = serialize(document)
|
53
|
+
|
54
|
+
document_size = serialized_document.bytesize
|
55
|
+
|
56
|
+
if @max_allowed_document_size > 0 && document_size > @max_allowed_document_size
|
57
|
+
Utility::Logger.warn("Connector attempted to ingest too large document with id=#{document['id']} [#{document_size}/#{@max_allowed_document_size}], skipping the document.")
|
58
|
+
return
|
59
|
+
end
|
60
|
+
|
61
|
+
index_op = serialize({ 'index' => { '_index' => @index_name, '_id' => id } })
|
27
62
|
|
28
63
|
flush unless @operation_queue.will_fit?(index_op, serialized_document)
|
29
64
|
|
@@ -31,13 +66,27 @@ module Core
|
|
31
66
|
index_op,
|
32
67
|
serialized_document
|
33
68
|
)
|
69
|
+
|
70
|
+
@queued[:indexed_document_count] += 1
|
71
|
+
@queued[:indexed_document_volume] += document_size
|
72
|
+
end
|
73
|
+
|
74
|
+
def ingest_multiple(documents)
|
75
|
+
documents.each { |doc| ingest(doc) }
|
34
76
|
end
|
35
77
|
|
36
|
-
def delete(
|
37
|
-
|
78
|
+
def delete(id)
|
79
|
+
return if id.nil?
|
80
|
+
|
81
|
+
delete_op = serialize({ 'delete' => { '_index' => @index_name, '_id' => id } })
|
38
82
|
flush unless @operation_queue.will_fit?(delete_op)
|
39
83
|
|
40
84
|
@operation_queue.add(delete_op)
|
85
|
+
@queued[:deleted_document_count] += 1
|
86
|
+
end
|
87
|
+
|
88
|
+
def delete_multiple(ids)
|
89
|
+
ids.each { |id| delete(id) }
|
41
90
|
end
|
42
91
|
|
43
92
|
def flush
|
@@ -45,15 +94,25 @@ module Core
|
|
45
94
|
return if data.empty?
|
46
95
|
|
47
96
|
@client.bulk(:body => data, :pipeline => @request_pipeline)
|
97
|
+
|
98
|
+
@completed[:indexed_document_count] += @queued[:indexed_document_count]
|
99
|
+
@completed[:deleted_document_count] += @queued[:deleted_document_count]
|
100
|
+
@completed[:indexed_document_volume] += @queued[:indexed_document_volume]
|
101
|
+
|
102
|
+
@queued[:indexed_document_count] = 0
|
103
|
+
@queued[:deleted_document_count] = 0
|
104
|
+
@queued[:indexed_document_volume] = 0
|
48
105
|
end
|
49
106
|
|
50
|
-
def
|
51
|
-
|
107
|
+
def ingestion_stats
|
108
|
+
@completed.dup
|
52
109
|
end
|
53
110
|
|
54
111
|
private
|
55
112
|
|
56
|
-
|
113
|
+
def serialize(document)
|
114
|
+
Elasticsearch::API.serializer.dump(document)
|
115
|
+
end
|
57
116
|
end
|
58
117
|
end
|
59
118
|
end
|
data/lib/core/ingestion.rb
CHANGED
@@ -0,0 +1,114 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
module Core
|
10
|
+
module Jobs
|
11
|
+
class Consumer
|
12
|
+
def initialize(scheduler:, poll_interval: 3, termination_timeout: 60, min_threads: 1, max_threads: 5, max_queue: 100, idle_time: 5)
|
13
|
+
@scheduler = scheduler
|
14
|
+
@poll_interval = poll_interval
|
15
|
+
@termination_timeout = termination_timeout
|
16
|
+
@min_threads = min_threads
|
17
|
+
@max_threads = max_threads
|
18
|
+
@max_queue = max_queue
|
19
|
+
@idle_time = idle_time
|
20
|
+
|
21
|
+
@running = Concurrent::AtomicBoolean.new(false)
|
22
|
+
end
|
23
|
+
|
24
|
+
def subscribe!(index_name:)
|
25
|
+
@index_name = index_name
|
26
|
+
|
27
|
+
start_loop!
|
28
|
+
end
|
29
|
+
|
30
|
+
def running?
|
31
|
+
# @TODO check if a loop thread is alive
|
32
|
+
pool.running? && @running.true?
|
33
|
+
end
|
34
|
+
|
35
|
+
def shutdown!
|
36
|
+
Utility::Logger.info("Shutting down consumer for #{@index_name} index")
|
37
|
+
@running.make_false
|
38
|
+
pool.shutdown
|
39
|
+
pool.wait_for_termination(@termination_timeout)
|
40
|
+
# reset pool
|
41
|
+
@pool = nil
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def start_loop!
|
47
|
+
Utility::Logger.info("Starting a new consumer for #{@index_name} index")
|
48
|
+
|
49
|
+
Thread.new do
|
50
|
+
# assign a name to the thread
|
51
|
+
# see @TODO in #self.running?
|
52
|
+
Thread.current[:name] = "consumer-group-#{@index_name}"
|
53
|
+
|
54
|
+
loop do
|
55
|
+
if @running.false?
|
56
|
+
Utility::Logger.info('Shutting down the loop')
|
57
|
+
break
|
58
|
+
end
|
59
|
+
|
60
|
+
sleep(@poll_interval)
|
61
|
+
Utility::Logger.debug('Getting registered connectors')
|
62
|
+
|
63
|
+
connectors = ready_for_sync_connectors
|
64
|
+
next unless connectors.any?
|
65
|
+
|
66
|
+
Utility::Logger.debug("Number of available connectors: #{connectors.size}")
|
67
|
+
|
68
|
+
# @TODO It is assumed that @index_name is used to retrive pending jobs.
|
69
|
+
# This will be discussed after 8.6 release
|
70
|
+
pending_jobs = Core::ConnectorJob.pending_jobs(connectors_ids: connectors.keys)
|
71
|
+
Utility::Logger.info("Number of pending jobs: #{pending_jobs.size}")
|
72
|
+
|
73
|
+
pending_jobs.each do |job|
|
74
|
+
connector_settings = connectors[job.connector_id]
|
75
|
+
|
76
|
+
pool.post do
|
77
|
+
Utility::Logger.info("Connector #{connector_settings.formatted} picked up the job #{job.id}")
|
78
|
+
Core::ElasticConnectorActions.ensure_content_index_exists(connector_settings.index_name)
|
79
|
+
job_runner = Core::SyncJobRunner.new(connector_settings, job)
|
80
|
+
job_runner.execute
|
81
|
+
rescue Core::JobAlreadyRunningError
|
82
|
+
Utility::Logger.info("Sync job for #{connector_settings.formatted} is already running, skipping.")
|
83
|
+
rescue Core::ConnectorVersionChangedError => e
|
84
|
+
Utility::Logger.info("Could not start the job because #{connector_settings.formatted} has been updated externally. Message: #{e.message}")
|
85
|
+
rescue StandardError => e
|
86
|
+
Utility::ExceptionTracking.log_exception(e, "Sync job for #{connector_settings.formatted} failed due to unexpected error.")
|
87
|
+
end
|
88
|
+
end
|
89
|
+
rescue StandardError => e
|
90
|
+
Utility::ExceptionTracking.log_exception(e, 'The consumer group failed')
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
@running.make_true
|
95
|
+
end
|
96
|
+
|
97
|
+
def pool
|
98
|
+
@pool ||= Concurrent::ThreadPoolExecutor.new(
|
99
|
+
min_threads: @min_threads,
|
100
|
+
max_threads: @max_threads,
|
101
|
+
max_queue: @max_queue,
|
102
|
+
fallback_policy: :abort,
|
103
|
+
idletime: @idle_time
|
104
|
+
)
|
105
|
+
end
|
106
|
+
|
107
|
+
def ready_for_sync_connectors
|
108
|
+
@scheduler.connector_settings
|
109
|
+
.select(&:ready_for_sync?)
|
110
|
+
.inject({}) { |memo, cs| memo.merge(cs.id => cs) }
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
module Core
|
10
|
+
module Jobs
|
11
|
+
class Producer
|
12
|
+
JOB_TYPES = %i(sync).freeze
|
13
|
+
|
14
|
+
class << self
|
15
|
+
def enqueue_job(job_type:, connector_settings:)
|
16
|
+
raise UnsupportedJobType unless JOB_TYPES.include?(job_type)
|
17
|
+
raise ArgumentError unless connector_settings.kind_of?(ConnectorSettings)
|
18
|
+
|
19
|
+
ElasticConnectorActions.create_job(connector_settings: connector_settings)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class UnsupportedJobType < StandardError; end
|
25
|
+
end
|
26
|
+
end
|
@@ -20,7 +20,7 @@ module Core
|
|
20
20
|
|
21
21
|
def connector_settings
|
22
22
|
connector_settings = Core::ConnectorSettings.fetch_by_id(@connector_id)
|
23
|
-
[connector_settings]
|
23
|
+
[connector_settings].compact
|
24
24
|
rescue *Utility::AUTHORIZATION_ERRORS => e
|
25
25
|
# should be handled by the general scheduler
|
26
26
|
raise e
|
data/lib/core/sync_job_runner.rb
CHANGED
@@ -23,9 +23,9 @@ module Core
|
|
23
23
|
class SyncJobRunner
|
24
24
|
JOB_REPORTING_INTERVAL = 10
|
25
25
|
|
26
|
-
def initialize(connector_settings)
|
26
|
+
def initialize(connector_settings, job)
|
27
27
|
@connector_settings = connector_settings
|
28
|
-
@
|
28
|
+
@sink = Core::Ingestion::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline)
|
29
29
|
@connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
|
30
30
|
@sync_finished = false
|
31
31
|
@sync_error = nil
|
@@ -35,6 +35,7 @@ module Core
|
|
35
35
|
:indexed_document_volume => 0,
|
36
36
|
:error => nil
|
37
37
|
}
|
38
|
+
@job = job
|
38
39
|
end
|
39
40
|
|
40
41
|
def execute
|
@@ -47,9 +48,16 @@ module Core
|
|
47
48
|
def do_sync!
|
48
49
|
Utility::Logger.info("Claiming a sync job for connector #{@connector_settings.id}.")
|
49
50
|
|
50
|
-
|
51
|
-
|
52
|
-
|
51
|
+
# connector service doesn't support multiple jobs running simultaneously
|
52
|
+
raise Core::JobAlreadyRunningError.new(@connector_settings.id) if @connector_settings.running?
|
53
|
+
|
54
|
+
Core::ElasticConnectorActions.update_connector_last_sync_status(@connector_settings.id, Connectors::SyncStatus::IN_PROGRESS)
|
55
|
+
|
56
|
+
# claim the job
|
57
|
+
@job.make_running!
|
58
|
+
|
59
|
+
job_description = @job.es_source
|
60
|
+
job_id = @job.id
|
53
61
|
job_description['_id'] = job_id
|
54
62
|
|
55
63
|
unless job_id.present?
|
@@ -80,12 +88,12 @@ module Core
|
|
80
88
|
document = add_ingest_metadata(document)
|
81
89
|
post_process_result = post_processing_engine.process(document)
|
82
90
|
if post_process_result.is_include?
|
83
|
-
@
|
91
|
+
@sink.ingest(document)
|
84
92
|
incoming_ids << document['id']
|
85
93
|
end
|
86
94
|
|
87
95
|
if Time.now - reporting_cycle_start >= JOB_REPORTING_INTERVAL
|
88
|
-
ElasticConnectorActions.update_sync(job_id, @
|
96
|
+
ElasticConnectorActions.update_sync(job_id, @sink.ingestion_stats.merge(:metadata => connector_instance.metadata))
|
89
97
|
reporting_cycle_start = Time.now
|
90
98
|
end
|
91
99
|
end
|
@@ -95,15 +103,15 @@ module Core
|
|
95
103
|
Utility::Logger.info("Deleting #{ids_to_delete.size} documents from index #{@connector_settings.index_name}.")
|
96
104
|
|
97
105
|
ids_to_delete.each do |id|
|
98
|
-
@
|
106
|
+
@sink.delete(id)
|
99
107
|
|
100
108
|
if Time.now - reporting_cycle_start >= JOB_REPORTING_INTERVAL
|
101
|
-
ElasticConnectorActions.update_sync(job_id, @
|
109
|
+
ElasticConnectorActions.update_sync(job_id, @sink.ingestion_stats.merge(:metadata => connector_instance.metadata))
|
102
110
|
reporting_cycle_start = Time.now
|
103
111
|
end
|
104
112
|
end
|
105
113
|
|
106
|
-
@
|
114
|
+
@sink.flush
|
107
115
|
|
108
116
|
# We use this mechanism for checking, whether an interrupt (or something else lead to the thread not finishing)
|
109
117
|
# occurred as most of the time the main execution thread is interrupted and we miss this Signal/Exception here
|
@@ -112,7 +120,7 @@ module Core
|
|
112
120
|
@sync_error = e.message
|
113
121
|
Utility::ExceptionTracking.log_exception(e)
|
114
122
|
ensure
|
115
|
-
stats = @
|
123
|
+
stats = @sink.ingestion_stats
|
116
124
|
|
117
125
|
Utility::Logger.debug("Sync stats are: #{stats}")
|
118
126
|
|
@@ -129,7 +137,7 @@ module Core
|
|
129
137
|
end
|
130
138
|
|
131
139
|
unless connector_instance.nil?
|
132
|
-
metadata = @
|
140
|
+
metadata = @sink.ingestion_stats.merge(:metadata => connector_instance.metadata)
|
133
141
|
metadata[:total_document_count] = ElasticConnectorActions.document_count(@connector_settings.index_name)
|
134
142
|
end
|
135
143
|
|
data/lib/core.rb
CHANGED
@@ -0,0 +1,108 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'time'
|
10
|
+
require 'utility/errors'
|
11
|
+
require 'utility/exception_tracking'
|
12
|
+
|
13
|
+
module Utility
|
14
|
+
class ErrorMonitor
|
15
|
+
class MonitoringError < StandardError
|
16
|
+
attr_accessor :tripped_by
|
17
|
+
|
18
|
+
def initialize(message = nil, tripped_by: nil)
|
19
|
+
super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
|
20
|
+
@tripped_by = tripped_by
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class MaxSuccessiveErrorsExceededError < MonitoringError; end
|
25
|
+
class MaxErrorsExceededError < MonitoringError; end
|
26
|
+
class MaxErrorsInWindowExceededError < MonitoringError; end
|
27
|
+
|
28
|
+
attr_reader :total_error_count, :success_count, :consecutive_error_count, :error_queue
|
29
|
+
|
30
|
+
def initialize(
|
31
|
+
max_errors: 1000,
|
32
|
+
max_consecutive_errors: 10,
|
33
|
+
max_error_ratio: 0.15,
|
34
|
+
window_size: 100,
|
35
|
+
error_queue_size: 20
|
36
|
+
)
|
37
|
+
@max_errors = max_errors
|
38
|
+
@max_consecutive_errors = max_consecutive_errors
|
39
|
+
@max_error_ratio = max_error_ratio
|
40
|
+
@window_size = window_size
|
41
|
+
@total_error_count = 0
|
42
|
+
@success_count = 0
|
43
|
+
@consecutive_error_count = 0
|
44
|
+
@window_errors = Array.new(window_size) { false }
|
45
|
+
@window_index = 0
|
46
|
+
@last_error = nil
|
47
|
+
@error_queue_size = error_queue_size
|
48
|
+
@error_queue = []
|
49
|
+
end
|
50
|
+
|
51
|
+
def note_success
|
52
|
+
@consecutive_error_count = 0
|
53
|
+
@success_count += 1
|
54
|
+
increment_window_index
|
55
|
+
end
|
56
|
+
|
57
|
+
def note_error(error, id: Time.now.to_i)
|
58
|
+
stack_trace = Utility::ExceptionTracking.generate_stack_trace(error)
|
59
|
+
error_message = Utility::ExceptionTracking.generate_error_message(error, nil, nil)
|
60
|
+
Utility::Logger.debug("Message id: #{id} - #{error_message}\n#{stack_trace}")
|
61
|
+
@total_error_count += 1
|
62
|
+
@consecutive_error_count += 1
|
63
|
+
@window_errors[@window_index] = true
|
64
|
+
@error_queue << DocumentError.new(error.class.name, error_message, stack_trace, id)
|
65
|
+
@error_queue = @error_queue.drop(1) if @error_queue.size > @error_queue_size
|
66
|
+
increment_window_index
|
67
|
+
@last_error = error
|
68
|
+
|
69
|
+
raise_if_necessary
|
70
|
+
end
|
71
|
+
|
72
|
+
def finalize
|
73
|
+
total_documents = @total_error_count + @success_count
|
74
|
+
if total_documents > 0 && @total_error_count.to_f / total_documents > @max_error_ratio
|
75
|
+
raise_with_last_cause(MaxErrorsInWindowExceededError.new("There were #{@total_error_count} errors out of #{total_documents} total documents", :tripped_by => @last_error))
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
private
|
80
|
+
|
81
|
+
def raise_if_necessary
|
82
|
+
error =
|
83
|
+
if @consecutive_error_count > @max_consecutive_errors
|
84
|
+
MaxSuccessiveErrorsExceededError.new("Exceeded maximum consecutive errors - saw #{@consecutive_error_count} errors in a row.", :tripped_by => @last_error)
|
85
|
+
elsif @total_error_count > @max_errors
|
86
|
+
MaxErrorsExceededError.new("Exceeded maximum number of errors - saw #{@total_error_count} errors in total.", :tripped_by => @last_error)
|
87
|
+
elsif @window_size > 0 && num_errors_in_window / @window_size > @max_error_ratio
|
88
|
+
MaxErrorsInWindowExceededError.new("Exceeded maximum error ratio of #{@max_error_ratio}. Of the last #{@window_size} documents, #{num_errors_in_window} had errors", :tripped_by => @last_error)
|
89
|
+
end
|
90
|
+
|
91
|
+
raise_with_last_cause(error) if error
|
92
|
+
end
|
93
|
+
|
94
|
+
def num_errors_in_window
|
95
|
+
@window_errors.count(&:itself).to_f
|
96
|
+
end
|
97
|
+
|
98
|
+
def increment_window_index
|
99
|
+
@window_index = (@window_index + 1) % @window_size
|
100
|
+
end
|
101
|
+
|
102
|
+
def raise_with_last_cause(error)
|
103
|
+
raise @last_error
|
104
|
+
rescue StandardError
|
105
|
+
raise error
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
data/lib/utility/errors.rb
CHANGED
@@ -60,18 +60,6 @@ module Utility
|
|
60
60
|
class JobDocumentLimitError < StandardError; end
|
61
61
|
class JobClaimingError < StandardError; end
|
62
62
|
|
63
|
-
class MonitoringError < StandardError
|
64
|
-
attr_accessor :tripped_by
|
65
|
-
|
66
|
-
def initialize(message = nil, tripped_by: nil)
|
67
|
-
super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
|
68
|
-
@tripped_by = tripped_by
|
69
|
-
end
|
70
|
-
end
|
71
|
-
class MaxSuccessiveErrorsExceededError < MonitoringError; end
|
72
|
-
class MaxErrorsExceededError < MonitoringError; end
|
73
|
-
class MaxErrorsInWindowExceededError < MonitoringError; end
|
74
|
-
|
75
63
|
class JobSyncNotPossibleYetError < StandardError
|
76
64
|
attr_accessor :sync_will_be_possible_at
|
77
65
|
|