connectors_utility 8.6.0.4.pre.20221115T002329Z → 8.6.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 996cc59bf15e82bf245bc30716d0ac93861514fe9f8e73f92948f80e60adccc4
4
- data.tar.gz: f904929c8d82a73e1763cf30536c7e703b885d633f05e9cdecbba310eb44d99d
3
+ metadata.gz: bd3d8a98ffaf8965434e1c01467c4ff30d87a1b25e414d5ce36f8a8529178053
4
+ data.tar.gz: 07d54a470c31ba311aeee4aca41401bca11178d20ed7de72c8dadf7926e910d4
5
5
  SHA512:
6
- metadata.gz: d22401f2411e468c734bb015e6027ede2858a64fcdeb6ea446ad9cc4871c68791e736dea230e5a2fcee2c77b84514c41314cd866f7766bb2906e4396b1c557e8
7
- data.tar.gz: e3ed8f7cb8e0d10bfbc220520ff7d98dcc7d221ee65f4790976eb6e1f538f78ac781eaa453258896749ed56e41eb3cf3c065a206cbb209740d43dbe6b37f7b52
6
+ metadata.gz: eddc02838146053fa61173a5dc6060eb62c92c70930bd5c1338586572af08078288455c65fb2301796d99b338b996ec889966e535bbceaf9f1f39bac6c9888bc
7
+ data.tar.gz: e0ec181ccc3cb8dd59f123003d873c1218f2e75c6d91569efe075330d913d6bb366c4bcfbc389844b8040a10e22e5b8c797525379d042cb81a2fa2c7e3af08a4
@@ -26,11 +26,16 @@ module Connectors
26
26
  ERROR
27
27
  ]
28
28
 
29
- PENDING_STATUES = [
29
+ PENDING_STATUSES = [
30
30
  PENDING,
31
31
  SUSPENDED
32
32
  ]
33
33
 
34
+ ACTIVE_STATUSES = [
35
+ IN_PROGRESS,
36
+ CANCELING
37
+ ]
38
+
34
39
  TERMINAL_STATUSES = [
35
40
  CANCELED,
36
41
  COMPLETED,
@@ -0,0 +1,240 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'active_support/core_ext/hash/indifferent_access'
10
+ require 'connectors/sync_status'
11
+ require 'core/connector_settings'
12
+ require 'core/elastic_connector_actions'
13
+ require 'utility'
14
+
15
+ module Core
16
+ class ConnectorJob
17
+ DEFAULT_PAGE_SIZE = 100
18
+ STUCK_THRESHOLD = 60
19
+
20
+ def self.fetch_by_id(job_id)
21
+ es_response = ElasticConnectorActions.get_job(job_id)
22
+ return nil unless es_response[:found]
23
+
24
+ new(es_response)
25
+ end
26
+
27
+ def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
28
+ status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
29
+
30
+ query = { bool: { must: [{ terms: status_term }] } }
31
+
32
+ return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
33
+
34
+ query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
35
+
36
+ fetch_jobs_by_query(query, page_size)
37
+ end
38
+
39
+ def self.orphaned_jobs(page_size = DEFAULT_PAGE_SIZE)
40
+ connector_ids = ConnectorSettings.fetch_all_connectors.map(&:id)
41
+ query = { bool: { must_not: { terms: { 'connector.id': connector_ids } } } }
42
+ fetch_jobs_by_query(query, page_size)
43
+ end
44
+
45
+ def self.delete_jobs(jobs)
46
+ query = { terms: { '_id': jobs.map(&:id) } }
47
+ ElasticConnectorActions.delete_jobs_by_query(query)
48
+ end
49
+
50
+ def self.stuck_jobs(connector_id = nil, page_size = DEFAULT_PAGE_SIZE)
51
+ connector_ids = if connector_id
52
+ [connector_id]
53
+ else
54
+ ConnectorSettings.fetch_native_connectors.map(&:id)
55
+ end
56
+ query = {
57
+ bool: {
58
+ filter: [
59
+ { terms: { 'connector.id': connector_ids } },
60
+ { terms: { status: Connectors::SyncStatus::ACTIVE_STATUSES } },
61
+ { range: { last_seen: { lte: "now-#{STUCK_THRESHOLD}s" } } }
62
+ ]
63
+ }
64
+ }
65
+ fetch_jobs_by_query(query, page_size)
66
+ end
67
+
68
+ def self.enqueue(_connector_id)
69
+ nil
70
+ end
71
+
72
+ def id
73
+ @elasticsearch_response[:_id]
74
+ end
75
+
76
+ def [](property_name)
77
+ @elasticsearch_response[:_source][property_name]
78
+ end
79
+
80
+ def error
81
+ self[:error]
82
+ end
83
+
84
+ def status
85
+ self[:status]
86
+ end
87
+
88
+ def in_progress?
89
+ status == Connectors::SyncStatus::IN_PROGRESS
90
+ end
91
+
92
+ def canceling?
93
+ status == Connectors::SyncStatus::CANCELING
94
+ end
95
+
96
+ def suspended?
97
+ status == Connectors::SyncStatus::SUSPENDED
98
+ end
99
+
100
+ def canceled?
101
+ status == Connectors::SyncStatus::CANCELED
102
+ end
103
+
104
+ def pending?
105
+ Connectors::SyncStatus::PENDING_STATUSES.include?(status)
106
+ end
107
+
108
+ def active?
109
+ Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
110
+ end
111
+
112
+ def terminated?
113
+ Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
114
+ end
115
+
116
+ def connector_snapshot
117
+ self[:connector] || {}
118
+ end
119
+
120
+ def connector_id
121
+ connector_snapshot[:id]
122
+ end
123
+
124
+ def index_name
125
+ connector_snapshot[:index_name]
126
+ end
127
+
128
+ def language
129
+ connector_snapshot[:language]
130
+ end
131
+
132
+ def service_type
133
+ connector_snapshot[:service_type]
134
+ end
135
+
136
+ def configuration
137
+ connector_snapshot[:configuration]
138
+ end
139
+
140
+ def filtering
141
+ connector_snapshot[:filtering]
142
+ end
143
+
144
+ def pipeline
145
+ connector_snapshot[:pipeline]
146
+ end
147
+
148
+ def connector
149
+ @connector ||= ConnectorSettings.fetch_by_id(connector_id)
150
+ end
151
+
152
+ def update_metadata(ingestion_stats = {}, connector_metadata = {})
153
+ ingestion_stats ||= {}
154
+ doc = { :last_seen => Time.now }.merge(ingestion_stats)
155
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
156
+ ElasticConnectorActions.update_job_fields(id, doc)
157
+ end
158
+
159
+ def done!(ingestion_stats = {}, connector_metadata = {})
160
+ terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
161
+ end
162
+
163
+ def error!(message, ingestion_stats = {}, connector_metadata = {})
164
+ terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
165
+ end
166
+
167
+ def cancel!(ingestion_stats = {}, connector_metadata = {})
168
+ terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
169
+ end
170
+
171
+ def with_concurrency_control
172
+ response = ElasticConnectorActions.get_job(id)
173
+
174
+ yield response, response['_seq_no'], response['_primary_term']
175
+ end
176
+
177
+ def make_running!
178
+ with_concurrency_control do |es_doc, seq_no, primary_term|
179
+ now = Time.now
180
+ doc = {
181
+ status: Connectors::SyncStatus::IN_PROGRESS,
182
+ started_at: now,
183
+ last_seen: now,
184
+ worker_hostname: Socket.gethostname
185
+ }
186
+
187
+ ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
188
+ end
189
+ end
190
+
191
+ def es_source
192
+ @elasticsearch_response[:_source]
193
+ end
194
+
195
+ private
196
+
197
+ def self.fetch_jobs_by_query(query, page_size)
198
+ results = []
199
+ offset = 0
200
+ loop do
201
+ response = ElasticConnectorActions.search_jobs(query, page_size, offset)
202
+
203
+ hits = response.dig('hits', 'hits') || []
204
+ total = response.dig('hits', 'total', 'value') || 0
205
+ results += hits.map { |hit| new(hit) }
206
+ break if results.size >= total
207
+ offset += hits.size
208
+ end
209
+
210
+ results
211
+ end
212
+
213
+ def initialize(es_response)
214
+ # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
215
+ @elasticsearch_response = es_response.with_indifferent_access
216
+ end
217
+
218
+ def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
219
+ ingestion_stats ||= {}
220
+ ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
221
+ doc = {
222
+ :last_seen => Time.now,
223
+ :completed_at => Time.now,
224
+ :status => status,
225
+ :error => error
226
+ }.merge(ingestion_stats)
227
+ doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
228
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
229
+ ElasticConnectorActions.update_job_fields(id, doc)
230
+ end
231
+
232
+ def seq_no
233
+ @elasticsearch_response[:_seq_no]
234
+ end
235
+
236
+ def primary_term
237
+ @elasticsearch_response[:_primary_term]
238
+ end
239
+ end
240
+ end
@@ -23,14 +23,11 @@ module Core
23
23
 
24
24
  DEFAULT_PAGE_SIZE = 100
25
25
 
26
- # Error Classes
27
- class ConnectorNotFoundError < StandardError; end
28
-
29
26
  def self.fetch_by_id(connector_id)
30
27
  es_response = ElasticConnectorActions.get_connector(connector_id)
31
- connectors_meta = ElasticConnectorActions.connectors_meta
28
+ return nil unless es_response[:found]
32
29
 
33
- raise ConnectorNotFoundError.new("Connector with id=#{connector_id} was not found.") unless es_response[:found]
30
+ connectors_meta = ElasticConnectorActions.connectors_meta
34
31
  new(es_response, connectors_meta)
35
32
  end
36
33
 
@@ -52,6 +49,11 @@ module Core
52
49
  fetch_connectors_by_query(query, page_size)
53
50
  end
54
51
 
52
+ def self.fetch_all_connectors(page_size = DEFAULT_PAGE_SIZE)
53
+ query = { match_all: {} }
54
+ fetch_connectors_by_query(query, page_size)
55
+ end
56
+
55
57
  def id
56
58
  @elasticsearch_response[:_id]
57
59
  end
@@ -122,6 +124,34 @@ module Core
122
124
  index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
123
125
  end
124
126
 
127
+ def ready_for_sync?
128
+ Connectors::REGISTRY.registered?(service_type) &&
129
+ valid_index_name? &&
130
+ connector_status_allows_sync?
131
+ end
132
+
133
+ def running?
134
+ @elasticsearch_response[:_source][:last_sync_status] == Connectors::SyncStatus::IN_PROGRESS
135
+ end
136
+
137
+ def update_last_sync!(job)
138
+ # if job is nil, connector still needs to be updated, to avoid it stuck at in_progress
139
+ job_status = job&.status || Connectors::SyncStatus::ERROR
140
+ job_error = job.nil? ? 'Could\'t find the job' : job.error
141
+ job_error ||= 'unknown error' if job_status == Connectors::SyncStatus::ERROR
142
+ doc = {
143
+ :last_sync_status => job_status,
144
+ :last_synced => Time.now,
145
+ :last_sync_error => job_error,
146
+ :error => job_error
147
+ }
148
+ if job&.terminated?
149
+ doc[:last_indexed_document_count] = job[:indexed_document_count]
150
+ doc[:last_deleted_document_count] = job[:deleted_document_count]
151
+ end
152
+ Core::ElasticConnectorActions.update_connector_fields(id, doc)
153
+ end
154
+
125
155
  private
126
156
 
127
157
  def initialize(es_response, connectors_meta)
@@ -91,6 +91,17 @@ module Core
91
91
  )
92
92
  end
93
93
 
94
+ def delete_jobs_by_query(query)
95
+ client.delete_by_query(
96
+ :index => Utility::Constants::JOB_INDEX,
97
+ :body => { :query => query }
98
+ )
99
+ end
100
+
101
+ def delete_indices(indices)
102
+ client.indices.delete(:index => indices, :ignore_unavailable => true)
103
+ end
104
+
94
105
  def update_connector_configuration(connector_id, configuration)
95
106
  update_connector_fields(connector_id, :configuration => configuration)
96
107
  end
@@ -132,11 +143,35 @@ module Core
132
143
  update_connector_fields(connector_id, { :filtering => filtering })
133
144
  end
134
145
 
135
- def claim_job(connector_id)
146
+ def update_connector_sync_now(connector_id, sync_now)
147
+ doc = connector_with_concurrency_control(connector_id)
148
+
149
+ body = { sync_now: sync_now, last_synced: Time.now }
150
+
151
+ update_connector_fields(
152
+ connector_id,
153
+ body,
154
+ doc[:seq_no],
155
+ doc[:primary_term]
156
+ )
157
+ end
158
+
159
+ def update_connector_last_sync_status(connector_id, last_sync_status)
160
+ doc = connector_with_concurrency_control(connector_id)
161
+
162
+ update_connector_fields(
163
+ connector_id,
164
+ { last_sync_status: last_sync_status },
165
+ doc[:seq_no],
166
+ doc[:primary_term]
167
+ )
168
+ end
169
+
170
+ def connector_with_concurrency_control(connector_id)
136
171
  seq_no = nil
137
172
  primary_term = nil
138
- sync_in_progress = false
139
- connector_record = client.get(
173
+
174
+ doc = client.get(
140
175
  :index => Utility::Constants::CONNECTORS_INDEX,
141
176
  :id => connector_id,
142
177
  :ignore => 404,
@@ -144,42 +179,31 @@ module Core
144
179
  ).tap do |response|
145
180
  seq_no = response['_seq_no']
146
181
  primary_term = response['_primary_term']
147
- sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
148
- end
149
- if sync_in_progress
150
- raise JobAlreadyRunningError.new(connector_id)
151
182
  end
152
- update_connector_fields(
153
- connector_id,
154
- { :sync_now => false,
155
- :last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
156
- :last_synced => Time.now },
157
- seq_no,
158
- primary_term
159
- )
160
183
 
184
+ { doc: doc, seq_no: seq_no, primary_term: primary_term }
185
+ end
186
+
187
+ def create_job(connector_settings:)
161
188
  body = {
162
- :status => Connectors::SyncStatus::IN_PROGRESS,
163
- :worker_hostname => Socket.gethostname,
164
- :created_at => Time.now,
165
- :started_at => Time.now,
166
- :last_seen => Time.now,
167
- :connector => {
168
- :id => connector_id,
169
- :filtering => convert_connector_filtering_to_job_filtering(connector_record.dig('_source', 'filtering'))
189
+ status: Connectors::SyncStatus::PENDING,
190
+ created_at: Time.now,
191
+ last_seen: Time.now,
192
+ connector: {
193
+ id: connector_settings.id,
194
+ filtering: convert_connector_filtering_to_job_filtering(connector_settings.filtering),
195
+ index_name: connector_settings.index_name,
196
+ language: connector_settings[:language],
197
+ pipeline: connector_settings[:pipeline],
198
+ service_type: connector_settings.service_type
170
199
  }
171
200
  }
172
201
 
173
- index_response = client.index(:index => Utility::Constants::JOB_INDEX, :body => body, :refresh => true)
174
- if index_response['result'] == 'created'
175
- # TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
176
- return client.get(
177
- :index => Utility::Constants::JOB_INDEX,
178
- :id => index_response['_id'],
179
- :ignore => 404
180
- ).with_indifferent_access
181
- end
182
- raise JobNotCreatedError.new(connector_id, index_response)
202
+ index_response = client.index(index: Utility::Constants::JOB_INDEX, body: body, refresh: true)
203
+
204
+ return index_response if index_response['result'] == 'created'
205
+
206
+ raise JobNotCreatedError.new(connector_settings.id, index_response)
183
207
  end
184
208
 
185
209
  def convert_connector_filtering_to_job_filtering(connector_filtering)
@@ -207,37 +231,6 @@ module Core
207
231
  update_connector_fields(connector_id, body)
208
232
  end
209
233
 
210
- def update_sync(job_id, metadata)
211
- body = {
212
- :doc => { :last_seen => Time.now }.merge(metadata)
213
- }
214
- client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
215
- end
216
-
217
- def complete_sync(connector_id, job_id, metadata, error)
218
- sync_status = error ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
219
-
220
- metadata ||= {}
221
-
222
- update_connector_fields(connector_id,
223
- :last_sync_status => sync_status,
224
- :last_sync_error => error,
225
- :error => error,
226
- :last_synced => Time.now,
227
- :last_indexed_document_count => metadata[:indexed_document_count],
228
- :last_deleted_document_count => metadata[:deleted_document_count])
229
-
230
- body = {
231
- :doc => {
232
- :status => sync_status,
233
- :completed_at => Time.now,
234
- :last_seen => Time.now,
235
- :error => error
236
- }.merge(metadata)
237
- }
238
- client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
239
- end
240
-
241
234
  def fetch_document_ids(index_name)
242
235
  page_size = 1000
243
236
  result = []
@@ -507,31 +500,15 @@ module Core
507
500
  end
508
501
 
509
502
  def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
510
- return if doc.empty?
511
- update_args = {
512
- :index => Utility::Constants::CONNECTORS_INDEX,
513
- :id => connector_id,
514
- :body => { :doc => doc },
515
- :refresh => true,
516
- :retry_on_conflict => 3
517
- }
518
- # seq_no and primary_term are used for optimistic concurrency control
519
- # see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
520
- if seq_no && primary_term
521
- update_args[:if_seq_no] = seq_no
522
- update_args[:if_primary_term] = primary_term
523
- update_args.delete(:retry_on_conflict)
524
- end
525
- begin
526
- client.update(update_args)
527
- rescue Elastic::Transport::Transport::Errors::Conflict
528
- # VersionConflictException
529
- # see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
530
- raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
531
- end
503
+ update_doc_fields(Utility::Constants::CONNECTORS_INDEX, connector_id, doc, seq_no, primary_term)
504
+ end
505
+
506
+ def update_job_fields(job_id, doc = {}, seq_no = nil, primary_term = nil)
507
+ update_doc_fields(Utility::Constants::JOB_INDEX, job_id, doc, seq_no, primary_term)
532
508
  end
533
509
 
534
510
  def document_count(index_name)
511
+ client.indices.refresh(:index => index_name)
535
512
  client.count(:index => index_name)['count']
536
513
  end
537
514
 
@@ -563,6 +540,31 @@ module Core
563
540
  filter.deep_merge!(new_validation_state)
564
541
  end
565
542
  end
543
+
544
+ def update_doc_fields(index, id, doc = {}, seq_no = nil, primary_term = nil)
545
+ return if doc.empty?
546
+ update_args = {
547
+ :index => index,
548
+ :id => id,
549
+ :body => { :doc => doc },
550
+ :refresh => true,
551
+ :retry_on_conflict => 3
552
+ }
553
+
554
+ if seq_no && primary_term
555
+ update_args[:if_seq_no] = seq_no
556
+ update_args[:if_primary_term] = primary_term
557
+ update_args.delete(:retry_on_conflict)
558
+ end
559
+
560
+ begin
561
+ client.update(update_args)
562
+ rescue Elastic::Transport::Transport::Errors::Conflict
563
+ # VersionConflictException
564
+ # see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
565
+ raise ConnectorVersionChangedError.new(id, seq_no, primary_term)
566
+ end
567
+ end
566
568
  end
567
569
  end
568
570
  end
@@ -90,13 +90,6 @@ module Core
90
90
  return false
91
91
  end
92
92
 
93
- # We want to sync when sync never actually happened
94
- last_synced = connector_settings[:last_synced]
95
- if last_synced.nil? || last_synced.empty?
96
- Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
97
- return true
98
- end
99
-
100
93
  current_schedule = scheduling_settings[:interval]
101
94
 
102
95
  # Don't sync if there is no actual scheduling interval
@@ -119,6 +112,13 @@ module Core
119
112
  return false
120
113
  end
121
114
 
115
+ # We want to sync when sync never actually happened
116
+ last_synced = connector_settings[:last_synced]
117
+ if last_synced.nil? || last_synced.empty?
118
+ Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
119
+ return true
120
+ end
121
+
122
122
  next_trigger_time = cron_parser.next_time(Time.parse(last_synced))
123
123
 
124
124
  # Sync if next trigger for the connector is in past
@@ -6,12 +6,14 @@
6
6
 
7
7
  require 'json'
8
8
 
9
+ require 'utility/constants'
10
+
9
11
  module Utility
10
12
  class BulkQueue
11
13
  class QueueOverflowError < StandardError; end
12
14
 
13
15
  # 500 items or 5MB
14
- def initialize(operation_count_threshold = 500, size_threshold = 5 * 1024 * 1024)
16
+ def initialize(operation_count_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_SIZE, size_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_BYTES)
15
17
  @operation_count_threshold = operation_count_threshold.freeze
16
18
  @size_threshold = size_threshold.freeze
17
19
 
@@ -18,5 +18,10 @@ module Utility
18
18
  CRAWLER_SERVICE_TYPE = 'elastic-crawler'
19
19
  FILTERING_RULES_FEATURE = 'filtering_rules'
20
20
  FILTERING_ADVANCED_FEATURE = 'filtering_advanced_config'
21
+
22
+ # Maximum number of operations in BULK Elasticsearch operation that will ingest the data
23
+ DEFAULT_MAX_INGESTION_QUEUE_SIZE = 500
24
+ # Maximum size of either whole BULK Elasticsearch operation or one document in it
25
+ DEFAULT_MAX_INGESTION_QUEUE_BYTES = 5 * 1024 * 1024
21
26
  end
22
27
  end
@@ -0,0 +1,108 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'time'
10
+ require 'utility/errors'
11
+ require 'utility/exception_tracking'
12
+
13
+ module Utility
14
+ class ErrorMonitor
15
+ class MonitoringError < StandardError
16
+ attr_accessor :tripped_by
17
+
18
+ def initialize(message = nil, tripped_by: nil)
19
+ super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
20
+ @tripped_by = tripped_by
21
+ end
22
+ end
23
+
24
+ class MaxSuccessiveErrorsExceededError < MonitoringError; end
25
+ class MaxErrorsExceededError < MonitoringError; end
26
+ class MaxErrorsInWindowExceededError < MonitoringError; end
27
+
28
+ attr_reader :total_error_count, :success_count, :consecutive_error_count, :error_queue
29
+
30
+ def initialize(
31
+ max_errors: 1000,
32
+ max_consecutive_errors: 10,
33
+ max_error_ratio: 0.15,
34
+ window_size: 100,
35
+ error_queue_size: 20
36
+ )
37
+ @max_errors = max_errors
38
+ @max_consecutive_errors = max_consecutive_errors
39
+ @max_error_ratio = max_error_ratio
40
+ @window_size = window_size
41
+ @total_error_count = 0
42
+ @success_count = 0
43
+ @consecutive_error_count = 0
44
+ @window_errors = Array.new(window_size) { false }
45
+ @window_index = 0
46
+ @last_error = nil
47
+ @error_queue_size = error_queue_size
48
+ @error_queue = []
49
+ end
50
+
51
+ def note_success
52
+ @consecutive_error_count = 0
53
+ @success_count += 1
54
+ increment_window_index
55
+ end
56
+
57
+ def note_error(error, id: Time.now.to_i)
58
+ stack_trace = Utility::ExceptionTracking.generate_stack_trace(error)
59
+ error_message = Utility::ExceptionTracking.generate_error_message(error, nil, nil)
60
+ Utility::Logger.debug("Message id: #{id} - #{error_message}\n#{stack_trace}")
61
+ @total_error_count += 1
62
+ @consecutive_error_count += 1
63
+ @window_errors[@window_index] = true
64
+ @error_queue << DocumentError.new(error.class.name, error_message, stack_trace, id)
65
+ @error_queue = @error_queue.drop(1) if @error_queue.size > @error_queue_size
66
+ increment_window_index
67
+ @last_error = error
68
+
69
+ raise_if_necessary
70
+ end
71
+
72
+ def finalize
73
+ total_documents = @total_error_count + @success_count
74
+ if total_documents > 0 && @total_error_count.to_f / total_documents > @max_error_ratio
75
+ raise_with_last_cause(MaxErrorsInWindowExceededError.new("There were #{@total_error_count} errors out of #{total_documents} total documents", :tripped_by => @last_error))
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def raise_if_necessary
82
+ error =
83
+ if @consecutive_error_count > @max_consecutive_errors
84
+ MaxSuccessiveErrorsExceededError.new("Exceeded maximum consecutive errors - saw #{@consecutive_error_count} errors in a row.", :tripped_by => @last_error)
85
+ elsif @total_error_count > @max_errors
86
+ MaxErrorsExceededError.new("Exceeded maximum number of errors - saw #{@total_error_count} errors in total.", :tripped_by => @last_error)
87
+ elsif @window_size > 0 && num_errors_in_window / @window_size > @max_error_ratio
88
+ MaxErrorsInWindowExceededError.new("Exceeded maximum error ratio of #{@max_error_ratio}. Of the last #{@window_size} documents, #{num_errors_in_window} had errors", :tripped_by => @last_error)
89
+ end
90
+
91
+ raise_with_last_cause(error) if error
92
+ end
93
+
94
+ def num_errors_in_window
95
+ @window_errors.count(&:itself).to_f
96
+ end
97
+
98
+ def increment_window_index
99
+ @window_index = (@window_index + 1) % @window_size
100
+ end
101
+
102
+ def raise_with_last_cause(error)
103
+ raise @last_error
104
+ rescue StandardError
105
+ raise error
106
+ end
107
+ end
108
+ end
@@ -60,18 +60,6 @@ module Utility
60
60
  class JobDocumentLimitError < StandardError; end
61
61
  class JobClaimingError < StandardError; end
62
62
 
63
- class MonitoringError < StandardError
64
- attr_accessor :tripped_by
65
-
66
- def initialize(message = nil, tripped_by: nil)
67
- super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
68
- @tripped_by = tripped_by
69
- end
70
- end
71
- class MaxSuccessiveErrorsExceededError < MonitoringError; end
72
- class MaxErrorsExceededError < MonitoringError; end
73
- class MaxErrorsInWindowExceededError < MonitoringError; end
74
-
75
63
  class JobSyncNotPossibleYetError < StandardError
76
64
  attr_accessor :sync_will_be_possible_at
77
65
 
data/lib/utility.rb CHANGED
@@ -4,6 +4,8 @@
4
4
  # you may not use this file except in compliance with the Elastic License.
5
5
  #
6
6
 
7
+ # !!!!!!!!
8
+ # IF YOU EDIT THIS FILE, YOU MUST EDIT THE `connectors_utility.gemspec`
7
9
  require 'utility/bulk_queue'
8
10
  require 'utility/common'
9
11
  require 'utility/constants'
@@ -11,9 +13,12 @@ require 'utility/cron'
11
13
  require 'utility/elasticsearch/index/mappings'
12
14
  require 'utility/elasticsearch/index/text_analysis_settings'
13
15
  require 'utility/environment'
16
+ require 'utility/error_monitor'
14
17
  require 'utility/errors'
15
18
  require 'utility/filtering'
16
19
  require 'utility/es_client'
17
20
  require 'utility/exception_tracking'
18
21
  require 'utility/extension_mapping_util'
19
22
  require 'utility/logger'
23
+ # IF YOU EDIT THIS FILE, YOU MUST EDIT THE `connectors_utility.gemspec`
24
+ # !!!!!!!!
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: connectors_utility
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.6.0.4.pre.20221115T002329Z
4
+ version: 8.6.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elastic
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-15 00:00:00.000000000 Z
11
+ date: 2022-11-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -106,6 +106,7 @@ files:
106
106
  - lib/connectors/crawler/scheduler.rb
107
107
  - lib/connectors/sync_status.rb
108
108
  - lib/connectors_utility.rb
109
+ - lib/core/connector_job.rb
109
110
  - lib/core/connector_settings.rb
110
111
  - lib/core/elastic_connector_actions.rb
111
112
  - lib/core/filtering/validation_status.rb
@@ -119,6 +120,7 @@ files:
119
120
  - lib/utility/elasticsearch/index/mappings.rb
120
121
  - lib/utility/elasticsearch/index/text_analysis_settings.rb
121
122
  - lib/utility/environment.rb
123
+ - lib/utility/error_monitor.rb
122
124
  - lib/utility/errors.rb
123
125
  - lib/utility/es_client.rb
124
126
  - lib/utility/exception_tracking.rb
@@ -129,8 +131,8 @@ homepage: https://github.com/elastic/connectors-ruby
129
131
  licenses:
130
132
  - Elastic-2.0
131
133
  metadata:
132
- revision: f506d5e5ebedfb0c6058d347d8ce22adc42e2cc0
133
- repository: git@github.com:elastic/ent-search-connectors.git
134
+ revision: 39cbb85dbae57a2c92e6e0da272d05aa24ca99a9
135
+ repository: git@github.com:elastic/connectors-ruby.git
134
136
  post_install_message:
135
137
  rdoc_options: []
136
138
  require_paths:
@@ -142,9 +144,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
142
144
  version: '0'
143
145
  required_rubygems_version: !ruby/object:Gem::Requirement
144
146
  requirements:
145
- - - ">"
147
+ - - ">="
146
148
  - !ruby/object:Gem::Version
147
- version: 1.3.1
149
+ version: '0'
148
150
  requirements: []
149
151
  rubygems_version: 3.0.3.1
150
152
  signing_key: