connectors_utility 8.6.0.4.pre.20221116T024609Z → 8.6.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6ae5315b00ad59db8c9a0300ec34f560a8293a285aa0d62fb5b9f662996b432d
4
- data.tar.gz: 6eec049ff3a257bcff893edefd841e9b441dcc1858f7f88c4b0376fe47961bcc
3
+ metadata.gz: bd3d8a98ffaf8965434e1c01467c4ff30d87a1b25e414d5ce36f8a8529178053
4
+ data.tar.gz: 07d54a470c31ba311aeee4aca41401bca11178d20ed7de72c8dadf7926e910d4
5
5
  SHA512:
6
- metadata.gz: 296e79a4f866b91a98d02cf45ab7742efc2e10dd209be9456fcf6be34600247d55b09e7313030a386ccef693207b20b8802d5cb2ded3662cc289ed70fe5c1c3f
7
- data.tar.gz: 7d59558a77f22e14a01ef279ae1da29195b1f1216920e6aecde81c2c5b5f11371a5f5fc23b87df5587ca3c1969389d86e5b02c189e24f94ea5a0590047af4b98
6
+ metadata.gz: eddc02838146053fa61173a5dc6060eb62c92c70930bd5c1338586572af08078288455c65fb2301796d99b338b996ec889966e535bbceaf9f1f39bac6c9888bc
7
+ data.tar.gz: e0ec181ccc3cb8dd59f123003d873c1218f2e75c6d91569efe075330d913d6bb366c4bcfbc389844b8040a10e22e5b8c797525379d042cb81a2fa2c7e3af08a4
@@ -0,0 +1,240 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'active_support/core_ext/hash/indifferent_access'
10
+ require 'connectors/sync_status'
11
+ require 'core/connector_settings'
12
+ require 'core/elastic_connector_actions'
13
+ require 'utility'
14
+
15
+ module Core
16
+ class ConnectorJob
17
+ DEFAULT_PAGE_SIZE = 100
18
+ STUCK_THRESHOLD = 60
19
+
20
+ def self.fetch_by_id(job_id)
21
+ es_response = ElasticConnectorActions.get_job(job_id)
22
+ return nil unless es_response[:found]
23
+
24
+ new(es_response)
25
+ end
26
+
27
+ def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
28
+ status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
29
+
30
+ query = { bool: { must: [{ terms: status_term }] } }
31
+
32
+ return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
33
+
34
+ query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
35
+
36
+ fetch_jobs_by_query(query, page_size)
37
+ end
38
+
39
+ def self.orphaned_jobs(page_size = DEFAULT_PAGE_SIZE)
40
+ connector_ids = ConnectorSettings.fetch_all_connectors.map(&:id)
41
+ query = { bool: { must_not: { terms: { 'connector.id': connector_ids } } } }
42
+ fetch_jobs_by_query(query, page_size)
43
+ end
44
+
45
+ def self.delete_jobs(jobs)
46
+ query = { terms: { '_id': jobs.map(&:id) } }
47
+ ElasticConnectorActions.delete_jobs_by_query(query)
48
+ end
49
+
50
+ def self.stuck_jobs(connector_id = nil, page_size = DEFAULT_PAGE_SIZE)
51
+ connector_ids = if connector_id
52
+ [connector_id]
53
+ else
54
+ ConnectorSettings.fetch_native_connectors.map(&:id)
55
+ end
56
+ query = {
57
+ bool: {
58
+ filter: [
59
+ { terms: { 'connector.id': connector_ids } },
60
+ { terms: { status: Connectors::SyncStatus::ACTIVE_STATUSES } },
61
+ { range: { last_seen: { lte: "now-#{STUCK_THRESHOLD}s" } } }
62
+ ]
63
+ }
64
+ }
65
+ fetch_jobs_by_query(query, page_size)
66
+ end
67
+
68
+ def self.enqueue(_connector_id)
69
+ nil
70
+ end
71
+
72
+ def id
73
+ @elasticsearch_response[:_id]
74
+ end
75
+
76
+ def [](property_name)
77
+ @elasticsearch_response[:_source][property_name]
78
+ end
79
+
80
+ def error
81
+ self[:error]
82
+ end
83
+
84
+ def status
85
+ self[:status]
86
+ end
87
+
88
+ def in_progress?
89
+ status == Connectors::SyncStatus::IN_PROGRESS
90
+ end
91
+
92
+ def canceling?
93
+ status == Connectors::SyncStatus::CANCELING
94
+ end
95
+
96
+ def suspended?
97
+ status == Connectors::SyncStatus::SUSPENDED
98
+ end
99
+
100
+ def canceled?
101
+ status == Connectors::SyncStatus::CANCELED
102
+ end
103
+
104
+ def pending?
105
+ Connectors::SyncStatus::PENDING_STATUSES.include?(status)
106
+ end
107
+
108
+ def active?
109
+ Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
110
+ end
111
+
112
+ def terminated?
113
+ Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
114
+ end
115
+
116
+ def connector_snapshot
117
+ self[:connector] || {}
118
+ end
119
+
120
+ def connector_id
121
+ connector_snapshot[:id]
122
+ end
123
+
124
+ def index_name
125
+ connector_snapshot[:index_name]
126
+ end
127
+
128
+ def language
129
+ connector_snapshot[:language]
130
+ end
131
+
132
+ def service_type
133
+ connector_snapshot[:service_type]
134
+ end
135
+
136
+ def configuration
137
+ connector_snapshot[:configuration]
138
+ end
139
+
140
+ def filtering
141
+ connector_snapshot[:filtering]
142
+ end
143
+
144
+ def pipeline
145
+ connector_snapshot[:pipeline]
146
+ end
147
+
148
+ def connector
149
+ @connector ||= ConnectorSettings.fetch_by_id(connector_id)
150
+ end
151
+
152
+ def update_metadata(ingestion_stats = {}, connector_metadata = {})
153
+ ingestion_stats ||= {}
154
+ doc = { :last_seen => Time.now }.merge(ingestion_stats)
155
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
156
+ ElasticConnectorActions.update_job_fields(id, doc)
157
+ end
158
+
159
+ def done!(ingestion_stats = {}, connector_metadata = {})
160
+ terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
161
+ end
162
+
163
+ def error!(message, ingestion_stats = {}, connector_metadata = {})
164
+ terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
165
+ end
166
+
167
+ def cancel!(ingestion_stats = {}, connector_metadata = {})
168
+ terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
169
+ end
170
+
171
+ def with_concurrency_control
172
+ response = ElasticConnectorActions.get_job(id)
173
+
174
+ yield response, response['_seq_no'], response['_primary_term']
175
+ end
176
+
177
+ def make_running!
178
+ with_concurrency_control do |es_doc, seq_no, primary_term|
179
+ now = Time.now
180
+ doc = {
181
+ status: Connectors::SyncStatus::IN_PROGRESS,
182
+ started_at: now,
183
+ last_seen: now,
184
+ worker_hostname: Socket.gethostname
185
+ }
186
+
187
+ ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
188
+ end
189
+ end
190
+
191
+ def es_source
192
+ @elasticsearch_response[:_source]
193
+ end
194
+
195
+ private
196
+
197
+ def self.fetch_jobs_by_query(query, page_size)
198
+ results = []
199
+ offset = 0
200
+ loop do
201
+ response = ElasticConnectorActions.search_jobs(query, page_size, offset)
202
+
203
+ hits = response.dig('hits', 'hits') || []
204
+ total = response.dig('hits', 'total', 'value') || 0
205
+ results += hits.map { |hit| new(hit) }
206
+ break if results.size >= total
207
+ offset += hits.size
208
+ end
209
+
210
+ results
211
+ end
212
+
213
+ def initialize(es_response)
214
+ # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
215
+ @elasticsearch_response = es_response.with_indifferent_access
216
+ end
217
+
218
+ def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
219
+ ingestion_stats ||= {}
220
+ ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
221
+ doc = {
222
+ :last_seen => Time.now,
223
+ :completed_at => Time.now,
224
+ :status => status,
225
+ :error => error
226
+ }.merge(ingestion_stats)
227
+ doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
228
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
229
+ ElasticConnectorActions.update_job_fields(id, doc)
230
+ end
231
+
232
+ def seq_no
233
+ @elasticsearch_response[:_seq_no]
234
+ end
235
+
236
+ def primary_term
237
+ @elasticsearch_response[:_primary_term]
238
+ end
239
+ end
240
+ end
@@ -49,6 +49,11 @@ module Core
49
49
  fetch_connectors_by_query(query, page_size)
50
50
  end
51
51
 
52
+ def self.fetch_all_connectors(page_size = DEFAULT_PAGE_SIZE)
53
+ query = { match_all: {} }
54
+ fetch_connectors_by_query(query, page_size)
55
+ end
56
+
52
57
  def id
53
58
  @elasticsearch_response[:_id]
54
59
  end
@@ -130,19 +135,21 @@ module Core
130
135
  end
131
136
 
132
137
  def update_last_sync!(job)
138
+ # if job is nil, connector still needs to be updated, to avoid it stuck at in_progress
139
+ job_status = job&.status || Connectors::SyncStatus::ERROR
140
+ job_error = job.nil? ? 'Could\'t find the job' : job.error
141
+ job_error ||= 'unknown error' if job_status == Connectors::SyncStatus::ERROR
133
142
  doc = {
134
- :last_sync_status => job.status,
143
+ :last_sync_status => job_status,
135
144
  :last_synced => Time.now,
136
- :last_sync_error => job.error,
137
- :error => job.error
145
+ :last_sync_error => job_error,
146
+ :error => job_error
138
147
  }
139
-
140
- if job.terminated?
148
+ if job&.terminated?
141
149
  doc[:last_indexed_document_count] = job[:indexed_document_count]
142
150
  doc[:last_deleted_document_count] = job[:deleted_document_count]
143
151
  end
144
-
145
- Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
152
+ Core::ElasticConnectorActions.update_connector_fields(id, doc)
146
153
  end
147
154
 
148
155
  private
@@ -91,6 +91,17 @@ module Core
91
91
  )
92
92
  end
93
93
 
94
+ def delete_jobs_by_query(query)
95
+ client.delete_by_query(
96
+ :index => Utility::Constants::JOB_INDEX,
97
+ :body => { :query => query }
98
+ )
99
+ end
100
+
101
+ def delete_indices(indices)
102
+ client.indices.delete(:index => indices, :ignore_unavailable => true)
103
+ end
104
+
94
105
  def update_connector_configuration(connector_id, configuration)
95
106
  update_connector_fields(connector_id, :configuration => configuration)
96
107
  end
@@ -220,37 +231,6 @@ module Core
220
231
  update_connector_fields(connector_id, body)
221
232
  end
222
233
 
223
- def update_sync(job_id, metadata)
224
- body = {
225
- :doc => { :last_seen => Time.now }.merge(metadata)
226
- }
227
- client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
228
- end
229
-
230
- def complete_sync(connector_id, job_id, metadata, error)
231
- sync_status = error ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
232
-
233
- metadata ||= {}
234
-
235
- update_connector_fields(connector_id,
236
- :last_sync_status => sync_status,
237
- :last_sync_error => error,
238
- :error => error,
239
- :last_synced => Time.now,
240
- :last_indexed_document_count => metadata[:indexed_document_count],
241
- :last_deleted_document_count => metadata[:deleted_document_count])
242
-
243
- body = {
244
- :doc => {
245
- :status => sync_status,
246
- :completed_at => Time.now,
247
- :last_seen => Time.now,
248
- :error => error
249
- }.merge(metadata)
250
- }
251
- client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
252
- end
253
-
254
234
  def fetch_document_ids(index_name)
255
235
  page_size = 1000
256
236
  result = []
@@ -90,13 +90,6 @@ module Core
90
90
  return false
91
91
  end
92
92
 
93
- # We want to sync when sync never actually happened
94
- last_synced = connector_settings[:last_synced]
95
- if last_synced.nil? || last_synced.empty?
96
- Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
97
- return true
98
- end
99
-
100
93
  current_schedule = scheduling_settings[:interval]
101
94
 
102
95
  # Don't sync if there is no actual scheduling interval
@@ -119,6 +112,13 @@ module Core
119
112
  return false
120
113
  end
121
114
 
115
+ # We want to sync when sync never actually happened
116
+ last_synced = connector_settings[:last_synced]
117
+ if last_synced.nil? || last_synced.empty?
118
+ Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
119
+ return true
120
+ end
121
+
122
122
  next_trigger_time = cron_parser.next_time(Time.parse(last_synced))
123
123
 
124
124
  # Sync if next trigger for the connector is in past
@@ -6,12 +6,14 @@
6
6
 
7
7
  require 'json'
8
8
 
9
+ require 'utility/constants'
10
+
9
11
  module Utility
10
12
  class BulkQueue
11
13
  class QueueOverflowError < StandardError; end
12
14
 
13
15
  # 500 items or 5MB
14
- def initialize(operation_count_threshold = 500, size_threshold = 5 * 1024 * 1024)
16
+ def initialize(operation_count_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_SIZE, size_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_BYTES)
15
17
  @operation_count_threshold = operation_count_threshold.freeze
16
18
  @size_threshold = size_threshold.freeze
17
19
 
@@ -18,5 +18,10 @@ module Utility
18
18
  CRAWLER_SERVICE_TYPE = 'elastic-crawler'
19
19
  FILTERING_RULES_FEATURE = 'filtering_rules'
20
20
  FILTERING_ADVANCED_FEATURE = 'filtering_advanced_config'
21
+
22
+ # Maximum number of operations in BULK Elasticsearch operation that will ingest the data
23
+ DEFAULT_MAX_INGESTION_QUEUE_SIZE = 500
24
+ # Maximum size of either whole BULK Elasticsearch operation or one document in it
25
+ DEFAULT_MAX_INGESTION_QUEUE_BYTES = 5 * 1024 * 1024
21
26
  end
22
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: connectors_utility
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.6.0.4.pre.20221116T024609Z
4
+ version: 8.6.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elastic
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-16 00:00:00.000000000 Z
11
+ date: 2022-11-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -106,6 +106,7 @@ files:
106
106
  - lib/connectors/crawler/scheduler.rb
107
107
  - lib/connectors/sync_status.rb
108
108
  - lib/connectors_utility.rb
109
+ - lib/core/connector_job.rb
109
110
  - lib/core/connector_settings.rb
110
111
  - lib/core/elastic_connector_actions.rb
111
112
  - lib/core/filtering/validation_status.rb
@@ -130,8 +131,8 @@ homepage: https://github.com/elastic/connectors-ruby
130
131
  licenses:
131
132
  - Elastic-2.0
132
133
  metadata:
133
- revision: b3cc1332879a38930a272a63f8c6be1847578204
134
- repository: git@github.com:elastic/ent-search-connectors.git
134
+ revision: 39cbb85dbae57a2c92e6e0da272d05aa24ca99a9
135
+ repository: git@github.com:elastic/connectors-ruby.git
135
136
  post_install_message:
136
137
  rdoc_options: []
137
138
  require_paths:
@@ -143,9 +144,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
143
144
  version: '0'
144
145
  required_rubygems_version: !ruby/object:Gem::Requirement
145
146
  requirements:
146
- - - ">"
147
+ - - ">="
147
148
  - !ruby/object:Gem::Version
148
- version: 1.3.1
149
+ version: '0'
149
150
  requirements: []
150
151
  rubygems_version: 3.0.3.1
151
152
  signing_key: