connectors_utility 8.6.0.4.pre.20221116T024609Z → 8.6.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6ae5315b00ad59db8c9a0300ec34f560a8293a285aa0d62fb5b9f662996b432d
4
- data.tar.gz: 6eec049ff3a257bcff893edefd841e9b441dcc1858f7f88c4b0376fe47961bcc
3
+ metadata.gz: 29244c3e2240e8989ebce72be48105fefb09d84b6f2228d156f8247ce27a31bf
4
+ data.tar.gz: cd5e7c9ad5ff3d4934f662c81452e5c6999a35917cecc3b763d4a0383eebefa5
5
5
  SHA512:
6
- metadata.gz: 296e79a4f866b91a98d02cf45ab7742efc2e10dd209be9456fcf6be34600247d55b09e7313030a386ccef693207b20b8802d5cb2ded3662cc289ed70fe5c1c3f
7
- data.tar.gz: 7d59558a77f22e14a01ef279ae1da29195b1f1216920e6aecde81c2c5b5f11371a5f5fc23b87df5587ca3c1969389d86e5b02c189e24f94ea5a0590047af4b98
6
+ metadata.gz: 58ea6b8c80af406fad11c3f3f3dd15164a859e8af7ed148b8befe827ddfec45248b4c2cb314ffaecd04fc061d7643423d2d4174fac2a32265aab6906c32e8d72
7
+ data.tar.gz: ed76592a6b609fb171003bc04d08b618479169bcd929297dc239d59fbbe1fcf9f21050a2e5c7e3a606cad9fc3848fc2bb7db402701553e50196f94baf76440f5
@@ -0,0 +1,240 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'active_support/core_ext/hash/indifferent_access'
10
+ require 'connectors/sync_status'
11
+ require 'core/connector_settings'
12
+ require 'core/elastic_connector_actions'
13
+ require 'utility'
14
+
15
+ module Core
16
+ class ConnectorJob
17
+ DEFAULT_PAGE_SIZE = 100
18
+ STUCK_THRESHOLD = 60
19
+
20
+ def self.fetch_by_id(job_id)
21
+ es_response = ElasticConnectorActions.get_job(job_id)
22
+ return nil unless es_response[:found]
23
+
24
+ new(es_response)
25
+ end
26
+
27
+ def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
28
+ status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
29
+
30
+ query = { bool: { must: [{ terms: status_term }] } }
31
+
32
+ return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
33
+
34
+ query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
35
+
36
+ fetch_jobs_by_query(query, page_size)
37
+ end
38
+
39
+ def self.orphaned_jobs(page_size = DEFAULT_PAGE_SIZE)
40
+ connector_ids = ConnectorSettings.fetch_all_connectors.map(&:id)
41
+ query = { bool: { must_not: { terms: { 'connector.id': connector_ids } } } }
42
+ fetch_jobs_by_query(query, page_size)
43
+ end
44
+
45
+ def self.delete_jobs(jobs)
46
+ query = { terms: { '_id': jobs.map(&:id) } }
47
+ ElasticConnectorActions.delete_jobs_by_query(query)
48
+ end
49
+
50
+ def self.stuck_jobs(connector_id = nil, page_size = DEFAULT_PAGE_SIZE)
51
+ connector_ids = if connector_id
52
+ [connector_id]
53
+ else
54
+ ConnectorSettings.fetch_native_connectors.map(&:id)
55
+ end
56
+ query = {
57
+ bool: {
58
+ filter: [
59
+ { terms: { 'connector.id': connector_ids } },
60
+ { terms: { status: Connectors::SyncStatus::ACTIVE_STATUSES } },
61
+ { range: { last_seen: { lte: "now-#{STUCK_THRESHOLD}s" } } }
62
+ ]
63
+ }
64
+ }
65
+ fetch_jobs_by_query(query, page_size)
66
+ end
67
+
68
+ def self.enqueue(_connector_id)
69
+ nil
70
+ end
71
+
72
+ def id
73
+ @elasticsearch_response[:_id]
74
+ end
75
+
76
+ def [](property_name)
77
+ @elasticsearch_response[:_source][property_name]
78
+ end
79
+
80
+ def error
81
+ self[:error]
82
+ end
83
+
84
+ def status
85
+ self[:status]
86
+ end
87
+
88
+ def in_progress?
89
+ status == Connectors::SyncStatus::IN_PROGRESS
90
+ end
91
+
92
+ def canceling?
93
+ status == Connectors::SyncStatus::CANCELING
94
+ end
95
+
96
+ def suspended?
97
+ status == Connectors::SyncStatus::SUSPENDED
98
+ end
99
+
100
+ def canceled?
101
+ status == Connectors::SyncStatus::CANCELED
102
+ end
103
+
104
+ def pending?
105
+ Connectors::SyncStatus::PENDING_STATUSES.include?(status)
106
+ end
107
+
108
+ def active?
109
+ Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
110
+ end
111
+
112
+ def terminated?
113
+ Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
114
+ end
115
+
116
+ def connector_snapshot
117
+ self[:connector] || {}
118
+ end
119
+
120
+ def connector_id
121
+ connector_snapshot[:id]
122
+ end
123
+
124
+ def index_name
125
+ connector_snapshot[:index_name]
126
+ end
127
+
128
+ def language
129
+ connector_snapshot[:language]
130
+ end
131
+
132
+ def service_type
133
+ connector_snapshot[:service_type]
134
+ end
135
+
136
+ def configuration
137
+ connector_snapshot[:configuration]
138
+ end
139
+
140
+ def filtering
141
+ connector_snapshot[:filtering]
142
+ end
143
+
144
+ def pipeline
145
+ connector_snapshot[:pipeline]
146
+ end
147
+
148
+ def connector
149
+ @connector ||= ConnectorSettings.fetch_by_id(connector_id)
150
+ end
151
+
152
+ def update_metadata(ingestion_stats = {}, connector_metadata = {})
153
+ ingestion_stats ||= {}
154
+ doc = { :last_seen => Time.now }.merge(ingestion_stats)
155
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
156
+ ElasticConnectorActions.update_job_fields(id, doc)
157
+ end
158
+
159
+ def done!(ingestion_stats = {}, connector_metadata = {})
160
+ terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
161
+ end
162
+
163
+ def error!(message, ingestion_stats = {}, connector_metadata = {})
164
+ terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
165
+ end
166
+
167
+ def cancel!(ingestion_stats = {}, connector_metadata = {})
168
+ terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
169
+ end
170
+
171
+ def with_concurrency_control
172
+ response = ElasticConnectorActions.get_job(id)
173
+
174
+ yield response, response['_seq_no'], response['_primary_term']
175
+ end
176
+
177
+ def make_running!
178
+ with_concurrency_control do |es_doc, seq_no, primary_term|
179
+ now = Time.now
180
+ doc = {
181
+ status: Connectors::SyncStatus::IN_PROGRESS,
182
+ started_at: now,
183
+ last_seen: now,
184
+ worker_hostname: Socket.gethostname
185
+ }
186
+
187
+ ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
188
+ end
189
+ end
190
+
191
+ def es_source
192
+ @elasticsearch_response[:_source]
193
+ end
194
+
195
+ private
196
+
197
+ def self.fetch_jobs_by_query(query, page_size)
198
+ results = []
199
+ offset = 0
200
+ loop do
201
+ response = ElasticConnectorActions.search_jobs(query, page_size, offset)
202
+
203
+ hits = response.dig('hits', 'hits') || []
204
+ total = response.dig('hits', 'total', 'value') || 0
205
+ results += hits.map { |hit| new(hit) }
206
+ break if results.size >= total
207
+ offset += hits.size
208
+ end
209
+
210
+ results
211
+ end
212
+
213
+ def initialize(es_response)
214
+ # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
215
+ @elasticsearch_response = es_response.with_indifferent_access
216
+ end
217
+
218
+ def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
219
+ ingestion_stats ||= {}
220
+ ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
221
+ doc = {
222
+ :last_seen => Time.now,
223
+ :completed_at => Time.now,
224
+ :status => status,
225
+ :error => error
226
+ }.merge(ingestion_stats)
227
+ doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
228
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
229
+ ElasticConnectorActions.update_job_fields(id, doc)
230
+ end
231
+
232
+ def seq_no
233
+ @elasticsearch_response[:_seq_no]
234
+ end
235
+
236
+ def primary_term
237
+ @elasticsearch_response[:_primary_term]
238
+ end
239
+ end
240
+ end
@@ -49,6 +49,11 @@ module Core
49
49
  fetch_connectors_by_query(query, page_size)
50
50
  end
51
51
 
52
+ def self.fetch_all_connectors(page_size = DEFAULT_PAGE_SIZE)
53
+ query = { match_all: {} }
54
+ fetch_connectors_by_query(query, page_size)
55
+ end
56
+
52
57
  def id
53
58
  @elasticsearch_response[:_id]
54
59
  end
@@ -130,19 +135,21 @@ module Core
130
135
  end
131
136
 
132
137
  def update_last_sync!(job)
138
+ # if job is nil, connector still needs to be updated, to avoid it stuck at in_progress
139
+ job_status = job&.status || Connectors::SyncStatus::ERROR
140
+ job_error = job.nil? ? 'Could\'t find the job' : job.error
141
+ job_error ||= 'unknown error' if job_status == Connectors::SyncStatus::ERROR
133
142
  doc = {
134
- :last_sync_status => job.status,
143
+ :last_sync_status => job_status,
135
144
  :last_synced => Time.now,
136
- :last_sync_error => job.error,
137
- :error => job.error
145
+ :last_sync_error => job_error,
146
+ :error => job_error
138
147
  }
139
-
140
- if job.terminated?
148
+ if job&.terminated?
141
149
  doc[:last_indexed_document_count] = job[:indexed_document_count]
142
150
  doc[:last_deleted_document_count] = job[:deleted_document_count]
143
151
  end
144
-
145
- Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
152
+ Core::ElasticConnectorActions.update_connector_fields(id, doc)
146
153
  end
147
154
 
148
155
  private
@@ -91,6 +91,17 @@ module Core
91
91
  )
92
92
  end
93
93
 
94
+ def delete_jobs_by_query(query)
95
+ client.delete_by_query(
96
+ :index => Utility::Constants::JOB_INDEX,
97
+ :body => { :query => query }
98
+ )
99
+ end
100
+
101
+ def delete_indices(indices)
102
+ client.indices.delete(:index => indices, :ignore_unavailable => true)
103
+ end
104
+
94
105
  def update_connector_configuration(connector_id, configuration)
95
106
  update_connector_fields(connector_id, :configuration => configuration)
96
107
  end
@@ -220,37 +231,6 @@ module Core
220
231
  update_connector_fields(connector_id, body)
221
232
  end
222
233
 
223
- def update_sync(job_id, metadata)
224
- body = {
225
- :doc => { :last_seen => Time.now }.merge(metadata)
226
- }
227
- client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
228
- end
229
-
230
- def complete_sync(connector_id, job_id, metadata, error)
231
- sync_status = error ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
232
-
233
- metadata ||= {}
234
-
235
- update_connector_fields(connector_id,
236
- :last_sync_status => sync_status,
237
- :last_sync_error => error,
238
- :error => error,
239
- :last_synced => Time.now,
240
- :last_indexed_document_count => metadata[:indexed_document_count],
241
- :last_deleted_document_count => metadata[:deleted_document_count])
242
-
243
- body = {
244
- :doc => {
245
- :status => sync_status,
246
- :completed_at => Time.now,
247
- :last_seen => Time.now,
248
- :error => error
249
- }.merge(metadata)
250
- }
251
- client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
252
- end
253
-
254
234
  def fetch_document_ids(index_name)
255
235
  page_size = 1000
256
236
  result = []
@@ -90,13 +90,6 @@ module Core
90
90
  return false
91
91
  end
92
92
 
93
- # We want to sync when sync never actually happened
94
- last_synced = connector_settings[:last_synced]
95
- if last_synced.nil? || last_synced.empty?
96
- Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
97
- return true
98
- end
99
-
100
93
  current_schedule = scheduling_settings[:interval]
101
94
 
102
95
  # Don't sync if there is no actual scheduling interval
@@ -119,6 +112,13 @@ module Core
119
112
  return false
120
113
  end
121
114
 
115
+ # We want to sync when sync never actually happened
116
+ last_synced = connector_settings[:last_synced]
117
+ if last_synced.nil? || last_synced.empty?
118
+ Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
119
+ return true
120
+ end
121
+
122
122
  next_trigger_time = cron_parser.next_time(Time.parse(last_synced))
123
123
 
124
124
  # Sync if next trigger for the connector is in past
@@ -6,12 +6,14 @@
6
6
 
7
7
  require 'json'
8
8
 
9
+ require 'utility/constants'
10
+
9
11
  module Utility
10
12
  class BulkQueue
11
13
  class QueueOverflowError < StandardError; end
12
14
 
13
15
  # 500 items or 5MB
14
- def initialize(operation_count_threshold = 500, size_threshold = 5 * 1024 * 1024)
16
+ def initialize(operation_count_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_SIZE, size_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_BYTES)
15
17
  @operation_count_threshold = operation_count_threshold.freeze
16
18
  @size_threshold = size_threshold.freeze
17
19
 
@@ -18,5 +18,10 @@ module Utility
18
18
  CRAWLER_SERVICE_TYPE = 'elastic-crawler'
19
19
  FILTERING_RULES_FEATURE = 'filtering_rules'
20
20
  FILTERING_ADVANCED_FEATURE = 'filtering_advanced_config'
21
+
22
+ # Maximum number of operations in BULK Elasticsearch operation that will ingest the data
23
+ DEFAULT_MAX_INGESTION_QUEUE_SIZE = 500
24
+ # Maximum size of either whole BULK Elasticsearch operation or one document in it
25
+ DEFAULT_MAX_INGESTION_QUEUE_BYTES = 5 * 1024 * 1024
21
26
  end
22
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: connectors_utility
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.6.0.4.pre.20221116T024609Z
4
+ version: 8.6.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elastic
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-16 00:00:00.000000000 Z
11
+ date: 2022-11-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -106,6 +106,7 @@ files:
106
106
  - lib/connectors/crawler/scheduler.rb
107
107
  - lib/connectors/sync_status.rb
108
108
  - lib/connectors_utility.rb
109
+ - lib/core/connector_job.rb
109
110
  - lib/core/connector_settings.rb
110
111
  - lib/core/elastic_connector_actions.rb
111
112
  - lib/core/filtering/validation_status.rb
@@ -130,8 +131,8 @@ homepage: https://github.com/elastic/connectors-ruby
130
131
  licenses:
131
132
  - Elastic-2.0
132
133
  metadata:
133
- revision: b3cc1332879a38930a272a63f8c6be1847578204
134
- repository: git@github.com:elastic/ent-search-connectors.git
134
+ revision: 39cbb85dbae57a2c92e6e0da272d05aa24ca99a9
135
+ repository: git@github.com:elastic/connectors-ruby.git
135
136
  post_install_message:
136
137
  rdoc_options: []
137
138
  require_paths:
@@ -143,9 +144,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
143
144
  version: '0'
144
145
  required_rubygems_version: !ruby/object:Gem::Requirement
145
146
  requirements:
146
- - - ">"
147
+ - - ">="
147
148
  - !ruby/object:Gem::Version
148
- version: 1.3.1
149
+ version: '0'
149
150
  requirements: []
150
151
  rubygems_version: 3.0.3.1
151
152
  signing_key: