connectors_utility 8.6.0.4.pre.20221116T024609Z → 8.6.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/core/connector_job.rb +240 -0
- data/lib/core/connector_settings.rb +14 -7
- data/lib/core/elastic_connector_actions.rb +11 -31
- data/lib/core/scheduler.rb +7 -7
- data/lib/utility/bulk_queue.rb +3 -1
- data/lib/utility/constants.rb +5 -0
- metadata +7 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 29244c3e2240e8989ebce72be48105fefb09d84b6f2228d156f8247ce27a31bf
|
4
|
+
data.tar.gz: cd5e7c9ad5ff3d4934f662c81452e5c6999a35917cecc3b763d4a0383eebefa5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 58ea6b8c80af406fad11c3f3f3dd15164a859e8af7ed148b8befe827ddfec45248b4c2cb314ffaecd04fc061d7643423d2d4174fac2a32265aab6906c32e8d72
|
7
|
+
data.tar.gz: ed76592a6b609fb171003bc04d08b618479169bcd929297dc239d59fbbe1fcf9f21050a2e5c7e3a606cad9fc3848fc2bb7db402701553e50196f94baf76440f5
|
@@ -0,0 +1,240 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
10
|
+
require 'connectors/sync_status'
|
11
|
+
require 'core/connector_settings'
|
12
|
+
require 'core/elastic_connector_actions'
|
13
|
+
require 'utility'
|
14
|
+
|
15
|
+
module Core
|
16
|
+
class ConnectorJob
|
17
|
+
DEFAULT_PAGE_SIZE = 100
|
18
|
+
STUCK_THRESHOLD = 60
|
19
|
+
|
20
|
+
def self.fetch_by_id(job_id)
|
21
|
+
es_response = ElasticConnectorActions.get_job(job_id)
|
22
|
+
return nil unless es_response[:found]
|
23
|
+
|
24
|
+
new(es_response)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
|
28
|
+
status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
|
29
|
+
|
30
|
+
query = { bool: { must: [{ terms: status_term }] } }
|
31
|
+
|
32
|
+
return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
|
33
|
+
|
34
|
+
query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
|
35
|
+
|
36
|
+
fetch_jobs_by_query(query, page_size)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.orphaned_jobs(page_size = DEFAULT_PAGE_SIZE)
|
40
|
+
connector_ids = ConnectorSettings.fetch_all_connectors.map(&:id)
|
41
|
+
query = { bool: { must_not: { terms: { 'connector.id': connector_ids } } } }
|
42
|
+
fetch_jobs_by_query(query, page_size)
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.delete_jobs(jobs)
|
46
|
+
query = { terms: { '_id': jobs.map(&:id) } }
|
47
|
+
ElasticConnectorActions.delete_jobs_by_query(query)
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.stuck_jobs(connector_id = nil, page_size = DEFAULT_PAGE_SIZE)
|
51
|
+
connector_ids = if connector_id
|
52
|
+
[connector_id]
|
53
|
+
else
|
54
|
+
ConnectorSettings.fetch_native_connectors.map(&:id)
|
55
|
+
end
|
56
|
+
query = {
|
57
|
+
bool: {
|
58
|
+
filter: [
|
59
|
+
{ terms: { 'connector.id': connector_ids } },
|
60
|
+
{ terms: { status: Connectors::SyncStatus::ACTIVE_STATUSES } },
|
61
|
+
{ range: { last_seen: { lte: "now-#{STUCK_THRESHOLD}s" } } }
|
62
|
+
]
|
63
|
+
}
|
64
|
+
}
|
65
|
+
fetch_jobs_by_query(query, page_size)
|
66
|
+
end
|
67
|
+
|
68
|
+
def self.enqueue(_connector_id)
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
|
72
|
+
def id
|
73
|
+
@elasticsearch_response[:_id]
|
74
|
+
end
|
75
|
+
|
76
|
+
def [](property_name)
|
77
|
+
@elasticsearch_response[:_source][property_name]
|
78
|
+
end
|
79
|
+
|
80
|
+
def error
|
81
|
+
self[:error]
|
82
|
+
end
|
83
|
+
|
84
|
+
def status
|
85
|
+
self[:status]
|
86
|
+
end
|
87
|
+
|
88
|
+
def in_progress?
|
89
|
+
status == Connectors::SyncStatus::IN_PROGRESS
|
90
|
+
end
|
91
|
+
|
92
|
+
def canceling?
|
93
|
+
status == Connectors::SyncStatus::CANCELING
|
94
|
+
end
|
95
|
+
|
96
|
+
def suspended?
|
97
|
+
status == Connectors::SyncStatus::SUSPENDED
|
98
|
+
end
|
99
|
+
|
100
|
+
def canceled?
|
101
|
+
status == Connectors::SyncStatus::CANCELED
|
102
|
+
end
|
103
|
+
|
104
|
+
def pending?
|
105
|
+
Connectors::SyncStatus::PENDING_STATUSES.include?(status)
|
106
|
+
end
|
107
|
+
|
108
|
+
def active?
|
109
|
+
Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
|
110
|
+
end
|
111
|
+
|
112
|
+
def terminated?
|
113
|
+
Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
|
114
|
+
end
|
115
|
+
|
116
|
+
def connector_snapshot
|
117
|
+
self[:connector] || {}
|
118
|
+
end
|
119
|
+
|
120
|
+
def connector_id
|
121
|
+
connector_snapshot[:id]
|
122
|
+
end
|
123
|
+
|
124
|
+
def index_name
|
125
|
+
connector_snapshot[:index_name]
|
126
|
+
end
|
127
|
+
|
128
|
+
def language
|
129
|
+
connector_snapshot[:language]
|
130
|
+
end
|
131
|
+
|
132
|
+
def service_type
|
133
|
+
connector_snapshot[:service_type]
|
134
|
+
end
|
135
|
+
|
136
|
+
def configuration
|
137
|
+
connector_snapshot[:configuration]
|
138
|
+
end
|
139
|
+
|
140
|
+
def filtering
|
141
|
+
connector_snapshot[:filtering]
|
142
|
+
end
|
143
|
+
|
144
|
+
def pipeline
|
145
|
+
connector_snapshot[:pipeline]
|
146
|
+
end
|
147
|
+
|
148
|
+
def connector
|
149
|
+
@connector ||= ConnectorSettings.fetch_by_id(connector_id)
|
150
|
+
end
|
151
|
+
|
152
|
+
def update_metadata(ingestion_stats = {}, connector_metadata = {})
|
153
|
+
ingestion_stats ||= {}
|
154
|
+
doc = { :last_seen => Time.now }.merge(ingestion_stats)
|
155
|
+
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
156
|
+
ElasticConnectorActions.update_job_fields(id, doc)
|
157
|
+
end
|
158
|
+
|
159
|
+
def done!(ingestion_stats = {}, connector_metadata = {})
|
160
|
+
terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
|
161
|
+
end
|
162
|
+
|
163
|
+
def error!(message, ingestion_stats = {}, connector_metadata = {})
|
164
|
+
terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
|
165
|
+
end
|
166
|
+
|
167
|
+
def cancel!(ingestion_stats = {}, connector_metadata = {})
|
168
|
+
terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
|
169
|
+
end
|
170
|
+
|
171
|
+
def with_concurrency_control
|
172
|
+
response = ElasticConnectorActions.get_job(id)
|
173
|
+
|
174
|
+
yield response, response['_seq_no'], response['_primary_term']
|
175
|
+
end
|
176
|
+
|
177
|
+
def make_running!
|
178
|
+
with_concurrency_control do |es_doc, seq_no, primary_term|
|
179
|
+
now = Time.now
|
180
|
+
doc = {
|
181
|
+
status: Connectors::SyncStatus::IN_PROGRESS,
|
182
|
+
started_at: now,
|
183
|
+
last_seen: now,
|
184
|
+
worker_hostname: Socket.gethostname
|
185
|
+
}
|
186
|
+
|
187
|
+
ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def es_source
|
192
|
+
@elasticsearch_response[:_source]
|
193
|
+
end
|
194
|
+
|
195
|
+
private
|
196
|
+
|
197
|
+
def self.fetch_jobs_by_query(query, page_size)
|
198
|
+
results = []
|
199
|
+
offset = 0
|
200
|
+
loop do
|
201
|
+
response = ElasticConnectorActions.search_jobs(query, page_size, offset)
|
202
|
+
|
203
|
+
hits = response.dig('hits', 'hits') || []
|
204
|
+
total = response.dig('hits', 'total', 'value') || 0
|
205
|
+
results += hits.map { |hit| new(hit) }
|
206
|
+
break if results.size >= total
|
207
|
+
offset += hits.size
|
208
|
+
end
|
209
|
+
|
210
|
+
results
|
211
|
+
end
|
212
|
+
|
213
|
+
def initialize(es_response)
|
214
|
+
# TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
|
215
|
+
@elasticsearch_response = es_response.with_indifferent_access
|
216
|
+
end
|
217
|
+
|
218
|
+
def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
|
219
|
+
ingestion_stats ||= {}
|
220
|
+
ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
|
221
|
+
doc = {
|
222
|
+
:last_seen => Time.now,
|
223
|
+
:completed_at => Time.now,
|
224
|
+
:status => status,
|
225
|
+
:error => error
|
226
|
+
}.merge(ingestion_stats)
|
227
|
+
doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
|
228
|
+
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
229
|
+
ElasticConnectorActions.update_job_fields(id, doc)
|
230
|
+
end
|
231
|
+
|
232
|
+
def seq_no
|
233
|
+
@elasticsearch_response[:_seq_no]
|
234
|
+
end
|
235
|
+
|
236
|
+
def primary_term
|
237
|
+
@elasticsearch_response[:_primary_term]
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
@@ -49,6 +49,11 @@ module Core
|
|
49
49
|
fetch_connectors_by_query(query, page_size)
|
50
50
|
end
|
51
51
|
|
52
|
+
def self.fetch_all_connectors(page_size = DEFAULT_PAGE_SIZE)
|
53
|
+
query = { match_all: {} }
|
54
|
+
fetch_connectors_by_query(query, page_size)
|
55
|
+
end
|
56
|
+
|
52
57
|
def id
|
53
58
|
@elasticsearch_response[:_id]
|
54
59
|
end
|
@@ -130,19 +135,21 @@ module Core
|
|
130
135
|
end
|
131
136
|
|
132
137
|
def update_last_sync!(job)
|
138
|
+
# if job is nil, connector still needs to be updated, to avoid it stuck at in_progress
|
139
|
+
job_status = job&.status || Connectors::SyncStatus::ERROR
|
140
|
+
job_error = job.nil? ? 'Could\'t find the job' : job.error
|
141
|
+
job_error ||= 'unknown error' if job_status == Connectors::SyncStatus::ERROR
|
133
142
|
doc = {
|
134
|
-
:last_sync_status =>
|
143
|
+
:last_sync_status => job_status,
|
135
144
|
:last_synced => Time.now,
|
136
|
-
:last_sync_error =>
|
137
|
-
:error =>
|
145
|
+
:last_sync_error => job_error,
|
146
|
+
:error => job_error
|
138
147
|
}
|
139
|
-
|
140
|
-
if job.terminated?
|
148
|
+
if job&.terminated?
|
141
149
|
doc[:last_indexed_document_count] = job[:indexed_document_count]
|
142
150
|
doc[:last_deleted_document_count] = job[:deleted_document_count]
|
143
151
|
end
|
144
|
-
|
145
|
-
Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
|
152
|
+
Core::ElasticConnectorActions.update_connector_fields(id, doc)
|
146
153
|
end
|
147
154
|
|
148
155
|
private
|
@@ -91,6 +91,17 @@ module Core
|
|
91
91
|
)
|
92
92
|
end
|
93
93
|
|
94
|
+
def delete_jobs_by_query(query)
|
95
|
+
client.delete_by_query(
|
96
|
+
:index => Utility::Constants::JOB_INDEX,
|
97
|
+
:body => { :query => query }
|
98
|
+
)
|
99
|
+
end
|
100
|
+
|
101
|
+
def delete_indices(indices)
|
102
|
+
client.indices.delete(:index => indices, :ignore_unavailable => true)
|
103
|
+
end
|
104
|
+
|
94
105
|
def update_connector_configuration(connector_id, configuration)
|
95
106
|
update_connector_fields(connector_id, :configuration => configuration)
|
96
107
|
end
|
@@ -220,37 +231,6 @@ module Core
|
|
220
231
|
update_connector_fields(connector_id, body)
|
221
232
|
end
|
222
233
|
|
223
|
-
def update_sync(job_id, metadata)
|
224
|
-
body = {
|
225
|
-
:doc => { :last_seen => Time.now }.merge(metadata)
|
226
|
-
}
|
227
|
-
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
228
|
-
end
|
229
|
-
|
230
|
-
def complete_sync(connector_id, job_id, metadata, error)
|
231
|
-
sync_status = error ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
|
232
|
-
|
233
|
-
metadata ||= {}
|
234
|
-
|
235
|
-
update_connector_fields(connector_id,
|
236
|
-
:last_sync_status => sync_status,
|
237
|
-
:last_sync_error => error,
|
238
|
-
:error => error,
|
239
|
-
:last_synced => Time.now,
|
240
|
-
:last_indexed_document_count => metadata[:indexed_document_count],
|
241
|
-
:last_deleted_document_count => metadata[:deleted_document_count])
|
242
|
-
|
243
|
-
body = {
|
244
|
-
:doc => {
|
245
|
-
:status => sync_status,
|
246
|
-
:completed_at => Time.now,
|
247
|
-
:last_seen => Time.now,
|
248
|
-
:error => error
|
249
|
-
}.merge(metadata)
|
250
|
-
}
|
251
|
-
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
252
|
-
end
|
253
|
-
|
254
234
|
def fetch_document_ids(index_name)
|
255
235
|
page_size = 1000
|
256
236
|
result = []
|
data/lib/core/scheduler.rb
CHANGED
@@ -90,13 +90,6 @@ module Core
|
|
90
90
|
return false
|
91
91
|
end
|
92
92
|
|
93
|
-
# We want to sync when sync never actually happened
|
94
|
-
last_synced = connector_settings[:last_synced]
|
95
|
-
if last_synced.nil? || last_synced.empty?
|
96
|
-
Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
|
97
|
-
return true
|
98
|
-
end
|
99
|
-
|
100
93
|
current_schedule = scheduling_settings[:interval]
|
101
94
|
|
102
95
|
# Don't sync if there is no actual scheduling interval
|
@@ -119,6 +112,13 @@ module Core
|
|
119
112
|
return false
|
120
113
|
end
|
121
114
|
|
115
|
+
# We want to sync when sync never actually happened
|
116
|
+
last_synced = connector_settings[:last_synced]
|
117
|
+
if last_synced.nil? || last_synced.empty?
|
118
|
+
Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
|
119
|
+
return true
|
120
|
+
end
|
121
|
+
|
122
122
|
next_trigger_time = cron_parser.next_time(Time.parse(last_synced))
|
123
123
|
|
124
124
|
# Sync if next trigger for the connector is in past
|
data/lib/utility/bulk_queue.rb
CHANGED
@@ -6,12 +6,14 @@
|
|
6
6
|
|
7
7
|
require 'json'
|
8
8
|
|
9
|
+
require 'utility/constants'
|
10
|
+
|
9
11
|
module Utility
|
10
12
|
class BulkQueue
|
11
13
|
class QueueOverflowError < StandardError; end
|
12
14
|
|
13
15
|
# 500 items or 5MB
|
14
|
-
def initialize(operation_count_threshold =
|
16
|
+
def initialize(operation_count_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_SIZE, size_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_BYTES)
|
15
17
|
@operation_count_threshold = operation_count_threshold.freeze
|
16
18
|
@size_threshold = size_threshold.freeze
|
17
19
|
|
data/lib/utility/constants.rb
CHANGED
@@ -18,5 +18,10 @@ module Utility
|
|
18
18
|
CRAWLER_SERVICE_TYPE = 'elastic-crawler'
|
19
19
|
FILTERING_RULES_FEATURE = 'filtering_rules'
|
20
20
|
FILTERING_ADVANCED_FEATURE = 'filtering_advanced_config'
|
21
|
+
|
22
|
+
# Maximum number of operations in BULK Elasticsearch operation that will ingest the data
|
23
|
+
DEFAULT_MAX_INGESTION_QUEUE_SIZE = 500
|
24
|
+
# Maximum size of either whole BULK Elasticsearch operation or one document in it
|
25
|
+
DEFAULT_MAX_INGESTION_QUEUE_BYTES = 5 * 1024 * 1024
|
21
26
|
end
|
22
27
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: connectors_utility
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 8.6.0.4
|
4
|
+
version: 8.6.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elastic
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-11-
|
11
|
+
date: 2022-11-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -106,6 +106,7 @@ files:
|
|
106
106
|
- lib/connectors/crawler/scheduler.rb
|
107
107
|
- lib/connectors/sync_status.rb
|
108
108
|
- lib/connectors_utility.rb
|
109
|
+
- lib/core/connector_job.rb
|
109
110
|
- lib/core/connector_settings.rb
|
110
111
|
- lib/core/elastic_connector_actions.rb
|
111
112
|
- lib/core/filtering/validation_status.rb
|
@@ -130,8 +131,8 @@ homepage: https://github.com/elastic/connectors-ruby
|
|
130
131
|
licenses:
|
131
132
|
- Elastic-2.0
|
132
133
|
metadata:
|
133
|
-
revision:
|
134
|
-
repository: git@github.com:elastic/
|
134
|
+
revision: 39cbb85dbae57a2c92e6e0da272d05aa24ca99a9
|
135
|
+
repository: git@github.com:elastic/connectors-ruby.git
|
135
136
|
post_install_message:
|
136
137
|
rdoc_options: []
|
137
138
|
require_paths:
|
@@ -143,9 +144,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
143
144
|
version: '0'
|
144
145
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
145
146
|
requirements:
|
146
|
-
- - "
|
147
|
+
- - ">="
|
147
148
|
- !ruby/object:Gem::Version
|
148
|
-
version:
|
149
|
+
version: '0'
|
149
150
|
requirements: []
|
150
151
|
rubygems_version: 3.0.3.1
|
151
152
|
signing_key:
|