connectors_utility 8.7.0.0.pre.20221117T004939Z → 8.7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/connectors/crawler/scheduler.rb +36 -0
- data/lib/connectors/job_trigger_method.rb +14 -0
- data/lib/connectors_utility.rb +6 -3
- data/lib/core/connector_job.rb +251 -0
- data/lib/core/connector_settings.rb +47 -19
- data/lib/core/elastic_connector_actions.rb +47 -36
- data/lib/core/scheduler.rb +67 -52
- data/lib/utility/bulk_queue.rb +1 -1
- data/lib/utility/constants.rb +0 -2
- data/lib/utility/error_monitor.rb +26 -5
- data/lib/utility/es_client.rb +4 -0
- data/lib/utility/filtering.rb +4 -0
- metadata +15 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8bde5d9fcfd7af80dd1a20bc3fdffb3e509af46fc492607fa963858aacdb79bc
|
4
|
+
data.tar.gz: b3f26fba69d08e1add58b476a37a74f3fa855790d9bcea05c7a4f5ed3b1fd9bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1eb4c63b6ae46d11b8b8e01224e1e1943a1971b7e12054978ebab97c99939fb1d0316b3ff06912d4908e8f34df482477e4e8d1f9b53c02fcebc809e27b597d2a
|
7
|
+
data.tar.gz: 2300a3a9c32ed95a1c25a54fba4737bb052caee10756887f2ac114948ec8a8ba0df066ee30420f5e3d64e0f5855df7e487f4782bc242e27ea66b592c6c60dbe6
|
@@ -22,11 +22,47 @@ module Connectors
|
|
22
22
|
[]
|
23
23
|
end
|
24
24
|
|
25
|
+
def when_triggered
|
26
|
+
loop do
|
27
|
+
connector_settings.each do |cs|
|
28
|
+
# crawler only supports :sync
|
29
|
+
if sync_triggered?(cs)
|
30
|
+
yield cs, :sync, nil
|
31
|
+
next
|
32
|
+
end
|
33
|
+
|
34
|
+
schedule_key = custom_schedule_triggered(cs)
|
35
|
+
yield cs, :sync, schedule_key if schedule_key
|
36
|
+
end
|
37
|
+
rescue *Utility::AUTHORIZATION_ERRORS => e
|
38
|
+
log_authorization_error(e)
|
39
|
+
rescue StandardError => e
|
40
|
+
log_standard_error(e)
|
41
|
+
ensure
|
42
|
+
if @is_shutting_down
|
43
|
+
break
|
44
|
+
end
|
45
|
+
sleep_for_poll_interval
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
25
49
|
private
|
26
50
|
|
27
51
|
def connector_registered?(service_type)
|
28
52
|
service_type == 'elastic-crawler'
|
29
53
|
end
|
54
|
+
|
55
|
+
# custom scheduling has no ordering, so the first-found schedule is returned
|
56
|
+
def custom_schedule_triggered(cs)
|
57
|
+
cs.custom_scheduling_settings.each do |key, custom_scheduling|
|
58
|
+
identifier = "#{cs.formatted} - #{custom_scheduling[:name]}"
|
59
|
+
if schedule_triggered?(custom_scheduling, identifier)
|
60
|
+
return key
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
nil
|
65
|
+
end
|
30
66
|
end
|
31
67
|
end
|
32
68
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
module Connectors
|
10
|
+
class JobTriggerMethod
|
11
|
+
ON_DEMAND = 'on_demand'
|
12
|
+
SCHEDULED = 'scheduled'
|
13
|
+
end
|
14
|
+
end
|
data/lib/connectors_utility.rb
CHANGED
@@ -9,8 +9,11 @@
|
|
9
9
|
require_relative 'utility'
|
10
10
|
|
11
11
|
require_relative 'connectors/connector_status'
|
12
|
+
require_relative 'connectors/crawler/scheduler'
|
13
|
+
require_relative 'connectors/job_trigger_method'
|
12
14
|
require_relative 'connectors/sync_status'
|
13
|
-
require_relative 'core/
|
15
|
+
require_relative 'core/connector_job'
|
16
|
+
require_relative 'core/connector_settings'
|
14
17
|
require_relative 'core/elastic_connector_actions'
|
15
|
-
|
16
|
-
require_relative '
|
18
|
+
require_relative 'core/filtering/validation_status'
|
19
|
+
require_relative 'core/scheduler'
|
@@ -0,0 +1,251 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
10
|
+
require 'connectors/sync_status'
|
11
|
+
require 'core/connector_settings'
|
12
|
+
require 'core/elastic_connector_actions'
|
13
|
+
require 'utility'
|
14
|
+
|
15
|
+
module Core
|
16
|
+
class ConnectorJob
|
17
|
+
DEFAULT_PAGE_SIZE = 100
|
18
|
+
IDLE_THRESHOLD = 60
|
19
|
+
|
20
|
+
def self.fetch_by_id(job_id)
|
21
|
+
es_response = ElasticConnectorActions.get_job(job_id)
|
22
|
+
return nil unless es_response[:found]
|
23
|
+
|
24
|
+
new(es_response)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
|
28
|
+
status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
|
29
|
+
|
30
|
+
query = { bool: { must: [{ terms: status_term }] } }
|
31
|
+
|
32
|
+
return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
|
33
|
+
|
34
|
+
query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
|
35
|
+
|
36
|
+
fetch_jobs_by_query(query, page_size)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.orphaned_jobs(connector_ids = [], page_size = DEFAULT_PAGE_SIZE)
|
40
|
+
query = { bool: { must_not: { terms: { 'connector.id': connector_ids } } } }
|
41
|
+
fetch_jobs_by_query(query, page_size)
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.delete_jobs(jobs)
|
45
|
+
query = { terms: { '_id': jobs.map(&:id) } }
|
46
|
+
ElasticConnectorActions.delete_jobs_by_query(query)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.idle_jobs(connector_id = nil, page_size = DEFAULT_PAGE_SIZE)
|
50
|
+
connector_ids = if connector_id
|
51
|
+
[connector_id]
|
52
|
+
else
|
53
|
+
ConnectorSettings.fetch_native_connectors.map(&:id)
|
54
|
+
end
|
55
|
+
query = {
|
56
|
+
bool: {
|
57
|
+
filter: [
|
58
|
+
{ terms: { 'connector.id': connector_ids } },
|
59
|
+
{ terms: { status: Connectors::SyncStatus::ACTIVE_STATUSES } },
|
60
|
+
{ range: { last_seen: { lte: "now-#{IDLE_THRESHOLD}s" } } }
|
61
|
+
]
|
62
|
+
}
|
63
|
+
}
|
64
|
+
fetch_jobs_by_query(query, page_size)
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.enqueue(_connector_id)
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def id
|
72
|
+
@elasticsearch_response[:_id]
|
73
|
+
end
|
74
|
+
|
75
|
+
def [](property_name)
|
76
|
+
@elasticsearch_response[:_source][property_name]
|
77
|
+
end
|
78
|
+
|
79
|
+
def error
|
80
|
+
self[:error]
|
81
|
+
end
|
82
|
+
|
83
|
+
def status
|
84
|
+
self[:status]
|
85
|
+
end
|
86
|
+
|
87
|
+
def in_progress?
|
88
|
+
status == Connectors::SyncStatus::IN_PROGRESS
|
89
|
+
end
|
90
|
+
|
91
|
+
def canceling?
|
92
|
+
status == Connectors::SyncStatus::CANCELING
|
93
|
+
end
|
94
|
+
|
95
|
+
def suspended?
|
96
|
+
status == Connectors::SyncStatus::SUSPENDED
|
97
|
+
end
|
98
|
+
|
99
|
+
def canceled?
|
100
|
+
status == Connectors::SyncStatus::CANCELED
|
101
|
+
end
|
102
|
+
|
103
|
+
def pending?
|
104
|
+
Connectors::SyncStatus::PENDING_STATUSES.include?(status)
|
105
|
+
end
|
106
|
+
|
107
|
+
def active?
|
108
|
+
Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
|
109
|
+
end
|
110
|
+
|
111
|
+
def terminated?
|
112
|
+
Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
|
113
|
+
end
|
114
|
+
|
115
|
+
def connector_snapshot
|
116
|
+
self[:connector] || {}
|
117
|
+
end
|
118
|
+
|
119
|
+
def connector_id
|
120
|
+
connector_snapshot[:id]
|
121
|
+
end
|
122
|
+
|
123
|
+
def index_name
|
124
|
+
connector_snapshot[:index_name]
|
125
|
+
end
|
126
|
+
|
127
|
+
def language
|
128
|
+
connector_snapshot[:language]
|
129
|
+
end
|
130
|
+
|
131
|
+
def service_type
|
132
|
+
connector_snapshot[:service_type]
|
133
|
+
end
|
134
|
+
|
135
|
+
def configuration
|
136
|
+
connector_snapshot[:configuration]
|
137
|
+
end
|
138
|
+
|
139
|
+
def filtering
|
140
|
+
connector_snapshot[:filtering]
|
141
|
+
end
|
142
|
+
|
143
|
+
def pipeline
|
144
|
+
connector_snapshot[:pipeline] || {}
|
145
|
+
end
|
146
|
+
|
147
|
+
def extract_binary_content?
|
148
|
+
pipeline[:extract_binary_content]
|
149
|
+
end
|
150
|
+
|
151
|
+
def reduce_whitespace?
|
152
|
+
pipeline[:reduce_whitespace]
|
153
|
+
end
|
154
|
+
|
155
|
+
def run_ml_inference?
|
156
|
+
pipeline[:run_ml_inference]
|
157
|
+
end
|
158
|
+
|
159
|
+
def connector
|
160
|
+
@connector ||= ConnectorSettings.fetch_by_id(connector_id)
|
161
|
+
end
|
162
|
+
|
163
|
+
def update_metadata(ingestion_stats = {}, connector_metadata = {})
|
164
|
+
ingestion_stats ||= {}
|
165
|
+
doc = { :last_seen => Time.now }.merge(ingestion_stats)
|
166
|
+
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
167
|
+
ElasticConnectorActions.update_job_fields(id, doc)
|
168
|
+
end
|
169
|
+
|
170
|
+
def done!(ingestion_stats = {}, connector_metadata = {})
|
171
|
+
terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
|
172
|
+
end
|
173
|
+
|
174
|
+
def error!(message, ingestion_stats = {}, connector_metadata = {})
|
175
|
+
terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
|
176
|
+
end
|
177
|
+
|
178
|
+
def cancel!(ingestion_stats = {}, connector_metadata = {})
|
179
|
+
terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
|
180
|
+
end
|
181
|
+
|
182
|
+
def with_concurrency_control
|
183
|
+
response = ElasticConnectorActions.get_job(id)
|
184
|
+
|
185
|
+
yield response, response['_seq_no'], response['_primary_term']
|
186
|
+
end
|
187
|
+
|
188
|
+
def make_running!
|
189
|
+
with_concurrency_control do |es_doc, seq_no, primary_term|
|
190
|
+
now = Time.now
|
191
|
+
doc = {
|
192
|
+
status: Connectors::SyncStatus::IN_PROGRESS,
|
193
|
+
started_at: now,
|
194
|
+
last_seen: now,
|
195
|
+
worker_hostname: Socket.gethostname
|
196
|
+
}
|
197
|
+
|
198
|
+
ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def es_source
|
203
|
+
@elasticsearch_response[:_source]
|
204
|
+
end
|
205
|
+
|
206
|
+
private
|
207
|
+
|
208
|
+
def self.fetch_jobs_by_query(query, page_size)
|
209
|
+
results = []
|
210
|
+
offset = 0
|
211
|
+
loop do
|
212
|
+
response = ElasticConnectorActions.search_jobs(query, page_size, offset)
|
213
|
+
|
214
|
+
hits = response.dig('hits', 'hits') || []
|
215
|
+
total = response.dig('hits', 'total', 'value') || 0
|
216
|
+
results += hits.map { |hit| new(hit) }
|
217
|
+
break if results.size >= total
|
218
|
+
offset += hits.size
|
219
|
+
end
|
220
|
+
|
221
|
+
results
|
222
|
+
end
|
223
|
+
|
224
|
+
def initialize(es_response)
|
225
|
+
# TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
|
226
|
+
@elasticsearch_response = es_response.with_indifferent_access
|
227
|
+
end
|
228
|
+
|
229
|
+
def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
|
230
|
+
ingestion_stats ||= {}
|
231
|
+
ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
|
232
|
+
doc = {
|
233
|
+
:last_seen => Time.now,
|
234
|
+
:completed_at => Time.now,
|
235
|
+
:status => status,
|
236
|
+
:error => error
|
237
|
+
}.merge(ingestion_stats)
|
238
|
+
doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
|
239
|
+
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
240
|
+
ElasticConnectorActions.update_job_fields(id, doc)
|
241
|
+
end
|
242
|
+
|
243
|
+
def seq_no
|
244
|
+
@elasticsearch_response[:_seq_no]
|
245
|
+
end
|
246
|
+
|
247
|
+
def primary_term
|
248
|
+
@elasticsearch_response[:_primary_term]
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
require 'active_support/core_ext/hash/indifferent_access'
|
10
10
|
require 'connectors/connector_status'
|
11
|
+
require 'connectors/sync_status'
|
11
12
|
require 'core/elastic_connector_actions'
|
12
13
|
require 'utility'
|
13
14
|
|
@@ -49,6 +50,11 @@ module Core
|
|
49
50
|
fetch_connectors_by_query(query, page_size)
|
50
51
|
end
|
51
52
|
|
53
|
+
def self.fetch_all_connectors(page_size = DEFAULT_PAGE_SIZE)
|
54
|
+
query = { match_all: {} }
|
55
|
+
fetch_connectors_by_query(query, page_size)
|
56
|
+
end
|
57
|
+
|
52
58
|
def id
|
53
59
|
@elasticsearch_response[:_id]
|
54
60
|
end
|
@@ -58,6 +64,24 @@ module Core
|
|
58
64
|
@elasticsearch_response[:_source][property_name]
|
59
65
|
end
|
60
66
|
|
67
|
+
def features
|
68
|
+
self[:features] || {}
|
69
|
+
end
|
70
|
+
|
71
|
+
# .dig version is the modern features way of doing things,
|
72
|
+
# Right-hand of OR operator is legacy features support
|
73
|
+
# When this is fixed with a migration, we can go ahead
|
74
|
+
def filtering_rule_feature_enabled?
|
75
|
+
!!features.dig(:sync_rules, :basic, :enabled) || !!features[:filtering_rules]
|
76
|
+
end
|
77
|
+
def filtering_advanced_config_feature_enabled?
|
78
|
+
!!features.dig(:sync_rules, :advanced, :enabled) || !!features[:filtering_advanced_config]
|
79
|
+
end
|
80
|
+
|
81
|
+
def any_filtering_feature_enabled?
|
82
|
+
filtering_rule_feature_enabled? || filtering_advanced_config_feature_enabled?
|
83
|
+
end
|
84
|
+
|
61
85
|
def index_name
|
62
86
|
self[:index_name]
|
63
87
|
end
|
@@ -82,6 +106,18 @@ module Core
|
|
82
106
|
self[:scheduling]
|
83
107
|
end
|
84
108
|
|
109
|
+
def custom_scheduling_settings
|
110
|
+
self[:custom_scheduling]
|
111
|
+
end
|
112
|
+
|
113
|
+
def sync_now?
|
114
|
+
self[:sync_now] == true
|
115
|
+
end
|
116
|
+
|
117
|
+
def last_synced
|
118
|
+
self[:last_synced]
|
119
|
+
end
|
120
|
+
|
85
121
|
def filtering
|
86
122
|
# assume for now, that first object in filtering array or a filter object itself is the only filtering object
|
87
123
|
filtering = @elasticsearch_response.dig(:_source, :filtering)
|
@@ -93,18 +129,6 @@ module Core
|
|
93
129
|
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
|
94
130
|
end
|
95
131
|
|
96
|
-
def extract_binary_content?
|
97
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
|
98
|
-
end
|
99
|
-
|
100
|
-
def reduce_whitespace?
|
101
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
|
102
|
-
end
|
103
|
-
|
104
|
-
def run_ml_inference?
|
105
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
|
106
|
-
end
|
107
|
-
|
108
132
|
def formatted
|
109
133
|
properties = ["ID: #{id}"]
|
110
134
|
properties << "Service type: #{service_type}" if service_type
|
@@ -130,19 +154,23 @@ module Core
|
|
130
154
|
end
|
131
155
|
|
132
156
|
def update_last_sync!(job)
|
157
|
+
# if job is nil, connector still needs to be updated, to avoid it stuck at in_progress
|
158
|
+
job_status = job&.status || Connectors::SyncStatus::ERROR
|
159
|
+
job_error = job.nil? ? 'Could\'t find the job' : job.error
|
160
|
+
job_error ||= 'unknown error' if job_status == Connectors::SyncStatus::ERROR
|
161
|
+
connector_status = (job_status == Connectors::SyncStatus::ERROR ? Connectors::ConnectorStatus::ERROR : Connectors::ConnectorStatus::CONNECTED)
|
133
162
|
doc = {
|
134
|
-
:last_sync_status =>
|
163
|
+
:last_sync_status => job_status,
|
135
164
|
:last_synced => Time.now,
|
136
|
-
:last_sync_error =>
|
137
|
-
:
|
165
|
+
:last_sync_error => job_error,
|
166
|
+
:status => connector_status,
|
167
|
+
:error => job_error
|
138
168
|
}
|
139
|
-
|
140
|
-
if job.terminated?
|
169
|
+
if job&.terminated?
|
141
170
|
doc[:last_indexed_document_count] = job[:indexed_document_count]
|
142
171
|
doc[:last_deleted_document_count] = job[:deleted_document_count]
|
143
172
|
end
|
144
|
-
|
145
|
-
Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
|
173
|
+
Core::ElasticConnectorActions.update_connector_fields(id, doc)
|
146
174
|
end
|
147
175
|
|
148
176
|
private
|
@@ -8,6 +8,7 @@
|
|
8
8
|
#
|
9
9
|
require 'active_support/core_ext/hash'
|
10
10
|
require 'connectors/connector_status'
|
11
|
+
require 'connectors/job_trigger_method'
|
11
12
|
require 'connectors/sync_status'
|
12
13
|
require 'utility'
|
13
14
|
require 'elastic-transport'
|
@@ -91,6 +92,17 @@ module Core
|
|
91
92
|
)
|
92
93
|
end
|
93
94
|
|
95
|
+
def delete_jobs_by_query(query)
|
96
|
+
client.delete_by_query(
|
97
|
+
:index => Utility::Constants::JOB_INDEX,
|
98
|
+
:body => { :query => query }
|
99
|
+
)
|
100
|
+
end
|
101
|
+
|
102
|
+
def delete_indices(indices)
|
103
|
+
client.indices.delete(:index => indices, :ignore_unavailable => true)
|
104
|
+
end
|
105
|
+
|
94
106
|
def update_connector_configuration(connector_id, configuration)
|
95
107
|
update_connector_fields(connector_id, :configuration => configuration)
|
96
108
|
end
|
@@ -145,12 +157,37 @@ module Core
|
|
145
157
|
)
|
146
158
|
end
|
147
159
|
|
148
|
-
def
|
160
|
+
def update_connector_sync_start(connector_id)
|
161
|
+
doc = connector_with_concurrency_control(connector_id)
|
162
|
+
|
163
|
+
body = {
|
164
|
+
last_sync_status: Connectors::SyncStatus::IN_PROGRESS,
|
165
|
+
last_sync_error: nil,
|
166
|
+
status: Connectors::ConnectorStatus::CONNECTED
|
167
|
+
}
|
168
|
+
|
169
|
+
update_connector_fields(
|
170
|
+
connector_id,
|
171
|
+
body,
|
172
|
+
doc[:seq_no],
|
173
|
+
doc[:primary_term]
|
174
|
+
)
|
175
|
+
end
|
176
|
+
|
177
|
+
def update_connector_custom_scheduling_last_synced(connector_id, schedule_key)
|
149
178
|
doc = connector_with_concurrency_control(connector_id)
|
150
179
|
|
180
|
+
body = {
|
181
|
+
:custom_scheduling => {
|
182
|
+
schedule_key => {
|
183
|
+
:last_synced => Time.now
|
184
|
+
}
|
185
|
+
}
|
186
|
+
}
|
187
|
+
|
151
188
|
update_connector_fields(
|
152
189
|
connector_id,
|
153
|
-
|
190
|
+
body,
|
154
191
|
doc[:seq_no],
|
155
192
|
doc[:primary_term]
|
156
193
|
)
|
@@ -178,13 +215,15 @@ module Core
|
|
178
215
|
status: Connectors::SyncStatus::PENDING,
|
179
216
|
created_at: Time.now,
|
180
217
|
last_seen: Time.now,
|
218
|
+
trigger_method: connector_settings.sync_now? ? Connectors::JobTriggerMethod::ON_DEMAND : Connectors::JobTriggerMethod::SCHEDULED,
|
181
219
|
connector: {
|
182
220
|
id: connector_settings.id,
|
183
221
|
filtering: convert_connector_filtering_to_job_filtering(connector_settings.filtering),
|
184
222
|
index_name: connector_settings.index_name,
|
185
223
|
language: connector_settings[:language],
|
186
224
|
pipeline: connector_settings[:pipeline],
|
187
|
-
service_type: connector_settings.service_type
|
225
|
+
service_type: connector_settings.service_type,
|
226
|
+
configuration: connector_settings.configuration
|
188
227
|
}
|
189
228
|
}
|
190
229
|
|
@@ -220,37 +259,6 @@ module Core
|
|
220
259
|
update_connector_fields(connector_id, body)
|
221
260
|
end
|
222
261
|
|
223
|
-
def update_sync(job_id, metadata)
|
224
|
-
body = {
|
225
|
-
:doc => { :last_seen => Time.now }.merge(metadata)
|
226
|
-
}
|
227
|
-
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
228
|
-
end
|
229
|
-
|
230
|
-
def complete_sync(connector_id, job_id, metadata, error)
|
231
|
-
sync_status = error ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
|
232
|
-
|
233
|
-
metadata ||= {}
|
234
|
-
|
235
|
-
update_connector_fields(connector_id,
|
236
|
-
:last_sync_status => sync_status,
|
237
|
-
:last_sync_error => error,
|
238
|
-
:error => error,
|
239
|
-
:last_synced => Time.now,
|
240
|
-
:last_indexed_document_count => metadata[:indexed_document_count],
|
241
|
-
:last_deleted_document_count => metadata[:deleted_document_count])
|
242
|
-
|
243
|
-
body = {
|
244
|
-
:doc => {
|
245
|
-
:status => sync_status,
|
246
|
-
:completed_at => Time.now,
|
247
|
-
:last_seen => Time.now,
|
248
|
-
:error => error
|
249
|
-
}.merge(metadata)
|
250
|
-
}
|
251
|
-
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
252
|
-
end
|
253
|
-
|
254
262
|
def fetch_document_ids(index_name)
|
255
263
|
page_size = 1000
|
256
264
|
result = []
|
@@ -331,9 +339,11 @@ module Core
|
|
331
339
|
# Creation of connector index should be handled by Kibana, this method is only used by ftest.rb
|
332
340
|
def ensure_connectors_index_exists
|
333
341
|
mappings = {
|
342
|
+
:dynamic => false,
|
334
343
|
:properties => {
|
335
344
|
:api_key_id => { :type => :keyword },
|
336
345
|
:configuration => { :type => :object },
|
346
|
+
:custom_schedule => { :type => :object },
|
337
347
|
:description => { :type => :text },
|
338
348
|
:error => { :type => :keyword },
|
339
349
|
:features => {
|
@@ -451,6 +461,7 @@ module Core
|
|
451
461
|
# Creation of job index should be handled by Kibana, this method is only used by ftest.rb
|
452
462
|
def ensure_job_index_exists
|
453
463
|
mappings = {
|
464
|
+
:dynamic => false,
|
454
465
|
:properties => {
|
455
466
|
:cancelation_requested_at => { :type => :date },
|
456
467
|
:canceled_at => { :type => :date },
|
@@ -528,8 +539,8 @@ module Core
|
|
528
539
|
end
|
529
540
|
|
530
541
|
def document_count(index_name)
|
531
|
-
client.indices.refresh(:index => index_name)
|
532
|
-
client.count(:index => index_name)['count']
|
542
|
+
client.indices.refresh(:index => index_name, :ignore_unavailable => true)
|
543
|
+
client.count(:index => index_name, :ignore_unavailable => true)['count']
|
533
544
|
end
|
534
545
|
|
535
546
|
private
|
data/lib/core/scheduler.rb
CHANGED
@@ -44,17 +44,14 @@ module Core
|
|
44
44
|
end
|
45
45
|
end
|
46
46
|
rescue *Utility::AUTHORIZATION_ERRORS => e
|
47
|
-
|
47
|
+
log_authorization_error(e)
|
48
48
|
rescue StandardError => e
|
49
|
-
|
49
|
+
log_standard_error(e)
|
50
50
|
ensure
|
51
51
|
if @is_shutting_down
|
52
52
|
break
|
53
53
|
end
|
54
|
-
|
55
|
-
Utility::Logger.debug("Sleeping for #{@poll_interval} seconds in #{self.class}.")
|
56
|
-
sleep(@poll_interval)
|
57
|
-
end
|
54
|
+
sleep_for_poll_interval
|
58
55
|
end
|
59
56
|
end
|
60
57
|
|
@@ -78,56 +75,12 @@ module Core
|
|
78
75
|
end
|
79
76
|
|
80
77
|
# Sync when sync_now flag is true for the connector
|
81
|
-
if connector_settings
|
78
|
+
if connector_settings.sync_now?
|
82
79
|
Utility::Logger.info("#{connector_settings.formatted.capitalize} is manually triggered to sync now.")
|
83
80
|
return true
|
84
81
|
end
|
85
82
|
|
86
|
-
|
87
|
-
scheduling_settings = connector_settings.scheduling_settings
|
88
|
-
unless scheduling_settings.present? && scheduling_settings[:enabled] == true
|
89
|
-
Utility::Logger.debug("#{connector_settings.formatted.capitalize} scheduling is disabled.")
|
90
|
-
return false
|
91
|
-
end
|
92
|
-
|
93
|
-
# We want to sync when sync never actually happened
|
94
|
-
last_synced = connector_settings[:last_synced]
|
95
|
-
if last_synced.nil? || last_synced.empty?
|
96
|
-
Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
|
97
|
-
return true
|
98
|
-
end
|
99
|
-
|
100
|
-
current_schedule = scheduling_settings[:interval]
|
101
|
-
|
102
|
-
# Don't sync if there is no actual scheduling interval
|
103
|
-
if current_schedule.nil? || current_schedule.empty?
|
104
|
-
Utility::Logger.warn("No sync schedule configured for #{connector_settings.formatted}.")
|
105
|
-
return false
|
106
|
-
end
|
107
|
-
|
108
|
-
current_schedule = begin
|
109
|
-
Utility::Cron.quartz_to_crontab(current_schedule)
|
110
|
-
rescue StandardError => e
|
111
|
-
Utility::ExceptionTracking.log_exception(e, "Unable to convert quartz (#{current_schedule}) to crontab.")
|
112
|
-
return false
|
113
|
-
end
|
114
|
-
cron_parser = Fugit::Cron.parse(current_schedule)
|
115
|
-
|
116
|
-
# Don't sync if the scheduling interval is non-parsable
|
117
|
-
unless cron_parser
|
118
|
-
Utility::Logger.error("Unable to parse sync schedule for #{connector_settings.formatted}: expression #{current_schedule} is not a valid Quartz Cron definition.")
|
119
|
-
return false
|
120
|
-
end
|
121
|
-
|
122
|
-
next_trigger_time = cron_parser.next_time(Time.parse(last_synced))
|
123
|
-
|
124
|
-
# Sync if next trigger for the connector is in past
|
125
|
-
if next_trigger_time < Time.now
|
126
|
-
Utility::Logger.info("#{connector_settings.formatted.capitalize} sync is triggered by cron schedule #{current_schedule}.")
|
127
|
-
return true
|
128
|
-
end
|
129
|
-
|
130
|
-
false
|
83
|
+
schedule_triggered?(connector_settings.scheduling_settings, connector_settings.formatted)
|
131
84
|
end
|
132
85
|
|
133
86
|
def heartbeat_triggered?(connector_settings)
|
@@ -148,6 +101,12 @@ module Core
|
|
148
101
|
end
|
149
102
|
|
150
103
|
def filtering_validation_triggered?(connector_settings)
|
104
|
+
unless connector_settings.any_filtering_feature_enabled?
|
105
|
+
Utility::Logger.debug("#{connector_settings.formatted} all filtering features are disabled. Skip filtering validation.")
|
106
|
+
|
107
|
+
return false
|
108
|
+
end
|
109
|
+
|
151
110
|
filtering = connector_settings.filtering
|
152
111
|
|
153
112
|
unless filtering.present?
|
@@ -189,5 +148,61 @@ module Core
|
|
189
148
|
false
|
190
149
|
end
|
191
150
|
end
|
151
|
+
|
152
|
+
def schedule_triggered?(scheduling_settings, identifier)
|
153
|
+
# Don't sync if sync is explicitly disabled
|
154
|
+
unless scheduling_settings.present? && scheduling_settings[:enabled] == true
|
155
|
+
Utility::Logger.debug("#{identifier.capitalize} scheduling is disabled.")
|
156
|
+
return false
|
157
|
+
end
|
158
|
+
|
159
|
+
current_schedule = scheduling_settings[:interval]
|
160
|
+
|
161
|
+
# Don't sync if there is no actual scheduling interval
|
162
|
+
if current_schedule.nil? || current_schedule.empty?
|
163
|
+
Utility::Logger.warn("No sync schedule configured for #{identifier}.")
|
164
|
+
return false
|
165
|
+
end
|
166
|
+
|
167
|
+
current_schedule =
|
168
|
+
begin
|
169
|
+
Utility::Cron.quartz_to_crontab(current_schedule)
|
170
|
+
rescue StandardError => e
|
171
|
+
Utility::ExceptionTracking.log_exception(e, "Unable to convert quartz (#{current_schedule}) to crontab.")
|
172
|
+
return false
|
173
|
+
end
|
174
|
+
cron_parser = Fugit::Cron.parse(current_schedule)
|
175
|
+
|
176
|
+
# Don't sync if the scheduling interval is non-parsable
|
177
|
+
unless cron_parser
|
178
|
+
Utility::Logger.error("Unable to parse sync schedule for #{identifier}: expression #{current_schedule} is not a valid Quartz Cron definition.")
|
179
|
+
return false
|
180
|
+
end
|
181
|
+
|
182
|
+
next_trigger_time = cron_parser.next_time(Time.now)
|
183
|
+
|
184
|
+
# Sync if next trigger happens before the next poll
|
185
|
+
if next_trigger_time <= Time.now + @poll_interval
|
186
|
+
Utility::Logger.info("#{identifier.capitalize} sync is triggered by cron schedule #{current_schedule}.")
|
187
|
+
return true
|
188
|
+
end
|
189
|
+
|
190
|
+
false
|
191
|
+
end
|
192
|
+
|
193
|
+
def sleep_for_poll_interval
|
194
|
+
if @poll_interval > 0 && !@is_shutting_down
|
195
|
+
Utility::Logger.debug("Sleeping for #{@poll_interval} seconds in #{self.class}.")
|
196
|
+
sleep(@poll_interval)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def log_authorization_error(e)
|
201
|
+
Utility::ExceptionTracking.log_exception(e, 'Could not retrieve connectors settings due to authorization error.')
|
202
|
+
end
|
203
|
+
|
204
|
+
def log_standard_error(e)
|
205
|
+
Utility::ExceptionTracking.log_exception(e, 'Sync failed due to unexpected error.')
|
206
|
+
end
|
192
207
|
end
|
193
208
|
end
|
data/lib/utility/bulk_queue.rb
CHANGED
@@ -13,7 +13,7 @@ module Utility
|
|
13
13
|
class QueueOverflowError < StandardError; end
|
14
14
|
|
15
15
|
# 500 items or 5MB
|
16
|
-
def initialize(operation_count_threshold = Utility::Constants::
|
16
|
+
def initialize(operation_count_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_SIZE, size_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_BYTES)
|
17
17
|
@operation_count_threshold = operation_count_threshold.freeze
|
18
18
|
@size_threshold = size_threshold.freeze
|
19
19
|
|
data/lib/utility/constants.rb
CHANGED
@@ -16,8 +16,6 @@ module Utility
|
|
16
16
|
JOB_INDEX = '.elastic-connectors-sync-jobs'
|
17
17
|
CONTENT_INDEX_PREFIX = 'search-'
|
18
18
|
CRAWLER_SERVICE_TYPE = 'elastic-crawler'
|
19
|
-
FILTERING_RULES_FEATURE = 'filtering_rules'
|
20
|
-
FILTERING_ADVANCED_FEATURE = 'filtering_advanced_config'
|
21
19
|
|
22
20
|
# Maximum number of operations in BULK Elasticsearch operation that will ingest the data
|
23
21
|
DEFAULT_MAX_INGESTION_QUEUE_SIZE = 500
|
@@ -51,7 +51,7 @@ module Utility
|
|
51
51
|
def note_success
|
52
52
|
@consecutive_error_count = 0
|
53
53
|
@success_count += 1
|
54
|
-
|
54
|
+
track_window_error(false)
|
55
55
|
end
|
56
56
|
|
57
57
|
def note_error(error, id: Time.now.to_i)
|
@@ -60,10 +60,9 @@ module Utility
|
|
60
60
|
Utility::Logger.debug("Message id: #{id} - #{error_message}\n#{stack_trace}")
|
61
61
|
@total_error_count += 1
|
62
62
|
@consecutive_error_count += 1
|
63
|
-
@window_errors[@window_index] = true
|
64
63
|
@error_queue << DocumentError.new(error.class.name, error_message, stack_trace, id)
|
65
64
|
@error_queue = @error_queue.drop(1) if @error_queue.size > @error_queue_size
|
66
|
-
|
65
|
+
track_window_error(true)
|
67
66
|
@last_error = error
|
68
67
|
|
69
68
|
raise_if_necessary
|
@@ -92,10 +91,32 @@ module Utility
|
|
92
91
|
end
|
93
92
|
|
94
93
|
def num_errors_in_window
|
95
|
-
@window_errors.count(
|
94
|
+
@window_errors.count(true).to_f
|
96
95
|
end
|
97
96
|
|
98
|
-
def
|
97
|
+
def track_window_error(is_error)
|
98
|
+
# We keep the errors array of the size @window_size this way, imagine @window_size = 5
|
99
|
+
# Error array inits as falses:
|
100
|
+
# [ false, false, false, false, false ]
|
101
|
+
# Third document raises an error:
|
102
|
+
# [ false, false, true, false, false ]
|
103
|
+
# ^^^^
|
104
|
+
# 2 % 5 == 2
|
105
|
+
# Fifth document raises an error:
|
106
|
+
# [ false, false, true, false, true ]
|
107
|
+
# ^^^^
|
108
|
+
# 4 % 5 == 4
|
109
|
+
# Sixth document raises an error:
|
110
|
+
# [ true, false, true, false, true ]
|
111
|
+
# ^^^^
|
112
|
+
# 5 % 5 == 0
|
113
|
+
#
|
114
|
+
# Eigth document is successful:
|
115
|
+
# [ true, false, false, false, true ]
|
116
|
+
# ^^^^^
|
117
|
+
# 7 % 5 == 2
|
118
|
+
# And so on.
|
119
|
+
@window_errors[@window_index] = is_error
|
99
120
|
@window_index = (@window_index + 1) % @window_size
|
100
121
|
end
|
101
122
|
|
data/lib/utility/es_client.rb
CHANGED
@@ -43,6 +43,10 @@ module Utility
|
|
43
43
|
configs[:transport_options] = es_config[:transport_options] if es_config[:transport_options]
|
44
44
|
configs[:ca_fingerprint] = es_config[:ca_fingerprint] if es_config[:ca_fingerprint]
|
45
45
|
|
46
|
+
# headers
|
47
|
+
# these are necessary for cloud-hosted native connectors
|
48
|
+
configs[:headers] = es_config[:headers].to_h if es_config[:headers]
|
49
|
+
|
46
50
|
# if log or trace is activated, we use the application logger
|
47
51
|
configs[:logger] = if configs[:log] || configs[:trace]
|
48
52
|
Utility::Logger.logger
|
data/lib/utility/filtering.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: connectors_utility
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 8.7.0.0
|
4
|
+
version: 8.7.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elastic
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-02-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 5.2
|
19
|
+
version: '5.2'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 5.2
|
26
|
+
version: '5.2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: ecs-logging
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -104,8 +104,10 @@ files:
|
|
104
104
|
- NOTICE.txt
|
105
105
|
- lib/connectors/connector_status.rb
|
106
106
|
- lib/connectors/crawler/scheduler.rb
|
107
|
+
- lib/connectors/job_trigger_method.rb
|
107
108
|
- lib/connectors/sync_status.rb
|
108
109
|
- lib/connectors_utility.rb
|
110
|
+
- lib/core/connector_job.rb
|
109
111
|
- lib/core/connector_settings.rb
|
110
112
|
- lib/core/elastic_connector_actions.rb
|
111
113
|
- lib/core/filtering/validation_status.rb
|
@@ -130,9 +132,9 @@ homepage: https://github.com/elastic/connectors-ruby
|
|
130
132
|
licenses:
|
131
133
|
- Elastic-2.0
|
132
134
|
metadata:
|
133
|
-
revision:
|
134
|
-
repository: https://github.com/elastic/connectors-ruby
|
135
|
-
post_install_message:
|
135
|
+
revision: ae6292137eef9acac1259c5e7e71a3d0e149210b
|
136
|
+
repository: https://github.com/elastic/connectors-ruby
|
137
|
+
post_install_message:
|
136
138
|
rdoc_options: []
|
137
139
|
require_paths:
|
138
140
|
- lib
|
@@ -143,12 +145,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
143
145
|
version: '0'
|
144
146
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
145
147
|
requirements:
|
146
|
-
- - "
|
148
|
+
- - ">="
|
147
149
|
- !ruby/object:Gem::Version
|
148
|
-
version:
|
150
|
+
version: '0'
|
149
151
|
requirements: []
|
150
152
|
rubygems_version: 3.0.3.1
|
151
|
-
signing_key:
|
153
|
+
signing_key:
|
152
154
|
specification_version: 4
|
153
155
|
summary: Gem containing shared Connector Services libraries
|
154
156
|
test_files: []
|