connectors_utility 8.7.0.0.pre.20221117T004939Z → 8.10.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/connectors/crawler/scheduler.rb +36 -0
- data/lib/connectors/job_trigger_method.rb +14 -0
- data/lib/connectors_utility.rb +6 -3
- data/lib/core/connector_job.rb +251 -0
- data/lib/core/connector_settings.rb +52 -20
- data/lib/core/elastic_connector_actions.rb +54 -38
- data/lib/core/scheduler.rb +67 -52
- data/lib/utility/bulk_queue.rb +1 -1
- data/lib/utility/constants.rb +0 -2
- data/lib/utility/error_monitor.rb +26 -5
- data/lib/utility/es_client.rb +4 -0
- data/lib/utility/filtering.rb +4 -0
- metadata +15 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cecf3c0cb5d8ae36980ea24589f4a7f696b44d50d7c0ee6a78fbcb178abfafb0
|
4
|
+
data.tar.gz: 205f276fdaa771be59c18aba83eb453774597fa89faa4df06c1e52f554b335ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 49e16535ac0af49175f653b674beff17f587a3af76ba95e857f45689999f21b89d221195423ecfd1e1bc50d9591cd3fb8a78d7f05c26da26578a33da7ad910da
|
7
|
+
data.tar.gz: c4e643d267a2fbfed3cae91eba61ee12c2edf08417eeebcb780ebf0eab02fa033ce62cd598058f0c203cdcec4ca2444caa17dac8279d93c9d00cb36cc2bab256
|
@@ -22,11 +22,47 @@ module Connectors
|
|
22
22
|
[]
|
23
23
|
end
|
24
24
|
|
25
|
+
def when_triggered
|
26
|
+
loop do
|
27
|
+
connector_settings.each do |cs|
|
28
|
+
# crawler only supports :sync
|
29
|
+
if sync_triggered?(cs)
|
30
|
+
yield cs, :sync, nil
|
31
|
+
next
|
32
|
+
end
|
33
|
+
|
34
|
+
schedule_key = custom_schedule_triggered(cs)
|
35
|
+
yield cs, :sync, schedule_key if schedule_key
|
36
|
+
end
|
37
|
+
rescue *Utility::AUTHORIZATION_ERRORS => e
|
38
|
+
log_authorization_error(e)
|
39
|
+
rescue StandardError => e
|
40
|
+
log_standard_error(e)
|
41
|
+
ensure
|
42
|
+
if @is_shutting_down
|
43
|
+
break
|
44
|
+
end
|
45
|
+
sleep_for_poll_interval
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
25
49
|
private
|
26
50
|
|
27
51
|
def connector_registered?(service_type)
|
28
52
|
service_type == 'elastic-crawler'
|
29
53
|
end
|
54
|
+
|
55
|
+
# custom scheduling has no ordering, so the first-found schedule is returned
|
56
|
+
def custom_schedule_triggered(cs)
|
57
|
+
cs.custom_scheduling_settings.each do |key, custom_scheduling|
|
58
|
+
identifier = "#{cs.formatted} - #{custom_scheduling[:name]}"
|
59
|
+
if schedule_triggered?(custom_scheduling, identifier)
|
60
|
+
return key
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
nil
|
65
|
+
end
|
30
66
|
end
|
31
67
|
end
|
32
68
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
module Connectors
|
10
|
+
class JobTriggerMethod
|
11
|
+
ON_DEMAND = 'on_demand'
|
12
|
+
SCHEDULED = 'scheduled'
|
13
|
+
end
|
14
|
+
end
|
data/lib/connectors_utility.rb
CHANGED
@@ -9,8 +9,11 @@
|
|
9
9
|
require_relative 'utility'
|
10
10
|
|
11
11
|
require_relative 'connectors/connector_status'
|
12
|
+
require_relative 'connectors/crawler/scheduler'
|
13
|
+
require_relative 'connectors/job_trigger_method'
|
12
14
|
require_relative 'connectors/sync_status'
|
13
|
-
require_relative 'core/
|
15
|
+
require_relative 'core/connector_job'
|
16
|
+
require_relative 'core/connector_settings'
|
14
17
|
require_relative 'core/elastic_connector_actions'
|
15
|
-
|
16
|
-
require_relative '
|
18
|
+
require_relative 'core/filtering/validation_status'
|
19
|
+
require_relative 'core/scheduler'
|
@@ -0,0 +1,251 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
10
|
+
require 'connectors/sync_status'
|
11
|
+
require 'core/connector_settings'
|
12
|
+
require 'core/elastic_connector_actions'
|
13
|
+
require 'utility'
|
14
|
+
|
15
|
+
module Core
|
16
|
+
class ConnectorJob
|
17
|
+
DEFAULT_PAGE_SIZE = 100
|
18
|
+
IDLE_THRESHOLD = 60
|
19
|
+
|
20
|
+
def self.fetch_by_id(job_id)
|
21
|
+
es_response = ElasticConnectorActions.get_job(job_id)
|
22
|
+
return nil unless es_response[:found]
|
23
|
+
|
24
|
+
new(es_response)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
|
28
|
+
status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
|
29
|
+
|
30
|
+
query = { bool: { must: [{ terms: status_term }] } }
|
31
|
+
|
32
|
+
return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
|
33
|
+
|
34
|
+
query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
|
35
|
+
|
36
|
+
fetch_jobs_by_query(query, page_size)
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.orphaned_jobs(connector_ids = [], page_size = DEFAULT_PAGE_SIZE)
|
40
|
+
query = { bool: { must_not: { terms: { 'connector.id': connector_ids } } } }
|
41
|
+
fetch_jobs_by_query(query, page_size)
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.delete_jobs(jobs)
|
45
|
+
query = { terms: { '_id': jobs.map(&:id) } }
|
46
|
+
ElasticConnectorActions.delete_jobs_by_query(query)
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.idle_jobs(connector_id = nil, page_size = DEFAULT_PAGE_SIZE)
|
50
|
+
connector_ids = if connector_id
|
51
|
+
[connector_id]
|
52
|
+
else
|
53
|
+
ConnectorSettings.fetch_native_connectors.map(&:id)
|
54
|
+
end
|
55
|
+
query = {
|
56
|
+
bool: {
|
57
|
+
filter: [
|
58
|
+
{ terms: { 'connector.id': connector_ids } },
|
59
|
+
{ terms: { status: Connectors::SyncStatus::ACTIVE_STATUSES } },
|
60
|
+
{ range: { last_seen: { lte: "now-#{IDLE_THRESHOLD}s" } } }
|
61
|
+
]
|
62
|
+
}
|
63
|
+
}
|
64
|
+
fetch_jobs_by_query(query, page_size)
|
65
|
+
end
|
66
|
+
|
67
|
+
def self.enqueue(_connector_id)
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
|
71
|
+
def id
|
72
|
+
@elasticsearch_response[:_id]
|
73
|
+
end
|
74
|
+
|
75
|
+
def [](property_name)
|
76
|
+
@elasticsearch_response[:_source][property_name]
|
77
|
+
end
|
78
|
+
|
79
|
+
def error
|
80
|
+
self[:error]
|
81
|
+
end
|
82
|
+
|
83
|
+
def status
|
84
|
+
self[:status]
|
85
|
+
end
|
86
|
+
|
87
|
+
def in_progress?
|
88
|
+
status == Connectors::SyncStatus::IN_PROGRESS
|
89
|
+
end
|
90
|
+
|
91
|
+
def canceling?
|
92
|
+
status == Connectors::SyncStatus::CANCELING
|
93
|
+
end
|
94
|
+
|
95
|
+
def suspended?
|
96
|
+
status == Connectors::SyncStatus::SUSPENDED
|
97
|
+
end
|
98
|
+
|
99
|
+
def canceled?
|
100
|
+
status == Connectors::SyncStatus::CANCELED
|
101
|
+
end
|
102
|
+
|
103
|
+
def pending?
|
104
|
+
Connectors::SyncStatus::PENDING_STATUSES.include?(status)
|
105
|
+
end
|
106
|
+
|
107
|
+
def active?
|
108
|
+
Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
|
109
|
+
end
|
110
|
+
|
111
|
+
def terminated?
|
112
|
+
Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
|
113
|
+
end
|
114
|
+
|
115
|
+
def connector_snapshot
|
116
|
+
self[:connector] || {}
|
117
|
+
end
|
118
|
+
|
119
|
+
def connector_id
|
120
|
+
connector_snapshot[:id]
|
121
|
+
end
|
122
|
+
|
123
|
+
def index_name
|
124
|
+
connector_snapshot[:index_name]
|
125
|
+
end
|
126
|
+
|
127
|
+
def language
|
128
|
+
connector_snapshot[:language]
|
129
|
+
end
|
130
|
+
|
131
|
+
def service_type
|
132
|
+
connector_snapshot[:service_type]
|
133
|
+
end
|
134
|
+
|
135
|
+
def configuration
|
136
|
+
connector_snapshot[:configuration]
|
137
|
+
end
|
138
|
+
|
139
|
+
def filtering
|
140
|
+
connector_snapshot[:filtering]
|
141
|
+
end
|
142
|
+
|
143
|
+
def pipeline
|
144
|
+
connector_snapshot[:pipeline] || {}
|
145
|
+
end
|
146
|
+
|
147
|
+
def extract_binary_content?
|
148
|
+
pipeline[:extract_binary_content]
|
149
|
+
end
|
150
|
+
|
151
|
+
def reduce_whitespace?
|
152
|
+
pipeline[:reduce_whitespace]
|
153
|
+
end
|
154
|
+
|
155
|
+
def run_ml_inference?
|
156
|
+
pipeline[:run_ml_inference]
|
157
|
+
end
|
158
|
+
|
159
|
+
def connector
|
160
|
+
@connector ||= ConnectorSettings.fetch_by_id(connector_id)
|
161
|
+
end
|
162
|
+
|
163
|
+
def update_metadata(ingestion_stats = {}, connector_metadata = {})
|
164
|
+
ingestion_stats ||= {}
|
165
|
+
doc = { :last_seen => Time.now }.merge(ingestion_stats)
|
166
|
+
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
167
|
+
ElasticConnectorActions.update_job_fields(id, doc)
|
168
|
+
end
|
169
|
+
|
170
|
+
def done!(ingestion_stats = {}, connector_metadata = {})
|
171
|
+
terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
|
172
|
+
end
|
173
|
+
|
174
|
+
def error!(message, ingestion_stats = {}, connector_metadata = {})
|
175
|
+
terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
|
176
|
+
end
|
177
|
+
|
178
|
+
def cancel!(ingestion_stats = {}, connector_metadata = {})
|
179
|
+
terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
|
180
|
+
end
|
181
|
+
|
182
|
+
def with_concurrency_control
|
183
|
+
response = ElasticConnectorActions.get_job(id)
|
184
|
+
|
185
|
+
yield response, response['_seq_no'], response['_primary_term']
|
186
|
+
end
|
187
|
+
|
188
|
+
def make_running!
|
189
|
+
with_concurrency_control do |es_doc, seq_no, primary_term|
|
190
|
+
now = Time.now
|
191
|
+
doc = {
|
192
|
+
status: Connectors::SyncStatus::IN_PROGRESS,
|
193
|
+
started_at: now,
|
194
|
+
last_seen: now,
|
195
|
+
worker_hostname: Socket.gethostname
|
196
|
+
}
|
197
|
+
|
198
|
+
ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def es_source
|
203
|
+
@elasticsearch_response[:_source]
|
204
|
+
end
|
205
|
+
|
206
|
+
private
|
207
|
+
|
208
|
+
def self.fetch_jobs_by_query(query, page_size)
|
209
|
+
results = []
|
210
|
+
offset = 0
|
211
|
+
loop do
|
212
|
+
response = ElasticConnectorActions.search_jobs(query, page_size, offset)
|
213
|
+
|
214
|
+
hits = response.dig('hits', 'hits') || []
|
215
|
+
total = response.dig('hits', 'total', 'value') || 0
|
216
|
+
results += hits.map { |hit| new(hit) }
|
217
|
+
break if results.size >= total
|
218
|
+
offset += hits.size
|
219
|
+
end
|
220
|
+
|
221
|
+
results
|
222
|
+
end
|
223
|
+
|
224
|
+
def initialize(es_response)
|
225
|
+
# TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
|
226
|
+
@elasticsearch_response = es_response.with_indifferent_access
|
227
|
+
end
|
228
|
+
|
229
|
+
def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
|
230
|
+
ingestion_stats ||= {}
|
231
|
+
ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
|
232
|
+
doc = {
|
233
|
+
:last_seen => Time.now,
|
234
|
+
:completed_at => Time.now,
|
235
|
+
:status => status,
|
236
|
+
:error => error
|
237
|
+
}.merge(ingestion_stats)
|
238
|
+
doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
|
239
|
+
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
240
|
+
ElasticConnectorActions.update_job_fields(id, doc)
|
241
|
+
end
|
242
|
+
|
243
|
+
def seq_no
|
244
|
+
@elasticsearch_response[:_seq_no]
|
245
|
+
end
|
246
|
+
|
247
|
+
def primary_term
|
248
|
+
@elasticsearch_response[:_primary_term]
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
@@ -8,6 +8,7 @@
|
|
8
8
|
|
9
9
|
require 'active_support/core_ext/hash/indifferent_access'
|
10
10
|
require 'connectors/connector_status'
|
11
|
+
require 'connectors/sync_status'
|
11
12
|
require 'core/elastic_connector_actions'
|
12
13
|
require 'utility'
|
13
14
|
|
@@ -49,6 +50,11 @@ module Core
|
|
49
50
|
fetch_connectors_by_query(query, page_size)
|
50
51
|
end
|
51
52
|
|
53
|
+
def self.fetch_all_connectors(page_size = DEFAULT_PAGE_SIZE)
|
54
|
+
query = { match_all: {} }
|
55
|
+
fetch_connectors_by_query(query, page_size)
|
56
|
+
end
|
57
|
+
|
52
58
|
def id
|
53
59
|
@elasticsearch_response[:_id]
|
54
60
|
end
|
@@ -58,6 +64,24 @@ module Core
|
|
58
64
|
@elasticsearch_response[:_source][property_name]
|
59
65
|
end
|
60
66
|
|
67
|
+
def features
|
68
|
+
self[:features] || {}
|
69
|
+
end
|
70
|
+
|
71
|
+
# .dig version is the modern features way of doing things,
|
72
|
+
# Right-hand of OR operator is legacy features support
|
73
|
+
# When this is fixed with a migration, we can go ahead
|
74
|
+
def filtering_rule_feature_enabled?
|
75
|
+
!!features.dig(:sync_rules, :basic, :enabled) || !!features[:filtering_rules]
|
76
|
+
end
|
77
|
+
def filtering_advanced_config_feature_enabled?
|
78
|
+
!!features.dig(:sync_rules, :advanced, :enabled) || !!features[:filtering_advanced_config]
|
79
|
+
end
|
80
|
+
|
81
|
+
def any_filtering_feature_enabled?
|
82
|
+
filtering_rule_feature_enabled? || filtering_advanced_config_feature_enabled?
|
83
|
+
end
|
84
|
+
|
61
85
|
def index_name
|
62
86
|
self[:index_name]
|
63
87
|
end
|
@@ -79,30 +103,34 @@ module Core
|
|
79
103
|
end
|
80
104
|
|
81
105
|
def scheduling_settings
|
82
|
-
self[:scheduling]
|
106
|
+
self[:scheduling] || {}
|
83
107
|
end
|
84
108
|
|
85
|
-
def
|
86
|
-
|
87
|
-
|
109
|
+
def full_sync_scheduling
|
110
|
+
scheduling_settings[:full]
|
111
|
+
end
|
88
112
|
|
89
|
-
|
113
|
+
def custom_scheduling_settings
|
114
|
+
self[:custom_scheduling]
|
90
115
|
end
|
91
116
|
|
92
|
-
def
|
93
|
-
|
117
|
+
def sync_now?
|
118
|
+
self[:sync_now] == true
|
94
119
|
end
|
95
120
|
|
96
|
-
def
|
97
|
-
|
121
|
+
def last_synced
|
122
|
+
self[:last_synced]
|
98
123
|
end
|
99
124
|
|
100
|
-
def
|
101
|
-
|
125
|
+
def filtering
|
126
|
+
# assume for now, that first object in filtering array or a filter object itself is the only filtering object
|
127
|
+
filtering = @elasticsearch_response.dig(:_source, :filtering)
|
128
|
+
|
129
|
+
Utility::Filtering.extract_filter(filtering)
|
102
130
|
end
|
103
131
|
|
104
|
-
def
|
105
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :
|
132
|
+
def request_pipeline
|
133
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
|
106
134
|
end
|
107
135
|
|
108
136
|
def formatted
|
@@ -130,19 +158,23 @@ module Core
|
|
130
158
|
end
|
131
159
|
|
132
160
|
def update_last_sync!(job)
|
161
|
+
# if job is nil, connector still needs to be updated, to avoid it stuck at in_progress
|
162
|
+
job_status = job&.status || Connectors::SyncStatus::ERROR
|
163
|
+
job_error = job.nil? ? 'Could\'t find the job' : job.error
|
164
|
+
job_error ||= 'unknown error' if job_status == Connectors::SyncStatus::ERROR
|
165
|
+
connector_status = (job_status == Connectors::SyncStatus::ERROR ? Connectors::ConnectorStatus::ERROR : Connectors::ConnectorStatus::CONNECTED)
|
133
166
|
doc = {
|
134
|
-
:last_sync_status =>
|
167
|
+
:last_sync_status => job_status,
|
135
168
|
:last_synced => Time.now,
|
136
|
-
:last_sync_error =>
|
137
|
-
:
|
169
|
+
:last_sync_error => job_error,
|
170
|
+
:status => connector_status,
|
171
|
+
:error => job_error
|
138
172
|
}
|
139
|
-
|
140
|
-
if job.terminated?
|
173
|
+
if job&.terminated?
|
141
174
|
doc[:last_indexed_document_count] = job[:indexed_document_count]
|
142
175
|
doc[:last_deleted_document_count] = job[:deleted_document_count]
|
143
176
|
end
|
144
|
-
|
145
|
-
Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
|
177
|
+
Core::ElasticConnectorActions.update_connector_fields(id, doc)
|
146
178
|
end
|
147
179
|
|
148
180
|
private
|
@@ -8,6 +8,7 @@
|
|
8
8
|
#
|
9
9
|
require 'active_support/core_ext/hash'
|
10
10
|
require 'connectors/connector_status'
|
11
|
+
require 'connectors/job_trigger_method'
|
11
12
|
require 'connectors/sync_status'
|
12
13
|
require 'utility'
|
13
14
|
require 'elastic-transport'
|
@@ -60,9 +61,14 @@ module Core
|
|
60
61
|
|
61
62
|
def connectors_meta
|
62
63
|
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
63
|
-
alias_mappings = client.indices.get_mapping(:index => Utility::Constants::CONNECTORS_INDEX).with_indifferent_access
|
64
|
+
alias_mappings = client.indices.get_mapping(:index => Utility::Constants::CONNECTORS_INDEX, :ignore => 404).with_indifferent_access
|
64
65
|
index = get_latest_index_in_alias(Utility::Constants::CONNECTORS_INDEX, alias_mappings.keys)
|
65
|
-
alias_mappings.dig(index, 'mappings', '_meta') || {
|
66
|
+
alias_mappings.dig(index, 'mappings', '_meta') || {
|
67
|
+
:extract_binary_content => true,
|
68
|
+
:name => 'ent-search-generic-ingestion',
|
69
|
+
:reduce_whitespace => true,
|
70
|
+
:run_ml_inference => false,
|
71
|
+
}
|
66
72
|
end
|
67
73
|
|
68
74
|
def search_connectors(query, page_size, offset)
|
@@ -91,6 +97,17 @@ module Core
|
|
91
97
|
)
|
92
98
|
end
|
93
99
|
|
100
|
+
def delete_jobs_by_query(query)
|
101
|
+
client.delete_by_query(
|
102
|
+
:index => Utility::Constants::JOB_INDEX,
|
103
|
+
:body => { :query => query }
|
104
|
+
)
|
105
|
+
end
|
106
|
+
|
107
|
+
def delete_indices(indices)
|
108
|
+
client.indices.delete(:index => indices, :ignore_unavailable => true)
|
109
|
+
end
|
110
|
+
|
94
111
|
def update_connector_configuration(connector_id, configuration)
|
95
112
|
update_connector_fields(connector_id, :configuration => configuration)
|
96
113
|
end
|
@@ -145,12 +162,37 @@ module Core
|
|
145
162
|
)
|
146
163
|
end
|
147
164
|
|
148
|
-
def
|
165
|
+
def update_connector_sync_start(connector_id)
|
149
166
|
doc = connector_with_concurrency_control(connector_id)
|
150
167
|
|
168
|
+
body = {
|
169
|
+
last_sync_status: Connectors::SyncStatus::IN_PROGRESS,
|
170
|
+
last_sync_error: nil,
|
171
|
+
status: Connectors::ConnectorStatus::CONNECTED
|
172
|
+
}
|
173
|
+
|
151
174
|
update_connector_fields(
|
152
175
|
connector_id,
|
153
|
-
|
176
|
+
body,
|
177
|
+
doc[:seq_no],
|
178
|
+
doc[:primary_term]
|
179
|
+
)
|
180
|
+
end
|
181
|
+
|
182
|
+
def update_connector_custom_scheduling_last_synced(connector_id, schedule_key)
|
183
|
+
doc = connector_with_concurrency_control(connector_id)
|
184
|
+
|
185
|
+
body = {
|
186
|
+
:custom_scheduling => {
|
187
|
+
schedule_key => {
|
188
|
+
:last_synced => Time.now
|
189
|
+
}
|
190
|
+
}
|
191
|
+
}
|
192
|
+
|
193
|
+
update_connector_fields(
|
194
|
+
connector_id,
|
195
|
+
body,
|
154
196
|
doc[:seq_no],
|
155
197
|
doc[:primary_term]
|
156
198
|
)
|
@@ -178,13 +220,15 @@ module Core
|
|
178
220
|
status: Connectors::SyncStatus::PENDING,
|
179
221
|
created_at: Time.now,
|
180
222
|
last_seen: Time.now,
|
223
|
+
trigger_method: connector_settings.sync_now? ? Connectors::JobTriggerMethod::ON_DEMAND : Connectors::JobTriggerMethod::SCHEDULED,
|
181
224
|
connector: {
|
182
225
|
id: connector_settings.id,
|
183
226
|
filtering: convert_connector_filtering_to_job_filtering(connector_settings.filtering),
|
184
227
|
index_name: connector_settings.index_name,
|
185
228
|
language: connector_settings[:language],
|
186
229
|
pipeline: connector_settings[:pipeline],
|
187
|
-
service_type: connector_settings.service_type
|
230
|
+
service_type: connector_settings.service_type,
|
231
|
+
configuration: connector_settings.configuration
|
188
232
|
}
|
189
233
|
}
|
190
234
|
|
@@ -220,37 +264,6 @@ module Core
|
|
220
264
|
update_connector_fields(connector_id, body)
|
221
265
|
end
|
222
266
|
|
223
|
-
def update_sync(job_id, metadata)
|
224
|
-
body = {
|
225
|
-
:doc => { :last_seen => Time.now }.merge(metadata)
|
226
|
-
}
|
227
|
-
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
228
|
-
end
|
229
|
-
|
230
|
-
def complete_sync(connector_id, job_id, metadata, error)
|
231
|
-
sync_status = error ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
|
232
|
-
|
233
|
-
metadata ||= {}
|
234
|
-
|
235
|
-
update_connector_fields(connector_id,
|
236
|
-
:last_sync_status => sync_status,
|
237
|
-
:last_sync_error => error,
|
238
|
-
:error => error,
|
239
|
-
:last_synced => Time.now,
|
240
|
-
:last_indexed_document_count => metadata[:indexed_document_count],
|
241
|
-
:last_deleted_document_count => metadata[:deleted_document_count])
|
242
|
-
|
243
|
-
body = {
|
244
|
-
:doc => {
|
245
|
-
:status => sync_status,
|
246
|
-
:completed_at => Time.now,
|
247
|
-
:last_seen => Time.now,
|
248
|
-
:error => error
|
249
|
-
}.merge(metadata)
|
250
|
-
}
|
251
|
-
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
252
|
-
end
|
253
|
-
|
254
267
|
def fetch_document_ids(index_name)
|
255
268
|
page_size = 1000
|
256
269
|
result = []
|
@@ -331,9 +344,11 @@ module Core
|
|
331
344
|
# Creation of connector index should be handled by Kibana, this method is only used by ftest.rb
|
332
345
|
def ensure_connectors_index_exists
|
333
346
|
mappings = {
|
347
|
+
:dynamic => false,
|
334
348
|
:properties => {
|
335
349
|
:api_key_id => { :type => :keyword },
|
336
350
|
:configuration => { :type => :object },
|
351
|
+
:custom_schedule => { :type => :object },
|
337
352
|
:description => { :type => :text },
|
338
353
|
:error => { :type => :keyword },
|
339
354
|
:features => {
|
@@ -451,6 +466,7 @@ module Core
|
|
451
466
|
# Creation of job index should be handled by Kibana, this method is only used by ftest.rb
|
452
467
|
def ensure_job_index_exists
|
453
468
|
mappings = {
|
469
|
+
:dynamic => false,
|
454
470
|
:properties => {
|
455
471
|
:cancelation_requested_at => { :type => :date },
|
456
472
|
:canceled_at => { :type => :date },
|
@@ -528,8 +544,8 @@ module Core
|
|
528
544
|
end
|
529
545
|
|
530
546
|
def document_count(index_name)
|
531
|
-
client.indices.refresh(:index => index_name)
|
532
|
-
client.count(:index => index_name)['count']
|
547
|
+
client.indices.refresh(:index => index_name, :ignore_unavailable => true)
|
548
|
+
client.count(:index => index_name, :ignore_unavailable => true)['count']
|
533
549
|
end
|
534
550
|
|
535
551
|
private
|
data/lib/core/scheduler.rb
CHANGED
@@ -44,17 +44,14 @@ module Core
|
|
44
44
|
end
|
45
45
|
end
|
46
46
|
rescue *Utility::AUTHORIZATION_ERRORS => e
|
47
|
-
|
47
|
+
log_authorization_error(e)
|
48
48
|
rescue StandardError => e
|
49
|
-
|
49
|
+
log_standard_error(e)
|
50
50
|
ensure
|
51
51
|
if @is_shutting_down
|
52
52
|
break
|
53
53
|
end
|
54
|
-
|
55
|
-
Utility::Logger.debug("Sleeping for #{@poll_interval} seconds in #{self.class}.")
|
56
|
-
sleep(@poll_interval)
|
57
|
-
end
|
54
|
+
sleep_for_poll_interval
|
58
55
|
end
|
59
56
|
end
|
60
57
|
|
@@ -78,56 +75,12 @@ module Core
|
|
78
75
|
end
|
79
76
|
|
80
77
|
# Sync when sync_now flag is true for the connector
|
81
|
-
if connector_settings
|
78
|
+
if connector_settings.sync_now?
|
82
79
|
Utility::Logger.info("#{connector_settings.formatted.capitalize} is manually triggered to sync now.")
|
83
80
|
return true
|
84
81
|
end
|
85
82
|
|
86
|
-
|
87
|
-
scheduling_settings = connector_settings.scheduling_settings
|
88
|
-
unless scheduling_settings.present? && scheduling_settings[:enabled] == true
|
89
|
-
Utility::Logger.debug("#{connector_settings.formatted.capitalize} scheduling is disabled.")
|
90
|
-
return false
|
91
|
-
end
|
92
|
-
|
93
|
-
# We want to sync when sync never actually happened
|
94
|
-
last_synced = connector_settings[:last_synced]
|
95
|
-
if last_synced.nil? || last_synced.empty?
|
96
|
-
Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
|
97
|
-
return true
|
98
|
-
end
|
99
|
-
|
100
|
-
current_schedule = scheduling_settings[:interval]
|
101
|
-
|
102
|
-
# Don't sync if there is no actual scheduling interval
|
103
|
-
if current_schedule.nil? || current_schedule.empty?
|
104
|
-
Utility::Logger.warn("No sync schedule configured for #{connector_settings.formatted}.")
|
105
|
-
return false
|
106
|
-
end
|
107
|
-
|
108
|
-
current_schedule = begin
|
109
|
-
Utility::Cron.quartz_to_crontab(current_schedule)
|
110
|
-
rescue StandardError => e
|
111
|
-
Utility::ExceptionTracking.log_exception(e, "Unable to convert quartz (#{current_schedule}) to crontab.")
|
112
|
-
return false
|
113
|
-
end
|
114
|
-
cron_parser = Fugit::Cron.parse(current_schedule)
|
115
|
-
|
116
|
-
# Don't sync if the scheduling interval is non-parsable
|
117
|
-
unless cron_parser
|
118
|
-
Utility::Logger.error("Unable to parse sync schedule for #{connector_settings.formatted}: expression #{current_schedule} is not a valid Quartz Cron definition.")
|
119
|
-
return false
|
120
|
-
end
|
121
|
-
|
122
|
-
next_trigger_time = cron_parser.next_time(Time.parse(last_synced))
|
123
|
-
|
124
|
-
# Sync if next trigger for the connector is in past
|
125
|
-
if next_trigger_time < Time.now
|
126
|
-
Utility::Logger.info("#{connector_settings.formatted.capitalize} sync is triggered by cron schedule #{current_schedule}.")
|
127
|
-
return true
|
128
|
-
end
|
129
|
-
|
130
|
-
false
|
83
|
+
schedule_triggered?(connector_settings.full_sync_scheduling, connector_settings.formatted)
|
131
84
|
end
|
132
85
|
|
133
86
|
def heartbeat_triggered?(connector_settings)
|
@@ -148,6 +101,12 @@ module Core
|
|
148
101
|
end
|
149
102
|
|
150
103
|
def filtering_validation_triggered?(connector_settings)
|
104
|
+
unless connector_settings.any_filtering_feature_enabled?
|
105
|
+
Utility::Logger.debug("#{connector_settings.formatted} all filtering features are disabled. Skip filtering validation.")
|
106
|
+
|
107
|
+
return false
|
108
|
+
end
|
109
|
+
|
151
110
|
filtering = connector_settings.filtering
|
152
111
|
|
153
112
|
unless filtering.present?
|
@@ -189,5 +148,61 @@ module Core
|
|
189
148
|
false
|
190
149
|
end
|
191
150
|
end
|
151
|
+
|
152
|
+
def schedule_triggered?(scheduling_settings, identifier)
|
153
|
+
# Don't sync if sync is explicitly disabled
|
154
|
+
unless scheduling_settings.present? && scheduling_settings[:enabled] == true
|
155
|
+
Utility::Logger.debug("#{identifier.capitalize} scheduling is disabled.")
|
156
|
+
return false
|
157
|
+
end
|
158
|
+
|
159
|
+
current_schedule = scheduling_settings[:interval]
|
160
|
+
|
161
|
+
# Don't sync if there is no actual scheduling interval
|
162
|
+
if current_schedule.nil? || current_schedule.empty?
|
163
|
+
Utility::Logger.warn("No sync schedule configured for #{identifier}.")
|
164
|
+
return false
|
165
|
+
end
|
166
|
+
|
167
|
+
current_schedule =
|
168
|
+
begin
|
169
|
+
Utility::Cron.quartz_to_crontab(current_schedule)
|
170
|
+
rescue StandardError => e
|
171
|
+
Utility::ExceptionTracking.log_exception(e, "Unable to convert quartz (#{current_schedule}) to crontab.")
|
172
|
+
return false
|
173
|
+
end
|
174
|
+
cron_parser = Fugit::Cron.parse(current_schedule)
|
175
|
+
|
176
|
+
# Don't sync if the scheduling interval is non-parsable
|
177
|
+
unless cron_parser
|
178
|
+
Utility::Logger.error("Unable to parse sync schedule for #{identifier}: expression #{current_schedule} is not a valid Quartz Cron definition.")
|
179
|
+
return false
|
180
|
+
end
|
181
|
+
|
182
|
+
next_trigger_time = cron_parser.next_time(Time.now)
|
183
|
+
|
184
|
+
# Sync if next trigger happens before the next poll
|
185
|
+
if next_trigger_time <= Time.now + @poll_interval
|
186
|
+
Utility::Logger.info("#{identifier.capitalize} sync is triggered by cron schedule #{current_schedule}.")
|
187
|
+
return true
|
188
|
+
end
|
189
|
+
|
190
|
+
false
|
191
|
+
end
|
192
|
+
|
193
|
+
def sleep_for_poll_interval
|
194
|
+
if @poll_interval > 0 && !@is_shutting_down
|
195
|
+
Utility::Logger.debug("Sleeping for #{@poll_interval} seconds in #{self.class}.")
|
196
|
+
sleep(@poll_interval)
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def log_authorization_error(e)
|
201
|
+
Utility::ExceptionTracking.log_exception(e, 'Could not retrieve connectors settings due to authorization error.')
|
202
|
+
end
|
203
|
+
|
204
|
+
def log_standard_error(e)
|
205
|
+
Utility::ExceptionTracking.log_exception(e, 'Sync failed due to unexpected error.')
|
206
|
+
end
|
192
207
|
end
|
193
208
|
end
|
data/lib/utility/bulk_queue.rb
CHANGED
@@ -13,7 +13,7 @@ module Utility
|
|
13
13
|
class QueueOverflowError < StandardError; end
|
14
14
|
|
15
15
|
# 500 items or 5MB
|
16
|
-
def initialize(operation_count_threshold = Utility::Constants::
|
16
|
+
def initialize(operation_count_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_SIZE, size_threshold = Utility::Constants::DEFAULT_MAX_INGESTION_QUEUE_BYTES)
|
17
17
|
@operation_count_threshold = operation_count_threshold.freeze
|
18
18
|
@size_threshold = size_threshold.freeze
|
19
19
|
|
data/lib/utility/constants.rb
CHANGED
@@ -16,8 +16,6 @@ module Utility
|
|
16
16
|
JOB_INDEX = '.elastic-connectors-sync-jobs'
|
17
17
|
CONTENT_INDEX_PREFIX = 'search-'
|
18
18
|
CRAWLER_SERVICE_TYPE = 'elastic-crawler'
|
19
|
-
FILTERING_RULES_FEATURE = 'filtering_rules'
|
20
|
-
FILTERING_ADVANCED_FEATURE = 'filtering_advanced_config'
|
21
19
|
|
22
20
|
# Maximum number of operations in BULK Elasticsearch operation that will ingest the data
|
23
21
|
DEFAULT_MAX_INGESTION_QUEUE_SIZE = 500
|
@@ -51,7 +51,7 @@ module Utility
|
|
51
51
|
def note_success
|
52
52
|
@consecutive_error_count = 0
|
53
53
|
@success_count += 1
|
54
|
-
|
54
|
+
track_window_error(false)
|
55
55
|
end
|
56
56
|
|
57
57
|
def note_error(error, id: Time.now.to_i)
|
@@ -60,10 +60,9 @@ module Utility
|
|
60
60
|
Utility::Logger.debug("Message id: #{id} - #{error_message}\n#{stack_trace}")
|
61
61
|
@total_error_count += 1
|
62
62
|
@consecutive_error_count += 1
|
63
|
-
@window_errors[@window_index] = true
|
64
63
|
@error_queue << DocumentError.new(error.class.name, error_message, stack_trace, id)
|
65
64
|
@error_queue = @error_queue.drop(1) if @error_queue.size > @error_queue_size
|
66
|
-
|
65
|
+
track_window_error(true)
|
67
66
|
@last_error = error
|
68
67
|
|
69
68
|
raise_if_necessary
|
@@ -92,10 +91,32 @@ module Utility
|
|
92
91
|
end
|
93
92
|
|
94
93
|
def num_errors_in_window
|
95
|
-
@window_errors.count(
|
94
|
+
@window_errors.count(true).to_f
|
96
95
|
end
|
97
96
|
|
98
|
-
def
|
97
|
+
def track_window_error(is_error)
|
98
|
+
# We keep the errors array of the size @window_size this way, imagine @window_size = 5
|
99
|
+
# Error array inits as falses:
|
100
|
+
# [ false, false, false, false, false ]
|
101
|
+
# Third document raises an error:
|
102
|
+
# [ false, false, true, false, false ]
|
103
|
+
# ^^^^
|
104
|
+
# 2 % 5 == 2
|
105
|
+
# Fifth document raises an error:
|
106
|
+
# [ false, false, true, false, true ]
|
107
|
+
# ^^^^
|
108
|
+
# 4 % 5 == 4
|
109
|
+
# Sixth document raises an error:
|
110
|
+
# [ true, false, true, false, true ]
|
111
|
+
# ^^^^
|
112
|
+
# 5 % 5 == 0
|
113
|
+
#
|
114
|
+
# Eigth document is successful:
|
115
|
+
# [ true, false, false, false, true ]
|
116
|
+
# ^^^^^
|
117
|
+
# 7 % 5 == 2
|
118
|
+
# And so on.
|
119
|
+
@window_errors[@window_index] = is_error
|
99
120
|
@window_index = (@window_index + 1) % @window_size
|
100
121
|
end
|
101
122
|
|
data/lib/utility/es_client.rb
CHANGED
@@ -43,6 +43,10 @@ module Utility
|
|
43
43
|
configs[:transport_options] = es_config[:transport_options] if es_config[:transport_options]
|
44
44
|
configs[:ca_fingerprint] = es_config[:ca_fingerprint] if es_config[:ca_fingerprint]
|
45
45
|
|
46
|
+
# headers
|
47
|
+
# these are necessary for cloud-hosted native connectors
|
48
|
+
configs[:headers] = es_config[:headers].to_h if es_config[:headers]
|
49
|
+
|
46
50
|
# if log or trace is activated, we use the application logger
|
47
51
|
configs[:logger] = if configs[:log] || configs[:trace]
|
48
52
|
Utility::Logger.logger
|
data/lib/utility/filtering.rb
CHANGED
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: connectors_utility
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 8.
|
4
|
+
version: 8.10.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Elastic
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-07-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 5.2
|
19
|
+
version: '5.2'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 5.2
|
26
|
+
version: '5.2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: ecs-logging
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -104,8 +104,10 @@ files:
|
|
104
104
|
- NOTICE.txt
|
105
105
|
- lib/connectors/connector_status.rb
|
106
106
|
- lib/connectors/crawler/scheduler.rb
|
107
|
+
- lib/connectors/job_trigger_method.rb
|
107
108
|
- lib/connectors/sync_status.rb
|
108
109
|
- lib/connectors_utility.rb
|
110
|
+
- lib/core/connector_job.rb
|
109
111
|
- lib/core/connector_settings.rb
|
110
112
|
- lib/core/elastic_connector_actions.rb
|
111
113
|
- lib/core/filtering/validation_status.rb
|
@@ -130,9 +132,9 @@ homepage: https://github.com/elastic/connectors-ruby
|
|
130
132
|
licenses:
|
131
133
|
- Elastic-2.0
|
132
134
|
metadata:
|
133
|
-
revision:
|
134
|
-
repository: https://github.com/elastic/connectors-ruby
|
135
|
-
post_install_message:
|
135
|
+
revision: f2cac87a2b02a3ed2ab6257bc8be742150304120
|
136
|
+
repository: https://github.com/elastic/connectors-ruby
|
137
|
+
post_install_message:
|
136
138
|
rdoc_options: []
|
137
139
|
require_paths:
|
138
140
|
- lib
|
@@ -143,12 +145,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
143
145
|
version: '0'
|
144
146
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
145
147
|
requirements:
|
146
|
-
- - "
|
148
|
+
- - ">="
|
147
149
|
- !ruby/object:Gem::Version
|
148
|
-
version:
|
150
|
+
version: '0'
|
149
151
|
requirements: []
|
150
152
|
rubygems_version: 3.0.3.1
|
151
|
-
signing_key:
|
153
|
+
signing_key:
|
152
154
|
specification_version: 4
|
153
155
|
summary: Gem containing shared Connector Services libraries
|
154
156
|
test_files: []
|