connectors_service 8.6.0.4.pre.20221114T233727Z → 8.6.0.4.pre.20221116T024501Z

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +4 -4
  3. data/lib/app/app.rb +4 -0
  4. data/lib/app/dispatcher.rb +30 -17
  5. data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
  6. data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
  7. data/lib/connectors/base/connector.rb +27 -5
  8. data/lib/connectors/example/connector.rb +3 -12
  9. data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
  10. data/lib/connectors/gitlab/connector.rb +3 -12
  11. data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
  12. data/lib/connectors/mongodb/connector.rb +9 -24
  13. data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
  14. data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
  15. data/lib/connectors/sync_status.rb +6 -1
  16. data/lib/connectors/tolerable_error_helper.rb +43 -0
  17. data/lib/core/connector_job.rb +96 -23
  18. data/lib/core/connector_settings.rb +29 -6
  19. data/lib/core/elastic_connector_actions.rb +77 -55
  20. data/lib/core/filtering/validation_job_runner.rb +1 -1
  21. data/lib/core/ingestion/es_sink.rb +68 -9
  22. data/lib/core/ingestion.rb +0 -1
  23. data/lib/core/jobs/consumer.rb +114 -0
  24. data/lib/core/jobs/producer.rb +26 -0
  25. data/lib/core/single_scheduler.rb +1 -1
  26. data/lib/core/sync_job_runner.rb +20 -12
  27. data/lib/core.rb +2 -0
  28. data/lib/utility/error_monitor.rb +108 -0
  29. data/lib/utility/errors.rb +0 -12
  30. data/lib/utility/logger.rb +0 -1
  31. data/lib/utility.rb +6 -0
  32. metadata +12 -3
  33. data/lib/core/ingestion/ingester.rb +0 -90
@@ -0,0 +1,292 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ module Connectors
10
+ module MongoDB
11
+ module AdvancedSnippet
12
+ # Pipeline stages: https://www.mongodb.com/docs/manual/reference/operator/aggregation-pipeline/
13
+ ALLOWED_PIPELINE_STAGES = %w[
14
+ $addFields $bucket $bucketAuto $changeStream $collStats $count $densify
15
+ $documents $facet $fill $geoNear $graphLookup $group $indexStats $limit
16
+ $listSessions $lookup $match $merge $out $planCacheStats $project $redact
17
+ $replaceRoot $replaceWith $sample $search $searchMeta $set $setWindowFields
18
+ $skip $sort $sortByCount $unionWith $unset $unwind
19
+ ]
20
+
21
+ # All except the $out, $merge, $geoNear, and $changeStream stages can appear multiple times in a pipeline.
22
+ # Source: https://www.mongodb.com/docs/manual/reference/operator/aggregation-pipeline/
23
+ PIPELINE_STAGES_ALLOWED_ONCE = %w[$out $merge $geoNear $changeStream]
24
+
25
+ NON_NEGATIVE_INTEGER = ->(value) { value.is_a?(Integer) && value >= 0 }
26
+ READ_CONCERN_LEVEL = ->(level) { %w[local available majority linearizable].include?(level) }
27
+ STRING_OR_DOCUMENT = ->(value) { value.is_a?(Hash) || value.is_a?(String) }
28
+ MUTUAL_EXCLUSIVE_FILTER = ->(fields) { fields.size <= 1 }
29
+
30
+ AGGREGATION_PIPELINE = lambda { |pipeline|
31
+ return false unless pipeline.is_a?(Array)
32
+
33
+ allowed_once_appearances = Set.new
34
+
35
+ pipeline.flat_map(&:keys).each do |key|
36
+ return false unless ALLOWED_PIPELINE_STAGES.include?(key)
37
+
38
+ if PIPELINE_STAGES_ALLOWED_ONCE.include?(key)
39
+ return false if allowed_once_appearances.include?(key)
40
+
41
+ allowed_once_appearances.add(key)
42
+ end
43
+ end
44
+
45
+ true
46
+ }
47
+
48
+ # Ruby has no 'Boolean' class
49
+ BOOLEAN = ->(value) { value.is_a?(TrueClass) || value.is_a?(FalseClass) }
50
+
51
+ COLLATION = {
52
+ :name => 'collation',
53
+ :type => Hash,
54
+ :optional => true,
55
+ :fields => [
56
+ {
57
+ :name => 'locale',
58
+ :type => String,
59
+ :optional => true
60
+ },
61
+ {
62
+ :name => 'caseLevel',
63
+ :type => BOOLEAN,
64
+ :optional => true
65
+ },
66
+ {
67
+ :name => 'caseFirst',
68
+ :type => String,
69
+ :optional => true
70
+ },
71
+ {
72
+ :name => 'strength',
73
+ :type => Integer,
74
+ :optional => true
75
+ },
76
+ {
77
+ :name => 'numericOrdering',
78
+ :type => BOOLEAN,
79
+ :optional => true
80
+ },
81
+ {
82
+ :name => 'alternate',
83
+ :type => String,
84
+ :optional => true
85
+ },
86
+ {
87
+ :name => 'maxVariable',
88
+ :type => String,
89
+ :optional => true
90
+ },
91
+ {
92
+ :name => 'backwards',
93
+ :type => BOOLEAN,
94
+ :optional => true
95
+ },
96
+ ]
97
+ }
98
+
99
+ CURSOR_TYPE = ->(cursor) { [:tailable, :tailable_await].include?(cursor) }
100
+
101
+ # Aggregate options: https://www.mongodb.com/docs/manual/reference/method/db.collection.aggregate/
102
+ AGGREGATE_OPTIONS = {
103
+ :name => 'options',
104
+ :type => Hash,
105
+ :optional => true,
106
+ :fields => [
107
+ {
108
+ :name => 'explain',
109
+ :type => BOOLEAN,
110
+ :optional => true
111
+ },
112
+ {
113
+ :name => 'allowDiskUse',
114
+ :type => BOOLEAN,
115
+ :optional => true
116
+ },
117
+ {
118
+ :name => 'cursor',
119
+ :type => Hash,
120
+ :optional => true,
121
+ :fields => [
122
+ {
123
+ :name => 'batchSize',
124
+ :type => NON_NEGATIVE_INTEGER
125
+ }
126
+ ]
127
+ },
128
+ {
129
+ :name => 'maxTimeMS',
130
+ :type => NON_NEGATIVE_INTEGER,
131
+ :optional => true
132
+ },
133
+ {
134
+ :name => 'bypassDocumentValidation',
135
+ :type => BOOLEAN,
136
+ :optional => true
137
+ },
138
+ {
139
+ :name => 'readConcern',
140
+ :type => Hash,
141
+ :optional => true,
142
+ :fields => [
143
+ {
144
+ :name => 'level',
145
+ :type => READ_CONCERN_LEVEL
146
+ }
147
+ ]
148
+ },
149
+ COLLATION,
150
+ {
151
+ :name => 'hint',
152
+ :type => STRING_OR_DOCUMENT,
153
+ :optional => true
154
+ },
155
+ {
156
+ :name => 'comment',
157
+ :type => String,
158
+ :optional => true
159
+ },
160
+ {
161
+ :name => 'writeConcern',
162
+ :type => Hash,
163
+ :optional => true
164
+ },
165
+ {
166
+ :name => 'let',
167
+ :type => Hash,
168
+ :optional => true
169
+ }
170
+ ]
171
+ }
172
+
173
+ AGGREGATE_PIPELINE = {
174
+ :name => 'pipeline',
175
+ :type => AGGREGATION_PIPELINE,
176
+ :optional => true,
177
+ }
178
+
179
+ AGGREGATE = {
180
+ :name => 'aggregate',
181
+ :type => Hash,
182
+ :optional => true,
183
+ :fields => [
184
+ AGGREGATE_PIPELINE,
185
+ AGGREGATE_OPTIONS
186
+ ]
187
+ }
188
+
189
+ FIND_OPTIONS = {
190
+ :name => 'options',
191
+ :type => Hash,
192
+ :optional => true,
193
+ :fields => [
194
+ {
195
+ :name => 'allowDiskUse',
196
+ :type => BOOLEAN,
197
+ :optional => true
198
+ },
199
+ {
200
+ :name => 'allowPartialResults',
201
+ :type => BOOLEAN,
202
+ :optional => true
203
+ },
204
+ {
205
+ :name => 'batchSize',
206
+ :type => NON_NEGATIVE_INTEGER,
207
+ :optional => true
208
+ },
209
+ COLLATION,
210
+ {
211
+ :name => 'cursorType',
212
+ :type => CURSOR_TYPE,
213
+ :optional => true
214
+ },
215
+ {
216
+ :name => 'limit',
217
+ :type => NON_NEGATIVE_INTEGER,
218
+ :optional => true
219
+ },
220
+ {
221
+ :name => 'maxTimeMS',
222
+ :type => NON_NEGATIVE_INTEGER,
223
+ :optional => true
224
+ },
225
+ {
226
+ :name => 'modifiers',
227
+ :type => Hash,
228
+ :optional => true
229
+ },
230
+ {
231
+ :name => 'noCursorTimeout',
232
+ :type => BOOLEAN,
233
+ :optional => true
234
+ },
235
+ {
236
+ :name => 'oplogReplay',
237
+ :type => BOOLEAN,
238
+ :optional => true
239
+ },
240
+ {
241
+ :name => 'projection',
242
+ :type => Hash,
243
+ :optional => true
244
+ },
245
+ {
246
+ :name => 'skip',
247
+ :type => NON_NEGATIVE_INTEGER,
248
+ :optional => true
249
+ },
250
+ {
251
+ :name => 'sort',
252
+ :type => Hash,
253
+ :optional => true
254
+ },
255
+ {
256
+ :name => 'let',
257
+ :type => Hash,
258
+ :optional => true
259
+ }
260
+ ]
261
+ }
262
+
263
+ # TODO: return true for now. Will be more involved (basically needs full query parsing or "dummy" execution against a running instance)
264
+ FILTER = ->(_filter) { true }
265
+
266
+ FIND_FILTER = {
267
+ :name => 'filter',
268
+ :type => FILTER
269
+ }
270
+
271
+ FIND = {
272
+ :name => 'find',
273
+ :type => Hash,
274
+ :optional => true,
275
+ :fields => [
276
+ FIND_OPTIONS,
277
+ FIND_FILTER
278
+ ]
279
+ }
280
+
281
+ SCHEMA = {
282
+ :fields => {
283
+ :constraints => MUTUAL_EXCLUSIVE_FILTER,
284
+ :values => [
285
+ AGGREGATE,
286
+ FIND
287
+ ]
288
+ }
289
+ }
290
+ end
291
+ end
292
+ end
@@ -26,11 +26,16 @@ module Connectors
26
26
  ERROR
27
27
  ]
28
28
 
29
- PENDING_STATUES = [
29
+ PENDING_STATUSES = [
30
30
  PENDING,
31
31
  SUSPENDED
32
32
  ]
33
33
 
34
+ ACTIVE_STATUSES = [
35
+ IN_PROGRESS,
36
+ CANCELING
37
+ ]
38
+
34
39
  TERMINAL_STATUSES = [
35
40
  CANCELED,
36
41
  COMPLETED,
@@ -0,0 +1,43 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ require 'utility/logger'
8
+ require 'utility/exception_tracking'
9
+ require 'utility/error_monitor'
10
+
11
+ module Connectors
12
+ class TolerableErrorHelper
13
+ def initialize(error_monitor)
14
+ @error_monitor = error_monitor
15
+ end
16
+
17
+ def yield_single_document(identifier: nil)
18
+ Utility::Logger.debug("Extracting single document for #{identifier}") if identifier
19
+ yield
20
+ @error_monitor.note_success
21
+ rescue *fatal_exception_classes => e
22
+ Utility::ExceptionTracking.augment_exception(e)
23
+ Utility::Logger.error("Encountered a fall-through error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
24
+ raise
25
+ rescue StandardError => e
26
+ Utility::ExceptionTracking.augment_exception(e)
27
+ Utility::Logger.warn("Encountered error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
28
+ @error_monitor.note_error(e, :id => e.id)
29
+ end
30
+
31
+ private
32
+
33
+ def identifying_error_message(identifier)
34
+ identifier.present? ? " of '#{identifier}'" : ''
35
+ end
36
+
37
+ def fatal_exception_classes
38
+ [
39
+ Utility::ErrorMonitor::MonitoringError
40
+ ]
41
+ end
42
+ end
43
+ end
@@ -15,18 +15,22 @@ module Core
15
15
  class ConnectorJob
16
16
  DEFAULT_PAGE_SIZE = 100
17
17
 
18
- # Error Classes
19
- class ConnectorJobNotFoundError < StandardError; end
20
-
21
18
  def self.fetch_by_id(job_id)
22
19
  es_response = ElasticConnectorActions.get_job(job_id)
20
+ return nil unless es_response[:found]
23
21
 
24
- raise ConnectorJobNotFoundError.new("Connector job with id=#{job_id} was not found.") unless es_response[:found]
25
22
  new(es_response)
26
23
  end
27
24
 
28
- def self.pending_jobs(page_size = DEFAULT_PAGE_SIZE)
29
- query = { terms: { status: Connectors::SyncStatus::PENDING_STATUES } }
25
+ def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
26
+ status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
27
+
28
+ query = { bool: { must: [{ terms: status_term }] } }
29
+
30
+ return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
31
+
32
+ query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
33
+
30
34
  fetch_jobs_by_query(query, page_size)
31
35
  end
32
36
 
@@ -50,6 +54,10 @@ module Core
50
54
  @elasticsearch_response[:_source][property_name]
51
55
  end
52
56
 
57
+ def error
58
+ self[:error]
59
+ end
60
+
53
61
  def status
54
62
  self[:status]
55
63
  end
@@ -62,16 +70,36 @@ module Core
62
70
  status == Connectors::SyncStatus::CANCELING
63
71
  end
64
72
 
73
+ def suspended?
74
+ status == Connectors::SyncStatus::SUSPENDED
75
+ end
76
+
77
+ def canceled?
78
+ status == Connectors::SyncStatus::CANCELED
79
+ end
80
+
81
+ def pending?
82
+ Connectors::SyncStatus::PENDING_STATUSES.include?(status)
83
+ end
84
+
85
+ def active?
86
+ Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
87
+ end
88
+
89
+ def terminated?
90
+ Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
91
+ end
92
+
65
93
  def connector_snapshot
66
- self[:connector]
94
+ self[:connector] || {}
67
95
  end
68
96
 
69
97
  def connector_id
70
- connector_snapshot[:id]
98
+ @elasticsearch_response[:_source][:connector][:id]
71
99
  end
72
100
 
73
101
  def index_name
74
- connector_snapshot[:configuration]
102
+ connector_snapshot[:index_name]
75
103
  end
76
104
 
77
105
  def language
@@ -91,33 +119,51 @@ module Core
91
119
  end
92
120
 
93
121
  def pipeline
94
- connector_snapshot[:pipeline]
122
+ @elasticsearch_response[:_source][:pipeline]
95
123
  end
96
124
 
97
125
  def connector
98
126
  @connector ||= ConnectorSettings.fetch_by_id(connector_id)
99
127
  end
100
128
 
101
- def reload_connector!
102
- @connector = nil
103
- connector
129
+ def done!(ingestion_stats = {}, connector_metadata = {})
130
+ terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
104
131
  end
105
132
 
106
- def reload
107
- es_response = ElasticConnectorActions.get_job(id)
108
- raise ConnectorJobNotFoundError.new("Connector job with id=#{id} was not found.") unless es_response[:found]
109
- # TODO: remove the usage of with_indifferent_access. get_id method is expected to return a hash
110
- @elasticsearch_response = es_response.with_indifferent_access
111
- @connector = nil
133
+ def error!(message, ingestion_stats = {}, connector_metadata = {})
134
+ terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
112
135
  end
113
136
 
114
- private
137
+ def cancel!(ingestion_stats = {}, connector_metadata = {})
138
+ terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
139
+ end
115
140
 
116
- def initialize(es_response)
117
- # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
118
- @elasticsearch_response = es_response.with_indifferent_access
141
+ def with_concurrency_control
142
+ response = ElasticConnectorActions.get_job(id)
143
+
144
+ yield response, response['_seq_no'], response['_primary_term']
119
145
  end
120
146
 
147
+ def make_running!
148
+ with_concurrency_control do |es_doc, seq_no, primary_term|
149
+ now = Time.now
150
+ doc = {
151
+ status: Connectors::SyncStatus::IN_PROGRESS,
152
+ started_at: now,
153
+ last_seen: now,
154
+ worker_hostname: Socket.gethostname
155
+ }
156
+
157
+ ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
158
+ end
159
+ end
160
+
161
+ def es_source
162
+ @elasticsearch_response[:_source]
163
+ end
164
+
165
+ private
166
+
121
167
  def self.fetch_jobs_by_query(query, page_size)
122
168
  results = []
123
169
  offset = 0
@@ -133,5 +179,32 @@ module Core
133
179
 
134
180
  results
135
181
  end
182
+
183
+ def initialize(es_response)
184
+ # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
185
+ @elasticsearch_response = es_response.with_indifferent_access
186
+ end
187
+
188
+ def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
189
+ ingestion_stats ||= {}
190
+ ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
191
+ doc = {
192
+ :last_seen => Time.now,
193
+ :completed_at => Time.now,
194
+ :status => status,
195
+ :error => error
196
+ }.merge(ingestion_stats)
197
+ doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
198
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
199
+ ElasticConnectorActions.update_job_fields(id, doc)
200
+ end
201
+
202
+ def seq_no
203
+ @elasticsearch_response[:_seq_no]
204
+ end
205
+
206
+ def primary_term
207
+ @elasticsearch_response[:_primary_term]
208
+ end
136
209
  end
137
210
  end
@@ -8,7 +8,6 @@
8
8
 
9
9
  require 'active_support/core_ext/hash/indifferent_access'
10
10
  require 'connectors/connector_status'
11
- require 'connectors/registry'
12
11
  require 'core/elastic_connector_actions'
13
12
  require 'utility'
14
13
 
@@ -24,18 +23,16 @@ module Core
24
23
 
25
24
  DEFAULT_PAGE_SIZE = 100
26
25
 
27
- # Error Classes
28
- class ConnectorNotFoundError < StandardError; end
29
-
30
26
  def self.fetch_by_id(connector_id)
31
27
  es_response = ElasticConnectorActions.get_connector(connector_id)
32
- connectors_meta = ElasticConnectorActions.connectors_meta
28
+ return nil unless es_response[:found]
33
29
 
34
- raise ConnectorNotFoundError.new("Connector with id=#{connector_id} was not found.") unless es_response[:found]
30
+ connectors_meta = ElasticConnectorActions.connectors_meta
35
31
  new(es_response, connectors_meta)
36
32
  end
37
33
 
38
34
  def self.fetch_native_connectors(page_size = DEFAULT_PAGE_SIZE)
35
+ require 'connectors/registry' unless defined?(Connectors::REGISTRY)
39
36
  query = {
40
37
  bool: {
41
38
  filter: [
@@ -122,6 +119,32 @@ module Core
122
119
  index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
123
120
  end
124
121
 
122
+ def ready_for_sync?
123
+ Connectors::REGISTRY.registered?(service_type) &&
124
+ valid_index_name? &&
125
+ connector_status_allows_sync?
126
+ end
127
+
128
+ def running?
129
+ @elasticsearch_response[:_source][:last_sync_status] == Connectors::SyncStatus::IN_PROGRESS
130
+ end
131
+
132
+ def update_last_sync!(job)
133
+ doc = {
134
+ :last_sync_status => job.status,
135
+ :last_synced => Time.now,
136
+ :last_sync_error => job.error,
137
+ :error => job.error
138
+ }
139
+
140
+ if job.terminated?
141
+ doc[:last_indexed_document_count] = job[:indexed_document_count]
142
+ doc[:last_deleted_document_count] = job[:deleted_document_count]
143
+ end
144
+
145
+ Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
146
+ end
147
+
125
148
  private
126
149
 
127
150
  def initialize(es_response, connectors_meta)