connectors_service 8.6.0.4.pre.20221114T233727Z → 8.6.0.4.pre.20221116T024501Z

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +4 -4
  3. data/lib/app/app.rb +4 -0
  4. data/lib/app/dispatcher.rb +30 -17
  5. data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
  6. data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
  7. data/lib/connectors/base/connector.rb +27 -5
  8. data/lib/connectors/example/connector.rb +3 -12
  9. data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
  10. data/lib/connectors/gitlab/connector.rb +3 -12
  11. data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
  12. data/lib/connectors/mongodb/connector.rb +9 -24
  13. data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
  14. data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
  15. data/lib/connectors/sync_status.rb +6 -1
  16. data/lib/connectors/tolerable_error_helper.rb +43 -0
  17. data/lib/core/connector_job.rb +96 -23
  18. data/lib/core/connector_settings.rb +29 -6
  19. data/lib/core/elastic_connector_actions.rb +77 -55
  20. data/lib/core/filtering/validation_job_runner.rb +1 -1
  21. data/lib/core/ingestion/es_sink.rb +68 -9
  22. data/lib/core/ingestion.rb +0 -1
  23. data/lib/core/jobs/consumer.rb +114 -0
  24. data/lib/core/jobs/producer.rb +26 -0
  25. data/lib/core/single_scheduler.rb +1 -1
  26. data/lib/core/sync_job_runner.rb +20 -12
  27. data/lib/core.rb +2 -0
  28. data/lib/utility/error_monitor.rb +108 -0
  29. data/lib/utility/errors.rb +0 -12
  30. data/lib/utility/logger.rb +0 -1
  31. data/lib/utility.rb +6 -0
  32. metadata +12 -3
  33. data/lib/core/ingestion/ingester.rb +0 -90
@@ -0,0 +1,292 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ module Connectors
10
+ module MongoDB
11
+ module AdvancedSnippet
12
+ # Pipeline stages: https://www.mongodb.com/docs/manual/reference/operator/aggregation-pipeline/
13
+ ALLOWED_PIPELINE_STAGES = %w[
14
+ $addFields $bucket $bucketAuto $changeStream $collStats $count $densify
15
+ $documents $facet $fill $geoNear $graphLookup $group $indexStats $limit
16
+ $listSessions $lookup $match $merge $out $planCacheStats $project $redact
17
+ $replaceRoot $replaceWith $sample $search $searchMeta $set $setWindowFields
18
+ $skip $sort $sortByCount $unionWith $unset $unwind
19
+ ]
20
+
21
+ # All except the $out, $merge, $geoNear, and $changeStream stages can appear multiple times in a pipeline.
22
+ # Source: https://www.mongodb.com/docs/manual/reference/operator/aggregation-pipeline/
23
+ PIPELINE_STAGES_ALLOWED_ONCE = %w[$out $merge $geoNear $changeStream]
24
+
25
+ NON_NEGATIVE_INTEGER = ->(value) { value.is_a?(Integer) && value >= 0 }
26
+ READ_CONCERN_LEVEL = ->(level) { %w[local available majority linearizable].include?(level) }
27
+ STRING_OR_DOCUMENT = ->(value) { value.is_a?(Hash) || value.is_a?(String) }
28
+ MUTUAL_EXCLUSIVE_FILTER = ->(fields) { fields.size <= 1 }
29
+
30
+ AGGREGATION_PIPELINE = lambda { |pipeline|
31
+ return false unless pipeline.is_a?(Array)
32
+
33
+ allowed_once_appearances = Set.new
34
+
35
+ pipeline.flat_map(&:keys).each do |key|
36
+ return false unless ALLOWED_PIPELINE_STAGES.include?(key)
37
+
38
+ if PIPELINE_STAGES_ALLOWED_ONCE.include?(key)
39
+ return false if allowed_once_appearances.include?(key)
40
+
41
+ allowed_once_appearances.add(key)
42
+ end
43
+ end
44
+
45
+ true
46
+ }
47
+
48
+ # Ruby has no 'Boolean' class
49
+ BOOLEAN = ->(value) { value.is_a?(TrueClass) || value.is_a?(FalseClass) }
50
+
51
+ COLLATION = {
52
+ :name => 'collation',
53
+ :type => Hash,
54
+ :optional => true,
55
+ :fields => [
56
+ {
57
+ :name => 'locale',
58
+ :type => String,
59
+ :optional => true
60
+ },
61
+ {
62
+ :name => 'caseLevel',
63
+ :type => BOOLEAN,
64
+ :optional => true
65
+ },
66
+ {
67
+ :name => 'caseFirst',
68
+ :type => String,
69
+ :optional => true
70
+ },
71
+ {
72
+ :name => 'strength',
73
+ :type => Integer,
74
+ :optional => true
75
+ },
76
+ {
77
+ :name => 'numericOrdering',
78
+ :type => BOOLEAN,
79
+ :optional => true
80
+ },
81
+ {
82
+ :name => 'alternate',
83
+ :type => String,
84
+ :optional => true
85
+ },
86
+ {
87
+ :name => 'maxVariable',
88
+ :type => String,
89
+ :optional => true
90
+ },
91
+ {
92
+ :name => 'backwards',
93
+ :type => BOOLEAN,
94
+ :optional => true
95
+ },
96
+ ]
97
+ }
98
+
99
+ CURSOR_TYPE = ->(cursor) { [:tailable, :tailable_await].include?(cursor) }
100
+
101
+ # Aggregate options: https://www.mongodb.com/docs/manual/reference/method/db.collection.aggregate/
102
+ AGGREGATE_OPTIONS = {
103
+ :name => 'options',
104
+ :type => Hash,
105
+ :optional => true,
106
+ :fields => [
107
+ {
108
+ :name => 'explain',
109
+ :type => BOOLEAN,
110
+ :optional => true
111
+ },
112
+ {
113
+ :name => 'allowDiskUse',
114
+ :type => BOOLEAN,
115
+ :optional => true
116
+ },
117
+ {
118
+ :name => 'cursor',
119
+ :type => Hash,
120
+ :optional => true,
121
+ :fields => [
122
+ {
123
+ :name => 'batchSize',
124
+ :type => NON_NEGATIVE_INTEGER
125
+ }
126
+ ]
127
+ },
128
+ {
129
+ :name => 'maxTimeMS',
130
+ :type => NON_NEGATIVE_INTEGER,
131
+ :optional => true
132
+ },
133
+ {
134
+ :name => 'bypassDocumentValidation',
135
+ :type => BOOLEAN,
136
+ :optional => true
137
+ },
138
+ {
139
+ :name => 'readConcern',
140
+ :type => Hash,
141
+ :optional => true,
142
+ :fields => [
143
+ {
144
+ :name => 'level',
145
+ :type => READ_CONCERN_LEVEL
146
+ }
147
+ ]
148
+ },
149
+ COLLATION,
150
+ {
151
+ :name => 'hint',
152
+ :type => STRING_OR_DOCUMENT,
153
+ :optional => true
154
+ },
155
+ {
156
+ :name => 'comment',
157
+ :type => String,
158
+ :optional => true
159
+ },
160
+ {
161
+ :name => 'writeConcern',
162
+ :type => Hash,
163
+ :optional => true
164
+ },
165
+ {
166
+ :name => 'let',
167
+ :type => Hash,
168
+ :optional => true
169
+ }
170
+ ]
171
+ }
172
+
173
+ AGGREGATE_PIPELINE = {
174
+ :name => 'pipeline',
175
+ :type => AGGREGATION_PIPELINE,
176
+ :optional => true,
177
+ }
178
+
179
+ AGGREGATE = {
180
+ :name => 'aggregate',
181
+ :type => Hash,
182
+ :optional => true,
183
+ :fields => [
184
+ AGGREGATE_PIPELINE,
185
+ AGGREGATE_OPTIONS
186
+ ]
187
+ }
188
+
189
+ FIND_OPTIONS = {
190
+ :name => 'options',
191
+ :type => Hash,
192
+ :optional => true,
193
+ :fields => [
194
+ {
195
+ :name => 'allowDiskUse',
196
+ :type => BOOLEAN,
197
+ :optional => true
198
+ },
199
+ {
200
+ :name => 'allowPartialResults',
201
+ :type => BOOLEAN,
202
+ :optional => true
203
+ },
204
+ {
205
+ :name => 'batchSize',
206
+ :type => NON_NEGATIVE_INTEGER,
207
+ :optional => true
208
+ },
209
+ COLLATION,
210
+ {
211
+ :name => 'cursorType',
212
+ :type => CURSOR_TYPE,
213
+ :optional => true
214
+ },
215
+ {
216
+ :name => 'limit',
217
+ :type => NON_NEGATIVE_INTEGER,
218
+ :optional => true
219
+ },
220
+ {
221
+ :name => 'maxTimeMS',
222
+ :type => NON_NEGATIVE_INTEGER,
223
+ :optional => true
224
+ },
225
+ {
226
+ :name => 'modifiers',
227
+ :type => Hash,
228
+ :optional => true
229
+ },
230
+ {
231
+ :name => 'noCursorTimeout',
232
+ :type => BOOLEAN,
233
+ :optional => true
234
+ },
235
+ {
236
+ :name => 'oplogReplay',
237
+ :type => BOOLEAN,
238
+ :optional => true
239
+ },
240
+ {
241
+ :name => 'projection',
242
+ :type => Hash,
243
+ :optional => true
244
+ },
245
+ {
246
+ :name => 'skip',
247
+ :type => NON_NEGATIVE_INTEGER,
248
+ :optional => true
249
+ },
250
+ {
251
+ :name => 'sort',
252
+ :type => Hash,
253
+ :optional => true
254
+ },
255
+ {
256
+ :name => 'let',
257
+ :type => Hash,
258
+ :optional => true
259
+ }
260
+ ]
261
+ }
262
+
263
+ # TODO: return true for now. Will be more involved (basically needs full query parsing or "dummy" execution against a running instance)
264
+ FILTER = ->(_filter) { true }
265
+
266
+ FIND_FILTER = {
267
+ :name => 'filter',
268
+ :type => FILTER
269
+ }
270
+
271
+ FIND = {
272
+ :name => 'find',
273
+ :type => Hash,
274
+ :optional => true,
275
+ :fields => [
276
+ FIND_OPTIONS,
277
+ FIND_FILTER
278
+ ]
279
+ }
280
+
281
+ SCHEMA = {
282
+ :fields => {
283
+ :constraints => MUTUAL_EXCLUSIVE_FILTER,
284
+ :values => [
285
+ AGGREGATE,
286
+ FIND
287
+ ]
288
+ }
289
+ }
290
+ end
291
+ end
292
+ end
@@ -26,11 +26,16 @@ module Connectors
26
26
  ERROR
27
27
  ]
28
28
 
29
- PENDING_STATUES = [
29
+ PENDING_STATUSES = [
30
30
  PENDING,
31
31
  SUSPENDED
32
32
  ]
33
33
 
34
+ ACTIVE_STATUSES = [
35
+ IN_PROGRESS,
36
+ CANCELING
37
+ ]
38
+
34
39
  TERMINAL_STATUSES = [
35
40
  CANCELED,
36
41
  COMPLETED,
@@ -0,0 +1,43 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ require 'utility/logger'
8
+ require 'utility/exception_tracking'
9
+ require 'utility/error_monitor'
10
+
11
+ module Connectors
12
+ class TolerableErrorHelper
13
+ def initialize(error_monitor)
14
+ @error_monitor = error_monitor
15
+ end
16
+
17
+ def yield_single_document(identifier: nil)
18
+ Utility::Logger.debug("Extracting single document for #{identifier}") if identifier
19
+ yield
20
+ @error_monitor.note_success
21
+ rescue *fatal_exception_classes => e
22
+ Utility::ExceptionTracking.augment_exception(e)
23
+ Utility::Logger.error("Encountered a fall-through error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
24
+ raise
25
+ rescue StandardError => e
26
+ Utility::ExceptionTracking.augment_exception(e)
27
+ Utility::Logger.warn("Encountered error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
28
+ @error_monitor.note_error(e, :id => e.id)
29
+ end
30
+
31
+ private
32
+
33
+ def identifying_error_message(identifier)
34
+ identifier.present? ? " of '#{identifier}'" : ''
35
+ end
36
+
37
+ def fatal_exception_classes
38
+ [
39
+ Utility::ErrorMonitor::MonitoringError
40
+ ]
41
+ end
42
+ end
43
+ end
@@ -15,18 +15,22 @@ module Core
15
15
  class ConnectorJob
16
16
  DEFAULT_PAGE_SIZE = 100
17
17
 
18
- # Error Classes
19
- class ConnectorJobNotFoundError < StandardError; end
20
-
21
18
  def self.fetch_by_id(job_id)
22
19
  es_response = ElasticConnectorActions.get_job(job_id)
20
+ return nil unless es_response[:found]
23
21
 
24
- raise ConnectorJobNotFoundError.new("Connector job with id=#{job_id} was not found.") unless es_response[:found]
25
22
  new(es_response)
26
23
  end
27
24
 
28
- def self.pending_jobs(page_size = DEFAULT_PAGE_SIZE)
29
- query = { terms: { status: Connectors::SyncStatus::PENDING_STATUES } }
25
+ def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
26
+ status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
27
+
28
+ query = { bool: { must: [{ terms: status_term }] } }
29
+
30
+ return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
31
+
32
+ query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
33
+
30
34
  fetch_jobs_by_query(query, page_size)
31
35
  end
32
36
 
@@ -50,6 +54,10 @@ module Core
50
54
  @elasticsearch_response[:_source][property_name]
51
55
  end
52
56
 
57
+ def error
58
+ self[:error]
59
+ end
60
+
53
61
  def status
54
62
  self[:status]
55
63
  end
@@ -62,16 +70,36 @@ module Core
62
70
  status == Connectors::SyncStatus::CANCELING
63
71
  end
64
72
 
73
+ def suspended?
74
+ status == Connectors::SyncStatus::SUSPENDED
75
+ end
76
+
77
+ def canceled?
78
+ status == Connectors::SyncStatus::CANCELED
79
+ end
80
+
81
+ def pending?
82
+ Connectors::SyncStatus::PENDING_STATUSES.include?(status)
83
+ end
84
+
85
+ def active?
86
+ Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
87
+ end
88
+
89
+ def terminated?
90
+ Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
91
+ end
92
+
65
93
  def connector_snapshot
66
- self[:connector]
94
+ self[:connector] || {}
67
95
  end
68
96
 
69
97
  def connector_id
70
- connector_snapshot[:id]
98
+ @elasticsearch_response[:_source][:connector][:id]
71
99
  end
72
100
 
73
101
  def index_name
74
- connector_snapshot[:configuration]
102
+ connector_snapshot[:index_name]
75
103
  end
76
104
 
77
105
  def language
@@ -91,33 +119,51 @@ module Core
91
119
  end
92
120
 
93
121
  def pipeline
94
- connector_snapshot[:pipeline]
122
+ @elasticsearch_response[:_source][:pipeline]
95
123
  end
96
124
 
97
125
  def connector
98
126
  @connector ||= ConnectorSettings.fetch_by_id(connector_id)
99
127
  end
100
128
 
101
- def reload_connector!
102
- @connector = nil
103
- connector
129
+ def done!(ingestion_stats = {}, connector_metadata = {})
130
+ terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
104
131
  end
105
132
 
106
- def reload
107
- es_response = ElasticConnectorActions.get_job(id)
108
- raise ConnectorJobNotFoundError.new("Connector job with id=#{id} was not found.") unless es_response[:found]
109
- # TODO: remove the usage of with_indifferent_access. get_id method is expected to return a hash
110
- @elasticsearch_response = es_response.with_indifferent_access
111
- @connector = nil
133
+ def error!(message, ingestion_stats = {}, connector_metadata = {})
134
+ terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
112
135
  end
113
136
 
114
- private
137
+ def cancel!(ingestion_stats = {}, connector_metadata = {})
138
+ terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
139
+ end
115
140
 
116
- def initialize(es_response)
117
- # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
118
- @elasticsearch_response = es_response.with_indifferent_access
141
+ def with_concurrency_control
142
+ response = ElasticConnectorActions.get_job(id)
143
+
144
+ yield response, response['_seq_no'], response['_primary_term']
119
145
  end
120
146
 
147
+ def make_running!
148
+ with_concurrency_control do |es_doc, seq_no, primary_term|
149
+ now = Time.now
150
+ doc = {
151
+ status: Connectors::SyncStatus::IN_PROGRESS,
152
+ started_at: now,
153
+ last_seen: now,
154
+ worker_hostname: Socket.gethostname
155
+ }
156
+
157
+ ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
158
+ end
159
+ end
160
+
161
+ def es_source
162
+ @elasticsearch_response[:_source]
163
+ end
164
+
165
+ private
166
+
121
167
  def self.fetch_jobs_by_query(query, page_size)
122
168
  results = []
123
169
  offset = 0
@@ -133,5 +179,32 @@ module Core
133
179
 
134
180
  results
135
181
  end
182
+
183
+ def initialize(es_response)
184
+ # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
185
+ @elasticsearch_response = es_response.with_indifferent_access
186
+ end
187
+
188
+ def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
189
+ ingestion_stats ||= {}
190
+ ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
191
+ doc = {
192
+ :last_seen => Time.now,
193
+ :completed_at => Time.now,
194
+ :status => status,
195
+ :error => error
196
+ }.merge(ingestion_stats)
197
+ doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
198
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
199
+ ElasticConnectorActions.update_job_fields(id, doc)
200
+ end
201
+
202
+ def seq_no
203
+ @elasticsearch_response[:_seq_no]
204
+ end
205
+
206
+ def primary_term
207
+ @elasticsearch_response[:_primary_term]
208
+ end
136
209
  end
137
210
  end
@@ -8,7 +8,6 @@
8
8
 
9
9
  require 'active_support/core_ext/hash/indifferent_access'
10
10
  require 'connectors/connector_status'
11
- require 'connectors/registry'
12
11
  require 'core/elastic_connector_actions'
13
12
  require 'utility'
14
13
 
@@ -24,18 +23,16 @@ module Core
24
23
 
25
24
  DEFAULT_PAGE_SIZE = 100
26
25
 
27
- # Error Classes
28
- class ConnectorNotFoundError < StandardError; end
29
-
30
26
  def self.fetch_by_id(connector_id)
31
27
  es_response = ElasticConnectorActions.get_connector(connector_id)
32
- connectors_meta = ElasticConnectorActions.connectors_meta
28
+ return nil unless es_response[:found]
33
29
 
34
- raise ConnectorNotFoundError.new("Connector with id=#{connector_id} was not found.") unless es_response[:found]
30
+ connectors_meta = ElasticConnectorActions.connectors_meta
35
31
  new(es_response, connectors_meta)
36
32
  end
37
33
 
38
34
  def self.fetch_native_connectors(page_size = DEFAULT_PAGE_SIZE)
35
+ require 'connectors/registry' unless defined?(Connectors::REGISTRY)
39
36
  query = {
40
37
  bool: {
41
38
  filter: [
@@ -122,6 +119,32 @@ module Core
122
119
  index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
123
120
  end
124
121
 
122
+ def ready_for_sync?
123
+ Connectors::REGISTRY.registered?(service_type) &&
124
+ valid_index_name? &&
125
+ connector_status_allows_sync?
126
+ end
127
+
128
+ def running?
129
+ @elasticsearch_response[:_source][:last_sync_status] == Connectors::SyncStatus::IN_PROGRESS
130
+ end
131
+
132
+ def update_last_sync!(job)
133
+ doc = {
134
+ :last_sync_status => job.status,
135
+ :last_synced => Time.now,
136
+ :last_sync_error => job.error,
137
+ :error => job.error
138
+ }
139
+
140
+ if job.terminated?
141
+ doc[:last_indexed_document_count] = job[:indexed_document_count]
142
+ doc[:last_deleted_document_count] = job[:deleted_document_count]
143
+ end
144
+
145
+ Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
146
+ end
147
+
125
148
  private
126
149
 
127
150
  def initialize(es_response, connectors_meta)