connectors_service 8.6.0.3 → 8.6.0.4.pre.20221104T200814Z

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b550a78dba7e4cd4502b6eea4c187253e6bb77ab944815b59809d92c7ccc23bb
4
- data.tar.gz: dbe8c32e1da94ed40777af57a84ecac883a15cab71aa164534157314bbcbfcff
3
+ metadata.gz: 23b78b5c5228f761b884af1bfd75b163ad912dc53331ce5a64cd3edb8f45ef86
4
+ data.tar.gz: 5c74e06f315ab9af4161e88fac626e84b6d2789c0b5b080c713e780a4560ef97
5
5
  SHA512:
6
- metadata.gz: 92ef83e3bc94107b1cb11b4454760b17c217b61beb555b55d01ddb7b0758372c3f04ed1acc153e406f20c7abb2df5a3540c2b93733b48eafcc99bd752e7d6759
7
- data.tar.gz: a7eade1996d683fcf47e072704d566479538249d9711e8c0d2019fbb7bd15c382c61cd35adc6ee52f989a75c1fa6d7a6fe1330a04801fb46952aa307f2e93ac5
6
+ metadata.gz: 75295f0d26061a977089e999190f4974f4929ca92c25c09e7ecbbd1f0117476f758b7f200b719461bdd9e7b9e6193df455d3e52bcec0634fc85a7b512721ab6a
7
+ data.tar.gz: fa12af986f72d10081245e782a9152a887512e62f48ad525af59e59a4ef72238727e7bfd521f516fd452a2e74a1101c5bd1965f1e9b97f7b236b8ae2ccf626ff
@@ -1,11 +1,10 @@
1
1
  # general metadata
2
- version: 8.6.0.3
3
- repository: https://github.com/elastic/connectors-ruby.git
4
- revision: aa2faf8cc993a26980441adffe97d62fdaf5aa5c
2
+ version: 8.6.0.4-20221104T200814Z
3
+ repository: git@github.com:elastic/ent-search-connectors.git
4
+ revision: 2051b3907639a1fe2ae68efdc33c06cf12d38383
5
5
  elasticsearch:
6
- cloud_id: CHANGEME
7
6
  hosts: http://localhost:9200
8
- api_key: CHANGEME
7
+ api_key: OW1FalJJUUI1clBtUVh5RVo1QmU6QVp5LV9pU3RRUXFYb2VVYnlCRWNZdw==
9
8
  retry_on_failure: 3
10
9
  request_timeout: 120
11
10
  disable_warnings: true
@@ -16,10 +15,10 @@ thread_pool:
16
15
  max_threads: 5
17
16
  max_queue: 100
18
17
  log_level: info
19
- ecs_logging: true
18
+ ecs_logging: false
20
19
  poll_interval: 3
21
20
  termination_timeout: 60
22
21
  heartbeat_interval: 1800
23
- native_mode: true
24
- connector_id: CHANGEME
25
- service_type: CHANGEME
22
+ native_mode: false
23
+ connector_id: 9WEjRIQB5rPmQXyEWJB2
24
+ service_type: example
data/lib/app/config.rb CHANGED
@@ -35,6 +35,8 @@ puts "Parsing #{CONFIG_FILE} configuration file."
35
35
  optional(:disable_warnings).value(:bool?)
36
36
  optional(:trace).value(:bool?)
37
37
  optional(:log).value(:bool?)
38
+ optional(:ca_fingerprint).value(:string)
39
+ optional(:transport_options).value(:hash)
38
40
  end
39
41
 
40
42
  optional(:thread_pool).hash do
@@ -84,10 +84,14 @@ module App
84
84
  def start_sync_task(connector_settings)
85
85
  start_heartbeat_task(connector_settings)
86
86
  pool.post do
87
- Utility::Logger.info("Starting a sync job for #{connector_settings.formatted}...")
87
+ Utility::Logger.info("Initiating a sync job for #{connector_settings.formatted}...")
88
88
  Core::ElasticConnectorActions.ensure_content_index_exists(connector_settings.index_name)
89
89
  job_runner = Core::SyncJobRunner.new(connector_settings)
90
90
  job_runner.execute
91
+ rescue Core::JobAlreadyRunningError
92
+ Utility::Logger.info("Sync job for #{connector_settings.formatted} is already running, skipping.")
93
+ rescue Core::ConnectorVersionChangedError => e
94
+ Utility::Logger.info("Could not start the job because #{connector_settings.formatted} has been updated externally. Message: #{e.message}")
91
95
  rescue StandardError => e
92
96
  Utility::ExceptionTracking.log_exception(e, "Sync job for #{connector_settings.formatted} failed due to unexpected error.")
93
97
  end
@@ -75,6 +75,10 @@ module App
75
75
  else
76
76
  raise UnhealthyCluster, "Unexpected cluster status: #{response['status']}"
77
77
  end
78
+ rescue *Utility::AUTHORIZATION_ERRORS => e
79
+ Utility::ExceptionTracking.log_exception(e)
80
+
81
+ fail_check!("Elasticsearch returned 'Unauthorized' response. Check your authentication details. Terminating...")
78
82
  rescue *App::RETRYABLE_CONNECTION_ERRORS => e
79
83
  Utility::Logger.warn('Could not connect to Elasticsearch. Make sure it is running and healthy.')
80
84
  Utility::Logger.debug("Error: #{e.full_message}")
@@ -8,9 +8,9 @@
8
8
 
9
9
  require 'bson'
10
10
  require 'core/output_sink'
11
- require 'utility/exception_tracking'
12
- require 'utility/errors'
11
+ require 'utility'
13
12
  require 'app/config'
13
+ require 'active_support/core_ext/hash/indifferent_access'
14
14
 
15
15
  module Connectors
16
16
  module Base
@@ -19,6 +19,11 @@ module Connectors
19
19
  raise 'Not implemented for this connector'
20
20
  end
21
21
 
22
+ # Used as a framework util method, don't override
23
+ def self.configurable_fields_indifferent_access
24
+ configurable_fields.with_indifferent_access
25
+ end
26
+
22
27
  def self.configurable_fields
23
28
  {}
24
29
  end
@@ -27,8 +32,16 @@ module Connectors
27
32
  raise 'Not implemented for this connector'
28
33
  end
29
34
 
30
- def initialize(configuration: {})
35
+ attr_reader :rules, :advanced_filter_config
36
+
37
+ def initialize(configuration: {}, job_description: {})
31
38
  @configuration = configuration.dup || {}
39
+ @job_description = job_description&.dup || {}
40
+
41
+ filter = get_filter(@job_description[:filtering])
42
+
43
+ @rules = Utility::Common.return_if_present(filter[:rules], [])
44
+ @advanced_filter_config = Utility::Common.return_if_present(filter[:advanced_config], {})
32
45
  end
33
46
 
34
47
  def yield_documents; end
@@ -52,6 +65,19 @@ module Connectors
52
65
  Utility::ExceptionTracking.log_exception(e, "Connector for service #{self.class.service_type} failed the health check for 3rd-party service.")
53
66
  false
54
67
  end
68
+
69
+ def filtering_present?
70
+ @advanced_filter_config.present? || @rules.present?
71
+ end
72
+
73
+ private
74
+
75
+ def get_filter(filtering)
76
+ # assume for now, that first object in filtering array or a filter object itself is the only filtering object
77
+ filter = filtering.is_a?(Array) ? filtering[0] : filtering
78
+
79
+ filter.present? ? filter : {}
80
+ end
55
81
  end
56
82
  end
57
83
  end
@@ -8,11 +8,11 @@
8
8
 
9
9
  module Connectors
10
10
  class ConnectorStatus
11
- CREATED = 'created'
11
+ CREATED = 'created'
12
12
  NEEDS_CONFIGURATION = 'needs_configuration'
13
- CONFIGURED = 'configured'
14
- CONNECTED = 'connected'
15
- ERROR = 'error'
13
+ CONFIGURED = 'configured'
14
+ CONNECTED = 'connected'
15
+ ERROR = 'error'
16
16
 
17
17
  STATUSES = [
18
18
  CREATED,
@@ -20,16 +20,21 @@ module Connectors
20
20
  'Example Connector'
21
21
  end
22
22
 
23
+ # Field 'Foo' won't have a default value. Field 'Bar' will have the default value 'Value'.
23
24
  def self.configurable_fields
24
25
  {
25
26
  'foo' => {
26
27
  'label' => 'Foo',
27
28
  'value' => nil
29
+ },
30
+ :bar => {
31
+ :label => 'Bar',
32
+ :value => 'Value'
28
33
  }
29
34
  }
30
35
  end
31
36
 
32
- def initialize(configuration: {})
37
+ def initialize(configuration: {}, job_description: {})
33
38
  super
34
39
  end
35
40
 
@@ -42,16 +47,35 @@ module Connectors
42
47
 
43
48
  def yield_documents
44
49
  attachments = [
45
- File.open('./lib/connectors/example/example_attachments/first_attachment.txt'),
46
- File.open('./lib/connectors/example/example_attachments/second_attachment.txt'),
47
- File.open('./lib/connectors/example/example_attachments/third_attachment.txt')
50
+ load_attachment('first_attachment.txt'),
51
+ load_attachment('second_attachment.txt'),
52
+ load_attachment('third_attachment.txt'),
48
53
  ]
49
54
 
50
55
  attachments.each_with_index do |att, index|
51
56
  data = { id: (index + 1).to_s, name: "example document #{index + 1}", _attachment: File.read(att) }
57
+
58
+ # Uncomment one of these two lines to simulate longer running sync jobs
59
+ #
60
+ # sleep(rand(10..60).seconds)
61
+ # sleep(rand(1..10).minutes)
62
+
52
63
  yield data
53
64
  end
54
65
  end
66
+
67
+ private
68
+
69
+ def load_attachment(path)
70
+ attachment_dir = "#{File.dirname(__FILE__)}/attachments"
71
+ attachment_path = "#{attachment_dir}/#{path}"
72
+
73
+ unless File.exist?(attachment_path)
74
+ raise "Attachment at location '#{attachment_path}' doesn't exist. Attachments should be located under #{attachment_dir}"
75
+ end
76
+
77
+ File.open(attachment_path)
78
+ end
55
79
  end
56
80
  end
57
81
  end
@@ -36,7 +36,7 @@ module Connectors
36
36
  }
37
37
  end
38
38
 
39
- def initialize(configuration: {})
39
+ def initialize(configuration: {}, job_description: {})
40
40
  super
41
41
 
42
42
  @extractor = Connectors::GitLab::Extractor.new(
@@ -6,13 +6,18 @@
6
6
 
7
7
  # frozen_string_literal: true
8
8
 
9
- require 'active_support/core_ext/hash/indifferent_access'
10
9
  require 'connectors/base/connector'
11
10
  require 'mongo'
11
+ require 'utility'
12
12
 
13
13
  module Connectors
14
14
  module MongoDB
15
15
  class Connector < Connectors::Base::Connector
16
+
17
+ ALLOWED_TOP_LEVEL_FILTER_KEYS = %w[find aggregate]
18
+
19
+ PAGE_SIZE = 100
20
+
16
21
  def self.service_type
17
22
  'mongodb'
18
23
  end
@@ -44,7 +49,7 @@ module Connectors
44
49
  }
45
50
  end
46
51
 
47
- def initialize(configuration: {})
52
+ def initialize(configuration: {}, job_description: {})
48
53
  super
49
54
 
50
55
  @host = configuration.dig(:host, :value)
@@ -56,17 +61,108 @@ module Connectors
56
61
  end
57
62
 
58
63
  def yield_documents
64
+ check_filtering
65
+
59
66
  with_client do |client|
60
- client[@collection].find.each do |document|
61
- doc = document.with_indifferent_access
67
+ # We do paging using skip().limit() here to make Ruby recycle the memory for each page pulled from the server after it's not needed any more.
68
+ # This gives us more control on the usage of the memory (we can adjust PAGE_SIZE constant for that to decrease max memory consumption).
69
+ # It's done due to the fact that usage of .find.each leads to memory leaks or overuse of memory - the whole result set seems to stay in memory
70
+ # during the sync. Sometimes (not 100% sure) it even leads to a real leak, when the memory for these objects is never recycled.
71
+ cursor, options = create_db_cursor_on_collection(client[@collection])
72
+ skip = 0
73
+
74
+ found_overall = 0
75
+
76
+ # if no overall limit is specified by filtering use -1 to not break ingestion, when no overall limit is specified (found_overall is only increased,
77
+ # thus can never reach -1)
78
+ overall_limit = Float::INFINITY
79
+
80
+ if options.present?
81
+ # there could be a skip parameter defined for filtering
82
+ skip = options.fetch(:skip, skip)
83
+ # there could be a limit parameter defined for filtering -> used for an overall limit (not a page limit, which was introduced for memory optimization)
84
+ overall_limit = options.fetch(:limit, overall_limit)
85
+ end
86
+
87
+ overall_limit_reached = false
88
+
89
+ loop do
90
+ found_in_page = 0
91
+
92
+ view = cursor.skip(skip).limit(PAGE_SIZE)
93
+ view.each do |document|
94
+ yield serialize(document)
95
+
96
+ found_in_page += 1
97
+ found_overall += 1
98
+
99
+ overall_limit_reached = found_overall >= overall_limit && overall_limit != Float::INFINITY
100
+
101
+ break if overall_limit_reached
102
+ end
103
+
104
+ page_was_empty = found_in_page == 0
105
+
106
+ break if page_was_empty || overall_limit_reached
62
107
 
63
- yield serialize(doc)
108
+ skip += PAGE_SIZE
64
109
  end
65
110
  end
66
111
  end
67
112
 
68
113
  private
69
114
 
115
+ def create_db_cursor_on_collection(collection)
116
+ return create_find_cursor(collection) if @advanced_filter_config[:find].present?
117
+
118
+ return create_aggregate_cursor(collection) if @advanced_filter_config[:aggregate].present?
119
+
120
+ collection.find
121
+ end
122
+
123
+ def check_filtering
124
+ return unless filtering_present?
125
+
126
+ check_find_and_aggregate
127
+ end
128
+
129
+ def check_find_and_aggregate
130
+ if @advanced_filter_config.keys.size != 1
131
+ invalid_keys_msg = "Only one of #{ALLOWED_TOP_LEVEL_FILTER_KEYS} is allowed in the filtering object. Keys present: '#{@advanced_filter_config.keys}'."
132
+ raise Utility::InvalidFilterConfigError.new(invalid_keys_msg)
133
+ end
134
+ end
135
+
136
+ def create_aggregate_cursor(collection)
137
+ aggregate = @advanced_filter_config[:aggregate]
138
+
139
+ pipeline = aggregate[:pipeline]
140
+ options = extract_options(aggregate)
141
+
142
+ if !pipeline.nil? && pipeline.empty? && !options.present?
143
+ Utility::Logger.warn('\'Aggregate\' was specified with an empty pipeline and empty options.')
144
+ end
145
+
146
+ [collection.aggregate(pipeline, options), options]
147
+ end
148
+
149
+ def create_find_cursor(collection)
150
+ find = @advanced_filter_config[:find]
151
+
152
+ filter = find[:filter]
153
+ options = extract_options(find)
154
+
155
+ if !filter.nil? && filter.empty? && !options.present?
156
+ Utility::Logger.warn('\'Find\' was specified with an empty filter and empty options.')
157
+ end
158
+
159
+ [collection.find(filter, options), options]
160
+ end
161
+
162
+ def extract_options(mongodb_function)
163
+ mongodb_function[:options].present? ? mongodb_function[:options] : {}
164
+ end
165
+
70
166
  def do_health_check
71
167
  with_client do |_client|
72
168
  Utility::Logger.debug("Mongo at #{@host}/#{@database} looks healthy.")
@@ -76,34 +172,43 @@ module Connectors
76
172
  def with_client
77
173
  raise "Invalid value for 'Direct connection' : #{@direct_connection}." unless %w[true false].include?(@direct_connection.to_s.strip.downcase)
78
174
 
79
- client = if @user.present? || @password.present?
80
- Mongo::Client.new(
81
- @host,
82
- database: @database,
83
- direct_connection: to_boolean(@direct_connection),
84
- user: @user,
85
- password: @password
86
- )
87
- else
88
- Mongo::Client.new(
89
- @host,
90
- database: @database,
91
- direct_connection: to_boolean(@direct_connection)
92
- )
93
- end
94
-
95
- begin
96
- Utility::Logger.debug("Existing Databases #{client.database_names}")
97
- Utility::Logger.debug('Existing Collections:')
98
-
99
- client.collections.each { |coll| Utility::Logger.debug(coll.name) }
175
+ args = {
176
+ database: @database,
177
+ direct_connection: to_boolean(@direct_connection)
178
+ }
179
+
180
+ if @user.present? || @password.present?
181
+ args[:user] = @user
182
+ args[:password] = @password
183
+ end
184
+
185
+ Mongo::Client.new(@host, args) do |client|
186
+ databases = client.database_names
187
+
188
+ Utility::Logger.debug("Existing Databases: #{databases}")
189
+ check_database_exists!(databases, @database)
190
+
191
+ collections = client.database.collection_names
192
+
193
+ Utility::Logger.debug("Existing Collections: #{collections}")
194
+ check_collection_exists!(collections, @database, @collection)
100
195
 
101
196
  yield client
102
- ensure
103
- client.close
104
197
  end
105
198
  end
106
199
 
200
+ def check_database_exists!(databases, database)
201
+ return if databases.include?(database)
202
+
203
+ raise "Database (#{database}) does not exist. Existing databases: #{databases.join(', ')}"
204
+ end
205
+
206
+ def check_collection_exists!(collections, database, collection)
207
+ return if collections.include?(collection)
208
+
209
+ raise "Collection (#{collection}) does not exist within database '#{database}'. Existing collections: #{collections.join(', ')}"
210
+ end
211
+
107
212
  def serialize(mongodb_document)
108
213
  # This is some lazy serialization here.
109
214
  # Problem: MongoDB has its own format of things - e.g. ids are Bson::ObjectId, which when serialized to JSON
@@ -120,11 +225,10 @@ module Connectors
120
225
  mongodb_document.map { |v| serialize(v) }
121
226
  when Hash
122
227
  mongodb_document.map do |key, value|
123
- remapped_key = key.to_sym == :_id ? :id : key.to_sym
124
-
228
+ key = 'id' if key == '_id'
125
229
  remapped_value = serialize(value)
126
- [remapped_key, remapped_value]
127
- end.to_h.with_indifferent_access
230
+ [key, remapped_value]
231
+ end.to_h
128
232
  else
129
233
  mongodb_document
130
234
  end
@@ -24,10 +24,10 @@ module Connectors
24
24
  @connectors[name]
25
25
  end
26
26
 
27
- def connector(name, configuration)
27
+ def connector(name, configuration, job_description: {})
28
28
  klass = connector_class(name)
29
29
  if klass.present?
30
- return klass.new(configuration: configuration)
30
+ return klass.new(configuration: configuration, job_description: job_description)
31
31
  end
32
32
  raise "Connector #{name} is not yet registered. You need to register it before use"
33
33
  end
@@ -8,14 +8,33 @@
8
8
 
9
9
  module Connectors
10
10
  class SyncStatus
11
- COMPLETED = 'completed'
11
+ PENDING = 'pending'
12
12
  IN_PROGRESS = 'in_progress'
13
- FAILED = 'failed'
13
+ CANCELING = 'canceling'
14
+ CANCELED = 'canceled'
15
+ SUSPENDED = 'suspended'
16
+ COMPLETED = 'completed'
17
+ ERROR = 'error'
14
18
 
15
19
  STATUSES = [
16
- COMPLETED,
20
+ PENDING,
17
21
  IN_PROGRESS,
18
- FAILED
22
+ CANCELING,
23
+ CANCELED,
24
+ SUSPENDED,
25
+ COMPLETED,
26
+ ERROR
27
+ ]
28
+
29
+ PENDING_STATUES = [
30
+ PENDING,
31
+ SUSPENDED
32
+ ]
33
+
34
+ TERMINAL_STATUSES = [
35
+ CANCELED,
36
+ COMPLETED,
37
+ ERROR
19
38
  ]
20
39
  end
21
40
  end
@@ -23,7 +23,7 @@ module Core
23
23
  Utility::Logger.error("Couldn't find connector for service type #{connector_settings.service_type || service_type}")
24
24
  return
25
25
  end
26
- configuration = connector_class.configurable_fields
26
+ configuration = connector_class.configurable_fields_indifferent_access
27
27
  doc = {
28
28
  :configuration => configuration
29
29
  }
@@ -19,6 +19,8 @@ module Core
19
19
  DEFAULT_REDUCE_WHITESPACE = true
20
20
  DEFAULT_RUN_ML_INFERENCE = true
21
21
 
22
+ DEFAULT_FILTERING = {}
23
+
22
24
  DEFAULT_PAGE_SIZE = 100
23
25
 
24
26
  # Error Classes
@@ -80,20 +82,24 @@ module Core
80
82
  self[:scheduling]
81
83
  end
82
84
 
85
+ def filtering
86
+ Utility::Common.return_if_present(@elasticsearch_response[:filtering], DEFAULT_FILTERING)
87
+ end
88
+
83
89
  def request_pipeline
84
- return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
90
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
85
91
  end
86
92
 
87
93
  def extract_binary_content?
88
- return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
94
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
89
95
  end
90
96
 
91
97
  def reduce_whitespace?
92
- return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
98
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
93
99
  end
94
100
 
95
101
  def run_ml_inference?
96
- return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
102
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
97
103
  end
98
104
 
99
105
  def formatted
@@ -110,8 +116,6 @@ module Core
110
116
  index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
111
117
  end
112
118
 
113
- private
114
-
115
119
  def self.fetch_connectors_by_query(query, page_size)
116
120
  connectors_meta = ElasticConnectorActions.connectors_meta
117
121
 
@@ -120,8 +124,8 @@ module Core
120
124
  loop do
121
125
  response = ElasticConnectorActions.search_connectors(query, page_size, offset)
122
126
 
123
- hits = response['hits']['hits']
124
- total = response['hits']['total']['value']
127
+ hits = response.dig('hits', 'hits') || []
128
+ total = response.dig('hits', 'total', 'value') || 0
125
129
  results += hits.map do |hit|
126
130
  Core::ConnectorSettings.new(hit, connectors_meta)
127
131
  end
@@ -132,11 +136,5 @@ module Core
132
136
  results
133
137
  end
134
138
 
135
- def return_if_present(*args)
136
- args.each do |arg|
137
- return arg unless arg.nil?
138
- end
139
- nil
140
- end
141
139
  end
142
140
  end
@@ -10,8 +10,21 @@ require 'active_support/core_ext/hash'
10
10
  require 'connectors/connector_status'
11
11
  require 'connectors/sync_status'
12
12
  require 'utility'
13
+ require 'elastic-transport'
13
14
 
14
15
  module Core
16
+ class JobAlreadyRunningError < StandardError
17
+ def initialize(connector_id)
18
+ super("Sync job for connector '#{connector_id}' is already running.")
19
+ end
20
+ end
21
+
22
+ class ConnectorVersionChangedError < StandardError
23
+ def initialize(connector_id, seq_no, primary_term)
24
+ super("Version conflict: seq_no [#{seq_no}] and primary_term [#{primary_term}] do not match for connector '#{connector_id}'.")
25
+ end
26
+ end
27
+
15
28
  class ElasticConnectorActions
16
29
  class << self
17
30
 
@@ -72,20 +85,53 @@ module Core
72
85
  end
73
86
 
74
87
  def claim_job(connector_id)
75
- update_connector_fields(connector_id,
76
- :sync_now => false,
77
- :last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
78
- :last_synced => Time.now)
88
+ seq_no = nil
89
+ primary_term = nil
90
+ sync_in_progress = false
91
+ connector_record = client.get(
92
+ :index => Utility::Constants::CONNECTORS_INDEX,
93
+ :id => connector_id,
94
+ :ignore => 404,
95
+ :refresh => true
96
+ ).tap do |response|
97
+ seq_no = response['_seq_no']
98
+ primary_term = response['_primary_term']
99
+ sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
100
+ end
101
+ if sync_in_progress
102
+ raise JobAlreadyRunningError.new(connector_id)
103
+ end
104
+ update_connector_fields(
105
+ connector_id,
106
+ { :sync_now => false,
107
+ :last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
108
+ :last_synced => Time.now },
109
+ seq_no,
110
+ primary_term
111
+ )
79
112
 
80
113
  body = {
81
114
  :connector_id => connector_id,
82
115
  :status => Connectors::SyncStatus::IN_PROGRESS,
83
116
  :worker_hostname => Socket.gethostname,
84
- :created_at => Time.now
117
+ :created_at => Time.now,
118
+ :filtering => convert_connector_filtering_to_job_filtering(connector_record.dig('_source', 'filtering'))
85
119
  }
86
- job = client.index(:index => Utility::Constants::JOB_INDEX, :body => body)
87
120
 
88
- job['_id']
121
+ client.index(:index => Utility::Constants::JOB_INDEX, :body => body)
122
+ end
123
+
124
+ def convert_connector_filtering_to_job_filtering(connector_filtering)
125
+ return [] unless connector_filtering
126
+ connector_filtering = [connector_filtering] unless connector_filtering.is_a?(Array)
127
+ connector_filtering.each_with_object([]) do |filtering_domain, job_filtering|
128
+ job_filtering << {
129
+ 'domain' => filtering_domain['domain'],
130
+ 'rules' => filtering_domain.dig('active', 'rules'),
131
+ 'advanced_snippet' => filtering_domain.dig('active', 'advanced_snippet'),
132
+ 'warnings' => [] # TODO: in https://github.com/elastic/enterprise-search-team/issues/3174
133
+ }
134
+ end
89
135
  end
90
136
 
91
137
  def update_connector_status(connector_id, status, error_message = nil)
@@ -100,7 +146,7 @@ module Core
100
146
  end
101
147
 
102
148
  def complete_sync(connector_id, job_id, status)
103
- sync_status = status[:error] ? Connectors::SyncStatus::FAILED : Connectors::SyncStatus::COMPLETED
149
+ sync_status = status[:error] ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
104
150
 
105
151
  update_connector_fields(connector_id,
106
152
  :last_sync_status => sync_status,
@@ -136,7 +182,7 @@ module Core
136
182
  }
137
183
  loop do
138
184
  response = client.search(:body => body)
139
- hits = response['hits']['hits']
185
+ hits = response.dig('hits', 'hits') || []
140
186
 
141
187
  ids = hits.map { |h| h['_id'] }
142
188
  result += ids
@@ -242,15 +288,29 @@ module Core
242
288
  ensure_index_exists("#{Utility::Constants::JOB_INDEX}-v1", system_index_body(:alias_name => Utility::Constants::JOB_INDEX, :mappings => mappings))
243
289
  end
244
290
 
245
- def update_connector_fields(connector_id, doc = {})
291
+ def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
246
292
  return if doc.empty?
247
- client.update(
293
+ update_args = {
248
294
  :index => Utility::Constants::CONNECTORS_INDEX,
249
295
  :id => connector_id,
250
296
  :body => { :doc => doc },
251
297
  :refresh => true,
252
298
  :retry_on_conflict => 3
253
- )
299
+ }
300
+ # seq_no and primary_term are used for optimistic concurrency control
301
+ # see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
302
+ if seq_no && primary_term
303
+ update_args[:if_seq_no] = seq_no
304
+ update_args[:if_primary_term] = primary_term
305
+ update_args.delete(:retry_on_conflict)
306
+ end
307
+ begin
308
+ client.update(update_args)
309
+ rescue Elastic::Transport::Transport::Errors::Conflict
310
+ # VersionConflictException
311
+ # see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
312
+ raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
313
+ end
254
314
  end
255
315
 
256
316
  private
@@ -16,6 +16,9 @@ module Core
16
16
  class NativeScheduler < Core::Scheduler
17
17
  def connector_settings
18
18
  Core::ConnectorSettings.fetch_native_connectors || []
19
+ rescue *Utility::AUTHORIZATION_ERRORS => e
20
+ # should be handled by the general scheduler
21
+ raise e
19
22
  rescue StandardError => e
20
23
  Utility::ExceptionTracking.log_exception(e, 'Could not retrieve native connectors due to unexpected error.')
21
24
  []
@@ -9,6 +9,7 @@
9
9
  require 'time'
10
10
  require 'fugit'
11
11
  require 'core/connector_settings'
12
+ require 'core/elastic_connector_actions'
12
13
  require 'utility/cron'
13
14
  require 'utility/logger'
14
15
  require 'utility/exception_tracking'
@@ -41,6 +42,8 @@ module Core
41
42
  if @is_shutting_down
42
43
  break
43
44
  end
45
+ rescue *Utility::AUTHORIZATION_ERRORS => e
46
+ Utility::ExceptionTracking.log_exception(e, 'Could not retrieve connectors settings due to authorization error.')
44
47
  rescue StandardError => e
45
48
  Utility::ExceptionTracking.log_exception(e, 'Sync failed due to unexpected error.')
46
49
  ensure
@@ -21,6 +21,9 @@ module Core
21
21
  def connector_settings
22
22
  connector_settings = Core::ConnectorSettings.fetch_by_id(@connector_id)
23
23
  [connector_settings]
24
+ rescue *Utility::AUTHORIZATION_ERRORS => e
25
+ # should be handled by the general scheduler
26
+ raise e
24
27
  rescue StandardError => e
25
28
  Utility::ExceptionTracking.log_exception(e, "Could not retrieve the connector by id #{@connector_id} due to unexpected error.")
26
29
  []
@@ -23,7 +23,7 @@ module Core
23
23
  @connector_settings = connector_settings
24
24
  @sink = Core::OutputSink::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline)
25
25
  @connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
26
- @connector_instance = Connectors::REGISTRY.connector(connector_settings.service_type, connector_settings.configuration)
26
+ @sync_finished = false
27
27
  @status = {
28
28
  :indexed_document_count => 0,
29
29
  :deleted_document_count => 0,
@@ -39,9 +39,10 @@ module Core
39
39
  private
40
40
 
41
41
  def do_sync!
42
- Utility::Logger.info("Starting sync for connector #{@connector_settings.id}.")
42
+ Utility::Logger.info("Claiming a sync job for connector #{@connector_settings.id}.")
43
43
 
44
- job_id = ElasticConnectorActions.claim_job(@connector_settings.id)
44
+ job_description = ElasticConnectorActions.claim_job(@connector_settings.id)
45
+ job_id = job_description['_id']
45
46
 
46
47
  unless job_id.present?
47
48
  Utility::Logger.error("Failed to claim the job for #{@connector_settings.id}. Please check the logs for the cause of this error.")
@@ -51,17 +52,19 @@ module Core
51
52
  begin
52
53
  Utility::Logger.debug("Successfully claimed job for connector #{@connector_settings.id}.")
53
54
 
54
- @connector_instance.do_health_check!
55
+ connector_instance = Connectors::REGISTRY.connector(@connector_settings.service_type, @connector_settings.configuration, job_description: job_description)
56
+
57
+ connector_instance.do_health_check!
55
58
 
56
59
  incoming_ids = []
57
60
  existing_ids = ElasticConnectorActions.fetch_document_ids(@connector_settings.index_name)
58
61
 
59
62
  Utility::Logger.debug("#{existing_ids.size} documents are present in index #{@connector_settings.index_name}.")
60
63
 
61
- @connector_instance.yield_documents do |document|
64
+ connector_instance.yield_documents do |document|
62
65
  document = add_ingest_metadata(document)
63
66
  @sink.ingest(document)
64
- incoming_ids << document[:id]
67
+ incoming_ids << document['id']
65
68
  @status[:indexed_document_count] += 1
66
69
  end
67
70
 
@@ -75,6 +78,10 @@ module Core
75
78
  end
76
79
 
77
80
  @sink.flush
81
+
82
+ # We use this mechanism for checking, whether an interrupt (or something else lead to the thread not finishing)
83
+ # occurred as most of the time the main execution thread is interrupted and we miss this Signal/Exception here
84
+ @sync_finished = true
78
85
  rescue StandardError => e
79
86
  @status[:error] = e.message
80
87
  Utility::ExceptionTracking.log_exception(e)
@@ -83,10 +90,15 @@ module Core
83
90
  Utility::Logger.info("Upserted #{@status[:indexed_document_count]} documents into #{@connector_settings.index_name}.")
84
91
  Utility::Logger.info("Deleted #{@status[:deleted_document_count]} documents into #{@connector_settings.index_name}.")
85
92
 
93
+ # Make sure to not override a previous error message
94
+ if !@sync_finished && @status[:error].nil?
95
+ @status[:error] = 'Sync thread didn\'t finish execution. Check connector logs for more details.'
96
+ end
97
+
86
98
  ElasticConnectorActions.complete_sync(@connector_settings.id, job_id, @status.dup)
87
99
 
88
100
  if @status[:error]
89
- Utility::Logger.info("Failed to sync for connector #{@connector_settings.id} with error #{@status[:error]}.")
101
+ Utility::Logger.info("Failed to sync for connector #{@connector_settings.id} with error '#{@status[:error]}'.")
90
102
  else
91
103
  Utility::Logger.info("Successfully synced for connector #{@connector_settings.id}.")
92
104
  end
@@ -4,10 +4,17 @@
4
4
  # you may not use this file except in compliance with the Elastic License.
5
5
  #
6
6
 
7
- module ConnectorsApp
8
- module Errors
9
- INVALID_API_KEY = 'INVALID_API_KEY'
10
- UNSUPPORTED_AUTH_SCHEME = 'UNSUPPORTED_AUTH_SCHEME'
11
- INTERNAL_SERVER_ERROR = 'INTERNAL_SERVER_ERROR'
7
+ # frozen_string_literal: true
8
+
9
+ module Utility
10
+ class Common
11
+ class << self
12
+ def return_if_present(*args)
13
+ args.each do |arg|
14
+ return arg unless arg.nil?
15
+ end
16
+ nil
17
+ end
18
+ end
12
19
  end
13
20
  end
@@ -5,6 +5,7 @@
5
5
  #
6
6
 
7
7
  require 'active_support/core_ext/string'
8
+ require 'elasticsearch'
8
9
 
9
10
  module Utility
10
11
  class DocumentError
@@ -31,6 +32,8 @@ module Utility
31
32
  end
32
33
 
33
34
  class ClientError < StandardError; end
35
+
36
+ class InvalidFilterConfigError < StandardError; end
34
37
  class EvictionWithNoProgressError < StandardError; end
35
38
  class EvictionError < StandardError
36
39
  attr_accessor :cursors
@@ -89,6 +92,7 @@ module Utility
89
92
  class InvalidTokenError < StandardError; end
90
93
  class TokenRefreshFailedError < StandardError; end
91
94
  class ConnectorNotAvailableError < StandardError; end
95
+ class AuthorizationError < StandardError; end
92
96
 
93
97
  # For when we want to explicitly set a #cause but can't
94
98
  class ExplicitlyCausedError < StandardError
@@ -124,6 +128,7 @@ module Utility
124
128
  end
125
129
  end
126
130
 
131
+ AUTHORIZATION_ERRORS = [Elastic::Transport::Transport::Errors::Unauthorized]
127
132
  INTERNAL_SERVER_ERROR = Utility::Error.new(500, 'INTERNAL_SERVER_ERROR', 'Internal server error')
128
133
  INVALID_API_KEY = Utility::Error.new(401, 'INVALID_API_KEY', 'Invalid API key')
129
134
  UNSUPPORTED_AUTH_SCHEME = Utility::Error.new(401, 'UNSUPPORTED_AUTH_SCHEME', 'Unsupported authorization scheme')
@@ -20,8 +20,8 @@ module Utility
20
20
  attr_reader :cause
21
21
  end
22
22
 
23
- def initialize(es_config)
24
- super(connection_configs(es_config))
23
+ def initialize(es_config, &block)
24
+ super(connection_configs(es_config), &block)
25
25
  end
26
26
 
27
27
  def connection_configs(es_config)
@@ -39,6 +39,10 @@ module Utility
39
39
  configs[:log] = es_config[:log] || false
40
40
  configs[:trace] = es_config[:trace] || false
41
41
 
42
+ # transport options
43
+ configs[:transport_options] = es_config[:transport_options] if es_config[:transport_options]
44
+ configs[:ca_fingerprint] = es_config[:ca_fingerprint] if es_config[:ca_fingerprint]
45
+
42
46
  # if log or trace is activated, we use the application logger
43
47
  configs[:logger] = if configs[:log] || configs[:trace]
44
48
  Utility::Logger.logger
data/lib/utility.rb CHANGED
@@ -6,6 +6,7 @@
6
6
 
7
7
  require 'utility/constants'
8
8
  require 'utility/cron'
9
+ require 'utility/common'
9
10
  require 'utility/errors'
10
11
  require 'utility/es_client'
11
12
  require 'utility/environment'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: connectors_service
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.6.0.3
4
+ version: 8.6.0.4.pre.20221104T200814Z
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elastic
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-10-03 00:00:00.000000000 Z
11
+ date: 2022-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -198,14 +198,14 @@ dependencies:
198
198
  requirements:
199
199
  - - "~>"
200
200
  - !ruby/object:Gem::Version
201
- version: 8.4.0
201
+ version: 8.5.0
202
202
  type: :runtime
203
203
  prerelease: false
204
204
  version_requirements: !ruby/object:Gem::Requirement
205
205
  requirements:
206
206
  - - "~>"
207
207
  - !ruby/object:Gem::Version
208
- version: 8.4.0
208
+ version: 8.5.0
209
209
  - !ruby/object:Gem::Dependency
210
210
  name: faraday
211
211
  requirement: !ruby/object:Gem::Requirement
@@ -400,10 +400,10 @@ files:
400
400
  - lib/connectors/base/custom_client.rb
401
401
  - lib/connectors/connector_status.rb
402
402
  - lib/connectors/crawler/scheduler.rb
403
+ - lib/connectors/example/attachments/first_attachment.txt
404
+ - lib/connectors/example/attachments/second_attachment.txt
405
+ - lib/connectors/example/attachments/third_attachment.txt
403
406
  - lib/connectors/example/connector.rb
404
- - lib/connectors/example/example_attachments/first_attachment.txt
405
- - lib/connectors/example/example_attachments/second_attachment.txt
406
- - lib/connectors/example/example_attachments/third_attachment.txt
407
407
  - lib/connectors/gitlab/adapter.rb
408
408
  - lib/connectors/gitlab/connector.rb
409
409
  - lib/connectors/gitlab/custom_client.rb
@@ -411,7 +411,6 @@ files:
411
411
  - lib/connectors/mongodb/connector.rb
412
412
  - lib/connectors/registry.rb
413
413
  - lib/connectors/sync_status.rb
414
- - lib/connectors_app/\
415
414
  - lib/connectors_service.rb
416
415
  - lib/connectors_utility.rb
417
416
  - lib/core.rb
@@ -433,6 +432,7 @@ files:
433
432
  - lib/stubs/connectors/stats.rb
434
433
  - lib/stubs/service_type.rb
435
434
  - lib/utility.rb
435
+ - lib/utility/common.rb
436
436
  - lib/utility/constants.rb
437
437
  - lib/utility/cron.rb
438
438
  - lib/utility/elasticsearch/index/language_data.yml
@@ -451,7 +451,7 @@ homepage: https://github.com/elastic/connectors-ruby
451
451
  licenses:
452
452
  - Elastic-2.0
453
453
  metadata: {}
454
- post_install_message:
454
+ post_install_message:
455
455
  rdoc_options: []
456
456
  require_paths:
457
457
  - lib
@@ -462,12 +462,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
462
462
  version: '0'
463
463
  required_rubygems_version: !ruby/object:Gem::Requirement
464
464
  requirements:
465
- - - ">="
465
+ - - ">"
466
466
  - !ruby/object:Gem::Version
467
- version: '0'
467
+ version: 1.3.1
468
468
  requirements: []
469
469
  rubygems_version: 3.0.3.1
470
- signing_key:
470
+ signing_key:
471
471
  specification_version: 4
472
472
  summary: Gem containing Elastic connectors service
473
473
  test_files: []