connectors_service 8.6.0.3 → 8.6.0.4.pre.20221104T200814Z

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b550a78dba7e4cd4502b6eea4c187253e6bb77ab944815b59809d92c7ccc23bb
4
- data.tar.gz: dbe8c32e1da94ed40777af57a84ecac883a15cab71aa164534157314bbcbfcff
3
+ metadata.gz: 23b78b5c5228f761b884af1bfd75b163ad912dc53331ce5a64cd3edb8f45ef86
4
+ data.tar.gz: 5c74e06f315ab9af4161e88fac626e84b6d2789c0b5b080c713e780a4560ef97
5
5
  SHA512:
6
- metadata.gz: 92ef83e3bc94107b1cb11b4454760b17c217b61beb555b55d01ddb7b0758372c3f04ed1acc153e406f20c7abb2df5a3540c2b93733b48eafcc99bd752e7d6759
7
- data.tar.gz: a7eade1996d683fcf47e072704d566479538249d9711e8c0d2019fbb7bd15c382c61cd35adc6ee52f989a75c1fa6d7a6fe1330a04801fb46952aa307f2e93ac5
6
+ metadata.gz: 75295f0d26061a977089e999190f4974f4929ca92c25c09e7ecbbd1f0117476f758b7f200b719461bdd9e7b9e6193df455d3e52bcec0634fc85a7b512721ab6a
7
+ data.tar.gz: fa12af986f72d10081245e782a9152a887512e62f48ad525af59e59a4ef72238727e7bfd521f516fd452a2e74a1101c5bd1965f1e9b97f7b236b8ae2ccf626ff
@@ -1,11 +1,10 @@
1
1
  # general metadata
2
- version: 8.6.0.3
3
- repository: https://github.com/elastic/connectors-ruby.git
4
- revision: aa2faf8cc993a26980441adffe97d62fdaf5aa5c
2
+ version: 8.6.0.4-20221104T200814Z
3
+ repository: git@github.com:elastic/ent-search-connectors.git
4
+ revision: 2051b3907639a1fe2ae68efdc33c06cf12d38383
5
5
  elasticsearch:
6
- cloud_id: CHANGEME
7
6
  hosts: http://localhost:9200
8
- api_key: CHANGEME
7
+ api_key: OW1FalJJUUI1clBtUVh5RVo1QmU6QVp5LV9pU3RRUXFYb2VVYnlCRWNZdw==
9
8
  retry_on_failure: 3
10
9
  request_timeout: 120
11
10
  disable_warnings: true
@@ -16,10 +15,10 @@ thread_pool:
16
15
  max_threads: 5
17
16
  max_queue: 100
18
17
  log_level: info
19
- ecs_logging: true
18
+ ecs_logging: false
20
19
  poll_interval: 3
21
20
  termination_timeout: 60
22
21
  heartbeat_interval: 1800
23
- native_mode: true
24
- connector_id: CHANGEME
25
- service_type: CHANGEME
22
+ native_mode: false
23
+ connector_id: 9WEjRIQB5rPmQXyEWJB2
24
+ service_type: example
data/lib/app/config.rb CHANGED
@@ -35,6 +35,8 @@ puts "Parsing #{CONFIG_FILE} configuration file."
35
35
  optional(:disable_warnings).value(:bool?)
36
36
  optional(:trace).value(:bool?)
37
37
  optional(:log).value(:bool?)
38
+ optional(:ca_fingerprint).value(:string)
39
+ optional(:transport_options).value(:hash)
38
40
  end
39
41
 
40
42
  optional(:thread_pool).hash do
@@ -84,10 +84,14 @@ module App
84
84
  def start_sync_task(connector_settings)
85
85
  start_heartbeat_task(connector_settings)
86
86
  pool.post do
87
- Utility::Logger.info("Starting a sync job for #{connector_settings.formatted}...")
87
+ Utility::Logger.info("Initiating a sync job for #{connector_settings.formatted}...")
88
88
  Core::ElasticConnectorActions.ensure_content_index_exists(connector_settings.index_name)
89
89
  job_runner = Core::SyncJobRunner.new(connector_settings)
90
90
  job_runner.execute
91
+ rescue Core::JobAlreadyRunningError
92
+ Utility::Logger.info("Sync job for #{connector_settings.formatted} is already running, skipping.")
93
+ rescue Core::ConnectorVersionChangedError => e
94
+ Utility::Logger.info("Could not start the job because #{connector_settings.formatted} has been updated externally. Message: #{e.message}")
91
95
  rescue StandardError => e
92
96
  Utility::ExceptionTracking.log_exception(e, "Sync job for #{connector_settings.formatted} failed due to unexpected error.")
93
97
  end
@@ -75,6 +75,10 @@ module App
75
75
  else
76
76
  raise UnhealthyCluster, "Unexpected cluster status: #{response['status']}"
77
77
  end
78
+ rescue *Utility::AUTHORIZATION_ERRORS => e
79
+ Utility::ExceptionTracking.log_exception(e)
80
+
81
+ fail_check!("Elasticsearch returned 'Unauthorized' response. Check your authentication details. Terminating...")
78
82
  rescue *App::RETRYABLE_CONNECTION_ERRORS => e
79
83
  Utility::Logger.warn('Could not connect to Elasticsearch. Make sure it is running and healthy.')
80
84
  Utility::Logger.debug("Error: #{e.full_message}")
@@ -8,9 +8,9 @@
8
8
 
9
9
  require 'bson'
10
10
  require 'core/output_sink'
11
- require 'utility/exception_tracking'
12
- require 'utility/errors'
11
+ require 'utility'
13
12
  require 'app/config'
13
+ require 'active_support/core_ext/hash/indifferent_access'
14
14
 
15
15
  module Connectors
16
16
  module Base
@@ -19,6 +19,11 @@ module Connectors
19
19
  raise 'Not implemented for this connector'
20
20
  end
21
21
 
22
+ # Used as a framework util method, don't override
23
+ def self.configurable_fields_indifferent_access
24
+ configurable_fields.with_indifferent_access
25
+ end
26
+
22
27
  def self.configurable_fields
23
28
  {}
24
29
  end
@@ -27,8 +32,16 @@ module Connectors
27
32
  raise 'Not implemented for this connector'
28
33
  end
29
34
 
30
- def initialize(configuration: {})
35
+ attr_reader :rules, :advanced_filter_config
36
+
37
+ def initialize(configuration: {}, job_description: {})
31
38
  @configuration = configuration.dup || {}
39
+ @job_description = job_description&.dup || {}
40
+
41
+ filter = get_filter(@job_description[:filtering])
42
+
43
+ @rules = Utility::Common.return_if_present(filter[:rules], [])
44
+ @advanced_filter_config = Utility::Common.return_if_present(filter[:advanced_config], {})
32
45
  end
33
46
 
34
47
  def yield_documents; end
@@ -52,6 +65,19 @@ module Connectors
52
65
  Utility::ExceptionTracking.log_exception(e, "Connector for service #{self.class.service_type} failed the health check for 3rd-party service.")
53
66
  false
54
67
  end
68
+
69
+ def filtering_present?
70
+ @advanced_filter_config.present? || @rules.present?
71
+ end
72
+
73
+ private
74
+
75
+ def get_filter(filtering)
76
+ # assume for now, that first object in filtering array or a filter object itself is the only filtering object
77
+ filter = filtering.is_a?(Array) ? filtering[0] : filtering
78
+
79
+ filter.present? ? filter : {}
80
+ end
55
81
  end
56
82
  end
57
83
  end
@@ -8,11 +8,11 @@
8
8
 
9
9
  module Connectors
10
10
  class ConnectorStatus
11
- CREATED = 'created'
11
+ CREATED = 'created'
12
12
  NEEDS_CONFIGURATION = 'needs_configuration'
13
- CONFIGURED = 'configured'
14
- CONNECTED = 'connected'
15
- ERROR = 'error'
13
+ CONFIGURED = 'configured'
14
+ CONNECTED = 'connected'
15
+ ERROR = 'error'
16
16
 
17
17
  STATUSES = [
18
18
  CREATED,
@@ -20,16 +20,21 @@ module Connectors
20
20
  'Example Connector'
21
21
  end
22
22
 
23
+ # Field 'Foo' won't have a default value. Field 'Bar' will have the default value 'Value'.
23
24
  def self.configurable_fields
24
25
  {
25
26
  'foo' => {
26
27
  'label' => 'Foo',
27
28
  'value' => nil
29
+ },
30
+ :bar => {
31
+ :label => 'Bar',
32
+ :value => 'Value'
28
33
  }
29
34
  }
30
35
  end
31
36
 
32
- def initialize(configuration: {})
37
+ def initialize(configuration: {}, job_description: {})
33
38
  super
34
39
  end
35
40
 
@@ -42,16 +47,35 @@ module Connectors
42
47
 
43
48
  def yield_documents
44
49
  attachments = [
45
- File.open('./lib/connectors/example/example_attachments/first_attachment.txt'),
46
- File.open('./lib/connectors/example/example_attachments/second_attachment.txt'),
47
- File.open('./lib/connectors/example/example_attachments/third_attachment.txt')
50
+ load_attachment('first_attachment.txt'),
51
+ load_attachment('second_attachment.txt'),
52
+ load_attachment('third_attachment.txt'),
48
53
  ]
49
54
 
50
55
  attachments.each_with_index do |att, index|
51
56
  data = { id: (index + 1).to_s, name: "example document #{index + 1}", _attachment: File.read(att) }
57
+
58
+ # Uncomment one of these two lines to simulate longer running sync jobs
59
+ #
60
+ # sleep(rand(10..60).seconds)
61
+ # sleep(rand(1..10).minutes)
62
+
52
63
  yield data
53
64
  end
54
65
  end
66
+
67
+ private
68
+
69
+ def load_attachment(path)
70
+ attachment_dir = "#{File.dirname(__FILE__)}/attachments"
71
+ attachment_path = "#{attachment_dir}/#{path}"
72
+
73
+ unless File.exist?(attachment_path)
74
+ raise "Attachment at location '#{attachment_path}' doesn't exist. Attachments should be located under #{attachment_dir}"
75
+ end
76
+
77
+ File.open(attachment_path)
78
+ end
55
79
  end
56
80
  end
57
81
  end
@@ -36,7 +36,7 @@ module Connectors
36
36
  }
37
37
  end
38
38
 
39
- def initialize(configuration: {})
39
+ def initialize(configuration: {}, job_description: {})
40
40
  super
41
41
 
42
42
  @extractor = Connectors::GitLab::Extractor.new(
@@ -6,13 +6,18 @@
6
6
 
7
7
  # frozen_string_literal: true
8
8
 
9
- require 'active_support/core_ext/hash/indifferent_access'
10
9
  require 'connectors/base/connector'
11
10
  require 'mongo'
11
+ require 'utility'
12
12
 
13
13
  module Connectors
14
14
  module MongoDB
15
15
  class Connector < Connectors::Base::Connector
16
+
17
+ ALLOWED_TOP_LEVEL_FILTER_KEYS = %w[find aggregate]
18
+
19
+ PAGE_SIZE = 100
20
+
16
21
  def self.service_type
17
22
  'mongodb'
18
23
  end
@@ -44,7 +49,7 @@ module Connectors
44
49
  }
45
50
  end
46
51
 
47
- def initialize(configuration: {})
52
+ def initialize(configuration: {}, job_description: {})
48
53
  super
49
54
 
50
55
  @host = configuration.dig(:host, :value)
@@ -56,17 +61,108 @@ module Connectors
56
61
  end
57
62
 
58
63
  def yield_documents
64
+ check_filtering
65
+
59
66
  with_client do |client|
60
- client[@collection].find.each do |document|
61
- doc = document.with_indifferent_access
67
+ # We do paging using skip().limit() here to make Ruby recycle the memory for each page pulled from the server after it's not needed any more.
68
+ # This gives us more control on the usage of the memory (we can adjust PAGE_SIZE constant for that to decrease max memory consumption).
69
+ # It's done due to the fact that usage of .find.each leads to memory leaks or overuse of memory - the whole result set seems to stay in memory
70
+ # during the sync. Sometimes (not 100% sure) it even leads to a real leak, when the memory for these objects is never recycled.
71
+ cursor, options = create_db_cursor_on_collection(client[@collection])
72
+ skip = 0
73
+
74
+ found_overall = 0
75
+
76
+ # if no overall limit is specified by filtering use -1 to not break ingestion, when no overall limit is specified (found_overall is only increased,
77
+ # thus can never reach -1)
78
+ overall_limit = Float::INFINITY
79
+
80
+ if options.present?
81
+ # there could be a skip parameter defined for filtering
82
+ skip = options.fetch(:skip, skip)
83
+ # there could be a limit parameter defined for filtering -> used for an overall limit (not a page limit, which was introduced for memory optimization)
84
+ overall_limit = options.fetch(:limit, overall_limit)
85
+ end
86
+
87
+ overall_limit_reached = false
88
+
89
+ loop do
90
+ found_in_page = 0
91
+
92
+ view = cursor.skip(skip).limit(PAGE_SIZE)
93
+ view.each do |document|
94
+ yield serialize(document)
95
+
96
+ found_in_page += 1
97
+ found_overall += 1
98
+
99
+ overall_limit_reached = found_overall >= overall_limit && overall_limit != Float::INFINITY
100
+
101
+ break if overall_limit_reached
102
+ end
103
+
104
+ page_was_empty = found_in_page == 0
105
+
106
+ break if page_was_empty || overall_limit_reached
62
107
 
63
- yield serialize(doc)
108
+ skip += PAGE_SIZE
64
109
  end
65
110
  end
66
111
  end
67
112
 
68
113
  private
69
114
 
115
+ def create_db_cursor_on_collection(collection)
116
+ return create_find_cursor(collection) if @advanced_filter_config[:find].present?
117
+
118
+ return create_aggregate_cursor(collection) if @advanced_filter_config[:aggregate].present?
119
+
120
+ collection.find
121
+ end
122
+
123
+ def check_filtering
124
+ return unless filtering_present?
125
+
126
+ check_find_and_aggregate
127
+ end
128
+
129
+ def check_find_and_aggregate
130
+ if @advanced_filter_config.keys.size != 1
131
+ invalid_keys_msg = "Only one of #{ALLOWED_TOP_LEVEL_FILTER_KEYS} is allowed in the filtering object. Keys present: '#{@advanced_filter_config.keys}'."
132
+ raise Utility::InvalidFilterConfigError.new(invalid_keys_msg)
133
+ end
134
+ end
135
+
136
+ def create_aggregate_cursor(collection)
137
+ aggregate = @advanced_filter_config[:aggregate]
138
+
139
+ pipeline = aggregate[:pipeline]
140
+ options = extract_options(aggregate)
141
+
142
+ if !pipeline.nil? && pipeline.empty? && !options.present?
143
+ Utility::Logger.warn('\'Aggregate\' was specified with an empty pipeline and empty options.')
144
+ end
145
+
146
+ [collection.aggregate(pipeline, options), options]
147
+ end
148
+
149
+ def create_find_cursor(collection)
150
+ find = @advanced_filter_config[:find]
151
+
152
+ filter = find[:filter]
153
+ options = extract_options(find)
154
+
155
+ if !filter.nil? && filter.empty? && !options.present?
156
+ Utility::Logger.warn('\'Find\' was specified with an empty filter and empty options.')
157
+ end
158
+
159
+ [collection.find(filter, options), options]
160
+ end
161
+
162
+ def extract_options(mongodb_function)
163
+ mongodb_function[:options].present? ? mongodb_function[:options] : {}
164
+ end
165
+
70
166
  def do_health_check
71
167
  with_client do |_client|
72
168
  Utility::Logger.debug("Mongo at #{@host}/#{@database} looks healthy.")
@@ -76,34 +172,43 @@ module Connectors
76
172
  def with_client
77
173
  raise "Invalid value for 'Direct connection' : #{@direct_connection}." unless %w[true false].include?(@direct_connection.to_s.strip.downcase)
78
174
 
79
- client = if @user.present? || @password.present?
80
- Mongo::Client.new(
81
- @host,
82
- database: @database,
83
- direct_connection: to_boolean(@direct_connection),
84
- user: @user,
85
- password: @password
86
- )
87
- else
88
- Mongo::Client.new(
89
- @host,
90
- database: @database,
91
- direct_connection: to_boolean(@direct_connection)
92
- )
93
- end
94
-
95
- begin
96
- Utility::Logger.debug("Existing Databases #{client.database_names}")
97
- Utility::Logger.debug('Existing Collections:')
98
-
99
- client.collections.each { |coll| Utility::Logger.debug(coll.name) }
175
+ args = {
176
+ database: @database,
177
+ direct_connection: to_boolean(@direct_connection)
178
+ }
179
+
180
+ if @user.present? || @password.present?
181
+ args[:user] = @user
182
+ args[:password] = @password
183
+ end
184
+
185
+ Mongo::Client.new(@host, args) do |client|
186
+ databases = client.database_names
187
+
188
+ Utility::Logger.debug("Existing Databases: #{databases}")
189
+ check_database_exists!(databases, @database)
190
+
191
+ collections = client.database.collection_names
192
+
193
+ Utility::Logger.debug("Existing Collections: #{collections}")
194
+ check_collection_exists!(collections, @database, @collection)
100
195
 
101
196
  yield client
102
- ensure
103
- client.close
104
197
  end
105
198
  end
106
199
 
200
+ def check_database_exists!(databases, database)
201
+ return if databases.include?(database)
202
+
203
+ raise "Database (#{database}) does not exist. Existing databases: #{databases.join(', ')}"
204
+ end
205
+
206
+ def check_collection_exists!(collections, database, collection)
207
+ return if collections.include?(collection)
208
+
209
+ raise "Collection (#{collection}) does not exist within database '#{database}'. Existing collections: #{collections.join(', ')}"
210
+ end
211
+
107
212
  def serialize(mongodb_document)
108
213
  # This is some lazy serialization here.
109
214
  # Problem: MongoDB has its own format of things - e.g. ids are Bson::ObjectId, which when serialized to JSON
@@ -120,11 +225,10 @@ module Connectors
120
225
  mongodb_document.map { |v| serialize(v) }
121
226
  when Hash
122
227
  mongodb_document.map do |key, value|
123
- remapped_key = key.to_sym == :_id ? :id : key.to_sym
124
-
228
+ key = 'id' if key == '_id'
125
229
  remapped_value = serialize(value)
126
- [remapped_key, remapped_value]
127
- end.to_h.with_indifferent_access
230
+ [key, remapped_value]
231
+ end.to_h
128
232
  else
129
233
  mongodb_document
130
234
  end
@@ -24,10 +24,10 @@ module Connectors
24
24
  @connectors[name]
25
25
  end
26
26
 
27
- def connector(name, configuration)
27
+ def connector(name, configuration, job_description: {})
28
28
  klass = connector_class(name)
29
29
  if klass.present?
30
- return klass.new(configuration: configuration)
30
+ return klass.new(configuration: configuration, job_description: job_description)
31
31
  end
32
32
  raise "Connector #{name} is not yet registered. You need to register it before use"
33
33
  end
@@ -8,14 +8,33 @@
8
8
 
9
9
  module Connectors
10
10
  class SyncStatus
11
- COMPLETED = 'completed'
11
+ PENDING = 'pending'
12
12
  IN_PROGRESS = 'in_progress'
13
- FAILED = 'failed'
13
+ CANCELING = 'canceling'
14
+ CANCELED = 'canceled'
15
+ SUSPENDED = 'suspended'
16
+ COMPLETED = 'completed'
17
+ ERROR = 'error'
14
18
 
15
19
  STATUSES = [
16
- COMPLETED,
20
+ PENDING,
17
21
  IN_PROGRESS,
18
- FAILED
22
+ CANCELING,
23
+ CANCELED,
24
+ SUSPENDED,
25
+ COMPLETED,
26
+ ERROR
27
+ ]
28
+
29
+ PENDING_STATUES = [
30
+ PENDING,
31
+ SUSPENDED
32
+ ]
33
+
34
+ TERMINAL_STATUSES = [
35
+ CANCELED,
36
+ COMPLETED,
37
+ ERROR
19
38
  ]
20
39
  end
21
40
  end
@@ -23,7 +23,7 @@ module Core
23
23
  Utility::Logger.error("Couldn't find connector for service type #{connector_settings.service_type || service_type}")
24
24
  return
25
25
  end
26
- configuration = connector_class.configurable_fields
26
+ configuration = connector_class.configurable_fields_indifferent_access
27
27
  doc = {
28
28
  :configuration => configuration
29
29
  }
@@ -19,6 +19,8 @@ module Core
19
19
  DEFAULT_REDUCE_WHITESPACE = true
20
20
  DEFAULT_RUN_ML_INFERENCE = true
21
21
 
22
+ DEFAULT_FILTERING = {}
23
+
22
24
  DEFAULT_PAGE_SIZE = 100
23
25
 
24
26
  # Error Classes
@@ -80,20 +82,24 @@ module Core
80
82
  self[:scheduling]
81
83
  end
82
84
 
85
+ def filtering
86
+ Utility::Common.return_if_present(@elasticsearch_response[:filtering], DEFAULT_FILTERING)
87
+ end
88
+
83
89
  def request_pipeline
84
- return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
90
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
85
91
  end
86
92
 
87
93
  def extract_binary_content?
88
- return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
94
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
89
95
  end
90
96
 
91
97
  def reduce_whitespace?
92
- return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
98
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
93
99
  end
94
100
 
95
101
  def run_ml_inference?
96
- return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
102
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
97
103
  end
98
104
 
99
105
  def formatted
@@ -110,8 +116,6 @@ module Core
110
116
  index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
111
117
  end
112
118
 
113
- private
114
-
115
119
  def self.fetch_connectors_by_query(query, page_size)
116
120
  connectors_meta = ElasticConnectorActions.connectors_meta
117
121
 
@@ -120,8 +124,8 @@ module Core
120
124
  loop do
121
125
  response = ElasticConnectorActions.search_connectors(query, page_size, offset)
122
126
 
123
- hits = response['hits']['hits']
124
- total = response['hits']['total']['value']
127
+ hits = response.dig('hits', 'hits') || []
128
+ total = response.dig('hits', 'total', 'value') || 0
125
129
  results += hits.map do |hit|
126
130
  Core::ConnectorSettings.new(hit, connectors_meta)
127
131
  end
@@ -132,11 +136,5 @@ module Core
132
136
  results
133
137
  end
134
138
 
135
- def return_if_present(*args)
136
- args.each do |arg|
137
- return arg unless arg.nil?
138
- end
139
- nil
140
- end
141
139
  end
142
140
  end
@@ -10,8 +10,21 @@ require 'active_support/core_ext/hash'
10
10
  require 'connectors/connector_status'
11
11
  require 'connectors/sync_status'
12
12
  require 'utility'
13
+ require 'elastic-transport'
13
14
 
14
15
  module Core
16
+ class JobAlreadyRunningError < StandardError
17
+ def initialize(connector_id)
18
+ super("Sync job for connector '#{connector_id}' is already running.")
19
+ end
20
+ end
21
+
22
+ class ConnectorVersionChangedError < StandardError
23
+ def initialize(connector_id, seq_no, primary_term)
24
+ super("Version conflict: seq_no [#{seq_no}] and primary_term [#{primary_term}] do not match for connector '#{connector_id}'.")
25
+ end
26
+ end
27
+
15
28
  class ElasticConnectorActions
16
29
  class << self
17
30
 
@@ -72,20 +85,53 @@ module Core
72
85
  end
73
86
 
74
87
  def claim_job(connector_id)
75
- update_connector_fields(connector_id,
76
- :sync_now => false,
77
- :last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
78
- :last_synced => Time.now)
88
+ seq_no = nil
89
+ primary_term = nil
90
+ sync_in_progress = false
91
+ connector_record = client.get(
92
+ :index => Utility::Constants::CONNECTORS_INDEX,
93
+ :id => connector_id,
94
+ :ignore => 404,
95
+ :refresh => true
96
+ ).tap do |response|
97
+ seq_no = response['_seq_no']
98
+ primary_term = response['_primary_term']
99
+ sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
100
+ end
101
+ if sync_in_progress
102
+ raise JobAlreadyRunningError.new(connector_id)
103
+ end
104
+ update_connector_fields(
105
+ connector_id,
106
+ { :sync_now => false,
107
+ :last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
108
+ :last_synced => Time.now },
109
+ seq_no,
110
+ primary_term
111
+ )
79
112
 
80
113
  body = {
81
114
  :connector_id => connector_id,
82
115
  :status => Connectors::SyncStatus::IN_PROGRESS,
83
116
  :worker_hostname => Socket.gethostname,
84
- :created_at => Time.now
117
+ :created_at => Time.now,
118
+ :filtering => convert_connector_filtering_to_job_filtering(connector_record.dig('_source', 'filtering'))
85
119
  }
86
- job = client.index(:index => Utility::Constants::JOB_INDEX, :body => body)
87
120
 
88
- job['_id']
121
+ client.index(:index => Utility::Constants::JOB_INDEX, :body => body)
122
+ end
123
+
124
+ def convert_connector_filtering_to_job_filtering(connector_filtering)
125
+ return [] unless connector_filtering
126
+ connector_filtering = [connector_filtering] unless connector_filtering.is_a?(Array)
127
+ connector_filtering.each_with_object([]) do |filtering_domain, job_filtering|
128
+ job_filtering << {
129
+ 'domain' => filtering_domain['domain'],
130
+ 'rules' => filtering_domain.dig('active', 'rules'),
131
+ 'advanced_snippet' => filtering_domain.dig('active', 'advanced_snippet'),
132
+ 'warnings' => [] # TODO: in https://github.com/elastic/enterprise-search-team/issues/3174
133
+ }
134
+ end
89
135
  end
90
136
 
91
137
  def update_connector_status(connector_id, status, error_message = nil)
@@ -100,7 +146,7 @@ module Core
100
146
  end
101
147
 
102
148
  def complete_sync(connector_id, job_id, status)
103
- sync_status = status[:error] ? Connectors::SyncStatus::FAILED : Connectors::SyncStatus::COMPLETED
149
+ sync_status = status[:error] ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
104
150
 
105
151
  update_connector_fields(connector_id,
106
152
  :last_sync_status => sync_status,
@@ -136,7 +182,7 @@ module Core
136
182
  }
137
183
  loop do
138
184
  response = client.search(:body => body)
139
- hits = response['hits']['hits']
185
+ hits = response.dig('hits', 'hits') || []
140
186
 
141
187
  ids = hits.map { |h| h['_id'] }
142
188
  result += ids
@@ -242,15 +288,29 @@ module Core
242
288
  ensure_index_exists("#{Utility::Constants::JOB_INDEX}-v1", system_index_body(:alias_name => Utility::Constants::JOB_INDEX, :mappings => mappings))
243
289
  end
244
290
 
245
- def update_connector_fields(connector_id, doc = {})
291
+ def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
246
292
  return if doc.empty?
247
- client.update(
293
+ update_args = {
248
294
  :index => Utility::Constants::CONNECTORS_INDEX,
249
295
  :id => connector_id,
250
296
  :body => { :doc => doc },
251
297
  :refresh => true,
252
298
  :retry_on_conflict => 3
253
- )
299
+ }
300
+ # seq_no and primary_term are used for optimistic concurrency control
301
+ # see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
302
+ if seq_no && primary_term
303
+ update_args[:if_seq_no] = seq_no
304
+ update_args[:if_primary_term] = primary_term
305
+ update_args.delete(:retry_on_conflict)
306
+ end
307
+ begin
308
+ client.update(update_args)
309
+ rescue Elastic::Transport::Transport::Errors::Conflict
310
+ # VersionConflictException
311
+ # see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
312
+ raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
313
+ end
254
314
  end
255
315
 
256
316
  private
@@ -16,6 +16,9 @@ module Core
16
16
  class NativeScheduler < Core::Scheduler
17
17
  def connector_settings
18
18
  Core::ConnectorSettings.fetch_native_connectors || []
19
+ rescue *Utility::AUTHORIZATION_ERRORS => e
20
+ # should be handled by the general scheduler
21
+ raise e
19
22
  rescue StandardError => e
20
23
  Utility::ExceptionTracking.log_exception(e, 'Could not retrieve native connectors due to unexpected error.')
21
24
  []
@@ -9,6 +9,7 @@
9
9
  require 'time'
10
10
  require 'fugit'
11
11
  require 'core/connector_settings'
12
+ require 'core/elastic_connector_actions'
12
13
  require 'utility/cron'
13
14
  require 'utility/logger'
14
15
  require 'utility/exception_tracking'
@@ -41,6 +42,8 @@ module Core
41
42
  if @is_shutting_down
42
43
  break
43
44
  end
45
+ rescue *Utility::AUTHORIZATION_ERRORS => e
46
+ Utility::ExceptionTracking.log_exception(e, 'Could not retrieve connectors settings due to authorization error.')
44
47
  rescue StandardError => e
45
48
  Utility::ExceptionTracking.log_exception(e, 'Sync failed due to unexpected error.')
46
49
  ensure
@@ -21,6 +21,9 @@ module Core
21
21
  def connector_settings
22
22
  connector_settings = Core::ConnectorSettings.fetch_by_id(@connector_id)
23
23
  [connector_settings]
24
+ rescue *Utility::AUTHORIZATION_ERRORS => e
25
+ # should be handled by the general scheduler
26
+ raise e
24
27
  rescue StandardError => e
25
28
  Utility::ExceptionTracking.log_exception(e, "Could not retrieve the connector by id #{@connector_id} due to unexpected error.")
26
29
  []
@@ -23,7 +23,7 @@ module Core
23
23
  @connector_settings = connector_settings
24
24
  @sink = Core::OutputSink::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline)
25
25
  @connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
26
- @connector_instance = Connectors::REGISTRY.connector(connector_settings.service_type, connector_settings.configuration)
26
+ @sync_finished = false
27
27
  @status = {
28
28
  :indexed_document_count => 0,
29
29
  :deleted_document_count => 0,
@@ -39,9 +39,10 @@ module Core
39
39
  private
40
40
 
41
41
  def do_sync!
42
- Utility::Logger.info("Starting sync for connector #{@connector_settings.id}.")
42
+ Utility::Logger.info("Claiming a sync job for connector #{@connector_settings.id}.")
43
43
 
44
- job_id = ElasticConnectorActions.claim_job(@connector_settings.id)
44
+ job_description = ElasticConnectorActions.claim_job(@connector_settings.id)
45
+ job_id = job_description['_id']
45
46
 
46
47
  unless job_id.present?
47
48
  Utility::Logger.error("Failed to claim the job for #{@connector_settings.id}. Please check the logs for the cause of this error.")
@@ -51,17 +52,19 @@ module Core
51
52
  begin
52
53
  Utility::Logger.debug("Successfully claimed job for connector #{@connector_settings.id}.")
53
54
 
54
- @connector_instance.do_health_check!
55
+ connector_instance = Connectors::REGISTRY.connector(@connector_settings.service_type, @connector_settings.configuration, job_description: job_description)
56
+
57
+ connector_instance.do_health_check!
55
58
 
56
59
  incoming_ids = []
57
60
  existing_ids = ElasticConnectorActions.fetch_document_ids(@connector_settings.index_name)
58
61
 
59
62
  Utility::Logger.debug("#{existing_ids.size} documents are present in index #{@connector_settings.index_name}.")
60
63
 
61
- @connector_instance.yield_documents do |document|
64
+ connector_instance.yield_documents do |document|
62
65
  document = add_ingest_metadata(document)
63
66
  @sink.ingest(document)
64
- incoming_ids << document[:id]
67
+ incoming_ids << document['id']
65
68
  @status[:indexed_document_count] += 1
66
69
  end
67
70
 
@@ -75,6 +78,10 @@ module Core
75
78
  end
76
79
 
77
80
  @sink.flush
81
+
82
+ # We use this mechanism for checking, whether an interrupt (or something else lead to the thread not finishing)
83
+ # occurred as most of the time the main execution thread is interrupted and we miss this Signal/Exception here
84
+ @sync_finished = true
78
85
  rescue StandardError => e
79
86
  @status[:error] = e.message
80
87
  Utility::ExceptionTracking.log_exception(e)
@@ -83,10 +90,15 @@ module Core
83
90
  Utility::Logger.info("Upserted #{@status[:indexed_document_count]} documents into #{@connector_settings.index_name}.")
84
91
  Utility::Logger.info("Deleted #{@status[:deleted_document_count]} documents into #{@connector_settings.index_name}.")
85
92
 
93
+ # Make sure to not override a previous error message
94
+ if !@sync_finished && @status[:error].nil?
95
+ @status[:error] = 'Sync thread didn\'t finish execution. Check connector logs for more details.'
96
+ end
97
+
86
98
  ElasticConnectorActions.complete_sync(@connector_settings.id, job_id, @status.dup)
87
99
 
88
100
  if @status[:error]
89
- Utility::Logger.info("Failed to sync for connector #{@connector_settings.id} with error #{@status[:error]}.")
101
+ Utility::Logger.info("Failed to sync for connector #{@connector_settings.id} with error '#{@status[:error]}'.")
90
102
  else
91
103
  Utility::Logger.info("Successfully synced for connector #{@connector_settings.id}.")
92
104
  end
@@ -4,10 +4,17 @@
4
4
  # you may not use this file except in compliance with the Elastic License.
5
5
  #
6
6
 
7
- module ConnectorsApp
8
- module Errors
9
- INVALID_API_KEY = 'INVALID_API_KEY'
10
- UNSUPPORTED_AUTH_SCHEME = 'UNSUPPORTED_AUTH_SCHEME'
11
- INTERNAL_SERVER_ERROR = 'INTERNAL_SERVER_ERROR'
7
+ # frozen_string_literal: true
8
+
9
+ module Utility
10
+ class Common
11
+ class << self
12
+ def return_if_present(*args)
13
+ args.each do |arg|
14
+ return arg unless arg.nil?
15
+ end
16
+ nil
17
+ end
18
+ end
12
19
  end
13
20
  end
@@ -5,6 +5,7 @@
5
5
  #
6
6
 
7
7
  require 'active_support/core_ext/string'
8
+ require 'elasticsearch'
8
9
 
9
10
  module Utility
10
11
  class DocumentError
@@ -31,6 +32,8 @@ module Utility
31
32
  end
32
33
 
33
34
  class ClientError < StandardError; end
35
+
36
+ class InvalidFilterConfigError < StandardError; end
34
37
  class EvictionWithNoProgressError < StandardError; end
35
38
  class EvictionError < StandardError
36
39
  attr_accessor :cursors
@@ -89,6 +92,7 @@ module Utility
89
92
  class InvalidTokenError < StandardError; end
90
93
  class TokenRefreshFailedError < StandardError; end
91
94
  class ConnectorNotAvailableError < StandardError; end
95
+ class AuthorizationError < StandardError; end
92
96
 
93
97
  # For when we want to explicitly set a #cause but can't
94
98
  class ExplicitlyCausedError < StandardError
@@ -124,6 +128,7 @@ module Utility
124
128
  end
125
129
  end
126
130
 
131
+ AUTHORIZATION_ERRORS = [Elastic::Transport::Transport::Errors::Unauthorized]
127
132
  INTERNAL_SERVER_ERROR = Utility::Error.new(500, 'INTERNAL_SERVER_ERROR', 'Internal server error')
128
133
  INVALID_API_KEY = Utility::Error.new(401, 'INVALID_API_KEY', 'Invalid API key')
129
134
  UNSUPPORTED_AUTH_SCHEME = Utility::Error.new(401, 'UNSUPPORTED_AUTH_SCHEME', 'Unsupported authorization scheme')
@@ -20,8 +20,8 @@ module Utility
20
20
  attr_reader :cause
21
21
  end
22
22
 
23
- def initialize(es_config)
24
- super(connection_configs(es_config))
23
+ def initialize(es_config, &block)
24
+ super(connection_configs(es_config), &block)
25
25
  end
26
26
 
27
27
  def connection_configs(es_config)
@@ -39,6 +39,10 @@ module Utility
39
39
  configs[:log] = es_config[:log] || false
40
40
  configs[:trace] = es_config[:trace] || false
41
41
 
42
+ # transport options
43
+ configs[:transport_options] = es_config[:transport_options] if es_config[:transport_options]
44
+ configs[:ca_fingerprint] = es_config[:ca_fingerprint] if es_config[:ca_fingerprint]
45
+
42
46
  # if log or trace is activated, we use the application logger
43
47
  configs[:logger] = if configs[:log] || configs[:trace]
44
48
  Utility::Logger.logger
data/lib/utility.rb CHANGED
@@ -6,6 +6,7 @@
6
6
 
7
7
  require 'utility/constants'
8
8
  require 'utility/cron'
9
+ require 'utility/common'
9
10
  require 'utility/errors'
10
11
  require 'utility/es_client'
11
12
  require 'utility/environment'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: connectors_service
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.6.0.3
4
+ version: 8.6.0.4.pre.20221104T200814Z
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elastic
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-10-03 00:00:00.000000000 Z
11
+ date: 2022-11-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -198,14 +198,14 @@ dependencies:
198
198
  requirements:
199
199
  - - "~>"
200
200
  - !ruby/object:Gem::Version
201
- version: 8.4.0
201
+ version: 8.5.0
202
202
  type: :runtime
203
203
  prerelease: false
204
204
  version_requirements: !ruby/object:Gem::Requirement
205
205
  requirements:
206
206
  - - "~>"
207
207
  - !ruby/object:Gem::Version
208
- version: 8.4.0
208
+ version: 8.5.0
209
209
  - !ruby/object:Gem::Dependency
210
210
  name: faraday
211
211
  requirement: !ruby/object:Gem::Requirement
@@ -400,10 +400,10 @@ files:
400
400
  - lib/connectors/base/custom_client.rb
401
401
  - lib/connectors/connector_status.rb
402
402
  - lib/connectors/crawler/scheduler.rb
403
+ - lib/connectors/example/attachments/first_attachment.txt
404
+ - lib/connectors/example/attachments/second_attachment.txt
405
+ - lib/connectors/example/attachments/third_attachment.txt
403
406
  - lib/connectors/example/connector.rb
404
- - lib/connectors/example/example_attachments/first_attachment.txt
405
- - lib/connectors/example/example_attachments/second_attachment.txt
406
- - lib/connectors/example/example_attachments/third_attachment.txt
407
407
  - lib/connectors/gitlab/adapter.rb
408
408
  - lib/connectors/gitlab/connector.rb
409
409
  - lib/connectors/gitlab/custom_client.rb
@@ -411,7 +411,6 @@ files:
411
411
  - lib/connectors/mongodb/connector.rb
412
412
  - lib/connectors/registry.rb
413
413
  - lib/connectors/sync_status.rb
414
- - lib/connectors_app/\
415
414
  - lib/connectors_service.rb
416
415
  - lib/connectors_utility.rb
417
416
  - lib/core.rb
@@ -433,6 +432,7 @@ files:
433
432
  - lib/stubs/connectors/stats.rb
434
433
  - lib/stubs/service_type.rb
435
434
  - lib/utility.rb
435
+ - lib/utility/common.rb
436
436
  - lib/utility/constants.rb
437
437
  - lib/utility/cron.rb
438
438
  - lib/utility/elasticsearch/index/language_data.yml
@@ -451,7 +451,7 @@ homepage: https://github.com/elastic/connectors-ruby
451
451
  licenses:
452
452
  - Elastic-2.0
453
453
  metadata: {}
454
- post_install_message:
454
+ post_install_message:
455
455
  rdoc_options: []
456
456
  require_paths:
457
457
  - lib
@@ -462,12 +462,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
462
462
  version: '0'
463
463
  required_rubygems_version: !ruby/object:Gem::Requirement
464
464
  requirements:
465
- - - ">="
465
+ - - ">"
466
466
  - !ruby/object:Gem::Version
467
- version: '0'
467
+ version: 1.3.1
468
468
  requirements: []
469
469
  rubygems_version: 3.0.3.1
470
- signing_key:
470
+ signing_key:
471
471
  specification_version: 4
472
472
  summary: Gem containing Elastic connectors service
473
473
  test_files: []