connectors_service 8.6.0.4.pre.20221116T024501Z → 8.6.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +6 -6
  3. data/lib/app/app.rb +0 -4
  4. data/lib/app/dispatcher.rb +17 -42
  5. data/lib/app/preflight_check.rb +0 -11
  6. data/lib/connectors/base/connector.rb +14 -43
  7. data/lib/connectors/example/connector.rb +0 -6
  8. data/lib/connectors/gitlab/connector.rb +1 -6
  9. data/lib/connectors/mongodb/connector.rb +43 -47
  10. data/lib/connectors/sync_status.rb +1 -6
  11. data/lib/core/configuration.rb +1 -3
  12. data/lib/core/connector_settings.rb +16 -52
  13. data/lib/core/elastic_connector_actions.rb +59 -320
  14. data/lib/core/output_sink/base_sink.rb +33 -0
  15. data/lib/core/output_sink/combined_sink.rb +38 -0
  16. data/lib/core/output_sink/console_sink.rb +51 -0
  17. data/lib/core/output_sink/es_sink.rb +74 -0
  18. data/lib/core/{ingestion.rb → output_sink.rb} +5 -1
  19. data/lib/core/scheduler.rb +10 -40
  20. data/lib/core/single_scheduler.rb +1 -1
  21. data/lib/core/sync_job_runner.rb +16 -72
  22. data/lib/core.rb +0 -4
  23. data/lib/utility/constants.rb +0 -2
  24. data/lib/utility/errors.rb +12 -0
  25. data/lib/utility/logger.rb +1 -1
  26. data/lib/utility.rb +4 -11
  27. metadata +9 -27
  28. data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +0 -173
  29. data/lib/connectors/base/advanced_snippet_validator.rb +0 -34
  30. data/lib/connectors/base/simple_rules_parser.rb +0 -42
  31. data/lib/connectors/example/example_advanced_snippet_validator.rb +0 -35
  32. data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +0 -35
  33. data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +0 -22
  34. data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +0 -292
  35. data/lib/connectors/mongodb/mongo_rules_parser.rb +0 -81
  36. data/lib/connectors/tolerable_error_helper.rb +0 -43
  37. data/lib/core/connector_job.rb +0 -210
  38. data/lib/core/filtering/post_process_engine.rb +0 -39
  39. data/lib/core/filtering/post_process_result.rb +0 -27
  40. data/lib/core/filtering/simple_rule.rb +0 -141
  41. data/lib/core/filtering/validation_job_runner.rb +0 -53
  42. data/lib/core/filtering/validation_status.rb +0 -17
  43. data/lib/core/filtering.rb +0 -17
  44. data/lib/core/ingestion/es_sink.rb +0 -118
  45. data/lib/core/jobs/consumer.rb +0 -114
  46. data/lib/core/jobs/producer.rb +0 -26
  47. data/lib/utility/bulk_queue.rb +0 -85
  48. data/lib/utility/error_monitor.rb +0 -108
  49. data/lib/utility/filtering.rb +0 -22
@@ -1,210 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'active_support/core_ext/hash/indifferent_access'
10
- require 'connectors/sync_status'
11
- require 'core/elastic_connector_actions'
12
- require 'utility'
13
-
14
- module Core
15
- class ConnectorJob
16
- DEFAULT_PAGE_SIZE = 100
17
-
18
- def self.fetch_by_id(job_id)
19
- es_response = ElasticConnectorActions.get_job(job_id)
20
- return nil unless es_response[:found]
21
-
22
- new(es_response)
23
- end
24
-
25
- def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
26
- status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
27
-
28
- query = { bool: { must: [{ terms: status_term }] } }
29
-
30
- return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
31
-
32
- query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
33
-
34
- fetch_jobs_by_query(query, page_size)
35
- end
36
-
37
- def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
38
- []
39
- end
40
-
41
- def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
42
- []
43
- end
44
-
45
- def self.enqueue(_connector_id)
46
- nil
47
- end
48
-
49
- def id
50
- @elasticsearch_response[:_id]
51
- end
52
-
53
- def [](property_name)
54
- @elasticsearch_response[:_source][property_name]
55
- end
56
-
57
- def error
58
- self[:error]
59
- end
60
-
61
- def status
62
- self[:status]
63
- end
64
-
65
- def in_progress?
66
- status == Connectors::SyncStatus::IN_PROGRESS
67
- end
68
-
69
- def canceling?
70
- status == Connectors::SyncStatus::CANCELING
71
- end
72
-
73
- def suspended?
74
- status == Connectors::SyncStatus::SUSPENDED
75
- end
76
-
77
- def canceled?
78
- status == Connectors::SyncStatus::CANCELED
79
- end
80
-
81
- def pending?
82
- Connectors::SyncStatus::PENDING_STATUSES.include?(status)
83
- end
84
-
85
- def active?
86
- Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
87
- end
88
-
89
- def terminated?
90
- Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
91
- end
92
-
93
- def connector_snapshot
94
- self[:connector] || {}
95
- end
96
-
97
- def connector_id
98
- @elasticsearch_response[:_source][:connector][:id]
99
- end
100
-
101
- def index_name
102
- connector_snapshot[:index_name]
103
- end
104
-
105
- def language
106
- connector_snapshot[:language]
107
- end
108
-
109
- def service_type
110
- connector_snapshot[:service_type]
111
- end
112
-
113
- def configuration
114
- connector_snapshot[:configuration]
115
- end
116
-
117
- def filtering
118
- Utility::Filtering.extract_filter(connector_snapshot[:filtering])
119
- end
120
-
121
- def pipeline
122
- @elasticsearch_response[:_source][:pipeline]
123
- end
124
-
125
- def connector
126
- @connector ||= ConnectorSettings.fetch_by_id(connector_id)
127
- end
128
-
129
- def done!(ingestion_stats = {}, connector_metadata = {})
130
- terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
131
- end
132
-
133
- def error!(message, ingestion_stats = {}, connector_metadata = {})
134
- terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
135
- end
136
-
137
- def cancel!(ingestion_stats = {}, connector_metadata = {})
138
- terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
139
- end
140
-
141
- def with_concurrency_control
142
- response = ElasticConnectorActions.get_job(id)
143
-
144
- yield response, response['_seq_no'], response['_primary_term']
145
- end
146
-
147
- def make_running!
148
- with_concurrency_control do |es_doc, seq_no, primary_term|
149
- now = Time.now
150
- doc = {
151
- status: Connectors::SyncStatus::IN_PROGRESS,
152
- started_at: now,
153
- last_seen: now,
154
- worker_hostname: Socket.gethostname
155
- }
156
-
157
- ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
158
- end
159
- end
160
-
161
- def es_source
162
- @elasticsearch_response[:_source]
163
- end
164
-
165
- private
166
-
167
- def self.fetch_jobs_by_query(query, page_size)
168
- results = []
169
- offset = 0
170
- loop do
171
- response = ElasticConnectorActions.search_jobs(query, page_size, offset)
172
-
173
- hits = response.dig('hits', 'hits') || []
174
- total = response.dig('hits', 'total', 'value') || 0
175
- results += hits.map { |hit| new(hit) }
176
- break if results.size >= total
177
- offset += hits.size
178
- end
179
-
180
- results
181
- end
182
-
183
- def initialize(es_response)
184
- # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
185
- @elasticsearch_response = es_response.with_indifferent_access
186
- end
187
-
188
- def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
189
- ingestion_stats ||= {}
190
- ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
191
- doc = {
192
- :last_seen => Time.now,
193
- :completed_at => Time.now,
194
- :status => status,
195
- :error => error
196
- }.merge(ingestion_stats)
197
- doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
198
- doc[:metadata] = connector_metadata if connector_metadata&.any?
199
- ElasticConnectorActions.update_job_fields(id, doc)
200
- end
201
-
202
- def seq_no
203
- @elasticsearch_response[:_seq_no]
204
- end
205
-
206
- def primary_term
207
- @elasticsearch_response[:_primary_term]
208
- end
209
- end
210
- end
@@ -1,39 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'core/filtering'
10
- require 'utility/filtering'
11
-
12
- module Core
13
- module Filtering
14
- class PostProcessEngine
15
- attr_reader :rules
16
-
17
- def initialize(job_description)
18
- @rules = ordered_rules(job_description.dig('connector', 'filtering'))
19
- end
20
-
21
- def process(document)
22
- @rules.each do |rule|
23
- if rule.match?(document.stringify_keys)
24
- return PostProcessResult.new(document, rule)
25
- end
26
- end
27
- PostProcessResult.new(document, SimpleRule::DEFAULT_RULE)
28
- end
29
-
30
- private
31
-
32
- def ordered_rules(job_filtering)
33
- job_rules = Utility::Filtering.extract_filter(job_filtering)['rules']
34
- sorted_rules = job_rules.sort_by { |rule| rule['order'] }.reject { |rule| rule['id'] == Core::Filtering::SimpleRule::DEFAULT_RULE_ID }
35
- sorted_rules.each_with_object([]) { |rule, output| output << SimpleRule.new(rule) }
36
- end
37
- end
38
- end
39
- end
@@ -1,27 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'utility/logger'
10
-
11
- module Core
12
- module Filtering
13
- class PostProcessResult
14
- attr_reader :document, :matching_rule
15
-
16
- def initialize(document, matching_rule)
17
- @document = document
18
- @matching_rule = matching_rule
19
- Utility::Logger.debug("Document '#{document['id']}' matched filtering rule: #{matching_rule.id}. It will be #{matching_rule.policy}d")
20
- end
21
-
22
- def is_include?
23
- matching_rule.is_include?
24
- end
25
- end
26
- end
27
- end
@@ -1,141 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'utility/logger'
10
-
11
- module Core
12
- module Filtering
13
- class SimpleRule
14
- DEFAULT_RULE_ID = 'DEFAULT'
15
-
16
- class Policy
17
- INCLUDE = 'include'
18
- EXCLUDE = 'exclude'
19
- end
20
-
21
- class Rule
22
- REGEX = 'regex'
23
- EQUALS = 'equals'
24
- STARTS_WITH = 'starts_with'
25
- ENDS_WITH = 'ends_with'
26
- CONTAINS = 'contains'
27
- LESS_THAN = '<'
28
- GREATER_THAN = '>'
29
- end
30
-
31
- attr_reader :policy, :field, :rule, :value, :id
32
-
33
- def initialize(rule_hash)
34
- @policy = rule_hash.fetch('policy')
35
- @field = rule_hash.fetch('field')
36
- @rule = rule_hash.fetch('rule')
37
- @value = rule_hash.fetch('value')
38
- @id = rule_hash.fetch('id')
39
- @rule_hash = rule_hash
40
- rescue KeyError => e
41
- raise "#{e.key} is required"
42
- end
43
-
44
- def self.from_args(id, policy, field, rule, value)
45
- SimpleRule.new(
46
- {
47
- 'id' => id,
48
- 'policy' => policy,
49
- 'field' => field,
50
- 'rule' => rule,
51
- 'value' => value
52
- }
53
- )
54
- end
55
-
56
- DEFAULT_RULE = SimpleRule.new(
57
- 'policy' => 'include',
58
- 'field' => '_',
59
- 'rule' => 'regex',
60
- 'value' => '.*',
61
- 'id' => SimpleRule::DEFAULT_RULE_ID
62
- )
63
-
64
- def match?(document)
65
- return true if id == DEFAULT_RULE_ID
66
- doc_value = document[field]
67
- return false if doc_value.nil?
68
- coerced_value = coerce(doc_value)
69
- case rule
70
- when Rule::EQUALS
71
- case coerced_value
72
- when Integer
73
- doc_value == coerced_value
74
- when DateTime, Time
75
- doc_value.to_s == coerced_value.to_s
76
- else
77
- doc_value.to_s == coerced_value
78
- end
79
- when Rule::STARTS_WITH
80
- doc_value.to_s.start_with?(value)
81
- when Rule::ENDS_WITH
82
- doc_value.to_s.end_with?(value)
83
- when Rule::CONTAINS
84
- doc_value.to_s.include?(value)
85
- when Rule::REGEX
86
- doc_value.to_s.match(/#{value}/)
87
- when Rule::LESS_THAN
88
- doc_value < coerced_value
89
- when Rule::GREATER_THAN
90
- doc_value > coerced_value
91
- else
92
- false
93
- end
94
- end
95
-
96
- def coerce(doc_value)
97
- case doc_value
98
- when String
99
- value.to_s
100
- when Integer
101
- value.to_i
102
- when DateTime, Time
103
- to_date(value)
104
- when TrueClass, FalseClass # Ruby doesn't have a Boolean type, TIL
105
- to_bool(value).to_s
106
- else
107
- value.to_s
108
- end
109
- rescue StandardError => e
110
- Utility::Logger.debug("Failed to coerce value '#{value}' (#{value.class}) based on document value '#{doc_value}' (#{doc_value.class}) due to error: #{e.class}: #{e.message}")
111
- value.to_s
112
- end
113
-
114
- def is_include?
115
- policy == Policy::INCLUDE
116
- end
117
-
118
- def is_exclude?
119
- policy == Policy::EXCLUDE
120
- end
121
-
122
- def to_h
123
- @rule_hash
124
- end
125
-
126
- private
127
-
128
- def to_bool(str)
129
- return true if str == true || str =~ (/^(true|t|yes|y|on|1)$/i)
130
- return false if str == false || str.blank? || str =~ (/^(false|f|no|n|off|0)$/i)
131
- raise ArgumentError.new("invalid value for Boolean: \"#{str}\"")
132
- end
133
-
134
- def to_date(str)
135
- DateTime.parse(str)
136
- rescue ArgumentError
137
- Time.at(str.to_i) # try with it as an int string of millis
138
- end
139
- end
140
- end
141
- end
@@ -1,53 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'connectors/connector_status'
10
- require 'connectors/registry'
11
-
12
- module Core
13
- module Filtering
14
- DEFAULT_DOMAIN = 'DEFAULT'
15
-
16
- class ValidationJobRunner
17
- def initialize(connector_settings)
18
- @connector_settings = connector_settings
19
- @connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
20
- @validation_finished = false
21
- @status = { :error => nil }
22
- end
23
-
24
- def execute
25
- Utility::Logger.info("Starting a validation job for connector #{@connector_settings.id}.")
26
-
27
- validation_result = @connector_class.validate_filtering(@connector_settings.filtering[:draft])
28
-
29
- # currently only used for connectors -> DEFAULT domain can be assumed (will be changed with the integration of crawler)
30
- ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_result })
31
-
32
- @validation_finished = true
33
- rescue StandardError => e
34
- Utility::ExceptionTracking.log_exception(e)
35
- validation_failed_result = { :state => Core::Filtering::ValidationStatus::INVALID,
36
- :errors => [
37
- { :ids => [], :messages => ['Unknown problem occurred while validating, see logs for details.'] }
38
- ] }
39
- ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_failed_result })
40
- ensure
41
- if !@validation_finished && !@status[:error].present?
42
- @status[:error] = 'Validation thread did not finish execution. Check connector logs for more details.'
43
- end
44
-
45
- if @status[:error]
46
- Utility::Logger.warn("Failed to validate filtering for connector #{@connector_settings.id} with error '#{@status[:error]}'.")
47
- else
48
- Utility::Logger.info("Successfully validated filtering for connector #{@connector_settings.id}.")
49
- end
50
- end
51
- end
52
- end
53
- end
@@ -1,17 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- module Core
10
- module Filtering
11
- class ValidationStatus
12
- INVALID = 'invalid'
13
- VALID = 'valid'
14
- EDITED = 'edited'
15
- end
16
- end
17
- end
@@ -1,17 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'core/filtering/post_process_engine'
10
- require 'core/filtering/post_process_result'
11
- require 'core/filtering/simple_rule'
12
- require 'core/filtering/validation_job_runner'
13
- require 'core/filtering/validation_status'
14
-
15
- module Core::Filtering
16
- DEFAULT_DOMAIN = 'DEFAULT'
17
- end
@@ -1,118 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'app/config'
10
- require 'utility/bulk_queue'
11
- require 'utility/es_client'
12
- require 'utility/logger'
13
- require 'elasticsearch/api'
14
- #
15
- # This class is responsible for sending the data to the data storage.
16
- # While we don't actually allow to output our data anywhere except
17
- # Elasticsearch, we still want to be able to do so sometime in future.
18
- #
19
- # This class should stay simple and any change to the class should be careful
20
- # with the thought of introducing other sinks in future.
21
- module Core
22
- module Ingestion
23
- class EsSink
24
- def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new, max_allowed_document_size = 5 * 1024 * 1024)
25
- @client = Utility::EsClient.new(App::Config[:elasticsearch])
26
- @index_name = index_name
27
- @request_pipeline = request_pipeline
28
- @operation_queue = bulk_queue
29
-
30
- @max_allowed_document_size = max_allowed_document_size
31
-
32
- @queued = {
33
- :indexed_document_count => 0,
34
- :deleted_document_count => 0,
35
- :indexed_document_volume => 0
36
- }
37
-
38
- @completed = {
39
- :indexed_document_count => 0,
40
- :deleted_document_count => 0,
41
- :indexed_document_volume => 0
42
- }
43
- end
44
-
45
- def ingest(document)
46
- if document.nil? || document.empty?
47
- Utility::Logger.warn('Connector attempted to ingest an empty document, skipping')
48
- return
49
- end
50
-
51
- id = document['id']
52
- serialized_document = serialize(document)
53
-
54
- document_size = serialized_document.bytesize
55
-
56
- if @max_allowed_document_size > 0 && document_size > @max_allowed_document_size
57
- Utility::Logger.warn("Connector attempted to ingest too large document with id=#{document['id']} [#{document_size}/#{@max_allowed_document_size}], skipping the document.")
58
- return
59
- end
60
-
61
- index_op = serialize({ 'index' => { '_index' => @index_name, '_id' => id } })
62
-
63
- flush unless @operation_queue.will_fit?(index_op, serialized_document)
64
-
65
- @operation_queue.add(
66
- index_op,
67
- serialized_document
68
- )
69
-
70
- @queued[:indexed_document_count] += 1
71
- @queued[:indexed_document_volume] += document_size
72
- end
73
-
74
- def ingest_multiple(documents)
75
- documents.each { |doc| ingest(doc) }
76
- end
77
-
78
- def delete(id)
79
- return if id.nil?
80
-
81
- delete_op = serialize({ 'delete' => { '_index' => @index_name, '_id' => id } })
82
- flush unless @operation_queue.will_fit?(delete_op)
83
-
84
- @operation_queue.add(delete_op)
85
- @queued[:deleted_document_count] += 1
86
- end
87
-
88
- def delete_multiple(ids)
89
- ids.each { |id| delete(id) }
90
- end
91
-
92
- def flush
93
- data = @operation_queue.pop_all
94
- return if data.empty?
95
-
96
- @client.bulk(:body => data, :pipeline => @request_pipeline)
97
-
98
- @completed[:indexed_document_count] += @queued[:indexed_document_count]
99
- @completed[:deleted_document_count] += @queued[:deleted_document_count]
100
- @completed[:indexed_document_volume] += @queued[:indexed_document_volume]
101
-
102
- @queued[:indexed_document_count] = 0
103
- @queued[:deleted_document_count] = 0
104
- @queued[:indexed_document_volume] = 0
105
- end
106
-
107
- def ingestion_stats
108
- @completed.dup
109
- end
110
-
111
- private
112
-
113
- def serialize(document)
114
- Elasticsearch::API.serializer.dump(document)
115
- end
116
- end
117
- end
118
- end