connectors_service 8.6.0.4.pre.20221116T024501Z → 8.6.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +6 -6
  3. data/lib/app/app.rb +0 -4
  4. data/lib/app/dispatcher.rb +17 -42
  5. data/lib/app/preflight_check.rb +0 -11
  6. data/lib/connectors/base/connector.rb +14 -43
  7. data/lib/connectors/example/connector.rb +0 -6
  8. data/lib/connectors/gitlab/connector.rb +1 -6
  9. data/lib/connectors/mongodb/connector.rb +43 -47
  10. data/lib/connectors/sync_status.rb +1 -6
  11. data/lib/core/configuration.rb +1 -3
  12. data/lib/core/connector_settings.rb +16 -52
  13. data/lib/core/elastic_connector_actions.rb +59 -320
  14. data/lib/core/output_sink/base_sink.rb +33 -0
  15. data/lib/core/output_sink/combined_sink.rb +38 -0
  16. data/lib/core/output_sink/console_sink.rb +51 -0
  17. data/lib/core/output_sink/es_sink.rb +74 -0
  18. data/lib/core/{ingestion.rb → output_sink.rb} +5 -1
  19. data/lib/core/scheduler.rb +10 -40
  20. data/lib/core/single_scheduler.rb +1 -1
  21. data/lib/core/sync_job_runner.rb +16 -72
  22. data/lib/core.rb +0 -4
  23. data/lib/utility/constants.rb +0 -2
  24. data/lib/utility/errors.rb +12 -0
  25. data/lib/utility/logger.rb +1 -1
  26. data/lib/utility.rb +4 -11
  27. metadata +9 -27
  28. data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +0 -173
  29. data/lib/connectors/base/advanced_snippet_validator.rb +0 -34
  30. data/lib/connectors/base/simple_rules_parser.rb +0 -42
  31. data/lib/connectors/example/example_advanced_snippet_validator.rb +0 -35
  32. data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +0 -35
  33. data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +0 -22
  34. data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +0 -292
  35. data/lib/connectors/mongodb/mongo_rules_parser.rb +0 -81
  36. data/lib/connectors/tolerable_error_helper.rb +0 -43
  37. data/lib/core/connector_job.rb +0 -210
  38. data/lib/core/filtering/post_process_engine.rb +0 -39
  39. data/lib/core/filtering/post_process_result.rb +0 -27
  40. data/lib/core/filtering/simple_rule.rb +0 -141
  41. data/lib/core/filtering/validation_job_runner.rb +0 -53
  42. data/lib/core/filtering/validation_status.rb +0 -17
  43. data/lib/core/filtering.rb +0 -17
  44. data/lib/core/ingestion/es_sink.rb +0 -118
  45. data/lib/core/jobs/consumer.rb +0 -114
  46. data/lib/core/jobs/producer.rb +0 -26
  47. data/lib/utility/bulk_queue.rb +0 -85
  48. data/lib/utility/error_monitor.rb +0 -108
  49. data/lib/utility/filtering.rb +0 -22
@@ -1,210 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'active_support/core_ext/hash/indifferent_access'
10
- require 'connectors/sync_status'
11
- require 'core/elastic_connector_actions'
12
- require 'utility'
13
-
14
- module Core
15
- class ConnectorJob
16
- DEFAULT_PAGE_SIZE = 100
17
-
18
- def self.fetch_by_id(job_id)
19
- es_response = ElasticConnectorActions.get_job(job_id)
20
- return nil unless es_response[:found]
21
-
22
- new(es_response)
23
- end
24
-
25
- def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
26
- status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
27
-
28
- query = { bool: { must: [{ terms: status_term }] } }
29
-
30
- return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
31
-
32
- query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
33
-
34
- fetch_jobs_by_query(query, page_size)
35
- end
36
-
37
- def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
38
- []
39
- end
40
-
41
- def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
42
- []
43
- end
44
-
45
- def self.enqueue(_connector_id)
46
- nil
47
- end
48
-
49
- def id
50
- @elasticsearch_response[:_id]
51
- end
52
-
53
- def [](property_name)
54
- @elasticsearch_response[:_source][property_name]
55
- end
56
-
57
- def error
58
- self[:error]
59
- end
60
-
61
- def status
62
- self[:status]
63
- end
64
-
65
- def in_progress?
66
- status == Connectors::SyncStatus::IN_PROGRESS
67
- end
68
-
69
- def canceling?
70
- status == Connectors::SyncStatus::CANCELING
71
- end
72
-
73
- def suspended?
74
- status == Connectors::SyncStatus::SUSPENDED
75
- end
76
-
77
- def canceled?
78
- status == Connectors::SyncStatus::CANCELED
79
- end
80
-
81
- def pending?
82
- Connectors::SyncStatus::PENDING_STATUSES.include?(status)
83
- end
84
-
85
- def active?
86
- Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
87
- end
88
-
89
- def terminated?
90
- Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
91
- end
92
-
93
- def connector_snapshot
94
- self[:connector] || {}
95
- end
96
-
97
- def connector_id
98
- @elasticsearch_response[:_source][:connector][:id]
99
- end
100
-
101
- def index_name
102
- connector_snapshot[:index_name]
103
- end
104
-
105
- def language
106
- connector_snapshot[:language]
107
- end
108
-
109
- def service_type
110
- connector_snapshot[:service_type]
111
- end
112
-
113
- def configuration
114
- connector_snapshot[:configuration]
115
- end
116
-
117
- def filtering
118
- Utility::Filtering.extract_filter(connector_snapshot[:filtering])
119
- end
120
-
121
- def pipeline
122
- @elasticsearch_response[:_source][:pipeline]
123
- end
124
-
125
- def connector
126
- @connector ||= ConnectorSettings.fetch_by_id(connector_id)
127
- end
128
-
129
- def done!(ingestion_stats = {}, connector_metadata = {})
130
- terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
131
- end
132
-
133
- def error!(message, ingestion_stats = {}, connector_metadata = {})
134
- terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
135
- end
136
-
137
- def cancel!(ingestion_stats = {}, connector_metadata = {})
138
- terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
139
- end
140
-
141
- def with_concurrency_control
142
- response = ElasticConnectorActions.get_job(id)
143
-
144
- yield response, response['_seq_no'], response['_primary_term']
145
- end
146
-
147
- def make_running!
148
- with_concurrency_control do |es_doc, seq_no, primary_term|
149
- now = Time.now
150
- doc = {
151
- status: Connectors::SyncStatus::IN_PROGRESS,
152
- started_at: now,
153
- last_seen: now,
154
- worker_hostname: Socket.gethostname
155
- }
156
-
157
- ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
158
- end
159
- end
160
-
161
- def es_source
162
- @elasticsearch_response[:_source]
163
- end
164
-
165
- private
166
-
167
- def self.fetch_jobs_by_query(query, page_size)
168
- results = []
169
- offset = 0
170
- loop do
171
- response = ElasticConnectorActions.search_jobs(query, page_size, offset)
172
-
173
- hits = response.dig('hits', 'hits') || []
174
- total = response.dig('hits', 'total', 'value') || 0
175
- results += hits.map { |hit| new(hit) }
176
- break if results.size >= total
177
- offset += hits.size
178
- end
179
-
180
- results
181
- end
182
-
183
- def initialize(es_response)
184
- # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
185
- @elasticsearch_response = es_response.with_indifferent_access
186
- end
187
-
188
- def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
189
- ingestion_stats ||= {}
190
- ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
191
- doc = {
192
- :last_seen => Time.now,
193
- :completed_at => Time.now,
194
- :status => status,
195
- :error => error
196
- }.merge(ingestion_stats)
197
- doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
198
- doc[:metadata] = connector_metadata if connector_metadata&.any?
199
- ElasticConnectorActions.update_job_fields(id, doc)
200
- end
201
-
202
- def seq_no
203
- @elasticsearch_response[:_seq_no]
204
- end
205
-
206
- def primary_term
207
- @elasticsearch_response[:_primary_term]
208
- end
209
- end
210
- end
@@ -1,39 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'core/filtering'
10
- require 'utility/filtering'
11
-
12
- module Core
13
- module Filtering
14
- class PostProcessEngine
15
- attr_reader :rules
16
-
17
- def initialize(job_description)
18
- @rules = ordered_rules(job_description.dig('connector', 'filtering'))
19
- end
20
-
21
- def process(document)
22
- @rules.each do |rule|
23
- if rule.match?(document.stringify_keys)
24
- return PostProcessResult.new(document, rule)
25
- end
26
- end
27
- PostProcessResult.new(document, SimpleRule::DEFAULT_RULE)
28
- end
29
-
30
- private
31
-
32
- def ordered_rules(job_filtering)
33
- job_rules = Utility::Filtering.extract_filter(job_filtering)['rules']
34
- sorted_rules = job_rules.sort_by { |rule| rule['order'] }.reject { |rule| rule['id'] == Core::Filtering::SimpleRule::DEFAULT_RULE_ID }
35
- sorted_rules.each_with_object([]) { |rule, output| output << SimpleRule.new(rule) }
36
- end
37
- end
38
- end
39
- end
@@ -1,27 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'utility/logger'
10
-
11
- module Core
12
- module Filtering
13
- class PostProcessResult
14
- attr_reader :document, :matching_rule
15
-
16
- def initialize(document, matching_rule)
17
- @document = document
18
- @matching_rule = matching_rule
19
- Utility::Logger.debug("Document '#{document['id']}' matched filtering rule: #{matching_rule.id}. It will be #{matching_rule.policy}d")
20
- end
21
-
22
- def is_include?
23
- matching_rule.is_include?
24
- end
25
- end
26
- end
27
- end
@@ -1,141 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'utility/logger'
10
-
11
- module Core
12
- module Filtering
13
- class SimpleRule
14
- DEFAULT_RULE_ID = 'DEFAULT'
15
-
16
- class Policy
17
- INCLUDE = 'include'
18
- EXCLUDE = 'exclude'
19
- end
20
-
21
- class Rule
22
- REGEX = 'regex'
23
- EQUALS = 'equals'
24
- STARTS_WITH = 'starts_with'
25
- ENDS_WITH = 'ends_with'
26
- CONTAINS = 'contains'
27
- LESS_THAN = '<'
28
- GREATER_THAN = '>'
29
- end
30
-
31
- attr_reader :policy, :field, :rule, :value, :id
32
-
33
- def initialize(rule_hash)
34
- @policy = rule_hash.fetch('policy')
35
- @field = rule_hash.fetch('field')
36
- @rule = rule_hash.fetch('rule')
37
- @value = rule_hash.fetch('value')
38
- @id = rule_hash.fetch('id')
39
- @rule_hash = rule_hash
40
- rescue KeyError => e
41
- raise "#{e.key} is required"
42
- end
43
-
44
- def self.from_args(id, policy, field, rule, value)
45
- SimpleRule.new(
46
- {
47
- 'id' => id,
48
- 'policy' => policy,
49
- 'field' => field,
50
- 'rule' => rule,
51
- 'value' => value
52
- }
53
- )
54
- end
55
-
56
- DEFAULT_RULE = SimpleRule.new(
57
- 'policy' => 'include',
58
- 'field' => '_',
59
- 'rule' => 'regex',
60
- 'value' => '.*',
61
- 'id' => SimpleRule::DEFAULT_RULE_ID
62
- )
63
-
64
- def match?(document)
65
- return true if id == DEFAULT_RULE_ID
66
- doc_value = document[field]
67
- return false if doc_value.nil?
68
- coerced_value = coerce(doc_value)
69
- case rule
70
- when Rule::EQUALS
71
- case coerced_value
72
- when Integer
73
- doc_value == coerced_value
74
- when DateTime, Time
75
- doc_value.to_s == coerced_value.to_s
76
- else
77
- doc_value.to_s == coerced_value
78
- end
79
- when Rule::STARTS_WITH
80
- doc_value.to_s.start_with?(value)
81
- when Rule::ENDS_WITH
82
- doc_value.to_s.end_with?(value)
83
- when Rule::CONTAINS
84
- doc_value.to_s.include?(value)
85
- when Rule::REGEX
86
- doc_value.to_s.match(/#{value}/)
87
- when Rule::LESS_THAN
88
- doc_value < coerced_value
89
- when Rule::GREATER_THAN
90
- doc_value > coerced_value
91
- else
92
- false
93
- end
94
- end
95
-
96
- def coerce(doc_value)
97
- case doc_value
98
- when String
99
- value.to_s
100
- when Integer
101
- value.to_i
102
- when DateTime, Time
103
- to_date(value)
104
- when TrueClass, FalseClass # Ruby doesn't have a Boolean type, TIL
105
- to_bool(value).to_s
106
- else
107
- value.to_s
108
- end
109
- rescue StandardError => e
110
- Utility::Logger.debug("Failed to coerce value '#{value}' (#{value.class}) based on document value '#{doc_value}' (#{doc_value.class}) due to error: #{e.class}: #{e.message}")
111
- value.to_s
112
- end
113
-
114
- def is_include?
115
- policy == Policy::INCLUDE
116
- end
117
-
118
- def is_exclude?
119
- policy == Policy::EXCLUDE
120
- end
121
-
122
- def to_h
123
- @rule_hash
124
- end
125
-
126
- private
127
-
128
- def to_bool(str)
129
- return true if str == true || str =~ (/^(true|t|yes|y|on|1)$/i)
130
- return false if str == false || str.blank? || str =~ (/^(false|f|no|n|off|0)$/i)
131
- raise ArgumentError.new("invalid value for Boolean: \"#{str}\"")
132
- end
133
-
134
- def to_date(str)
135
- DateTime.parse(str)
136
- rescue ArgumentError
137
- Time.at(str.to_i) # try with it as an int string of millis
138
- end
139
- end
140
- end
141
- end
@@ -1,53 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'connectors/connector_status'
10
- require 'connectors/registry'
11
-
12
- module Core
13
- module Filtering
14
- DEFAULT_DOMAIN = 'DEFAULT'
15
-
16
- class ValidationJobRunner
17
- def initialize(connector_settings)
18
- @connector_settings = connector_settings
19
- @connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
20
- @validation_finished = false
21
- @status = { :error => nil }
22
- end
23
-
24
- def execute
25
- Utility::Logger.info("Starting a validation job for connector #{@connector_settings.id}.")
26
-
27
- validation_result = @connector_class.validate_filtering(@connector_settings.filtering[:draft])
28
-
29
- # currently only used for connectors -> DEFAULT domain can be assumed (will be changed with the integration of crawler)
30
- ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_result })
31
-
32
- @validation_finished = true
33
- rescue StandardError => e
34
- Utility::ExceptionTracking.log_exception(e)
35
- validation_failed_result = { :state => Core::Filtering::ValidationStatus::INVALID,
36
- :errors => [
37
- { :ids => [], :messages => ['Unknown problem occurred while validating, see logs for details.'] }
38
- ] }
39
- ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_failed_result })
40
- ensure
41
- if !@validation_finished && !@status[:error].present?
42
- @status[:error] = 'Validation thread did not finish execution. Check connector logs for more details.'
43
- end
44
-
45
- if @status[:error]
46
- Utility::Logger.warn("Failed to validate filtering for connector #{@connector_settings.id} with error '#{@status[:error]}'.")
47
- else
48
- Utility::Logger.info("Successfully validated filtering for connector #{@connector_settings.id}.")
49
- end
50
- end
51
- end
52
- end
53
- end
@@ -1,17 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- module Core
10
- module Filtering
11
- class ValidationStatus
12
- INVALID = 'invalid'
13
- VALID = 'valid'
14
- EDITED = 'edited'
15
- end
16
- end
17
- end
@@ -1,17 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'core/filtering/post_process_engine'
10
- require 'core/filtering/post_process_result'
11
- require 'core/filtering/simple_rule'
12
- require 'core/filtering/validation_job_runner'
13
- require 'core/filtering/validation_status'
14
-
15
- module Core::Filtering
16
- DEFAULT_DOMAIN = 'DEFAULT'
17
- end
@@ -1,118 +0,0 @@
1
- #
2
- # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
- # or more contributor license agreements. Licensed under the Elastic License;
4
- # you may not use this file except in compliance with the Elastic License.
5
- #
6
-
7
- # frozen_string_literal: true
8
-
9
- require 'app/config'
10
- require 'utility/bulk_queue'
11
- require 'utility/es_client'
12
- require 'utility/logger'
13
- require 'elasticsearch/api'
14
- #
15
- # This class is responsible for sending the data to the data storage.
16
- # While we don't actually allow to output our data anywhere except
17
- # Elasticsearch, we still want to be able to do so sometime in future.
18
- #
19
- # This class should stay simple and any change to the class should be careful
20
- # with the thought of introducing other sinks in future.
21
- module Core
22
- module Ingestion
23
- class EsSink
24
- def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new, max_allowed_document_size = 5 * 1024 * 1024)
25
- @client = Utility::EsClient.new(App::Config[:elasticsearch])
26
- @index_name = index_name
27
- @request_pipeline = request_pipeline
28
- @operation_queue = bulk_queue
29
-
30
- @max_allowed_document_size = max_allowed_document_size
31
-
32
- @queued = {
33
- :indexed_document_count => 0,
34
- :deleted_document_count => 0,
35
- :indexed_document_volume => 0
36
- }
37
-
38
- @completed = {
39
- :indexed_document_count => 0,
40
- :deleted_document_count => 0,
41
- :indexed_document_volume => 0
42
- }
43
- end
44
-
45
- def ingest(document)
46
- if document.nil? || document.empty?
47
- Utility::Logger.warn('Connector attempted to ingest an empty document, skipping')
48
- return
49
- end
50
-
51
- id = document['id']
52
- serialized_document = serialize(document)
53
-
54
- document_size = serialized_document.bytesize
55
-
56
- if @max_allowed_document_size > 0 && document_size > @max_allowed_document_size
57
- Utility::Logger.warn("Connector attempted to ingest too large document with id=#{document['id']} [#{document_size}/#{@max_allowed_document_size}], skipping the document.")
58
- return
59
- end
60
-
61
- index_op = serialize({ 'index' => { '_index' => @index_name, '_id' => id } })
62
-
63
- flush unless @operation_queue.will_fit?(index_op, serialized_document)
64
-
65
- @operation_queue.add(
66
- index_op,
67
- serialized_document
68
- )
69
-
70
- @queued[:indexed_document_count] += 1
71
- @queued[:indexed_document_volume] += document_size
72
- end
73
-
74
- def ingest_multiple(documents)
75
- documents.each { |doc| ingest(doc) }
76
- end
77
-
78
- def delete(id)
79
- return if id.nil?
80
-
81
- delete_op = serialize({ 'delete' => { '_index' => @index_name, '_id' => id } })
82
- flush unless @operation_queue.will_fit?(delete_op)
83
-
84
- @operation_queue.add(delete_op)
85
- @queued[:deleted_document_count] += 1
86
- end
87
-
88
- def delete_multiple(ids)
89
- ids.each { |id| delete(id) }
90
- end
91
-
92
- def flush
93
- data = @operation_queue.pop_all
94
- return if data.empty?
95
-
96
- @client.bulk(:body => data, :pipeline => @request_pipeline)
97
-
98
- @completed[:indexed_document_count] += @queued[:indexed_document_count]
99
- @completed[:deleted_document_count] += @queued[:deleted_document_count]
100
- @completed[:indexed_document_volume] += @queued[:indexed_document_volume]
101
-
102
- @queued[:indexed_document_count] = 0
103
- @queued[:deleted_document_count] = 0
104
- @queued[:indexed_document_volume] = 0
105
- end
106
-
107
- def ingestion_stats
108
- @completed.dup
109
- end
110
-
111
- private
112
-
113
- def serialize(document)
114
- Elasticsearch::API.serializer.dump(document)
115
- end
116
- end
117
- end
118
- end