connectors_service 8.6.0.4.pre.20221116T024501Z → 8.6.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/connectors.yml +6 -6
- data/lib/app/app.rb +0 -4
- data/lib/app/dispatcher.rb +17 -42
- data/lib/app/preflight_check.rb +0 -11
- data/lib/connectors/base/connector.rb +14 -43
- data/lib/connectors/example/connector.rb +0 -6
- data/lib/connectors/gitlab/connector.rb +1 -6
- data/lib/connectors/mongodb/connector.rb +43 -47
- data/lib/connectors/sync_status.rb +1 -6
- data/lib/core/configuration.rb +1 -3
- data/lib/core/connector_settings.rb +16 -52
- data/lib/core/elastic_connector_actions.rb +59 -320
- data/lib/core/output_sink/base_sink.rb +33 -0
- data/lib/core/output_sink/combined_sink.rb +38 -0
- data/lib/core/output_sink/console_sink.rb +51 -0
- data/lib/core/output_sink/es_sink.rb +74 -0
- data/lib/core/{ingestion.rb → output_sink.rb} +5 -1
- data/lib/core/scheduler.rb +10 -40
- data/lib/core/single_scheduler.rb +1 -1
- data/lib/core/sync_job_runner.rb +16 -72
- data/lib/core.rb +0 -4
- data/lib/utility/constants.rb +0 -2
- data/lib/utility/errors.rb +12 -0
- data/lib/utility/logger.rb +1 -1
- data/lib/utility.rb +4 -11
- metadata +9 -27
- data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +0 -173
- data/lib/connectors/base/advanced_snippet_validator.rb +0 -34
- data/lib/connectors/base/simple_rules_parser.rb +0 -42
- data/lib/connectors/example/example_advanced_snippet_validator.rb +0 -35
- data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +0 -35
- data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +0 -22
- data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +0 -292
- data/lib/connectors/mongodb/mongo_rules_parser.rb +0 -81
- data/lib/connectors/tolerable_error_helper.rb +0 -43
- data/lib/core/connector_job.rb +0 -210
- data/lib/core/filtering/post_process_engine.rb +0 -39
- data/lib/core/filtering/post_process_result.rb +0 -27
- data/lib/core/filtering/simple_rule.rb +0 -141
- data/lib/core/filtering/validation_job_runner.rb +0 -53
- data/lib/core/filtering/validation_status.rb +0 -17
- data/lib/core/filtering.rb +0 -17
- data/lib/core/ingestion/es_sink.rb +0 -118
- data/lib/core/jobs/consumer.rb +0 -114
- data/lib/core/jobs/producer.rb +0 -26
- data/lib/utility/bulk_queue.rb +0 -85
- data/lib/utility/error_monitor.rb +0 -108
- data/lib/utility/filtering.rb +0 -22
data/lib/core/connector_job.rb
DELETED
@@ -1,210 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
-
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
-
# you may not use this file except in compliance with the Elastic License.
|
5
|
-
#
|
6
|
-
|
7
|
-
# frozen_string_literal: true
|
8
|
-
|
9
|
-
require 'active_support/core_ext/hash/indifferent_access'
|
10
|
-
require 'connectors/sync_status'
|
11
|
-
require 'core/elastic_connector_actions'
|
12
|
-
require 'utility'
|
13
|
-
|
14
|
-
module Core
|
15
|
-
class ConnectorJob
|
16
|
-
DEFAULT_PAGE_SIZE = 100
|
17
|
-
|
18
|
-
def self.fetch_by_id(job_id)
|
19
|
-
es_response = ElasticConnectorActions.get_job(job_id)
|
20
|
-
return nil unless es_response[:found]
|
21
|
-
|
22
|
-
new(es_response)
|
23
|
-
end
|
24
|
-
|
25
|
-
def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
|
26
|
-
status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
|
27
|
-
|
28
|
-
query = { bool: { must: [{ terms: status_term }] } }
|
29
|
-
|
30
|
-
return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
|
31
|
-
|
32
|
-
query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
|
33
|
-
|
34
|
-
fetch_jobs_by_query(query, page_size)
|
35
|
-
end
|
36
|
-
|
37
|
-
def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
|
38
|
-
[]
|
39
|
-
end
|
40
|
-
|
41
|
-
def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
|
42
|
-
[]
|
43
|
-
end
|
44
|
-
|
45
|
-
def self.enqueue(_connector_id)
|
46
|
-
nil
|
47
|
-
end
|
48
|
-
|
49
|
-
def id
|
50
|
-
@elasticsearch_response[:_id]
|
51
|
-
end
|
52
|
-
|
53
|
-
def [](property_name)
|
54
|
-
@elasticsearch_response[:_source][property_name]
|
55
|
-
end
|
56
|
-
|
57
|
-
def error
|
58
|
-
self[:error]
|
59
|
-
end
|
60
|
-
|
61
|
-
def status
|
62
|
-
self[:status]
|
63
|
-
end
|
64
|
-
|
65
|
-
def in_progress?
|
66
|
-
status == Connectors::SyncStatus::IN_PROGRESS
|
67
|
-
end
|
68
|
-
|
69
|
-
def canceling?
|
70
|
-
status == Connectors::SyncStatus::CANCELING
|
71
|
-
end
|
72
|
-
|
73
|
-
def suspended?
|
74
|
-
status == Connectors::SyncStatus::SUSPENDED
|
75
|
-
end
|
76
|
-
|
77
|
-
def canceled?
|
78
|
-
status == Connectors::SyncStatus::CANCELED
|
79
|
-
end
|
80
|
-
|
81
|
-
def pending?
|
82
|
-
Connectors::SyncStatus::PENDING_STATUSES.include?(status)
|
83
|
-
end
|
84
|
-
|
85
|
-
def active?
|
86
|
-
Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
|
87
|
-
end
|
88
|
-
|
89
|
-
def terminated?
|
90
|
-
Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
|
91
|
-
end
|
92
|
-
|
93
|
-
def connector_snapshot
|
94
|
-
self[:connector] || {}
|
95
|
-
end
|
96
|
-
|
97
|
-
def connector_id
|
98
|
-
@elasticsearch_response[:_source][:connector][:id]
|
99
|
-
end
|
100
|
-
|
101
|
-
def index_name
|
102
|
-
connector_snapshot[:index_name]
|
103
|
-
end
|
104
|
-
|
105
|
-
def language
|
106
|
-
connector_snapshot[:language]
|
107
|
-
end
|
108
|
-
|
109
|
-
def service_type
|
110
|
-
connector_snapshot[:service_type]
|
111
|
-
end
|
112
|
-
|
113
|
-
def configuration
|
114
|
-
connector_snapshot[:configuration]
|
115
|
-
end
|
116
|
-
|
117
|
-
def filtering
|
118
|
-
Utility::Filtering.extract_filter(connector_snapshot[:filtering])
|
119
|
-
end
|
120
|
-
|
121
|
-
def pipeline
|
122
|
-
@elasticsearch_response[:_source][:pipeline]
|
123
|
-
end
|
124
|
-
|
125
|
-
def connector
|
126
|
-
@connector ||= ConnectorSettings.fetch_by_id(connector_id)
|
127
|
-
end
|
128
|
-
|
129
|
-
def done!(ingestion_stats = {}, connector_metadata = {})
|
130
|
-
terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
|
131
|
-
end
|
132
|
-
|
133
|
-
def error!(message, ingestion_stats = {}, connector_metadata = {})
|
134
|
-
terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
|
135
|
-
end
|
136
|
-
|
137
|
-
def cancel!(ingestion_stats = {}, connector_metadata = {})
|
138
|
-
terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
|
139
|
-
end
|
140
|
-
|
141
|
-
def with_concurrency_control
|
142
|
-
response = ElasticConnectorActions.get_job(id)
|
143
|
-
|
144
|
-
yield response, response['_seq_no'], response['_primary_term']
|
145
|
-
end
|
146
|
-
|
147
|
-
def make_running!
|
148
|
-
with_concurrency_control do |es_doc, seq_no, primary_term|
|
149
|
-
now = Time.now
|
150
|
-
doc = {
|
151
|
-
status: Connectors::SyncStatus::IN_PROGRESS,
|
152
|
-
started_at: now,
|
153
|
-
last_seen: now,
|
154
|
-
worker_hostname: Socket.gethostname
|
155
|
-
}
|
156
|
-
|
157
|
-
ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
|
158
|
-
end
|
159
|
-
end
|
160
|
-
|
161
|
-
def es_source
|
162
|
-
@elasticsearch_response[:_source]
|
163
|
-
end
|
164
|
-
|
165
|
-
private
|
166
|
-
|
167
|
-
def self.fetch_jobs_by_query(query, page_size)
|
168
|
-
results = []
|
169
|
-
offset = 0
|
170
|
-
loop do
|
171
|
-
response = ElasticConnectorActions.search_jobs(query, page_size, offset)
|
172
|
-
|
173
|
-
hits = response.dig('hits', 'hits') || []
|
174
|
-
total = response.dig('hits', 'total', 'value') || 0
|
175
|
-
results += hits.map { |hit| new(hit) }
|
176
|
-
break if results.size >= total
|
177
|
-
offset += hits.size
|
178
|
-
end
|
179
|
-
|
180
|
-
results
|
181
|
-
end
|
182
|
-
|
183
|
-
def initialize(es_response)
|
184
|
-
# TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
|
185
|
-
@elasticsearch_response = es_response.with_indifferent_access
|
186
|
-
end
|
187
|
-
|
188
|
-
def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
|
189
|
-
ingestion_stats ||= {}
|
190
|
-
ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
|
191
|
-
doc = {
|
192
|
-
:last_seen => Time.now,
|
193
|
-
:completed_at => Time.now,
|
194
|
-
:status => status,
|
195
|
-
:error => error
|
196
|
-
}.merge(ingestion_stats)
|
197
|
-
doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
|
198
|
-
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
199
|
-
ElasticConnectorActions.update_job_fields(id, doc)
|
200
|
-
end
|
201
|
-
|
202
|
-
def seq_no
|
203
|
-
@elasticsearch_response[:_seq_no]
|
204
|
-
end
|
205
|
-
|
206
|
-
def primary_term
|
207
|
-
@elasticsearch_response[:_primary_term]
|
208
|
-
end
|
209
|
-
end
|
210
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
-
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
-
# you may not use this file except in compliance with the Elastic License.
|
5
|
-
#
|
6
|
-
|
7
|
-
# frozen_string_literal: true
|
8
|
-
|
9
|
-
require 'core/filtering'
|
10
|
-
require 'utility/filtering'
|
11
|
-
|
12
|
-
module Core
|
13
|
-
module Filtering
|
14
|
-
class PostProcessEngine
|
15
|
-
attr_reader :rules
|
16
|
-
|
17
|
-
def initialize(job_description)
|
18
|
-
@rules = ordered_rules(job_description.dig('connector', 'filtering'))
|
19
|
-
end
|
20
|
-
|
21
|
-
def process(document)
|
22
|
-
@rules.each do |rule|
|
23
|
-
if rule.match?(document.stringify_keys)
|
24
|
-
return PostProcessResult.new(document, rule)
|
25
|
-
end
|
26
|
-
end
|
27
|
-
PostProcessResult.new(document, SimpleRule::DEFAULT_RULE)
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def ordered_rules(job_filtering)
|
33
|
-
job_rules = Utility::Filtering.extract_filter(job_filtering)['rules']
|
34
|
-
sorted_rules = job_rules.sort_by { |rule| rule['order'] }.reject { |rule| rule['id'] == Core::Filtering::SimpleRule::DEFAULT_RULE_ID }
|
35
|
-
sorted_rules.each_with_object([]) { |rule, output| output << SimpleRule.new(rule) }
|
36
|
-
end
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
@@ -1,27 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
-
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
-
# you may not use this file except in compliance with the Elastic License.
|
5
|
-
#
|
6
|
-
|
7
|
-
# frozen_string_literal: true
|
8
|
-
|
9
|
-
require 'utility/logger'
|
10
|
-
|
11
|
-
module Core
|
12
|
-
module Filtering
|
13
|
-
class PostProcessResult
|
14
|
-
attr_reader :document, :matching_rule
|
15
|
-
|
16
|
-
def initialize(document, matching_rule)
|
17
|
-
@document = document
|
18
|
-
@matching_rule = matching_rule
|
19
|
-
Utility::Logger.debug("Document '#{document['id']}' matched filtering rule: #{matching_rule.id}. It will be #{matching_rule.policy}d")
|
20
|
-
end
|
21
|
-
|
22
|
-
def is_include?
|
23
|
-
matching_rule.is_include?
|
24
|
-
end
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
@@ -1,141 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
-
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
-
# you may not use this file except in compliance with the Elastic License.
|
5
|
-
#
|
6
|
-
|
7
|
-
# frozen_string_literal: true
|
8
|
-
|
9
|
-
require 'utility/logger'
|
10
|
-
|
11
|
-
module Core
|
12
|
-
module Filtering
|
13
|
-
class SimpleRule
|
14
|
-
DEFAULT_RULE_ID = 'DEFAULT'
|
15
|
-
|
16
|
-
class Policy
|
17
|
-
INCLUDE = 'include'
|
18
|
-
EXCLUDE = 'exclude'
|
19
|
-
end
|
20
|
-
|
21
|
-
class Rule
|
22
|
-
REGEX = 'regex'
|
23
|
-
EQUALS = 'equals'
|
24
|
-
STARTS_WITH = 'starts_with'
|
25
|
-
ENDS_WITH = 'ends_with'
|
26
|
-
CONTAINS = 'contains'
|
27
|
-
LESS_THAN = '<'
|
28
|
-
GREATER_THAN = '>'
|
29
|
-
end
|
30
|
-
|
31
|
-
attr_reader :policy, :field, :rule, :value, :id
|
32
|
-
|
33
|
-
def initialize(rule_hash)
|
34
|
-
@policy = rule_hash.fetch('policy')
|
35
|
-
@field = rule_hash.fetch('field')
|
36
|
-
@rule = rule_hash.fetch('rule')
|
37
|
-
@value = rule_hash.fetch('value')
|
38
|
-
@id = rule_hash.fetch('id')
|
39
|
-
@rule_hash = rule_hash
|
40
|
-
rescue KeyError => e
|
41
|
-
raise "#{e.key} is required"
|
42
|
-
end
|
43
|
-
|
44
|
-
def self.from_args(id, policy, field, rule, value)
|
45
|
-
SimpleRule.new(
|
46
|
-
{
|
47
|
-
'id' => id,
|
48
|
-
'policy' => policy,
|
49
|
-
'field' => field,
|
50
|
-
'rule' => rule,
|
51
|
-
'value' => value
|
52
|
-
}
|
53
|
-
)
|
54
|
-
end
|
55
|
-
|
56
|
-
DEFAULT_RULE = SimpleRule.new(
|
57
|
-
'policy' => 'include',
|
58
|
-
'field' => '_',
|
59
|
-
'rule' => 'regex',
|
60
|
-
'value' => '.*',
|
61
|
-
'id' => SimpleRule::DEFAULT_RULE_ID
|
62
|
-
)
|
63
|
-
|
64
|
-
def match?(document)
|
65
|
-
return true if id == DEFAULT_RULE_ID
|
66
|
-
doc_value = document[field]
|
67
|
-
return false if doc_value.nil?
|
68
|
-
coerced_value = coerce(doc_value)
|
69
|
-
case rule
|
70
|
-
when Rule::EQUALS
|
71
|
-
case coerced_value
|
72
|
-
when Integer
|
73
|
-
doc_value == coerced_value
|
74
|
-
when DateTime, Time
|
75
|
-
doc_value.to_s == coerced_value.to_s
|
76
|
-
else
|
77
|
-
doc_value.to_s == coerced_value
|
78
|
-
end
|
79
|
-
when Rule::STARTS_WITH
|
80
|
-
doc_value.to_s.start_with?(value)
|
81
|
-
when Rule::ENDS_WITH
|
82
|
-
doc_value.to_s.end_with?(value)
|
83
|
-
when Rule::CONTAINS
|
84
|
-
doc_value.to_s.include?(value)
|
85
|
-
when Rule::REGEX
|
86
|
-
doc_value.to_s.match(/#{value}/)
|
87
|
-
when Rule::LESS_THAN
|
88
|
-
doc_value < coerced_value
|
89
|
-
when Rule::GREATER_THAN
|
90
|
-
doc_value > coerced_value
|
91
|
-
else
|
92
|
-
false
|
93
|
-
end
|
94
|
-
end
|
95
|
-
|
96
|
-
def coerce(doc_value)
|
97
|
-
case doc_value
|
98
|
-
when String
|
99
|
-
value.to_s
|
100
|
-
when Integer
|
101
|
-
value.to_i
|
102
|
-
when DateTime, Time
|
103
|
-
to_date(value)
|
104
|
-
when TrueClass, FalseClass # Ruby doesn't have a Boolean type, TIL
|
105
|
-
to_bool(value).to_s
|
106
|
-
else
|
107
|
-
value.to_s
|
108
|
-
end
|
109
|
-
rescue StandardError => e
|
110
|
-
Utility::Logger.debug("Failed to coerce value '#{value}' (#{value.class}) based on document value '#{doc_value}' (#{doc_value.class}) due to error: #{e.class}: #{e.message}")
|
111
|
-
value.to_s
|
112
|
-
end
|
113
|
-
|
114
|
-
def is_include?
|
115
|
-
policy == Policy::INCLUDE
|
116
|
-
end
|
117
|
-
|
118
|
-
def is_exclude?
|
119
|
-
policy == Policy::EXCLUDE
|
120
|
-
end
|
121
|
-
|
122
|
-
def to_h
|
123
|
-
@rule_hash
|
124
|
-
end
|
125
|
-
|
126
|
-
private
|
127
|
-
|
128
|
-
def to_bool(str)
|
129
|
-
return true if str == true || str =~ (/^(true|t|yes|y|on|1)$/i)
|
130
|
-
return false if str == false || str.blank? || str =~ (/^(false|f|no|n|off|0)$/i)
|
131
|
-
raise ArgumentError.new("invalid value for Boolean: \"#{str}\"")
|
132
|
-
end
|
133
|
-
|
134
|
-
def to_date(str)
|
135
|
-
DateTime.parse(str)
|
136
|
-
rescue ArgumentError
|
137
|
-
Time.at(str.to_i) # try with it as an int string of millis
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
end
|
@@ -1,53 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
-
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
-
# you may not use this file except in compliance with the Elastic License.
|
5
|
-
#
|
6
|
-
|
7
|
-
# frozen_string_literal: true
|
8
|
-
|
9
|
-
require 'connectors/connector_status'
|
10
|
-
require 'connectors/registry'
|
11
|
-
|
12
|
-
module Core
|
13
|
-
module Filtering
|
14
|
-
DEFAULT_DOMAIN = 'DEFAULT'
|
15
|
-
|
16
|
-
class ValidationJobRunner
|
17
|
-
def initialize(connector_settings)
|
18
|
-
@connector_settings = connector_settings
|
19
|
-
@connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
|
20
|
-
@validation_finished = false
|
21
|
-
@status = { :error => nil }
|
22
|
-
end
|
23
|
-
|
24
|
-
def execute
|
25
|
-
Utility::Logger.info("Starting a validation job for connector #{@connector_settings.id}.")
|
26
|
-
|
27
|
-
validation_result = @connector_class.validate_filtering(@connector_settings.filtering[:draft])
|
28
|
-
|
29
|
-
# currently only used for connectors -> DEFAULT domain can be assumed (will be changed with the integration of crawler)
|
30
|
-
ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_result })
|
31
|
-
|
32
|
-
@validation_finished = true
|
33
|
-
rescue StandardError => e
|
34
|
-
Utility::ExceptionTracking.log_exception(e)
|
35
|
-
validation_failed_result = { :state => Core::Filtering::ValidationStatus::INVALID,
|
36
|
-
:errors => [
|
37
|
-
{ :ids => [], :messages => ['Unknown problem occurred while validating, see logs for details.'] }
|
38
|
-
] }
|
39
|
-
ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_failed_result })
|
40
|
-
ensure
|
41
|
-
if !@validation_finished && !@status[:error].present?
|
42
|
-
@status[:error] = 'Validation thread did not finish execution. Check connector logs for more details.'
|
43
|
-
end
|
44
|
-
|
45
|
-
if @status[:error]
|
46
|
-
Utility::Logger.warn("Failed to validate filtering for connector #{@connector_settings.id} with error '#{@status[:error]}'.")
|
47
|
-
else
|
48
|
-
Utility::Logger.info("Successfully validated filtering for connector #{@connector_settings.id}.")
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
@@ -1,17 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
-
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
-
# you may not use this file except in compliance with the Elastic License.
|
5
|
-
#
|
6
|
-
|
7
|
-
# frozen_string_literal: true
|
8
|
-
|
9
|
-
module Core
|
10
|
-
module Filtering
|
11
|
-
class ValidationStatus
|
12
|
-
INVALID = 'invalid'
|
13
|
-
VALID = 'valid'
|
14
|
-
EDITED = 'edited'
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
data/lib/core/filtering.rb
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
-
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
-
# you may not use this file except in compliance with the Elastic License.
|
5
|
-
#
|
6
|
-
|
7
|
-
# frozen_string_literal: true
|
8
|
-
|
9
|
-
require 'core/filtering/post_process_engine'
|
10
|
-
require 'core/filtering/post_process_result'
|
11
|
-
require 'core/filtering/simple_rule'
|
12
|
-
require 'core/filtering/validation_job_runner'
|
13
|
-
require 'core/filtering/validation_status'
|
14
|
-
|
15
|
-
module Core::Filtering
|
16
|
-
DEFAULT_DOMAIN = 'DEFAULT'
|
17
|
-
end
|
@@ -1,118 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
-
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
-
# you may not use this file except in compliance with the Elastic License.
|
5
|
-
#
|
6
|
-
|
7
|
-
# frozen_string_literal: true
|
8
|
-
|
9
|
-
require 'app/config'
|
10
|
-
require 'utility/bulk_queue'
|
11
|
-
require 'utility/es_client'
|
12
|
-
require 'utility/logger'
|
13
|
-
require 'elasticsearch/api'
|
14
|
-
#
|
15
|
-
# This class is responsible for sending the data to the data storage.
|
16
|
-
# While we don't actually allow to output our data anywhere except
|
17
|
-
# Elasticsearch, we still want to be able to do so sometime in future.
|
18
|
-
#
|
19
|
-
# This class should stay simple and any change to the class should be careful
|
20
|
-
# with the thought of introducing other sinks in future.
|
21
|
-
module Core
|
22
|
-
module Ingestion
|
23
|
-
class EsSink
|
24
|
-
def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new, max_allowed_document_size = 5 * 1024 * 1024)
|
25
|
-
@client = Utility::EsClient.new(App::Config[:elasticsearch])
|
26
|
-
@index_name = index_name
|
27
|
-
@request_pipeline = request_pipeline
|
28
|
-
@operation_queue = bulk_queue
|
29
|
-
|
30
|
-
@max_allowed_document_size = max_allowed_document_size
|
31
|
-
|
32
|
-
@queued = {
|
33
|
-
:indexed_document_count => 0,
|
34
|
-
:deleted_document_count => 0,
|
35
|
-
:indexed_document_volume => 0
|
36
|
-
}
|
37
|
-
|
38
|
-
@completed = {
|
39
|
-
:indexed_document_count => 0,
|
40
|
-
:deleted_document_count => 0,
|
41
|
-
:indexed_document_volume => 0
|
42
|
-
}
|
43
|
-
end
|
44
|
-
|
45
|
-
def ingest(document)
|
46
|
-
if document.nil? || document.empty?
|
47
|
-
Utility::Logger.warn('Connector attempted to ingest an empty document, skipping')
|
48
|
-
return
|
49
|
-
end
|
50
|
-
|
51
|
-
id = document['id']
|
52
|
-
serialized_document = serialize(document)
|
53
|
-
|
54
|
-
document_size = serialized_document.bytesize
|
55
|
-
|
56
|
-
if @max_allowed_document_size > 0 && document_size > @max_allowed_document_size
|
57
|
-
Utility::Logger.warn("Connector attempted to ingest too large document with id=#{document['id']} [#{document_size}/#{@max_allowed_document_size}], skipping the document.")
|
58
|
-
return
|
59
|
-
end
|
60
|
-
|
61
|
-
index_op = serialize({ 'index' => { '_index' => @index_name, '_id' => id } })
|
62
|
-
|
63
|
-
flush unless @operation_queue.will_fit?(index_op, serialized_document)
|
64
|
-
|
65
|
-
@operation_queue.add(
|
66
|
-
index_op,
|
67
|
-
serialized_document
|
68
|
-
)
|
69
|
-
|
70
|
-
@queued[:indexed_document_count] += 1
|
71
|
-
@queued[:indexed_document_volume] += document_size
|
72
|
-
end
|
73
|
-
|
74
|
-
def ingest_multiple(documents)
|
75
|
-
documents.each { |doc| ingest(doc) }
|
76
|
-
end
|
77
|
-
|
78
|
-
def delete(id)
|
79
|
-
return if id.nil?
|
80
|
-
|
81
|
-
delete_op = serialize({ 'delete' => { '_index' => @index_name, '_id' => id } })
|
82
|
-
flush unless @operation_queue.will_fit?(delete_op)
|
83
|
-
|
84
|
-
@operation_queue.add(delete_op)
|
85
|
-
@queued[:deleted_document_count] += 1
|
86
|
-
end
|
87
|
-
|
88
|
-
def delete_multiple(ids)
|
89
|
-
ids.each { |id| delete(id) }
|
90
|
-
end
|
91
|
-
|
92
|
-
def flush
|
93
|
-
data = @operation_queue.pop_all
|
94
|
-
return if data.empty?
|
95
|
-
|
96
|
-
@client.bulk(:body => data, :pipeline => @request_pipeline)
|
97
|
-
|
98
|
-
@completed[:indexed_document_count] += @queued[:indexed_document_count]
|
99
|
-
@completed[:deleted_document_count] += @queued[:deleted_document_count]
|
100
|
-
@completed[:indexed_document_volume] += @queued[:indexed_document_volume]
|
101
|
-
|
102
|
-
@queued[:indexed_document_count] = 0
|
103
|
-
@queued[:deleted_document_count] = 0
|
104
|
-
@queued[:indexed_document_volume] = 0
|
105
|
-
end
|
106
|
-
|
107
|
-
def ingestion_stats
|
108
|
-
@completed.dup
|
109
|
-
end
|
110
|
-
|
111
|
-
private
|
112
|
-
|
113
|
-
def serialize(document)
|
114
|
-
Elasticsearch::API.serializer.dump(document)
|
115
|
-
end
|
116
|
-
end
|
117
|
-
end
|
118
|
-
end
|