connectors_service 8.6.0.4 → 8.7.0.0.pre.20221117T010623Z
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/connectors.yml +9 -8
- data/lib/app/app.rb +4 -0
- data/lib/app/config.rb +3 -0
- data/lib/app/dispatcher.rb +44 -17
- data/lib/app/preflight_check.rb +11 -0
- data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
- data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
- data/lib/connectors/base/connector.rb +43 -14
- data/lib/connectors/base/simple_rules_parser.rb +42 -0
- data/lib/connectors/example/connector.rb +6 -0
- data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
- data/lib/connectors/gitlab/connector.rb +6 -1
- data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
- data/lib/connectors/mongodb/connector.rb +47 -43
- data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
- data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
- data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
- data/lib/connectors/sync_status.rb +6 -1
- data/lib/connectors/tolerable_error_helper.rb +43 -0
- data/lib/connectors_app/// +13 -0
- data/lib/core/configuration.rb +3 -1
- data/lib/core/connector_job.rb +210 -0
- data/lib/core/connector_settings.rb +52 -16
- data/lib/core/elastic_connector_actions.rb +320 -59
- data/lib/core/filtering/post_process_engine.rb +39 -0
- data/lib/core/filtering/post_process_result.rb +27 -0
- data/lib/core/filtering/simple_rule.rb +141 -0
- data/lib/core/filtering/validation_job_runner.rb +53 -0
- data/lib/core/filtering/validation_status.rb +17 -0
- data/lib/core/filtering.rb +17 -0
- data/lib/core/ingestion/es_sink.rb +118 -0
- data/lib/core/{output_sink.rb → ingestion.rb} +1 -5
- data/lib/core/jobs/consumer.rb +132 -0
- data/lib/core/jobs/producer.rb +26 -0
- data/lib/core/scheduler.rb +40 -10
- data/lib/core/single_scheduler.rb +1 -1
- data/lib/core/sync_job_runner.rb +80 -16
- data/lib/core.rb +4 -0
- data/lib/utility/bulk_queue.rb +87 -0
- data/lib/utility/constants.rb +7 -0
- data/lib/utility/error_monitor.rb +108 -0
- data/lib/utility/errors.rb +0 -12
- data/lib/utility/filtering.rb +22 -0
- data/lib/utility/logger.rb +1 -1
- data/lib/utility.rb +11 -4
- metadata +31 -12
- data/lib/core/output_sink/base_sink.rb +0 -33
- data/lib/core/output_sink/combined_sink.rb +0 -38
- data/lib/core/output_sink/console_sink.rb +0 -51
- data/lib/core/output_sink/es_sink.rb +0 -74
@@ -0,0 +1,210 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
10
|
+
require 'connectors/sync_status'
|
11
|
+
require 'core/elastic_connector_actions'
|
12
|
+
require 'utility'
|
13
|
+
|
14
|
+
module Core
|
15
|
+
class ConnectorJob
|
16
|
+
DEFAULT_PAGE_SIZE = 100
|
17
|
+
|
18
|
+
def self.fetch_by_id(job_id)
|
19
|
+
es_response = ElasticConnectorActions.get_job(job_id)
|
20
|
+
return nil unless es_response[:found]
|
21
|
+
|
22
|
+
new(es_response)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
|
26
|
+
status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
|
27
|
+
|
28
|
+
query = { bool: { must: [{ terms: status_term }] } }
|
29
|
+
|
30
|
+
return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
|
31
|
+
|
32
|
+
query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
|
33
|
+
|
34
|
+
fetch_jobs_by_query(query, page_size)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
|
38
|
+
[]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
|
42
|
+
[]
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.enqueue(_connector_id)
|
46
|
+
nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def id
|
50
|
+
@elasticsearch_response[:_id]
|
51
|
+
end
|
52
|
+
|
53
|
+
def [](property_name)
|
54
|
+
@elasticsearch_response[:_source][property_name]
|
55
|
+
end
|
56
|
+
|
57
|
+
def error
|
58
|
+
self[:error]
|
59
|
+
end
|
60
|
+
|
61
|
+
def status
|
62
|
+
self[:status]
|
63
|
+
end
|
64
|
+
|
65
|
+
def in_progress?
|
66
|
+
status == Connectors::SyncStatus::IN_PROGRESS
|
67
|
+
end
|
68
|
+
|
69
|
+
def canceling?
|
70
|
+
status == Connectors::SyncStatus::CANCELING
|
71
|
+
end
|
72
|
+
|
73
|
+
def suspended?
|
74
|
+
status == Connectors::SyncStatus::SUSPENDED
|
75
|
+
end
|
76
|
+
|
77
|
+
def canceled?
|
78
|
+
status == Connectors::SyncStatus::CANCELED
|
79
|
+
end
|
80
|
+
|
81
|
+
def pending?
|
82
|
+
Connectors::SyncStatus::PENDING_STATUSES.include?(status)
|
83
|
+
end
|
84
|
+
|
85
|
+
def active?
|
86
|
+
Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
|
87
|
+
end
|
88
|
+
|
89
|
+
def terminated?
|
90
|
+
Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
|
91
|
+
end
|
92
|
+
|
93
|
+
def connector_snapshot
|
94
|
+
self[:connector] || {}
|
95
|
+
end
|
96
|
+
|
97
|
+
def connector_id
|
98
|
+
@elasticsearch_response[:_source][:connector][:id]
|
99
|
+
end
|
100
|
+
|
101
|
+
def index_name
|
102
|
+
connector_snapshot[:index_name]
|
103
|
+
end
|
104
|
+
|
105
|
+
def language
|
106
|
+
connector_snapshot[:language]
|
107
|
+
end
|
108
|
+
|
109
|
+
def service_type
|
110
|
+
connector_snapshot[:service_type]
|
111
|
+
end
|
112
|
+
|
113
|
+
def configuration
|
114
|
+
connector_snapshot[:configuration]
|
115
|
+
end
|
116
|
+
|
117
|
+
def filtering
|
118
|
+
Utility::Filtering.extract_filter(connector_snapshot[:filtering])
|
119
|
+
end
|
120
|
+
|
121
|
+
def pipeline
|
122
|
+
@elasticsearch_response[:_source][:pipeline]
|
123
|
+
end
|
124
|
+
|
125
|
+
def connector
|
126
|
+
@connector ||= ConnectorSettings.fetch_by_id(connector_id)
|
127
|
+
end
|
128
|
+
|
129
|
+
def done!(ingestion_stats = {}, connector_metadata = {})
|
130
|
+
terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
|
131
|
+
end
|
132
|
+
|
133
|
+
def error!(message, ingestion_stats = {}, connector_metadata = {})
|
134
|
+
terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
|
135
|
+
end
|
136
|
+
|
137
|
+
def cancel!(ingestion_stats = {}, connector_metadata = {})
|
138
|
+
terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
|
139
|
+
end
|
140
|
+
|
141
|
+
def with_concurrency_control
|
142
|
+
response = ElasticConnectorActions.get_job(id)
|
143
|
+
|
144
|
+
yield response, response['_seq_no'], response['_primary_term']
|
145
|
+
end
|
146
|
+
|
147
|
+
def make_running!
|
148
|
+
with_concurrency_control do |es_doc, seq_no, primary_term|
|
149
|
+
now = Time.now
|
150
|
+
doc = {
|
151
|
+
status: Connectors::SyncStatus::IN_PROGRESS,
|
152
|
+
started_at: now,
|
153
|
+
last_seen: now,
|
154
|
+
worker_hostname: Socket.gethostname
|
155
|
+
}
|
156
|
+
|
157
|
+
ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def es_source
|
162
|
+
@elasticsearch_response[:_source]
|
163
|
+
end
|
164
|
+
|
165
|
+
private
|
166
|
+
|
167
|
+
def self.fetch_jobs_by_query(query, page_size)
|
168
|
+
results = []
|
169
|
+
offset = 0
|
170
|
+
loop do
|
171
|
+
response = ElasticConnectorActions.search_jobs(query, page_size, offset)
|
172
|
+
|
173
|
+
hits = response.dig('hits', 'hits') || []
|
174
|
+
total = response.dig('hits', 'total', 'value') || 0
|
175
|
+
results += hits.map { |hit| new(hit) }
|
176
|
+
break if results.size >= total
|
177
|
+
offset += hits.size
|
178
|
+
end
|
179
|
+
|
180
|
+
results
|
181
|
+
end
|
182
|
+
|
183
|
+
def initialize(es_response)
|
184
|
+
# TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
|
185
|
+
@elasticsearch_response = es_response.with_indifferent_access
|
186
|
+
end
|
187
|
+
|
188
|
+
def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
|
189
|
+
ingestion_stats ||= {}
|
190
|
+
ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
|
191
|
+
doc = {
|
192
|
+
:last_seen => Time.now,
|
193
|
+
:completed_at => Time.now,
|
194
|
+
:status => status,
|
195
|
+
:error => error
|
196
|
+
}.merge(ingestion_stats)
|
197
|
+
doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
|
198
|
+
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
199
|
+
ElasticConnectorActions.update_job_fields(id, doc)
|
200
|
+
end
|
201
|
+
|
202
|
+
def seq_no
|
203
|
+
@elasticsearch_response[:_seq_no]
|
204
|
+
end
|
205
|
+
|
206
|
+
def primary_term
|
207
|
+
@elasticsearch_response[:_primary_term]
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -23,24 +23,24 @@ module Core
|
|
23
23
|
|
24
24
|
DEFAULT_PAGE_SIZE = 100
|
25
25
|
|
26
|
-
# Error Classes
|
27
|
-
class ConnectorNotFoundError < StandardError; end
|
28
|
-
|
29
26
|
def self.fetch_by_id(connector_id)
|
30
27
|
es_response = ElasticConnectorActions.get_connector(connector_id)
|
31
|
-
|
28
|
+
return nil unless es_response[:found]
|
32
29
|
|
33
|
-
|
30
|
+
connectors_meta = ElasticConnectorActions.connectors_meta
|
34
31
|
new(es_response, connectors_meta)
|
35
32
|
end
|
36
33
|
|
37
|
-
def initialize(es_response, connectors_meta)
|
38
|
-
@elasticsearch_response = es_response.with_indifferent_access
|
39
|
-
@connectors_meta = connectors_meta.with_indifferent_access
|
40
|
-
end
|
41
|
-
|
42
34
|
def self.fetch_native_connectors(page_size = DEFAULT_PAGE_SIZE)
|
43
|
-
|
35
|
+
require 'connectors/registry' unless defined?(Connectors::REGISTRY)
|
36
|
+
query = {
|
37
|
+
bool: {
|
38
|
+
filter: [
|
39
|
+
{ term: { is_native: true } },
|
40
|
+
{ terms: { service_type: Connectors::REGISTRY.registered_connectors } }
|
41
|
+
]
|
42
|
+
}
|
43
|
+
}
|
44
44
|
fetch_connectors_by_query(query, page_size)
|
45
45
|
end
|
46
46
|
|
@@ -83,23 +83,26 @@ module Core
|
|
83
83
|
end
|
84
84
|
|
85
85
|
def filtering
|
86
|
-
|
86
|
+
# assume for now, that first object in filtering array or a filter object itself is the only filtering object
|
87
|
+
filtering = @elasticsearch_response.dig(:_source, :filtering)
|
88
|
+
|
89
|
+
Utility::Filtering.extract_filter(filtering)
|
87
90
|
end
|
88
91
|
|
89
92
|
def request_pipeline
|
90
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
|
93
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
|
91
94
|
end
|
92
95
|
|
93
96
|
def extract_binary_content?
|
94
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
|
97
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
|
95
98
|
end
|
96
99
|
|
97
100
|
def reduce_whitespace?
|
98
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
|
101
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
|
99
102
|
end
|
100
103
|
|
101
104
|
def run_ml_inference?
|
102
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
|
105
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
|
103
106
|
end
|
104
107
|
|
105
108
|
def formatted
|
@@ -116,6 +119,39 @@ module Core
|
|
116
119
|
index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
|
117
120
|
end
|
118
121
|
|
122
|
+
def ready_for_sync?
|
123
|
+
Connectors::REGISTRY.registered?(service_type) &&
|
124
|
+
valid_index_name? &&
|
125
|
+
connector_status_allows_sync?
|
126
|
+
end
|
127
|
+
|
128
|
+
def running?
|
129
|
+
@elasticsearch_response[:_source][:last_sync_status] == Connectors::SyncStatus::IN_PROGRESS
|
130
|
+
end
|
131
|
+
|
132
|
+
def update_last_sync!(job)
|
133
|
+
doc = {
|
134
|
+
:last_sync_status => job.status,
|
135
|
+
:last_synced => Time.now,
|
136
|
+
:last_sync_error => job.error,
|
137
|
+
:error => job.error
|
138
|
+
}
|
139
|
+
|
140
|
+
if job.terminated?
|
141
|
+
doc[:last_indexed_document_count] = job[:indexed_document_count]
|
142
|
+
doc[:last_deleted_document_count] = job[:deleted_document_count]
|
143
|
+
end
|
144
|
+
|
145
|
+
Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
def initialize(es_response, connectors_meta)
|
151
|
+
@elasticsearch_response = es_response.with_indifferent_access
|
152
|
+
@connectors_meta = connectors_meta.with_indifferent_access
|
153
|
+
end
|
154
|
+
|
119
155
|
def self.fetch_connectors_by_query(query, page_size)
|
120
156
|
connectors_meta = ElasticConnectorActions.connectors_meta
|
121
157
|
|