connectors_service 8.6.0.4 → 8.7.0.0.pre.20221117T010623Z
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config/connectors.yml +9 -8
- data/lib/app/app.rb +4 -0
- data/lib/app/config.rb +3 -0
- data/lib/app/dispatcher.rb +44 -17
- data/lib/app/preflight_check.rb +11 -0
- data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
- data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
- data/lib/connectors/base/connector.rb +43 -14
- data/lib/connectors/base/simple_rules_parser.rb +42 -0
- data/lib/connectors/example/connector.rb +6 -0
- data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
- data/lib/connectors/gitlab/connector.rb +6 -1
- data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
- data/lib/connectors/mongodb/connector.rb +47 -43
- data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
- data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
- data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
- data/lib/connectors/sync_status.rb +6 -1
- data/lib/connectors/tolerable_error_helper.rb +43 -0
- data/lib/connectors_app/// +13 -0
- data/lib/core/configuration.rb +3 -1
- data/lib/core/connector_job.rb +210 -0
- data/lib/core/connector_settings.rb +52 -16
- data/lib/core/elastic_connector_actions.rb +320 -59
- data/lib/core/filtering/post_process_engine.rb +39 -0
- data/lib/core/filtering/post_process_result.rb +27 -0
- data/lib/core/filtering/simple_rule.rb +141 -0
- data/lib/core/filtering/validation_job_runner.rb +53 -0
- data/lib/core/filtering/validation_status.rb +17 -0
- data/lib/core/filtering.rb +17 -0
- data/lib/core/ingestion/es_sink.rb +118 -0
- data/lib/core/{output_sink.rb → ingestion.rb} +1 -5
- data/lib/core/jobs/consumer.rb +132 -0
- data/lib/core/jobs/producer.rb +26 -0
- data/lib/core/scheduler.rb +40 -10
- data/lib/core/single_scheduler.rb +1 -1
- data/lib/core/sync_job_runner.rb +80 -16
- data/lib/core.rb +4 -0
- data/lib/utility/bulk_queue.rb +87 -0
- data/lib/utility/constants.rb +7 -0
- data/lib/utility/error_monitor.rb +108 -0
- data/lib/utility/errors.rb +0 -12
- data/lib/utility/filtering.rb +22 -0
- data/lib/utility/logger.rb +1 -1
- data/lib/utility.rb +11 -4
- metadata +31 -12
- data/lib/core/output_sink/base_sink.rb +0 -33
- data/lib/core/output_sink/combined_sink.rb +0 -38
- data/lib/core/output_sink/console_sink.rb +0 -51
- data/lib/core/output_sink/es_sink.rb +0 -74
@@ -0,0 +1,210 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
10
|
+
require 'connectors/sync_status'
|
11
|
+
require 'core/elastic_connector_actions'
|
12
|
+
require 'utility'
|
13
|
+
|
14
|
+
module Core
|
15
|
+
class ConnectorJob
|
16
|
+
DEFAULT_PAGE_SIZE = 100
|
17
|
+
|
18
|
+
def self.fetch_by_id(job_id)
|
19
|
+
es_response = ElasticConnectorActions.get_job(job_id)
|
20
|
+
return nil unless es_response[:found]
|
21
|
+
|
22
|
+
new(es_response)
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
|
26
|
+
status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
|
27
|
+
|
28
|
+
query = { bool: { must: [{ terms: status_term }] } }
|
29
|
+
|
30
|
+
return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
|
31
|
+
|
32
|
+
query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
|
33
|
+
|
34
|
+
fetch_jobs_by_query(query, page_size)
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
|
38
|
+
[]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
|
42
|
+
[]
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.enqueue(_connector_id)
|
46
|
+
nil
|
47
|
+
end
|
48
|
+
|
49
|
+
def id
|
50
|
+
@elasticsearch_response[:_id]
|
51
|
+
end
|
52
|
+
|
53
|
+
def [](property_name)
|
54
|
+
@elasticsearch_response[:_source][property_name]
|
55
|
+
end
|
56
|
+
|
57
|
+
def error
|
58
|
+
self[:error]
|
59
|
+
end
|
60
|
+
|
61
|
+
def status
|
62
|
+
self[:status]
|
63
|
+
end
|
64
|
+
|
65
|
+
def in_progress?
|
66
|
+
status == Connectors::SyncStatus::IN_PROGRESS
|
67
|
+
end
|
68
|
+
|
69
|
+
def canceling?
|
70
|
+
status == Connectors::SyncStatus::CANCELING
|
71
|
+
end
|
72
|
+
|
73
|
+
def suspended?
|
74
|
+
status == Connectors::SyncStatus::SUSPENDED
|
75
|
+
end
|
76
|
+
|
77
|
+
def canceled?
|
78
|
+
status == Connectors::SyncStatus::CANCELED
|
79
|
+
end
|
80
|
+
|
81
|
+
def pending?
|
82
|
+
Connectors::SyncStatus::PENDING_STATUSES.include?(status)
|
83
|
+
end
|
84
|
+
|
85
|
+
def active?
|
86
|
+
Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
|
87
|
+
end
|
88
|
+
|
89
|
+
def terminated?
|
90
|
+
Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
|
91
|
+
end
|
92
|
+
|
93
|
+
def connector_snapshot
|
94
|
+
self[:connector] || {}
|
95
|
+
end
|
96
|
+
|
97
|
+
def connector_id
|
98
|
+
@elasticsearch_response[:_source][:connector][:id]
|
99
|
+
end
|
100
|
+
|
101
|
+
def index_name
|
102
|
+
connector_snapshot[:index_name]
|
103
|
+
end
|
104
|
+
|
105
|
+
def language
|
106
|
+
connector_snapshot[:language]
|
107
|
+
end
|
108
|
+
|
109
|
+
def service_type
|
110
|
+
connector_snapshot[:service_type]
|
111
|
+
end
|
112
|
+
|
113
|
+
def configuration
|
114
|
+
connector_snapshot[:configuration]
|
115
|
+
end
|
116
|
+
|
117
|
+
def filtering
|
118
|
+
Utility::Filtering.extract_filter(connector_snapshot[:filtering])
|
119
|
+
end
|
120
|
+
|
121
|
+
def pipeline
|
122
|
+
@elasticsearch_response[:_source][:pipeline]
|
123
|
+
end
|
124
|
+
|
125
|
+
def connector
|
126
|
+
@connector ||= ConnectorSettings.fetch_by_id(connector_id)
|
127
|
+
end
|
128
|
+
|
129
|
+
def done!(ingestion_stats = {}, connector_metadata = {})
|
130
|
+
terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
|
131
|
+
end
|
132
|
+
|
133
|
+
def error!(message, ingestion_stats = {}, connector_metadata = {})
|
134
|
+
terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
|
135
|
+
end
|
136
|
+
|
137
|
+
def cancel!(ingestion_stats = {}, connector_metadata = {})
|
138
|
+
terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
|
139
|
+
end
|
140
|
+
|
141
|
+
def with_concurrency_control
|
142
|
+
response = ElasticConnectorActions.get_job(id)
|
143
|
+
|
144
|
+
yield response, response['_seq_no'], response['_primary_term']
|
145
|
+
end
|
146
|
+
|
147
|
+
def make_running!
|
148
|
+
with_concurrency_control do |es_doc, seq_no, primary_term|
|
149
|
+
now = Time.now
|
150
|
+
doc = {
|
151
|
+
status: Connectors::SyncStatus::IN_PROGRESS,
|
152
|
+
started_at: now,
|
153
|
+
last_seen: now,
|
154
|
+
worker_hostname: Socket.gethostname
|
155
|
+
}
|
156
|
+
|
157
|
+
ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def es_source
|
162
|
+
@elasticsearch_response[:_source]
|
163
|
+
end
|
164
|
+
|
165
|
+
private
|
166
|
+
|
167
|
+
def self.fetch_jobs_by_query(query, page_size)
|
168
|
+
results = []
|
169
|
+
offset = 0
|
170
|
+
loop do
|
171
|
+
response = ElasticConnectorActions.search_jobs(query, page_size, offset)
|
172
|
+
|
173
|
+
hits = response.dig('hits', 'hits') || []
|
174
|
+
total = response.dig('hits', 'total', 'value') || 0
|
175
|
+
results += hits.map { |hit| new(hit) }
|
176
|
+
break if results.size >= total
|
177
|
+
offset += hits.size
|
178
|
+
end
|
179
|
+
|
180
|
+
results
|
181
|
+
end
|
182
|
+
|
183
|
+
def initialize(es_response)
|
184
|
+
# TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
|
185
|
+
@elasticsearch_response = es_response.with_indifferent_access
|
186
|
+
end
|
187
|
+
|
188
|
+
def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
|
189
|
+
ingestion_stats ||= {}
|
190
|
+
ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
|
191
|
+
doc = {
|
192
|
+
:last_seen => Time.now,
|
193
|
+
:completed_at => Time.now,
|
194
|
+
:status => status,
|
195
|
+
:error => error
|
196
|
+
}.merge(ingestion_stats)
|
197
|
+
doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
|
198
|
+
doc[:metadata] = connector_metadata if connector_metadata&.any?
|
199
|
+
ElasticConnectorActions.update_job_fields(id, doc)
|
200
|
+
end
|
201
|
+
|
202
|
+
def seq_no
|
203
|
+
@elasticsearch_response[:_seq_no]
|
204
|
+
end
|
205
|
+
|
206
|
+
def primary_term
|
207
|
+
@elasticsearch_response[:_primary_term]
|
208
|
+
end
|
209
|
+
end
|
210
|
+
end
|
@@ -23,24 +23,24 @@ module Core
|
|
23
23
|
|
24
24
|
DEFAULT_PAGE_SIZE = 100
|
25
25
|
|
26
|
-
# Error Classes
|
27
|
-
class ConnectorNotFoundError < StandardError; end
|
28
|
-
|
29
26
|
def self.fetch_by_id(connector_id)
|
30
27
|
es_response = ElasticConnectorActions.get_connector(connector_id)
|
31
|
-
|
28
|
+
return nil unless es_response[:found]
|
32
29
|
|
33
|
-
|
30
|
+
connectors_meta = ElasticConnectorActions.connectors_meta
|
34
31
|
new(es_response, connectors_meta)
|
35
32
|
end
|
36
33
|
|
37
|
-
def initialize(es_response, connectors_meta)
|
38
|
-
@elasticsearch_response = es_response.with_indifferent_access
|
39
|
-
@connectors_meta = connectors_meta.with_indifferent_access
|
40
|
-
end
|
41
|
-
|
42
34
|
def self.fetch_native_connectors(page_size = DEFAULT_PAGE_SIZE)
|
43
|
-
|
35
|
+
require 'connectors/registry' unless defined?(Connectors::REGISTRY)
|
36
|
+
query = {
|
37
|
+
bool: {
|
38
|
+
filter: [
|
39
|
+
{ term: { is_native: true } },
|
40
|
+
{ terms: { service_type: Connectors::REGISTRY.registered_connectors } }
|
41
|
+
]
|
42
|
+
}
|
43
|
+
}
|
44
44
|
fetch_connectors_by_query(query, page_size)
|
45
45
|
end
|
46
46
|
|
@@ -83,23 +83,26 @@ module Core
|
|
83
83
|
end
|
84
84
|
|
85
85
|
def filtering
|
86
|
-
|
86
|
+
# assume for now, that first object in filtering array or a filter object itself is the only filtering object
|
87
|
+
filtering = @elasticsearch_response.dig(:_source, :filtering)
|
88
|
+
|
89
|
+
Utility::Filtering.extract_filter(filtering)
|
87
90
|
end
|
88
91
|
|
89
92
|
def request_pipeline
|
90
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
|
93
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
|
91
94
|
end
|
92
95
|
|
93
96
|
def extract_binary_content?
|
94
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
|
97
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
|
95
98
|
end
|
96
99
|
|
97
100
|
def reduce_whitespace?
|
98
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
|
101
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
|
99
102
|
end
|
100
103
|
|
101
104
|
def run_ml_inference?
|
102
|
-
Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
|
105
|
+
Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
|
103
106
|
end
|
104
107
|
|
105
108
|
def formatted
|
@@ -116,6 +119,39 @@ module Core
|
|
116
119
|
index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
|
117
120
|
end
|
118
121
|
|
122
|
+
def ready_for_sync?
|
123
|
+
Connectors::REGISTRY.registered?(service_type) &&
|
124
|
+
valid_index_name? &&
|
125
|
+
connector_status_allows_sync?
|
126
|
+
end
|
127
|
+
|
128
|
+
def running?
|
129
|
+
@elasticsearch_response[:_source][:last_sync_status] == Connectors::SyncStatus::IN_PROGRESS
|
130
|
+
end
|
131
|
+
|
132
|
+
def update_last_sync!(job)
|
133
|
+
doc = {
|
134
|
+
:last_sync_status => job.status,
|
135
|
+
:last_synced => Time.now,
|
136
|
+
:last_sync_error => job.error,
|
137
|
+
:error => job.error
|
138
|
+
}
|
139
|
+
|
140
|
+
if job.terminated?
|
141
|
+
doc[:last_indexed_document_count] = job[:indexed_document_count]
|
142
|
+
doc[:last_deleted_document_count] = job[:deleted_document_count]
|
143
|
+
end
|
144
|
+
|
145
|
+
Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
def initialize(es_response, connectors_meta)
|
151
|
+
@elasticsearch_response = es_response.with_indifferent_access
|
152
|
+
@connectors_meta = connectors_meta.with_indifferent_access
|
153
|
+
end
|
154
|
+
|
119
155
|
def self.fetch_connectors_by_query(query, page_size)
|
120
156
|
connectors_meta = ElasticConnectorActions.connectors_meta
|
121
157
|
|