connectors_service 8.6.0.4 → 8.7.0.0.pre.20221117T004928Z

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +9 -8
  3. data/lib/app/app.rb +4 -0
  4. data/lib/app/config.rb +3 -0
  5. data/lib/app/dispatcher.rb +44 -17
  6. data/lib/app/preflight_check.rb +11 -0
  7. data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
  8. data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
  9. data/lib/connectors/base/connector.rb +43 -14
  10. data/lib/connectors/base/simple_rules_parser.rb +42 -0
  11. data/lib/connectors/example/connector.rb +6 -0
  12. data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
  13. data/lib/connectors/gitlab/connector.rb +6 -1
  14. data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
  15. data/lib/connectors/mongodb/connector.rb +47 -43
  16. data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
  17. data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
  18. data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
  19. data/lib/connectors/sync_status.rb +6 -1
  20. data/lib/connectors/tolerable_error_helper.rb +43 -0
  21. data/lib/connectors_app/// +13 -0
  22. data/lib/core/configuration.rb +3 -1
  23. data/lib/core/connector_job.rb +210 -0
  24. data/lib/core/connector_settings.rb +52 -16
  25. data/lib/core/elastic_connector_actions.rb +320 -59
  26. data/lib/core/filtering/post_process_engine.rb +39 -0
  27. data/lib/core/filtering/post_process_result.rb +27 -0
  28. data/lib/core/filtering/simple_rule.rb +141 -0
  29. data/lib/core/filtering/validation_job_runner.rb +53 -0
  30. data/lib/core/filtering/validation_status.rb +17 -0
  31. data/lib/core/filtering.rb +17 -0
  32. data/lib/core/ingestion/es_sink.rb +118 -0
  33. data/lib/core/{output_sink.rb → ingestion.rb} +1 -5
  34. data/lib/core/jobs/consumer.rb +132 -0
  35. data/lib/core/jobs/producer.rb +26 -0
  36. data/lib/core/scheduler.rb +40 -10
  37. data/lib/core/single_scheduler.rb +1 -1
  38. data/lib/core/sync_job_runner.rb +80 -16
  39. data/lib/core.rb +4 -0
  40. data/lib/utility/bulk_queue.rb +87 -0
  41. data/lib/utility/constants.rb +7 -0
  42. data/lib/utility/error_monitor.rb +108 -0
  43. data/lib/utility/errors.rb +0 -12
  44. data/lib/utility/filtering.rb +22 -0
  45. data/lib/utility/logger.rb +1 -1
  46. data/lib/utility.rb +11 -4
  47. metadata +31 -12
  48. data/lib/core/output_sink/base_sink.rb +0 -33
  49. data/lib/core/output_sink/combined_sink.rb +0 -38
  50. data/lib/core/output_sink/console_sink.rb +0 -51
  51. data/lib/core/output_sink/es_sink.rb +0 -74
@@ -0,0 +1,210 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'active_support/core_ext/hash/indifferent_access'
10
+ require 'connectors/sync_status'
11
+ require 'core/elastic_connector_actions'
12
+ require 'utility'
13
+
14
+ module Core
15
+ class ConnectorJob
16
+ DEFAULT_PAGE_SIZE = 100
17
+
18
+ def self.fetch_by_id(job_id)
19
+ es_response = ElasticConnectorActions.get_job(job_id)
20
+ return nil unless es_response[:found]
21
+
22
+ new(es_response)
23
+ end
24
+
25
+ def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
26
+ status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
27
+
28
+ query = { bool: { must: [{ terms: status_term }] } }
29
+
30
+ return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
31
+
32
+ query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
33
+
34
+ fetch_jobs_by_query(query, page_size)
35
+ end
36
+
37
+ def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
38
+ []
39
+ end
40
+
41
+ def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
42
+ []
43
+ end
44
+
45
+ def self.enqueue(_connector_id)
46
+ nil
47
+ end
48
+
49
+ def id
50
+ @elasticsearch_response[:_id]
51
+ end
52
+
53
+ def [](property_name)
54
+ @elasticsearch_response[:_source][property_name]
55
+ end
56
+
57
+ def error
58
+ self[:error]
59
+ end
60
+
61
+ def status
62
+ self[:status]
63
+ end
64
+
65
+ def in_progress?
66
+ status == Connectors::SyncStatus::IN_PROGRESS
67
+ end
68
+
69
+ def canceling?
70
+ status == Connectors::SyncStatus::CANCELING
71
+ end
72
+
73
+ def suspended?
74
+ status == Connectors::SyncStatus::SUSPENDED
75
+ end
76
+
77
+ def canceled?
78
+ status == Connectors::SyncStatus::CANCELED
79
+ end
80
+
81
+ def pending?
82
+ Connectors::SyncStatus::PENDING_STATUSES.include?(status)
83
+ end
84
+
85
+ def active?
86
+ Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
87
+ end
88
+
89
+ def terminated?
90
+ Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
91
+ end
92
+
93
+ def connector_snapshot
94
+ self[:connector] || {}
95
+ end
96
+
97
+ def connector_id
98
+ @elasticsearch_response[:_source][:connector][:id]
99
+ end
100
+
101
+ def index_name
102
+ connector_snapshot[:index_name]
103
+ end
104
+
105
+ def language
106
+ connector_snapshot[:language]
107
+ end
108
+
109
+ def service_type
110
+ connector_snapshot[:service_type]
111
+ end
112
+
113
+ def configuration
114
+ connector_snapshot[:configuration]
115
+ end
116
+
117
+ def filtering
118
+ Utility::Filtering.extract_filter(connector_snapshot[:filtering])
119
+ end
120
+
121
+ def pipeline
122
+ @elasticsearch_response[:_source][:pipeline]
123
+ end
124
+
125
+ def connector
126
+ @connector ||= ConnectorSettings.fetch_by_id(connector_id)
127
+ end
128
+
129
+ def done!(ingestion_stats = {}, connector_metadata = {})
130
+ terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
131
+ end
132
+
133
+ def error!(message, ingestion_stats = {}, connector_metadata = {})
134
+ terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
135
+ end
136
+
137
+ def cancel!(ingestion_stats = {}, connector_metadata = {})
138
+ terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
139
+ end
140
+
141
+ def with_concurrency_control
142
+ response = ElasticConnectorActions.get_job(id)
143
+
144
+ yield response, response['_seq_no'], response['_primary_term']
145
+ end
146
+
147
+ def make_running!
148
+ with_concurrency_control do |es_doc, seq_no, primary_term|
149
+ now = Time.now
150
+ doc = {
151
+ status: Connectors::SyncStatus::IN_PROGRESS,
152
+ started_at: now,
153
+ last_seen: now,
154
+ worker_hostname: Socket.gethostname
155
+ }
156
+
157
+ ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
158
+ end
159
+ end
160
+
161
+ def es_source
162
+ @elasticsearch_response[:_source]
163
+ end
164
+
165
+ private
166
+
167
+ def self.fetch_jobs_by_query(query, page_size)
168
+ results = []
169
+ offset = 0
170
+ loop do
171
+ response = ElasticConnectorActions.search_jobs(query, page_size, offset)
172
+
173
+ hits = response.dig('hits', 'hits') || []
174
+ total = response.dig('hits', 'total', 'value') || 0
175
+ results += hits.map { |hit| new(hit) }
176
+ break if results.size >= total
177
+ offset += hits.size
178
+ end
179
+
180
+ results
181
+ end
182
+
183
+ def initialize(es_response)
184
+ # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
185
+ @elasticsearch_response = es_response.with_indifferent_access
186
+ end
187
+
188
+ def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
189
+ ingestion_stats ||= {}
190
+ ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
191
+ doc = {
192
+ :last_seen => Time.now,
193
+ :completed_at => Time.now,
194
+ :status => status,
195
+ :error => error
196
+ }.merge(ingestion_stats)
197
+ doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
198
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
199
+ ElasticConnectorActions.update_job_fields(id, doc)
200
+ end
201
+
202
+ def seq_no
203
+ @elasticsearch_response[:_seq_no]
204
+ end
205
+
206
+ def primary_term
207
+ @elasticsearch_response[:_primary_term]
208
+ end
209
+ end
210
+ end
@@ -23,24 +23,24 @@ module Core
23
23
 
24
24
  DEFAULT_PAGE_SIZE = 100
25
25
 
26
- # Error Classes
27
- class ConnectorNotFoundError < StandardError; end
28
-
29
26
  def self.fetch_by_id(connector_id)
30
27
  es_response = ElasticConnectorActions.get_connector(connector_id)
31
- connectors_meta = ElasticConnectorActions.connectors_meta
28
+ return nil unless es_response[:found]
32
29
 
33
- raise ConnectorNotFoundError.new("Connector with id=#{connector_id} was not found.") unless es_response[:found]
30
+ connectors_meta = ElasticConnectorActions.connectors_meta
34
31
  new(es_response, connectors_meta)
35
32
  end
36
33
 
37
- def initialize(es_response, connectors_meta)
38
- @elasticsearch_response = es_response.with_indifferent_access
39
- @connectors_meta = connectors_meta.with_indifferent_access
40
- end
41
-
42
34
  def self.fetch_native_connectors(page_size = DEFAULT_PAGE_SIZE)
43
- query = { term: { is_native: true } }
35
+ require 'connectors/registry' unless defined?(Connectors::REGISTRY)
36
+ query = {
37
+ bool: {
38
+ filter: [
39
+ { term: { is_native: true } },
40
+ { terms: { service_type: Connectors::REGISTRY.registered_connectors } }
41
+ ]
42
+ }
43
+ }
44
44
  fetch_connectors_by_query(query, page_size)
45
45
  end
46
46
 
@@ -83,23 +83,26 @@ module Core
83
83
  end
84
84
 
85
85
  def filtering
86
- Utility::Common.return_if_present(@elasticsearch_response[:filtering], DEFAULT_FILTERING)
86
+ # assume for now, that first object in filtering array or a filter object itself is the only filtering object
87
+ filtering = @elasticsearch_response.dig(:_source, :filtering)
88
+
89
+ Utility::Filtering.extract_filter(filtering)
87
90
  end
88
91
 
89
92
  def request_pipeline
90
- Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
93
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
91
94
  end
92
95
 
93
96
  def extract_binary_content?
94
- Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
97
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
95
98
  end
96
99
 
97
100
  def reduce_whitespace?
98
- Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
101
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
99
102
  end
100
103
 
101
104
  def run_ml_inference?
102
- Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
105
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
103
106
  end
104
107
 
105
108
  def formatted
@@ -116,6 +119,39 @@ module Core
116
119
  index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
117
120
  end
118
121
 
122
+ def ready_for_sync?
123
+ Connectors::REGISTRY.registered?(service_type) &&
124
+ valid_index_name? &&
125
+ connector_status_allows_sync?
126
+ end
127
+
128
+ def running?
129
+ @elasticsearch_response[:_source][:last_sync_status] == Connectors::SyncStatus::IN_PROGRESS
130
+ end
131
+
132
+ def update_last_sync!(job)
133
+ doc = {
134
+ :last_sync_status => job.status,
135
+ :last_synced => Time.now,
136
+ :last_sync_error => job.error,
137
+ :error => job.error
138
+ }
139
+
140
+ if job.terminated?
141
+ doc[:last_indexed_document_count] = job[:indexed_document_count]
142
+ doc[:last_deleted_document_count] = job[:deleted_document_count]
143
+ end
144
+
145
+ Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
146
+ end
147
+
148
+ private
149
+
150
+ def initialize(es_response, connectors_meta)
151
+ @elasticsearch_response = es_response.with_indifferent_access
152
+ @connectors_meta = connectors_meta.with_indifferent_access
153
+ end
154
+
119
155
  def self.fetch_connectors_by_query(query, page_size)
120
156
  connectors_meta = ElasticConnectorActions.connectors_meta
121
157