connectors_service 8.6.0.4 → 8.7.0.0.pre.20221117T004928Z

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +9 -8
  3. data/lib/app/app.rb +4 -0
  4. data/lib/app/config.rb +3 -0
  5. data/lib/app/dispatcher.rb +44 -17
  6. data/lib/app/preflight_check.rb +11 -0
  7. data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
  8. data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
  9. data/lib/connectors/base/connector.rb +43 -14
  10. data/lib/connectors/base/simple_rules_parser.rb +42 -0
  11. data/lib/connectors/example/connector.rb +6 -0
  12. data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
  13. data/lib/connectors/gitlab/connector.rb +6 -1
  14. data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
  15. data/lib/connectors/mongodb/connector.rb +47 -43
  16. data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
  17. data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
  18. data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
  19. data/lib/connectors/sync_status.rb +6 -1
  20. data/lib/connectors/tolerable_error_helper.rb +43 -0
  21. data/lib/connectors_app/// +13 -0
  22. data/lib/core/configuration.rb +3 -1
  23. data/lib/core/connector_job.rb +210 -0
  24. data/lib/core/connector_settings.rb +52 -16
  25. data/lib/core/elastic_connector_actions.rb +320 -59
  26. data/lib/core/filtering/post_process_engine.rb +39 -0
  27. data/lib/core/filtering/post_process_result.rb +27 -0
  28. data/lib/core/filtering/simple_rule.rb +141 -0
  29. data/lib/core/filtering/validation_job_runner.rb +53 -0
  30. data/lib/core/filtering/validation_status.rb +17 -0
  31. data/lib/core/filtering.rb +17 -0
  32. data/lib/core/ingestion/es_sink.rb +118 -0
  33. data/lib/core/{output_sink.rb → ingestion.rb} +1 -5
  34. data/lib/core/jobs/consumer.rb +132 -0
  35. data/lib/core/jobs/producer.rb +26 -0
  36. data/lib/core/scheduler.rb +40 -10
  37. data/lib/core/single_scheduler.rb +1 -1
  38. data/lib/core/sync_job_runner.rb +80 -16
  39. data/lib/core.rb +4 -0
  40. data/lib/utility/bulk_queue.rb +87 -0
  41. data/lib/utility/constants.rb +7 -0
  42. data/lib/utility/error_monitor.rb +108 -0
  43. data/lib/utility/errors.rb +0 -12
  44. data/lib/utility/filtering.rb +22 -0
  45. data/lib/utility/logger.rb +1 -1
  46. data/lib/utility.rb +11 -4
  47. metadata +31 -12
  48. data/lib/core/output_sink/base_sink.rb +0 -33
  49. data/lib/core/output_sink/combined_sink.rb +0 -38
  50. data/lib/core/output_sink/console_sink.rb +0 -51
  51. data/lib/core/output_sink/es_sink.rb +0 -74
@@ -0,0 +1,210 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'active_support/core_ext/hash/indifferent_access'
10
+ require 'connectors/sync_status'
11
+ require 'core/elastic_connector_actions'
12
+ require 'utility'
13
+
14
+ module Core
15
+ class ConnectorJob
16
+ DEFAULT_PAGE_SIZE = 100
17
+
18
+ def self.fetch_by_id(job_id)
19
+ es_response = ElasticConnectorActions.get_job(job_id)
20
+ return nil unless es_response[:found]
21
+
22
+ new(es_response)
23
+ end
24
+
25
+ def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
26
+ status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
27
+
28
+ query = { bool: { must: [{ terms: status_term }] } }
29
+
30
+ return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
31
+
32
+ query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
33
+
34
+ fetch_jobs_by_query(query, page_size)
35
+ end
36
+
37
+ def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
38
+ []
39
+ end
40
+
41
+ def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
42
+ []
43
+ end
44
+
45
+ def self.enqueue(_connector_id)
46
+ nil
47
+ end
48
+
49
+ def id
50
+ @elasticsearch_response[:_id]
51
+ end
52
+
53
+ def [](property_name)
54
+ @elasticsearch_response[:_source][property_name]
55
+ end
56
+
57
+ def error
58
+ self[:error]
59
+ end
60
+
61
+ def status
62
+ self[:status]
63
+ end
64
+
65
+ def in_progress?
66
+ status == Connectors::SyncStatus::IN_PROGRESS
67
+ end
68
+
69
+ def canceling?
70
+ status == Connectors::SyncStatus::CANCELING
71
+ end
72
+
73
+ def suspended?
74
+ status == Connectors::SyncStatus::SUSPENDED
75
+ end
76
+
77
+ def canceled?
78
+ status == Connectors::SyncStatus::CANCELED
79
+ end
80
+
81
+ def pending?
82
+ Connectors::SyncStatus::PENDING_STATUSES.include?(status)
83
+ end
84
+
85
+ def active?
86
+ Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
87
+ end
88
+
89
+ def terminated?
90
+ Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
91
+ end
92
+
93
+ def connector_snapshot
94
+ self[:connector] || {}
95
+ end
96
+
97
+ def connector_id
98
+ @elasticsearch_response[:_source][:connector][:id]
99
+ end
100
+
101
+ def index_name
102
+ connector_snapshot[:index_name]
103
+ end
104
+
105
+ def language
106
+ connector_snapshot[:language]
107
+ end
108
+
109
+ def service_type
110
+ connector_snapshot[:service_type]
111
+ end
112
+
113
+ def configuration
114
+ connector_snapshot[:configuration]
115
+ end
116
+
117
+ def filtering
118
+ Utility::Filtering.extract_filter(connector_snapshot[:filtering])
119
+ end
120
+
121
+ def pipeline
122
+ @elasticsearch_response[:_source][:pipeline]
123
+ end
124
+
125
+ def connector
126
+ @connector ||= ConnectorSettings.fetch_by_id(connector_id)
127
+ end
128
+
129
+ def done!(ingestion_stats = {}, connector_metadata = {})
130
+ terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
131
+ end
132
+
133
+ def error!(message, ingestion_stats = {}, connector_metadata = {})
134
+ terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
135
+ end
136
+
137
+ def cancel!(ingestion_stats = {}, connector_metadata = {})
138
+ terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
139
+ end
140
+
141
+ def with_concurrency_control
142
+ response = ElasticConnectorActions.get_job(id)
143
+
144
+ yield response, response['_seq_no'], response['_primary_term']
145
+ end
146
+
147
+ def make_running!
148
+ with_concurrency_control do |es_doc, seq_no, primary_term|
149
+ now = Time.now
150
+ doc = {
151
+ status: Connectors::SyncStatus::IN_PROGRESS,
152
+ started_at: now,
153
+ last_seen: now,
154
+ worker_hostname: Socket.gethostname
155
+ }
156
+
157
+ ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
158
+ end
159
+ end
160
+
161
+ def es_source
162
+ @elasticsearch_response[:_source]
163
+ end
164
+
165
+ private
166
+
167
+ def self.fetch_jobs_by_query(query, page_size)
168
+ results = []
169
+ offset = 0
170
+ loop do
171
+ response = ElasticConnectorActions.search_jobs(query, page_size, offset)
172
+
173
+ hits = response.dig('hits', 'hits') || []
174
+ total = response.dig('hits', 'total', 'value') || 0
175
+ results += hits.map { |hit| new(hit) }
176
+ break if results.size >= total
177
+ offset += hits.size
178
+ end
179
+
180
+ results
181
+ end
182
+
183
+ def initialize(es_response)
184
+ # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
185
+ @elasticsearch_response = es_response.with_indifferent_access
186
+ end
187
+
188
+ def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
189
+ ingestion_stats ||= {}
190
+ ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
191
+ doc = {
192
+ :last_seen => Time.now,
193
+ :completed_at => Time.now,
194
+ :status => status,
195
+ :error => error
196
+ }.merge(ingestion_stats)
197
+ doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
198
+ doc[:metadata] = connector_metadata if connector_metadata&.any?
199
+ ElasticConnectorActions.update_job_fields(id, doc)
200
+ end
201
+
202
+ def seq_no
203
+ @elasticsearch_response[:_seq_no]
204
+ end
205
+
206
+ def primary_term
207
+ @elasticsearch_response[:_primary_term]
208
+ end
209
+ end
210
+ end
@@ -23,24 +23,24 @@ module Core
23
23
 
24
24
  DEFAULT_PAGE_SIZE = 100
25
25
 
26
- # Error Classes
27
- class ConnectorNotFoundError < StandardError; end
28
-
29
26
  def self.fetch_by_id(connector_id)
30
27
  es_response = ElasticConnectorActions.get_connector(connector_id)
31
- connectors_meta = ElasticConnectorActions.connectors_meta
28
+ return nil unless es_response[:found]
32
29
 
33
- raise ConnectorNotFoundError.new("Connector with id=#{connector_id} was not found.") unless es_response[:found]
30
+ connectors_meta = ElasticConnectorActions.connectors_meta
34
31
  new(es_response, connectors_meta)
35
32
  end
36
33
 
37
- def initialize(es_response, connectors_meta)
38
- @elasticsearch_response = es_response.with_indifferent_access
39
- @connectors_meta = connectors_meta.with_indifferent_access
40
- end
41
-
42
34
  def self.fetch_native_connectors(page_size = DEFAULT_PAGE_SIZE)
43
- query = { term: { is_native: true } }
35
+ require 'connectors/registry' unless defined?(Connectors::REGISTRY)
36
+ query = {
37
+ bool: {
38
+ filter: [
39
+ { term: { is_native: true } },
40
+ { terms: { service_type: Connectors::REGISTRY.registered_connectors } }
41
+ ]
42
+ }
43
+ }
44
44
  fetch_connectors_by_query(query, page_size)
45
45
  end
46
46
 
@@ -83,23 +83,26 @@ module Core
83
83
  end
84
84
 
85
85
  def filtering
86
- Utility::Common.return_if_present(@elasticsearch_response[:filtering], DEFAULT_FILTERING)
86
+ # assume for now, that first object in filtering array or a filter object itself is the only filtering object
87
+ filtering = @elasticsearch_response.dig(:_source, :filtering)
88
+
89
+ Utility::Filtering.extract_filter(filtering)
87
90
  end
88
91
 
89
92
  def request_pipeline
90
- Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
93
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
91
94
  end
92
95
 
93
96
  def extract_binary_content?
94
- Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
97
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
95
98
  end
96
99
 
97
100
  def reduce_whitespace?
98
- Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
101
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
99
102
  end
100
103
 
101
104
  def run_ml_inference?
102
- Utility::Common.return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
105
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
103
106
  end
104
107
 
105
108
  def formatted
@@ -116,6 +119,39 @@ module Core
116
119
  index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
117
120
  end
118
121
 
122
+ def ready_for_sync?
123
+ Connectors::REGISTRY.registered?(service_type) &&
124
+ valid_index_name? &&
125
+ connector_status_allows_sync?
126
+ end
127
+
128
+ def running?
129
+ @elasticsearch_response[:_source][:last_sync_status] == Connectors::SyncStatus::IN_PROGRESS
130
+ end
131
+
132
+ def update_last_sync!(job)
133
+ doc = {
134
+ :last_sync_status => job.status,
135
+ :last_synced => Time.now,
136
+ :last_sync_error => job.error,
137
+ :error => job.error
138
+ }
139
+
140
+ if job.terminated?
141
+ doc[:last_indexed_document_count] = job[:indexed_document_count]
142
+ doc[:last_deleted_document_count] = job[:deleted_document_count]
143
+ end
144
+
145
+ Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
146
+ end
147
+
148
+ private
149
+
150
+ def initialize(es_response, connectors_meta)
151
+ @elasticsearch_response = es_response.with_indifferent_access
152
+ @connectors_meta = connectors_meta.with_indifferent_access
153
+ end
154
+
119
155
  def self.fetch_connectors_by_query(query, page_size)
120
156
  connectors_meta = ElasticConnectorActions.connectors_meta
121
157