connectors_service 8.6.0.3 → 8.6.0.4.pre.20221114T233727Z

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/config/connectors.yml +9 -10
  3. data/lib/app/config.rb +2 -0
  4. data/lib/app/dispatcher.rb +17 -1
  5. data/lib/app/preflight_check.rb +15 -0
  6. data/lib/connectors/base/connector.rb +37 -4
  7. data/lib/connectors/base/simple_rules_parser.rb +42 -0
  8. data/lib/connectors/connector_status.rb +4 -4
  9. data/lib/connectors/example/{example_attachments → attachments}/first_attachment.txt +0 -0
  10. data/lib/connectors/example/{example_attachments → attachments}/second_attachment.txt +0 -0
  11. data/lib/connectors/example/{example_attachments → attachments}/third_attachment.txt +0 -0
  12. data/lib/connectors/example/connector.rb +43 -4
  13. data/lib/connectors/gitlab/connector.rb +16 -2
  14. data/lib/connectors/mongodb/connector.rb +173 -50
  15. data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
  16. data/lib/connectors/registry.rb +2 -2
  17. data/lib/connectors/sync_status.rb +23 -4
  18. data/lib/core/configuration.rb +4 -2
  19. data/lib/core/connector_job.rb +137 -0
  20. data/lib/core/connector_settings.rb +29 -18
  21. data/lib/core/elastic_connector_actions.rb +331 -32
  22. data/lib/core/filtering/post_process_engine.rb +39 -0
  23. data/lib/core/filtering/post_process_result.rb +27 -0
  24. data/lib/core/filtering/simple_rule.rb +141 -0
  25. data/lib/core/filtering/validation_job_runner.rb +53 -0
  26. data/lib/{connectors_app/// → core/filtering/validation_status.rb} +9 -5
  27. data/lib/core/filtering.rb +17 -0
  28. data/lib/core/ingestion/es_sink.rb +59 -0
  29. data/lib/core/ingestion/ingester.rb +90 -0
  30. data/lib/core/{output_sink.rb → ingestion.rb} +2 -5
  31. data/lib/core/native_scheduler.rb +3 -0
  32. data/lib/core/scheduler.rb +43 -10
  33. data/lib/core/single_scheduler.rb +3 -0
  34. data/lib/core/sync_job_runner.rb +78 -18
  35. data/lib/core.rb +2 -0
  36. data/lib/utility/bulk_queue.rb +85 -0
  37. data/lib/utility/common.rb +20 -0
  38. data/lib/utility/constants.rb +2 -0
  39. data/lib/utility/errors.rb +5 -0
  40. data/lib/utility/es_client.rb +6 -2
  41. data/lib/utility/filtering.rb +22 -0
  42. data/lib/utility/logger.rb +2 -1
  43. data/lib/utility.rb +5 -3
  44. metadata +27 -18
  45. data/lib/core/output_sink/base_sink.rb +0 -33
  46. data/lib/core/output_sink/combined_sink.rb +0 -38
  47. data/lib/core/output_sink/console_sink.rb +0 -51
  48. data/lib/core/output_sink/es_sink.rb +0 -74
@@ -0,0 +1,81 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'active_support/core_ext/object'
10
+ require 'connectors/base/simple_rules_parser'
11
+ require 'core/filtering/simple_rule'
12
+
13
+ module Connectors
14
+ module MongoDB
15
+ class MongoRulesParser < Connectors::Base::SimpleRulesParser
16
+ def parse_rule(rule)
17
+ field = rule.field
18
+ value = rule.value
19
+ unless value.present?
20
+ raise "value is required for field: #{field}"
21
+ end
22
+ unless field.present?
23
+ raise "field is required for rule: #{rule}"
24
+ end
25
+ op = rule.rule
26
+ case op
27
+ when Core::Filtering::SimpleRule::Rule::EQUALS
28
+ parse_equals(rule)
29
+ when Core::Filtering::SimpleRule::Rule::GREATER_THAN
30
+ parse_greater_than(rule)
31
+ when Core::Filtering::SimpleRule::Rule::LESS_THAN
32
+ parse_less_than(rule)
33
+ when Core::Filtering::SimpleRule::Rule::REGEX
34
+ parse_regex(rule)
35
+ else
36
+ raise "Unknown operator: #{op}"
37
+ end
38
+ end
39
+
40
+ def merge_rules(rules)
41
+ return {} if rules.empty?
42
+ return rules[0] if rules.size == 1
43
+ { '$and' => rules }
44
+ end
45
+
46
+ private
47
+
48
+ def parse_equals(rule)
49
+ if rule.is_include?
50
+ { rule.field => rule.value }
51
+ else
52
+ { rule.field => { '$ne' => rule.value } }
53
+ end
54
+ end
55
+
56
+ def parse_greater_than(rule)
57
+ if rule.is_include?
58
+ { rule.field => { '$gt' => rule.value } }
59
+ else
60
+ { rule.field => { '$lte' => rule.value } }
61
+ end
62
+ end
63
+
64
+ def parse_less_than(rule)
65
+ if rule.is_include?
66
+ { rule.field => { '$lt' => rule.value } }
67
+ else
68
+ { rule.field => { '$gte' => rule.value } }
69
+ end
70
+ end
71
+
72
+ def parse_regex(rule)
73
+ if rule.is_include?
74
+ { rule.field => /#{rule.value}/ }
75
+ else
76
+ { rule.field => { '$not' => /#{rule.value}/ } }
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
@@ -24,10 +24,10 @@ module Connectors
24
24
  @connectors[name]
25
25
  end
26
26
 
27
- def connector(name, configuration)
27
+ def connector(name, configuration, job_description: {})
28
28
  klass = connector_class(name)
29
29
  if klass.present?
30
- return klass.new(configuration: configuration)
30
+ return klass.new(configuration: configuration, job_description: job_description)
31
31
  end
32
32
  raise "Connector #{name} is not yet registered. You need to register it before use"
33
33
  end
@@ -8,14 +8,33 @@
8
8
 
9
9
  module Connectors
10
10
  class SyncStatus
11
- COMPLETED = 'completed'
11
+ PENDING = 'pending'
12
12
  IN_PROGRESS = 'in_progress'
13
- FAILED = 'failed'
13
+ CANCELING = 'canceling'
14
+ CANCELED = 'canceled'
15
+ SUSPENDED = 'suspended'
16
+ COMPLETED = 'completed'
17
+ ERROR = 'error'
14
18
 
15
19
  STATUSES = [
16
- COMPLETED,
20
+ PENDING,
17
21
  IN_PROGRESS,
18
- FAILED
22
+ CANCELING,
23
+ CANCELED,
24
+ SUSPENDED,
25
+ COMPLETED,
26
+ ERROR
27
+ ]
28
+
29
+ PENDING_STATUES = [
30
+ PENDING,
31
+ SUSPENDED
32
+ ]
33
+
34
+ TERMINAL_STATUSES = [
35
+ CANCELED,
36
+ COMPLETED,
37
+ ERROR
19
38
  ]
20
39
  end
21
40
  end
@@ -23,9 +23,11 @@ module Core
23
23
  Utility::Logger.error("Couldn't find connector for service type #{connector_settings.service_type || service_type}")
24
24
  return
25
25
  end
26
- configuration = connector_class.configurable_fields
26
+ configuration = connector_class.configurable_fields_indifferent_access
27
+ features = connector_class.kibana_features.each_with_object({}) { |feature, hsh| hsh[feature] = true }
27
28
  doc = {
28
- :configuration => configuration
29
+ :configuration => configuration,
30
+ :features => features
29
31
  }
30
32
 
31
33
  doc[:service_type] = service_type if service_type && connector_settings.needs_service_type?
@@ -0,0 +1,137 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'active_support/core_ext/hash/indifferent_access'
10
+ require 'connectors/sync_status'
11
+ require 'core/elastic_connector_actions'
12
+ require 'utility'
13
+
14
+ module Core
15
+ class ConnectorJob
16
+ DEFAULT_PAGE_SIZE = 100
17
+
18
+ # Error Classes
19
+ class ConnectorJobNotFoundError < StandardError; end
20
+
21
+ def self.fetch_by_id(job_id)
22
+ es_response = ElasticConnectorActions.get_job(job_id)
23
+
24
+ raise ConnectorJobNotFoundError.new("Connector job with id=#{job_id} was not found.") unless es_response[:found]
25
+ new(es_response)
26
+ end
27
+
28
+ def self.pending_jobs(page_size = DEFAULT_PAGE_SIZE)
29
+ query = { terms: { status: Connectors::SyncStatus::PENDING_STATUES } }
30
+ fetch_jobs_by_query(query, page_size)
31
+ end
32
+
33
+ def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
34
+ []
35
+ end
36
+
37
+ def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
38
+ []
39
+ end
40
+
41
+ def self.enqueue(_connector_id)
42
+ nil
43
+ end
44
+
45
+ def id
46
+ @elasticsearch_response[:_id]
47
+ end
48
+
49
+ def [](property_name)
50
+ @elasticsearch_response[:_source][property_name]
51
+ end
52
+
53
+ def status
54
+ self[:status]
55
+ end
56
+
57
+ def in_progress?
58
+ status == Connectors::SyncStatus::IN_PROGRESS
59
+ end
60
+
61
+ def canceling?
62
+ status == Connectors::SyncStatus::CANCELING
63
+ end
64
+
65
+ def connector_snapshot
66
+ self[:connector]
67
+ end
68
+
69
+ def connector_id
70
+ connector_snapshot[:id]
71
+ end
72
+
73
+ def index_name
74
+ connector_snapshot[:configuration]
75
+ end
76
+
77
+ def language
78
+ connector_snapshot[:language]
79
+ end
80
+
81
+ def service_type
82
+ connector_snapshot[:service_type]
83
+ end
84
+
85
+ def configuration
86
+ connector_snapshot[:configuration]
87
+ end
88
+
89
+ def filtering
90
+ Utility::Filtering.extract_filter(connector_snapshot[:filtering])
91
+ end
92
+
93
+ def pipeline
94
+ connector_snapshot[:pipeline]
95
+ end
96
+
97
+ def connector
98
+ @connector ||= ConnectorSettings.fetch_by_id(connector_id)
99
+ end
100
+
101
+ def reload_connector!
102
+ @connector = nil
103
+ connector
104
+ end
105
+
106
+ def reload
107
+ es_response = ElasticConnectorActions.get_job(id)
108
+ raise ConnectorJobNotFoundError.new("Connector job with id=#{id} was not found.") unless es_response[:found]
109
+ # TODO: remove the usage of with_indifferent_access. get_id method is expected to return a hash
110
+ @elasticsearch_response = es_response.with_indifferent_access
111
+ @connector = nil
112
+ end
113
+
114
+ private
115
+
116
+ def initialize(es_response)
117
+ # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
118
+ @elasticsearch_response = es_response.with_indifferent_access
119
+ end
120
+
121
+ def self.fetch_jobs_by_query(query, page_size)
122
+ results = []
123
+ offset = 0
124
+ loop do
125
+ response = ElasticConnectorActions.search_jobs(query, page_size, offset)
126
+
127
+ hits = response.dig('hits', 'hits') || []
128
+ total = response.dig('hits', 'total', 'value') || 0
129
+ results += hits.map { |hit| new(hit) }
130
+ break if results.size >= total
131
+ offset += hits.size
132
+ end
133
+
134
+ results
135
+ end
136
+ end
137
+ end
@@ -8,6 +8,7 @@
8
8
 
9
9
  require 'active_support/core_ext/hash/indifferent_access'
10
10
  require 'connectors/connector_status'
11
+ require 'connectors/registry'
11
12
  require 'core/elastic_connector_actions'
12
13
  require 'utility'
13
14
 
@@ -19,6 +20,8 @@ module Core
19
20
  DEFAULT_REDUCE_WHITESPACE = true
20
21
  DEFAULT_RUN_ML_INFERENCE = true
21
22
 
23
+ DEFAULT_FILTERING = {}
24
+
22
25
  DEFAULT_PAGE_SIZE = 100
23
26
 
24
27
  # Error Classes
@@ -32,13 +35,15 @@ module Core
32
35
  new(es_response, connectors_meta)
33
36
  end
34
37
 
35
- def initialize(es_response, connectors_meta)
36
- @elasticsearch_response = es_response.with_indifferent_access
37
- @connectors_meta = connectors_meta.with_indifferent_access
38
- end
39
-
40
38
  def self.fetch_native_connectors(page_size = DEFAULT_PAGE_SIZE)
41
- query = { term: { is_native: true } }
39
+ query = {
40
+ bool: {
41
+ filter: [
42
+ { term: { is_native: true } },
43
+ { terms: { service_type: Connectors::REGISTRY.registered_connectors } }
44
+ ]
45
+ }
46
+ }
42
47
  fetch_connectors_by_query(query, page_size)
43
48
  end
44
49
 
@@ -80,20 +85,27 @@ module Core
80
85
  self[:scheduling]
81
86
  end
82
87
 
88
+ def filtering
89
+ # assume for now, that first object in filtering array or a filter object itself is the only filtering object
90
+ filtering = @elasticsearch_response.dig(:_source, :filtering)
91
+
92
+ Utility::Filtering.extract_filter(filtering)
93
+ end
94
+
83
95
  def request_pipeline
84
- return_if_present(@elasticsearch_response.dig(:pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
96
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :name), @connectors_meta.dig(:pipeline, :default_name), DEFAULT_REQUEST_PIPELINE)
85
97
  end
86
98
 
87
99
  def extract_binary_content?
88
- return_if_present(@elasticsearch_response.dig(:pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
100
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :extract_binary_content), @connectors_meta.dig(:pipeline, :default_extract_binary_content), DEFAULT_EXTRACT_BINARY_CONTENT)
89
101
  end
90
102
 
91
103
  def reduce_whitespace?
92
- return_if_present(@elasticsearch_response.dig(:pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
104
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :reduce_whitespace), @connectors_meta.dig(:pipeline, :default_reduce_whitespace), DEFAULT_REDUCE_WHITESPACE)
93
105
  end
94
106
 
95
107
  def run_ml_inference?
96
- return_if_present(@elasticsearch_response.dig(:pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
108
+ Utility::Common.return_if_present(@elasticsearch_response.dig(:_source, :pipeline, :run_ml_inference), @connectors_meta.dig(:pipeline, :default_run_ml_inference), DEFAULT_RUN_ML_INFERENCE)
97
109
  end
98
110
 
99
111
  def formatted
@@ -112,6 +124,11 @@ module Core
112
124
 
113
125
  private
114
126
 
127
+ def initialize(es_response, connectors_meta)
128
+ @elasticsearch_response = es_response.with_indifferent_access
129
+ @connectors_meta = connectors_meta.with_indifferent_access
130
+ end
131
+
115
132
  def self.fetch_connectors_by_query(query, page_size)
116
133
  connectors_meta = ElasticConnectorActions.connectors_meta
117
134
 
@@ -120,8 +137,8 @@ module Core
120
137
  loop do
121
138
  response = ElasticConnectorActions.search_connectors(query, page_size, offset)
122
139
 
123
- hits = response['hits']['hits']
124
- total = response['hits']['total']['value']
140
+ hits = response.dig('hits', 'hits') || []
141
+ total = response.dig('hits', 'total', 'value') || 0
125
142
  results += hits.map do |hit|
126
143
  Core::ConnectorSettings.new(hit, connectors_meta)
127
144
  end
@@ -132,11 +149,5 @@ module Core
132
149
  results
133
150
  end
134
151
 
135
- def return_if_present(*args)
136
- args.each do |arg|
137
- return arg unless arg.nil?
138
- end
139
- nil
140
- end
141
152
  end
142
153
  end