connectors_service 8.6.0.3 → 8.6.0.4.pre.20221114T233727Z
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config/connectors.yml +9 -10
- data/lib/app/config.rb +2 -0
- data/lib/app/dispatcher.rb +17 -1
- data/lib/app/preflight_check.rb +15 -0
- data/lib/connectors/base/connector.rb +37 -4
- data/lib/connectors/base/simple_rules_parser.rb +42 -0
- data/lib/connectors/connector_status.rb +4 -4
- data/lib/connectors/example/{example_attachments → attachments}/first_attachment.txt +0 -0
- data/lib/connectors/example/{example_attachments → attachments}/second_attachment.txt +0 -0
- data/lib/connectors/example/{example_attachments → attachments}/third_attachment.txt +0 -0
- data/lib/connectors/example/connector.rb +43 -4
- data/lib/connectors/gitlab/connector.rb +16 -2
- data/lib/connectors/mongodb/connector.rb +173 -50
- data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
- data/lib/connectors/registry.rb +2 -2
- data/lib/connectors/sync_status.rb +23 -4
- data/lib/core/configuration.rb +4 -2
- data/lib/core/connector_job.rb +137 -0
- data/lib/core/connector_settings.rb +29 -18
- data/lib/core/elastic_connector_actions.rb +331 -32
- data/lib/core/filtering/post_process_engine.rb +39 -0
- data/lib/core/filtering/post_process_result.rb +27 -0
- data/lib/core/filtering/simple_rule.rb +141 -0
- data/lib/core/filtering/validation_job_runner.rb +53 -0
- data/lib/{connectors_app/// → core/filtering/validation_status.rb} +9 -5
- data/lib/core/filtering.rb +17 -0
- data/lib/core/ingestion/es_sink.rb +59 -0
- data/lib/core/ingestion/ingester.rb +90 -0
- data/lib/core/{output_sink.rb → ingestion.rb} +2 -5
- data/lib/core/native_scheduler.rb +3 -0
- data/lib/core/scheduler.rb +43 -10
- data/lib/core/single_scheduler.rb +3 -0
- data/lib/core/sync_job_runner.rb +78 -18
- data/lib/core.rb +2 -0
- data/lib/utility/bulk_queue.rb +85 -0
- data/lib/utility/common.rb +20 -0
- data/lib/utility/constants.rb +2 -0
- data/lib/utility/errors.rb +5 -0
- data/lib/utility/es_client.rb +6 -2
- data/lib/utility/filtering.rb +22 -0
- data/lib/utility/logger.rb +2 -1
- data/lib/utility.rb +5 -3
- metadata +27 -18
- data/lib/core/output_sink/base_sink.rb +0 -33
- data/lib/core/output_sink/combined_sink.rb +0 -38
- data/lib/core/output_sink/console_sink.rb +0 -51
- data/lib/core/output_sink/es_sink.rb +0 -74
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8f69f05260d34b07ce34d569ce7c41fdd10349b33121823697ebbb6a4ebf9206
|
4
|
+
data.tar.gz: a2957118c80d0e2bc9ea6a8046307485c11e4f809efb01cabbb96e341dc947c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 63775eded9d9953b41950edd7ca86176200c4ae7510564f6f7995c336d6e78bbe40d494cb0a152a984b14b45055e7b7847a2779888d70905d4956b8c78d4bda1
|
7
|
+
data.tar.gz: 52b00d122ef43fc5afa0b4cb50bbe428111e7fe2cb7cee437dd2d2b6b32516cbd01c5803b0a11609a5021e85605a2e6bd2b30973807df3cae1420864e2fcb185
|
data/config/connectors.yml
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
# general metadata
|
2
|
-
version: 8.6.0.
|
3
|
-
repository:
|
4
|
-
revision:
|
2
|
+
version: 8.6.0.4-20221114T233727Z
|
3
|
+
repository: git@github.com:elastic/ent-search-connectors.git
|
4
|
+
revision: f506d5e5ebedfb0c6058d347d8ce22adc42e2cc0
|
5
5
|
elasticsearch:
|
6
|
-
cloud_id: CHANGEME
|
7
6
|
hosts: http://localhost:9200
|
8
|
-
api_key:
|
7
|
+
api_key: WXNYeWQ0UUJ4Y3ZQV3ctbjVibnU6REx4eE8tbFhUMU94N2JoU2hIeVFMQQ==
|
9
8
|
retry_on_failure: 3
|
10
9
|
request_timeout: 120
|
11
10
|
disable_warnings: true
|
@@ -15,11 +14,11 @@ thread_pool:
|
|
15
14
|
min_threads: 0
|
16
15
|
max_threads: 5
|
17
16
|
max_queue: 100
|
18
|
-
log_level:
|
19
|
-
ecs_logging:
|
17
|
+
log_level: debug
|
18
|
+
ecs_logging: false
|
20
19
|
poll_interval: 3
|
21
20
|
termination_timeout: 60
|
22
21
|
heartbeat_interval: 1800
|
23
|
-
native_mode:
|
24
|
-
connector_id:
|
25
|
-
service_type:
|
22
|
+
native_mode: false
|
23
|
+
connector_id: YcXyd4QBxcvPWw-n2bkA
|
24
|
+
service_type: mongodb
|
data/lib/app/config.rb
CHANGED
@@ -35,6 +35,8 @@ puts "Parsing #{CONFIG_FILE} configuration file."
|
|
35
35
|
optional(:disable_warnings).value(:bool?)
|
36
36
|
optional(:trace).value(:bool?)
|
37
37
|
optional(:log).value(:bool?)
|
38
|
+
optional(:ca_fingerprint).value(:string)
|
39
|
+
optional(:transport_options).value(:hash)
|
38
40
|
end
|
39
41
|
|
40
42
|
optional(:thread_pool).hash do
|
data/lib/app/dispatcher.rb
CHANGED
@@ -73,6 +73,8 @@ module App
|
|
73
73
|
start_heartbeat_task(connector_settings)
|
74
74
|
when :configuration
|
75
75
|
start_configuration_task(connector_settings)
|
76
|
+
when :filter_validation
|
77
|
+
start_filter_validation_task(connector_settings)
|
76
78
|
else
|
77
79
|
Utility::Logger.error("Unknown task type: #{task}. Skipping...")
|
78
80
|
end
|
@@ -84,10 +86,14 @@ module App
|
|
84
86
|
def start_sync_task(connector_settings)
|
85
87
|
start_heartbeat_task(connector_settings)
|
86
88
|
pool.post do
|
87
|
-
Utility::Logger.info("
|
89
|
+
Utility::Logger.info("Initiating a sync job for #{connector_settings.formatted}...")
|
88
90
|
Core::ElasticConnectorActions.ensure_content_index_exists(connector_settings.index_name)
|
89
91
|
job_runner = Core::SyncJobRunner.new(connector_settings)
|
90
92
|
job_runner.execute
|
93
|
+
rescue Core::JobAlreadyRunningError
|
94
|
+
Utility::Logger.info("Sync job for #{connector_settings.formatted} is already running, skipping.")
|
95
|
+
rescue Core::ConnectorVersionChangedError => e
|
96
|
+
Utility::Logger.info("Could not start the job because #{connector_settings.formatted} has been updated externally. Message: #{e.message}")
|
91
97
|
rescue StandardError => e
|
92
98
|
Utility::ExceptionTracking.log_exception(e, "Sync job for #{connector_settings.formatted} failed due to unexpected error.")
|
93
99
|
end
|
@@ -116,6 +122,16 @@ module App
|
|
116
122
|
Utility::ExceptionTracking.log_exception(e, "Configuration task for #{connector_settings.formatted} failed due to unexpected error.")
|
117
123
|
end
|
118
124
|
end
|
125
|
+
|
126
|
+
def start_filter_validation_task(connector_settings)
|
127
|
+
pool.post do
|
128
|
+
Utility::Logger.info("Validating filters for #{connector_settings.formatted}...")
|
129
|
+
validation_job_runner = Core::Filtering::ValidationJobRunner.new(connector_settings)
|
130
|
+
validation_job_runner.execute
|
131
|
+
rescue StandardError => e
|
132
|
+
Utility::ExceptionTracking.log_exception(e, "Filter validation task for #{connector_settings.formatted} failed due to unexpected error.")
|
133
|
+
end
|
134
|
+
end
|
119
135
|
end
|
120
136
|
end
|
121
137
|
end
|
data/lib/app/preflight_check.rb
CHANGED
@@ -23,6 +23,7 @@ module App
|
|
23
23
|
check_es_connection!
|
24
24
|
check_es_version!
|
25
25
|
check_system_indices!
|
26
|
+
check_single_connector!
|
26
27
|
end
|
27
28
|
|
28
29
|
private
|
@@ -59,6 +60,16 @@ module App
|
|
59
60
|
)
|
60
61
|
end
|
61
62
|
|
63
|
+
#-------------------------------------------------------------------------------------------------
|
64
|
+
# Ensures the connector is supported when running in non-native mode
|
65
|
+
def check_single_connector!
|
66
|
+
if App::Config.native_mode
|
67
|
+
Utility::Logger.info('Skip single connector check for native mode.')
|
68
|
+
elsif !Connectors::REGISTRY.registered?(App::Config.service_type)
|
69
|
+
fail_check!("The service type #{App::Config.service_type} is not supported. Terminating...")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
62
73
|
def check_es_connection_with_retries!(retry_interval:, retry_timeout:)
|
63
74
|
started_at = Time.now
|
64
75
|
|
@@ -75,6 +86,10 @@ module App
|
|
75
86
|
else
|
76
87
|
raise UnhealthyCluster, "Unexpected cluster status: #{response['status']}"
|
77
88
|
end
|
89
|
+
rescue *Utility::AUTHORIZATION_ERRORS => e
|
90
|
+
Utility::ExceptionTracking.log_exception(e)
|
91
|
+
|
92
|
+
fail_check!("Elasticsearch returned 'Unauthorized' response. Check your authentication details. Terminating...")
|
78
93
|
rescue *App::RETRYABLE_CONNECTION_ERRORS => e
|
79
94
|
Utility::Logger.warn('Could not connect to Elasticsearch. Make sure it is running and healthy.')
|
80
95
|
Utility::Logger.debug("Error: #{e.full_message}")
|
@@ -7,10 +7,11 @@
|
|
7
7
|
# frozen_string_literal: true
|
8
8
|
|
9
9
|
require 'bson'
|
10
|
-
require 'core/
|
11
|
-
require 'utility
|
12
|
-
require 'utility/
|
10
|
+
require 'core/ingestion'
|
11
|
+
require 'utility'
|
12
|
+
require 'utility/filtering'
|
13
13
|
require 'app/config'
|
14
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
14
15
|
|
15
16
|
module Connectors
|
16
17
|
module Base
|
@@ -19,6 +20,11 @@ module Connectors
|
|
19
20
|
raise 'Not implemented for this connector'
|
20
21
|
end
|
21
22
|
|
23
|
+
# Used as a framework util method, don't override
|
24
|
+
def self.configurable_fields_indifferent_access
|
25
|
+
configurable_fields.with_indifferent_access
|
26
|
+
end
|
27
|
+
|
22
28
|
def self.configurable_fields
|
23
29
|
{}
|
24
30
|
end
|
@@ -27,8 +33,27 @@ module Connectors
|
|
27
33
|
raise 'Not implemented for this connector'
|
28
34
|
end
|
29
35
|
|
30
|
-
def
|
36
|
+
def self.kibana_features
|
37
|
+
[
|
38
|
+
Utility::Constants::FILTERING_RULES_FEATURE,
|
39
|
+
Utility::Constants::FILTERING_ADVANCED_FEATURE
|
40
|
+
]
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.validate_filtering(_filtering = {})
|
44
|
+
raise 'Not implemented for this connector'
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :rules, :advanced_filter_config
|
48
|
+
|
49
|
+
def initialize(configuration: {}, job_description: {})
|
31
50
|
@configuration = configuration.dup || {}
|
51
|
+
@job_description = job_description&.dup || {}
|
52
|
+
|
53
|
+
filtering = Utility::Filtering.extract_filter(@job_description.dig(:connector, :filtering))
|
54
|
+
|
55
|
+
@rules = filtering[:rules] || []
|
56
|
+
@advanced_filter_config = filtering[:advanced_snippet] || {}
|
32
57
|
end
|
33
58
|
|
34
59
|
def yield_documents; end
|
@@ -52,6 +77,14 @@ module Connectors
|
|
52
77
|
Utility::ExceptionTracking.log_exception(e, "Connector for service #{self.class.service_type} failed the health check for 3rd-party service.")
|
53
78
|
false
|
54
79
|
end
|
80
|
+
|
81
|
+
def filtering_present?
|
82
|
+
@advanced_filter_config.present? && !@advanced_filter_config.empty? || @rules.present?
|
83
|
+
end
|
84
|
+
|
85
|
+
def metadata
|
86
|
+
{}
|
87
|
+
end
|
55
88
|
end
|
56
89
|
end
|
57
90
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
# frozen_string_literal: true
|
7
|
+
|
8
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
9
|
+
require 'active_support/core_ext/object/blank'
|
10
|
+
require 'core/filtering/simple_rule'
|
11
|
+
|
12
|
+
module Connectors
|
13
|
+
module Base
|
14
|
+
class SimpleRulesParser
|
15
|
+
def initialize(rules)
|
16
|
+
@rules = (rules || []).map(&:with_indifferent_access).filter { |r| r[:id] != 'DEFAULT' }.sort_by { |r| r[:order] }
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse
|
20
|
+
merge_rules(@rules.map do |rule_hash|
|
21
|
+
rule = Core::Filtering::SimpleRule.new(rule_hash)
|
22
|
+
unless rule.is_include? || rule.is_exclude?
|
23
|
+
raise "Unknown policy: #{rule.policy}"
|
24
|
+
end
|
25
|
+
parse_rule(rule)
|
26
|
+
end)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# merge all rules into a filter object or array
|
32
|
+
# in a base case, does no transformations
|
33
|
+
def merge_rules(rules)
|
34
|
+
rules || []
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse_rule(_rule)
|
38
|
+
raise 'Not implemented'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -8,11 +8,11 @@
|
|
8
8
|
|
9
9
|
module Connectors
|
10
10
|
class ConnectorStatus
|
11
|
-
CREATED
|
11
|
+
CREATED = 'created'
|
12
12
|
NEEDS_CONFIGURATION = 'needs_configuration'
|
13
|
-
CONFIGURED
|
14
|
-
CONNECTED
|
15
|
-
ERROR
|
13
|
+
CONFIGURED = 'configured'
|
14
|
+
CONNECTED = 'connected'
|
15
|
+
ERROR = 'error'
|
16
16
|
|
17
17
|
STATUSES = [
|
18
18
|
CREATED,
|
File without changes
|
File without changes
|
File without changes
|
@@ -7,6 +7,7 @@
|
|
7
7
|
# frozen_string_literal: true
|
8
8
|
|
9
9
|
require 'connectors/base/connector'
|
10
|
+
require 'core/filtering/validation_status'
|
10
11
|
require 'utility'
|
11
12
|
|
12
13
|
module Connectors
|
@@ -20,16 +21,21 @@ module Connectors
|
|
20
21
|
'Example Connector'
|
21
22
|
end
|
22
23
|
|
24
|
+
# Field 'Foo' won't have a default value. Field 'Bar' will have the default value 'Value'.
|
23
25
|
def self.configurable_fields
|
24
26
|
{
|
25
27
|
'foo' => {
|
26
28
|
'label' => 'Foo',
|
27
29
|
'value' => nil
|
30
|
+
},
|
31
|
+
:bar => {
|
32
|
+
:label => 'Bar',
|
33
|
+
:value => 'Value'
|
28
34
|
}
|
29
35
|
}
|
30
36
|
end
|
31
37
|
|
32
|
-
def initialize(configuration: {})
|
38
|
+
def initialize(configuration: {}, job_description: {})
|
33
39
|
super
|
34
40
|
end
|
35
41
|
|
@@ -40,18 +46,51 @@ module Connectors
|
|
40
46
|
# raise 'something went wrong'
|
41
47
|
end
|
42
48
|
|
49
|
+
def self.validate_filtering(filtering = {})
|
50
|
+
# TODO: real filtering validation will follow later
|
51
|
+
errors = [
|
52
|
+
{
|
53
|
+
:ids => ['missing-implementation'],
|
54
|
+
:messages => ['Filtering is not implemented yet for the example connector']
|
55
|
+
}
|
56
|
+
]
|
57
|
+
|
58
|
+
return { :state => Core::Filtering::ValidationStatus::INVALID, :errors => errors } if filtering.present?
|
59
|
+
|
60
|
+
{ :state => Core::Filtering::ValidationStatus::VALID, :errors => [] }
|
61
|
+
end
|
62
|
+
|
43
63
|
def yield_documents
|
44
64
|
attachments = [
|
45
|
-
|
46
|
-
|
47
|
-
|
65
|
+
load_attachment('first_attachment.txt'),
|
66
|
+
load_attachment('second_attachment.txt'),
|
67
|
+
load_attachment('third_attachment.txt'),
|
48
68
|
]
|
49
69
|
|
50
70
|
attachments.each_with_index do |att, index|
|
51
71
|
data = { id: (index + 1).to_s, name: "example document #{index + 1}", _attachment: File.read(att) }
|
72
|
+
|
73
|
+
# Uncomment one of these two lines to simulate longer running sync jobs
|
74
|
+
#
|
75
|
+
# sleep(rand(10..60).seconds)
|
76
|
+
# sleep(rand(1..10).minutes)
|
77
|
+
|
52
78
|
yield data
|
53
79
|
end
|
54
80
|
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def load_attachment(path)
|
85
|
+
attachment_dir = "#{File.dirname(__FILE__)}/attachments"
|
86
|
+
attachment_path = "#{attachment_dir}/#{path}"
|
87
|
+
|
88
|
+
unless File.exist?(attachment_path)
|
89
|
+
raise "Attachment at location '#{attachment_path}' doesn't exist. Attachments should be located under #{attachment_dir}"
|
90
|
+
end
|
91
|
+
|
92
|
+
File.open(attachment_path)
|
93
|
+
end
|
55
94
|
end
|
56
95
|
end
|
57
96
|
end
|
@@ -11,7 +11,7 @@ require 'connectors/base/connector'
|
|
11
11
|
require 'connectors/gitlab/extractor'
|
12
12
|
require 'connectors/gitlab/custom_client'
|
13
13
|
require 'connectors/gitlab/adapter'
|
14
|
-
require 'core/
|
14
|
+
require 'core/ingestion'
|
15
15
|
|
16
16
|
module Connectors
|
17
17
|
module GitLab
|
@@ -36,7 +36,21 @@ module Connectors
|
|
36
36
|
}
|
37
37
|
end
|
38
38
|
|
39
|
-
def
|
39
|
+
def self.validate_filtering(filtering = {})
|
40
|
+
# TODO: real filtering validation will follow later
|
41
|
+
errors = [
|
42
|
+
{
|
43
|
+
:ids => ['missing-implementation'],
|
44
|
+
:messages => ['Filtering is not implemented yet for the GitLab connector']
|
45
|
+
}
|
46
|
+
]
|
47
|
+
|
48
|
+
return { :state => Core::Filtering::ValidationStatus::INVALID, :errors => errors } if filtering.present?
|
49
|
+
|
50
|
+
{ :state => Core::Filtering::ValidationStatus::VALID, :errors => [] }
|
51
|
+
end
|
52
|
+
|
53
|
+
def initialize(configuration: {}, job_description: {})
|
40
54
|
super
|
41
55
|
|
42
56
|
@extractor = Connectors::GitLab::Extractor.new(
|
@@ -6,13 +6,20 @@
|
|
6
6
|
|
7
7
|
# frozen_string_literal: true
|
8
8
|
|
9
|
-
require 'active_support/core_ext/hash/indifferent_access'
|
10
9
|
require 'connectors/base/connector'
|
10
|
+
require 'core/filtering/validation_status'
|
11
|
+
require 'connectors/mongodb/mongo_rules_parser'
|
11
12
|
require 'mongo'
|
13
|
+
require 'utility'
|
12
14
|
|
13
15
|
module Connectors
|
14
16
|
module MongoDB
|
15
17
|
class Connector < Connectors::Base::Connector
|
18
|
+
|
19
|
+
ALLOWED_TOP_LEVEL_FILTER_KEYS = %w[find aggregate]
|
20
|
+
|
21
|
+
PAGE_SIZE = 100
|
22
|
+
|
16
23
|
def self.service_type
|
17
24
|
'mongodb'
|
18
25
|
end
|
@@ -23,28 +30,47 @@ module Connectors
|
|
23
30
|
|
24
31
|
def self.configurable_fields
|
25
32
|
{
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
33
|
+
:host => {
|
34
|
+
:label => 'Server Hostname'
|
35
|
+
},
|
36
|
+
:user => {
|
37
|
+
:label => 'Username'
|
38
|
+
},
|
39
|
+
:password => {
|
40
|
+
:label => 'Password'
|
41
|
+
},
|
42
|
+
:database => {
|
43
|
+
:label => 'Database'
|
44
|
+
},
|
45
|
+
:collection => {
|
46
|
+
:label => 'Collection'
|
47
|
+
},
|
48
|
+
:direct_connection => {
|
49
|
+
:label => 'Direct connection? (true/false)'
|
50
|
+
}
|
44
51
|
}
|
45
52
|
end
|
46
53
|
|
47
|
-
def
|
54
|
+
def self.validate_filtering(filtering = {})
|
55
|
+
valid_filtering = { :state => Core::Filtering::ValidationStatus::VALID, :errors => [] }
|
56
|
+
|
57
|
+
return valid_filtering unless filtering.present?
|
58
|
+
|
59
|
+
filter = Utility::Filtering.extract_filter(filtering)
|
60
|
+
|
61
|
+
advanced_filter_config = filter[:advanced_snippet] || {}
|
62
|
+
filter_keys = advanced_filter_config&.keys
|
63
|
+
|
64
|
+
if !filter_keys&.empty? && (filter_keys.size != 1 || !ALLOWED_TOP_LEVEL_FILTER_KEYS.include?(filter_keys[0]&.to_s))
|
65
|
+
return { :state => Core::Filtering::ValidationStatus::INVALID,
|
66
|
+
:errors => [{ :ids => ['wrong-keys'],
|
67
|
+
:messages => ["Only one of #{ALLOWED_TOP_LEVEL_FILTER_KEYS} is allowed in the filtering object. Keys present: '#{filter_keys}'."] }] }
|
68
|
+
end
|
69
|
+
|
70
|
+
valid_filtering
|
71
|
+
end
|
72
|
+
|
73
|
+
def initialize(configuration: {}, job_description: {})
|
48
74
|
super
|
49
75
|
|
50
76
|
@host = configuration.dig(:host, :value)
|
@@ -57,16 +83,105 @@ module Connectors
|
|
57
83
|
|
58
84
|
def yield_documents
|
59
85
|
with_client do |client|
|
60
|
-
|
61
|
-
|
86
|
+
# We do paging using skip().limit() here to make Ruby recycle the memory for each page pulled from the server after it's not needed any more.
|
87
|
+
# This gives us more control on the usage of the memory (we can adjust PAGE_SIZE constant for that to decrease max memory consumption).
|
88
|
+
# It's done due to the fact that usage of .find.each leads to memory leaks or overuse of memory - the whole result set seems to stay in memory
|
89
|
+
# during the sync. Sometimes (not 100% sure) it even leads to a real leak, when the memory for these objects is never recycled.
|
90
|
+
cursor, options = create_db_cursor_on_collection(client[@collection])
|
91
|
+
skip = 0
|
92
|
+
|
93
|
+
found_overall = 0
|
94
|
+
|
95
|
+
# if no overall limit is specified by filtering use -1 to not break ingestion, when no overall limit is specified (found_overall is only increased,
|
96
|
+
# thus can never reach -1)
|
97
|
+
overall_limit = Float::INFINITY
|
98
|
+
|
99
|
+
if options.present?
|
100
|
+
# there could be a skip parameter defined for filtering
|
101
|
+
skip = options.fetch(:skip, skip)
|
102
|
+
# there could be a limit parameter defined for filtering -> used for an overall limit (not a page limit, which was introduced for memory optimization)
|
103
|
+
overall_limit = options.fetch(:limit, overall_limit)
|
104
|
+
end
|
62
105
|
|
63
|
-
|
106
|
+
overall_limit_reached = false
|
107
|
+
|
108
|
+
loop do
|
109
|
+
found_in_page = 0
|
110
|
+
|
111
|
+
Utility::Logger.info("Requesting #{PAGE_SIZE} documents from MongoDB (Starting at #{skip})")
|
112
|
+
view = cursor.skip(skip).limit(PAGE_SIZE)
|
113
|
+
view.each do |document|
|
114
|
+
yield serialize(document)
|
115
|
+
|
116
|
+
found_in_page += 1
|
117
|
+
found_overall += 1
|
118
|
+
|
119
|
+
overall_limit_reached = found_overall >= overall_limit && overall_limit != Float::INFINITY
|
120
|
+
|
121
|
+
break if overall_limit_reached
|
122
|
+
end
|
123
|
+
|
124
|
+
page_was_empty = found_in_page == 0
|
125
|
+
|
126
|
+
break if page_was_empty || overall_limit_reached
|
127
|
+
|
128
|
+
skip += PAGE_SIZE
|
64
129
|
end
|
65
130
|
end
|
66
131
|
end
|
67
132
|
|
68
133
|
private
|
69
134
|
|
135
|
+
def create_db_cursor_on_collection(collection)
|
136
|
+
return create_find_cursor(collection) if @advanced_filter_config[:find].present?
|
137
|
+
|
138
|
+
return create_aggregate_cursor(collection) if @advanced_filter_config[:aggregate].present?
|
139
|
+
|
140
|
+
return create_simple_rules_cursor(collection) if @rules.present?
|
141
|
+
|
142
|
+
collection.find
|
143
|
+
end
|
144
|
+
|
145
|
+
def create_aggregate_cursor(collection)
|
146
|
+
aggregate = @advanced_filter_config[:aggregate]
|
147
|
+
|
148
|
+
pipeline = aggregate[:pipeline]
|
149
|
+
options = extract_options(aggregate)
|
150
|
+
|
151
|
+
if !pipeline.nil? && pipeline.empty? && !options.present?
|
152
|
+
Utility::Logger.warn('\'Aggregate\' was specified with an empty pipeline and empty options.')
|
153
|
+
end
|
154
|
+
|
155
|
+
[collection.aggregate(pipeline, options), options]
|
156
|
+
end
|
157
|
+
|
158
|
+
def create_find_cursor(collection)
|
159
|
+
find = @advanced_filter_config[:find]
|
160
|
+
|
161
|
+
filter = find[:filter]
|
162
|
+
options = extract_options(find)
|
163
|
+
|
164
|
+
if !filter.nil? && filter.empty? && !options.present?
|
165
|
+
Utility::Logger.warn('\'Find\' was specified with an empty filter and empty options.')
|
166
|
+
end
|
167
|
+
|
168
|
+
[collection.find(filter, options), options]
|
169
|
+
end
|
170
|
+
|
171
|
+
def create_simple_rules_cursor(collection)
|
172
|
+
filter = {}
|
173
|
+
if @rules.present?
|
174
|
+
parser = MongoRulesParser.new(@rules)
|
175
|
+
filter = parser.parse
|
176
|
+
end
|
177
|
+
Utility::Logger.info("Filtering with simple rules filter: #{filter}")
|
178
|
+
filter.present? ? collection.find(filter) : collection.find
|
179
|
+
end
|
180
|
+
|
181
|
+
def extract_options(mongodb_function)
|
182
|
+
mongodb_function[:options].present? ? mongodb_function[:options] : {}
|
183
|
+
end
|
184
|
+
|
70
185
|
def do_health_check
|
71
186
|
with_client do |_client|
|
72
187
|
Utility::Logger.debug("Mongo at #{@host}/#{@database} looks healthy.")
|
@@ -76,34 +191,43 @@ module Connectors
|
|
76
191
|
def with_client
|
77
192
|
raise "Invalid value for 'Direct connection' : #{@direct_connection}." unless %w[true false].include?(@direct_connection.to_s.strip.downcase)
|
78
193
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
Utility::Logger.debug(
|
98
|
-
|
99
|
-
client.collections.each { |coll| Utility::Logger.debug(coll.name) }
|
194
|
+
args = {
|
195
|
+
database: @database,
|
196
|
+
direct_connection: to_boolean(@direct_connection)
|
197
|
+
}
|
198
|
+
|
199
|
+
if @user.present? || @password.present?
|
200
|
+
args[:user] = @user
|
201
|
+
args[:password] = @password
|
202
|
+
end
|
203
|
+
|
204
|
+
Mongo::Client.new(@host, args) do |client|
|
205
|
+
databases = client.database_names
|
206
|
+
|
207
|
+
Utility::Logger.debug("Existing Databases: #{databases}")
|
208
|
+
check_database_exists!(databases, @database)
|
209
|
+
|
210
|
+
collections = client.database.collection_names
|
211
|
+
|
212
|
+
Utility::Logger.debug("Existing Collections: #{collections}")
|
213
|
+
check_collection_exists!(collections, @database, @collection)
|
100
214
|
|
101
215
|
yield client
|
102
|
-
ensure
|
103
|
-
client.close
|
104
216
|
end
|
105
217
|
end
|
106
218
|
|
219
|
+
def check_database_exists!(databases, database)
|
220
|
+
return if databases.include?(database)
|
221
|
+
|
222
|
+
raise "Database (#{database}) does not exist. Existing databases: #{databases.join(', ')}"
|
223
|
+
end
|
224
|
+
|
225
|
+
def check_collection_exists!(collections, database, collection)
|
226
|
+
return if collections.include?(collection)
|
227
|
+
|
228
|
+
raise "Collection (#{collection}) does not exist within database '#{database}'. Existing collections: #{collections.join(', ')}"
|
229
|
+
end
|
230
|
+
|
107
231
|
def serialize(mongodb_document)
|
108
232
|
# This is some lazy serialization here.
|
109
233
|
# Problem: MongoDB has its own format of things - e.g. ids are Bson::ObjectId, which when serialized to JSON
|
@@ -120,11 +244,10 @@ module Connectors
|
|
120
244
|
mongodb_document.map { |v| serialize(v) }
|
121
245
|
when Hash
|
122
246
|
mongodb_document.map do |key, value|
|
123
|
-
|
124
|
-
|
247
|
+
key = 'id' if key == '_id'
|
125
248
|
remapped_value = serialize(value)
|
126
|
-
[
|
127
|
-
end.to_h
|
249
|
+
[key, remapped_value]
|
250
|
+
end.to_h
|
128
251
|
else
|
129
252
|
mongodb_document
|
130
253
|
end
|