connectors_service 8.6.0.3 → 8.6.0.4.pre.20221114T233727Z
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/connectors.yml +9 -10
- data/lib/app/config.rb +2 -0
- data/lib/app/dispatcher.rb +17 -1
- data/lib/app/preflight_check.rb +15 -0
- data/lib/connectors/base/connector.rb +37 -4
- data/lib/connectors/base/simple_rules_parser.rb +42 -0
- data/lib/connectors/connector_status.rb +4 -4
- data/lib/connectors/example/{example_attachments → attachments}/first_attachment.txt +0 -0
- data/lib/connectors/example/{example_attachments → attachments}/second_attachment.txt +0 -0
- data/lib/connectors/example/{example_attachments → attachments}/third_attachment.txt +0 -0
- data/lib/connectors/example/connector.rb +43 -4
- data/lib/connectors/gitlab/connector.rb +16 -2
- data/lib/connectors/mongodb/connector.rb +173 -50
- data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
- data/lib/connectors/registry.rb +2 -2
- data/lib/connectors/sync_status.rb +23 -4
- data/lib/core/configuration.rb +4 -2
- data/lib/core/connector_job.rb +137 -0
- data/lib/core/connector_settings.rb +29 -18
- data/lib/core/elastic_connector_actions.rb +331 -32
- data/lib/core/filtering/post_process_engine.rb +39 -0
- data/lib/core/filtering/post_process_result.rb +27 -0
- data/lib/core/filtering/simple_rule.rb +141 -0
- data/lib/core/filtering/validation_job_runner.rb +53 -0
- data/lib/{connectors_app/// → core/filtering/validation_status.rb} +9 -5
- data/lib/core/filtering.rb +17 -0
- data/lib/core/ingestion/es_sink.rb +59 -0
- data/lib/core/ingestion/ingester.rb +90 -0
- data/lib/core/{output_sink.rb → ingestion.rb} +2 -5
- data/lib/core/native_scheduler.rb +3 -0
- data/lib/core/scheduler.rb +43 -10
- data/lib/core/single_scheduler.rb +3 -0
- data/lib/core/sync_job_runner.rb +78 -18
- data/lib/core.rb +2 -0
- data/lib/utility/bulk_queue.rb +85 -0
- data/lib/utility/common.rb +20 -0
- data/lib/utility/constants.rb +2 -0
- data/lib/utility/errors.rb +5 -0
- data/lib/utility/es_client.rb +6 -2
- data/lib/utility/filtering.rb +22 -0
- data/lib/utility/logger.rb +2 -1
- data/lib/utility.rb +5 -3
- metadata +27 -18
- data/lib/core/output_sink/base_sink.rb +0 -33
- data/lib/core/output_sink/combined_sink.rb +0 -38
- data/lib/core/output_sink/console_sink.rb +0 -51
- data/lib/core/output_sink/es_sink.rb +0 -74
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8f69f05260d34b07ce34d569ce7c41fdd10349b33121823697ebbb6a4ebf9206
|
4
|
+
data.tar.gz: a2957118c80d0e2bc9ea6a8046307485c11e4f809efb01cabbb96e341dc947c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 63775eded9d9953b41950edd7ca86176200c4ae7510564f6f7995c336d6e78bbe40d494cb0a152a984b14b45055e7b7847a2779888d70905d4956b8c78d4bda1
|
7
|
+
data.tar.gz: 52b00d122ef43fc5afa0b4cb50bbe428111e7fe2cb7cee437dd2d2b6b32516cbd01c5803b0a11609a5021e85605a2e6bd2b30973807df3cae1420864e2fcb185
|
data/config/connectors.yml
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
# general metadata
|
2
|
-
version: 8.6.0.
|
3
|
-
repository:
|
4
|
-
revision:
|
2
|
+
version: 8.6.0.4-20221114T233727Z
|
3
|
+
repository: git@github.com:elastic/ent-search-connectors.git
|
4
|
+
revision: f506d5e5ebedfb0c6058d347d8ce22adc42e2cc0
|
5
5
|
elasticsearch:
|
6
|
-
cloud_id: CHANGEME
|
7
6
|
hosts: http://localhost:9200
|
8
|
-
api_key:
|
7
|
+
api_key: WXNYeWQ0UUJ4Y3ZQV3ctbjVibnU6REx4eE8tbFhUMU94N2JoU2hIeVFMQQ==
|
9
8
|
retry_on_failure: 3
|
10
9
|
request_timeout: 120
|
11
10
|
disable_warnings: true
|
@@ -15,11 +14,11 @@ thread_pool:
|
|
15
14
|
min_threads: 0
|
16
15
|
max_threads: 5
|
17
16
|
max_queue: 100
|
18
|
-
log_level:
|
19
|
-
ecs_logging:
|
17
|
+
log_level: debug
|
18
|
+
ecs_logging: false
|
20
19
|
poll_interval: 3
|
21
20
|
termination_timeout: 60
|
22
21
|
heartbeat_interval: 1800
|
23
|
-
native_mode:
|
24
|
-
connector_id:
|
25
|
-
service_type:
|
22
|
+
native_mode: false
|
23
|
+
connector_id: YcXyd4QBxcvPWw-n2bkA
|
24
|
+
service_type: mongodb
|
data/lib/app/config.rb
CHANGED
@@ -35,6 +35,8 @@ puts "Parsing #{CONFIG_FILE} configuration file."
|
|
35
35
|
optional(:disable_warnings).value(:bool?)
|
36
36
|
optional(:trace).value(:bool?)
|
37
37
|
optional(:log).value(:bool?)
|
38
|
+
optional(:ca_fingerprint).value(:string)
|
39
|
+
optional(:transport_options).value(:hash)
|
38
40
|
end
|
39
41
|
|
40
42
|
optional(:thread_pool).hash do
|
data/lib/app/dispatcher.rb
CHANGED
@@ -73,6 +73,8 @@ module App
|
|
73
73
|
start_heartbeat_task(connector_settings)
|
74
74
|
when :configuration
|
75
75
|
start_configuration_task(connector_settings)
|
76
|
+
when :filter_validation
|
77
|
+
start_filter_validation_task(connector_settings)
|
76
78
|
else
|
77
79
|
Utility::Logger.error("Unknown task type: #{task}. Skipping...")
|
78
80
|
end
|
@@ -84,10 +86,14 @@ module App
|
|
84
86
|
def start_sync_task(connector_settings)
|
85
87
|
start_heartbeat_task(connector_settings)
|
86
88
|
pool.post do
|
87
|
-
Utility::Logger.info("
|
89
|
+
Utility::Logger.info("Initiating a sync job for #{connector_settings.formatted}...")
|
88
90
|
Core::ElasticConnectorActions.ensure_content_index_exists(connector_settings.index_name)
|
89
91
|
job_runner = Core::SyncJobRunner.new(connector_settings)
|
90
92
|
job_runner.execute
|
93
|
+
rescue Core::JobAlreadyRunningError
|
94
|
+
Utility::Logger.info("Sync job for #{connector_settings.formatted} is already running, skipping.")
|
95
|
+
rescue Core::ConnectorVersionChangedError => e
|
96
|
+
Utility::Logger.info("Could not start the job because #{connector_settings.formatted} has been updated externally. Message: #{e.message}")
|
91
97
|
rescue StandardError => e
|
92
98
|
Utility::ExceptionTracking.log_exception(e, "Sync job for #{connector_settings.formatted} failed due to unexpected error.")
|
93
99
|
end
|
@@ -116,6 +122,16 @@ module App
|
|
116
122
|
Utility::ExceptionTracking.log_exception(e, "Configuration task for #{connector_settings.formatted} failed due to unexpected error.")
|
117
123
|
end
|
118
124
|
end
|
125
|
+
|
126
|
+
def start_filter_validation_task(connector_settings)
|
127
|
+
pool.post do
|
128
|
+
Utility::Logger.info("Validating filters for #{connector_settings.formatted}...")
|
129
|
+
validation_job_runner = Core::Filtering::ValidationJobRunner.new(connector_settings)
|
130
|
+
validation_job_runner.execute
|
131
|
+
rescue StandardError => e
|
132
|
+
Utility::ExceptionTracking.log_exception(e, "Filter validation task for #{connector_settings.formatted} failed due to unexpected error.")
|
133
|
+
end
|
134
|
+
end
|
119
135
|
end
|
120
136
|
end
|
121
137
|
end
|
data/lib/app/preflight_check.rb
CHANGED
@@ -23,6 +23,7 @@ module App
|
|
23
23
|
check_es_connection!
|
24
24
|
check_es_version!
|
25
25
|
check_system_indices!
|
26
|
+
check_single_connector!
|
26
27
|
end
|
27
28
|
|
28
29
|
private
|
@@ -59,6 +60,16 @@ module App
|
|
59
60
|
)
|
60
61
|
end
|
61
62
|
|
63
|
+
#-------------------------------------------------------------------------------------------------
|
64
|
+
# Ensures the connector is supported when running in non-native mode
|
65
|
+
def check_single_connector!
|
66
|
+
if App::Config.native_mode
|
67
|
+
Utility::Logger.info('Skip single connector check for native mode.')
|
68
|
+
elsif !Connectors::REGISTRY.registered?(App::Config.service_type)
|
69
|
+
fail_check!("The service type #{App::Config.service_type} is not supported. Terminating...")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
62
73
|
def check_es_connection_with_retries!(retry_interval:, retry_timeout:)
|
63
74
|
started_at = Time.now
|
64
75
|
|
@@ -75,6 +86,10 @@ module App
|
|
75
86
|
else
|
76
87
|
raise UnhealthyCluster, "Unexpected cluster status: #{response['status']}"
|
77
88
|
end
|
89
|
+
rescue *Utility::AUTHORIZATION_ERRORS => e
|
90
|
+
Utility::ExceptionTracking.log_exception(e)
|
91
|
+
|
92
|
+
fail_check!("Elasticsearch returned 'Unauthorized' response. Check your authentication details. Terminating...")
|
78
93
|
rescue *App::RETRYABLE_CONNECTION_ERRORS => e
|
79
94
|
Utility::Logger.warn('Could not connect to Elasticsearch. Make sure it is running and healthy.')
|
80
95
|
Utility::Logger.debug("Error: #{e.full_message}")
|
@@ -7,10 +7,11 @@
|
|
7
7
|
# frozen_string_literal: true
|
8
8
|
|
9
9
|
require 'bson'
|
10
|
-
require 'core/
|
11
|
-
require 'utility
|
12
|
-
require 'utility/
|
10
|
+
require 'core/ingestion'
|
11
|
+
require 'utility'
|
12
|
+
require 'utility/filtering'
|
13
13
|
require 'app/config'
|
14
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
14
15
|
|
15
16
|
module Connectors
|
16
17
|
module Base
|
@@ -19,6 +20,11 @@ module Connectors
|
|
19
20
|
raise 'Not implemented for this connector'
|
20
21
|
end
|
21
22
|
|
23
|
+
# Used as a framework util method, don't override
|
24
|
+
def self.configurable_fields_indifferent_access
|
25
|
+
configurable_fields.with_indifferent_access
|
26
|
+
end
|
27
|
+
|
22
28
|
def self.configurable_fields
|
23
29
|
{}
|
24
30
|
end
|
@@ -27,8 +33,27 @@ module Connectors
|
|
27
33
|
raise 'Not implemented for this connector'
|
28
34
|
end
|
29
35
|
|
30
|
-
def
|
36
|
+
def self.kibana_features
|
37
|
+
[
|
38
|
+
Utility::Constants::FILTERING_RULES_FEATURE,
|
39
|
+
Utility::Constants::FILTERING_ADVANCED_FEATURE
|
40
|
+
]
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.validate_filtering(_filtering = {})
|
44
|
+
raise 'Not implemented for this connector'
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :rules, :advanced_filter_config
|
48
|
+
|
49
|
+
def initialize(configuration: {}, job_description: {})
|
31
50
|
@configuration = configuration.dup || {}
|
51
|
+
@job_description = job_description&.dup || {}
|
52
|
+
|
53
|
+
filtering = Utility::Filtering.extract_filter(@job_description.dig(:connector, :filtering))
|
54
|
+
|
55
|
+
@rules = filtering[:rules] || []
|
56
|
+
@advanced_filter_config = filtering[:advanced_snippet] || {}
|
32
57
|
end
|
33
58
|
|
34
59
|
def yield_documents; end
|
@@ -52,6 +77,14 @@ module Connectors
|
|
52
77
|
Utility::ExceptionTracking.log_exception(e, "Connector for service #{self.class.service_type} failed the health check for 3rd-party service.")
|
53
78
|
false
|
54
79
|
end
|
80
|
+
|
81
|
+
def filtering_present?
|
82
|
+
@advanced_filter_config.present? && !@advanced_filter_config.empty? || @rules.present?
|
83
|
+
end
|
84
|
+
|
85
|
+
def metadata
|
86
|
+
{}
|
87
|
+
end
|
55
88
|
end
|
56
89
|
end
|
57
90
|
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
# frozen_string_literal: true
|
7
|
+
|
8
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
9
|
+
require 'active_support/core_ext/object/blank'
|
10
|
+
require 'core/filtering/simple_rule'
|
11
|
+
|
12
|
+
module Connectors
|
13
|
+
module Base
|
14
|
+
class SimpleRulesParser
|
15
|
+
def initialize(rules)
|
16
|
+
@rules = (rules || []).map(&:with_indifferent_access).filter { |r| r[:id] != 'DEFAULT' }.sort_by { |r| r[:order] }
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse
|
20
|
+
merge_rules(@rules.map do |rule_hash|
|
21
|
+
rule = Core::Filtering::SimpleRule.new(rule_hash)
|
22
|
+
unless rule.is_include? || rule.is_exclude?
|
23
|
+
raise "Unknown policy: #{rule.policy}"
|
24
|
+
end
|
25
|
+
parse_rule(rule)
|
26
|
+
end)
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# merge all rules into a filter object or array
|
32
|
+
# in a base case, does no transformations
|
33
|
+
def merge_rules(rules)
|
34
|
+
rules || []
|
35
|
+
end
|
36
|
+
|
37
|
+
def parse_rule(_rule)
|
38
|
+
raise 'Not implemented'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -8,11 +8,11 @@
|
|
8
8
|
|
9
9
|
module Connectors
|
10
10
|
class ConnectorStatus
|
11
|
-
CREATED
|
11
|
+
CREATED = 'created'
|
12
12
|
NEEDS_CONFIGURATION = 'needs_configuration'
|
13
|
-
CONFIGURED
|
14
|
-
CONNECTED
|
15
|
-
ERROR
|
13
|
+
CONFIGURED = 'configured'
|
14
|
+
CONNECTED = 'connected'
|
15
|
+
ERROR = 'error'
|
16
16
|
|
17
17
|
STATUSES = [
|
18
18
|
CREATED,
|
File without changes
|
File without changes
|
File without changes
|
@@ -7,6 +7,7 @@
|
|
7
7
|
# frozen_string_literal: true
|
8
8
|
|
9
9
|
require 'connectors/base/connector'
|
10
|
+
require 'core/filtering/validation_status'
|
10
11
|
require 'utility'
|
11
12
|
|
12
13
|
module Connectors
|
@@ -20,16 +21,21 @@ module Connectors
|
|
20
21
|
'Example Connector'
|
21
22
|
end
|
22
23
|
|
24
|
+
# Field 'Foo' won't have a default value. Field 'Bar' will have the default value 'Value'.
|
23
25
|
def self.configurable_fields
|
24
26
|
{
|
25
27
|
'foo' => {
|
26
28
|
'label' => 'Foo',
|
27
29
|
'value' => nil
|
30
|
+
},
|
31
|
+
:bar => {
|
32
|
+
:label => 'Bar',
|
33
|
+
:value => 'Value'
|
28
34
|
}
|
29
35
|
}
|
30
36
|
end
|
31
37
|
|
32
|
-
def initialize(configuration: {})
|
38
|
+
def initialize(configuration: {}, job_description: {})
|
33
39
|
super
|
34
40
|
end
|
35
41
|
|
@@ -40,18 +46,51 @@ module Connectors
|
|
40
46
|
# raise 'something went wrong'
|
41
47
|
end
|
42
48
|
|
49
|
+
def self.validate_filtering(filtering = {})
|
50
|
+
# TODO: real filtering validation will follow later
|
51
|
+
errors = [
|
52
|
+
{
|
53
|
+
:ids => ['missing-implementation'],
|
54
|
+
:messages => ['Filtering is not implemented yet for the example connector']
|
55
|
+
}
|
56
|
+
]
|
57
|
+
|
58
|
+
return { :state => Core::Filtering::ValidationStatus::INVALID, :errors => errors } if filtering.present?
|
59
|
+
|
60
|
+
{ :state => Core::Filtering::ValidationStatus::VALID, :errors => [] }
|
61
|
+
end
|
62
|
+
|
43
63
|
def yield_documents
|
44
64
|
attachments = [
|
45
|
-
|
46
|
-
|
47
|
-
|
65
|
+
load_attachment('first_attachment.txt'),
|
66
|
+
load_attachment('second_attachment.txt'),
|
67
|
+
load_attachment('third_attachment.txt'),
|
48
68
|
]
|
49
69
|
|
50
70
|
attachments.each_with_index do |att, index|
|
51
71
|
data = { id: (index + 1).to_s, name: "example document #{index + 1}", _attachment: File.read(att) }
|
72
|
+
|
73
|
+
# Uncomment one of these two lines to simulate longer running sync jobs
|
74
|
+
#
|
75
|
+
# sleep(rand(10..60).seconds)
|
76
|
+
# sleep(rand(1..10).minutes)
|
77
|
+
|
52
78
|
yield data
|
53
79
|
end
|
54
80
|
end
|
81
|
+
|
82
|
+
private
|
83
|
+
|
84
|
+
def load_attachment(path)
|
85
|
+
attachment_dir = "#{File.dirname(__FILE__)}/attachments"
|
86
|
+
attachment_path = "#{attachment_dir}/#{path}"
|
87
|
+
|
88
|
+
unless File.exist?(attachment_path)
|
89
|
+
raise "Attachment at location '#{attachment_path}' doesn't exist. Attachments should be located under #{attachment_dir}"
|
90
|
+
end
|
91
|
+
|
92
|
+
File.open(attachment_path)
|
93
|
+
end
|
55
94
|
end
|
56
95
|
end
|
57
96
|
end
|
@@ -11,7 +11,7 @@ require 'connectors/base/connector'
|
|
11
11
|
require 'connectors/gitlab/extractor'
|
12
12
|
require 'connectors/gitlab/custom_client'
|
13
13
|
require 'connectors/gitlab/adapter'
|
14
|
-
require 'core/
|
14
|
+
require 'core/ingestion'
|
15
15
|
|
16
16
|
module Connectors
|
17
17
|
module GitLab
|
@@ -36,7 +36,21 @@ module Connectors
|
|
36
36
|
}
|
37
37
|
end
|
38
38
|
|
39
|
-
def
|
39
|
+
def self.validate_filtering(filtering = {})
|
40
|
+
# TODO: real filtering validation will follow later
|
41
|
+
errors = [
|
42
|
+
{
|
43
|
+
:ids => ['missing-implementation'],
|
44
|
+
:messages => ['Filtering is not implemented yet for the GitLab connector']
|
45
|
+
}
|
46
|
+
]
|
47
|
+
|
48
|
+
return { :state => Core::Filtering::ValidationStatus::INVALID, :errors => errors } if filtering.present?
|
49
|
+
|
50
|
+
{ :state => Core::Filtering::ValidationStatus::VALID, :errors => [] }
|
51
|
+
end
|
52
|
+
|
53
|
+
def initialize(configuration: {}, job_description: {})
|
40
54
|
super
|
41
55
|
|
42
56
|
@extractor = Connectors::GitLab::Extractor.new(
|
@@ -6,13 +6,20 @@
|
|
6
6
|
|
7
7
|
# frozen_string_literal: true
|
8
8
|
|
9
|
-
require 'active_support/core_ext/hash/indifferent_access'
|
10
9
|
require 'connectors/base/connector'
|
10
|
+
require 'core/filtering/validation_status'
|
11
|
+
require 'connectors/mongodb/mongo_rules_parser'
|
11
12
|
require 'mongo'
|
13
|
+
require 'utility'
|
12
14
|
|
13
15
|
module Connectors
|
14
16
|
module MongoDB
|
15
17
|
class Connector < Connectors::Base::Connector
|
18
|
+
|
19
|
+
ALLOWED_TOP_LEVEL_FILTER_KEYS = %w[find aggregate]
|
20
|
+
|
21
|
+
PAGE_SIZE = 100
|
22
|
+
|
16
23
|
def self.service_type
|
17
24
|
'mongodb'
|
18
25
|
end
|
@@ -23,28 +30,47 @@ module Connectors
|
|
23
30
|
|
24
31
|
def self.configurable_fields
|
25
32
|
{
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
33
|
+
:host => {
|
34
|
+
:label => 'Server Hostname'
|
35
|
+
},
|
36
|
+
:user => {
|
37
|
+
:label => 'Username'
|
38
|
+
},
|
39
|
+
:password => {
|
40
|
+
:label => 'Password'
|
41
|
+
},
|
42
|
+
:database => {
|
43
|
+
:label => 'Database'
|
44
|
+
},
|
45
|
+
:collection => {
|
46
|
+
:label => 'Collection'
|
47
|
+
},
|
48
|
+
:direct_connection => {
|
49
|
+
:label => 'Direct connection? (true/false)'
|
50
|
+
}
|
44
51
|
}
|
45
52
|
end
|
46
53
|
|
47
|
-
def
|
54
|
+
def self.validate_filtering(filtering = {})
|
55
|
+
valid_filtering = { :state => Core::Filtering::ValidationStatus::VALID, :errors => [] }
|
56
|
+
|
57
|
+
return valid_filtering unless filtering.present?
|
58
|
+
|
59
|
+
filter = Utility::Filtering.extract_filter(filtering)
|
60
|
+
|
61
|
+
advanced_filter_config = filter[:advanced_snippet] || {}
|
62
|
+
filter_keys = advanced_filter_config&.keys
|
63
|
+
|
64
|
+
if !filter_keys&.empty? && (filter_keys.size != 1 || !ALLOWED_TOP_LEVEL_FILTER_KEYS.include?(filter_keys[0]&.to_s))
|
65
|
+
return { :state => Core::Filtering::ValidationStatus::INVALID,
|
66
|
+
:errors => [{ :ids => ['wrong-keys'],
|
67
|
+
:messages => ["Only one of #{ALLOWED_TOP_LEVEL_FILTER_KEYS} is allowed in the filtering object. Keys present: '#{filter_keys}'."] }] }
|
68
|
+
end
|
69
|
+
|
70
|
+
valid_filtering
|
71
|
+
end
|
72
|
+
|
73
|
+
def initialize(configuration: {}, job_description: {})
|
48
74
|
super
|
49
75
|
|
50
76
|
@host = configuration.dig(:host, :value)
|
@@ -57,16 +83,105 @@ module Connectors
|
|
57
83
|
|
58
84
|
def yield_documents
|
59
85
|
with_client do |client|
|
60
|
-
|
61
|
-
|
86
|
+
# We do paging using skip().limit() here to make Ruby recycle the memory for each page pulled from the server after it's not needed any more.
|
87
|
+
# This gives us more control on the usage of the memory (we can adjust PAGE_SIZE constant for that to decrease max memory consumption).
|
88
|
+
# It's done due to the fact that usage of .find.each leads to memory leaks or overuse of memory - the whole result set seems to stay in memory
|
89
|
+
# during the sync. Sometimes (not 100% sure) it even leads to a real leak, when the memory for these objects is never recycled.
|
90
|
+
cursor, options = create_db_cursor_on_collection(client[@collection])
|
91
|
+
skip = 0
|
92
|
+
|
93
|
+
found_overall = 0
|
94
|
+
|
95
|
+
# if no overall limit is specified by filtering use -1 to not break ingestion, when no overall limit is specified (found_overall is only increased,
|
96
|
+
# thus can never reach -1)
|
97
|
+
overall_limit = Float::INFINITY
|
98
|
+
|
99
|
+
if options.present?
|
100
|
+
# there could be a skip parameter defined for filtering
|
101
|
+
skip = options.fetch(:skip, skip)
|
102
|
+
# there could be a limit parameter defined for filtering -> used for an overall limit (not a page limit, which was introduced for memory optimization)
|
103
|
+
overall_limit = options.fetch(:limit, overall_limit)
|
104
|
+
end
|
62
105
|
|
63
|
-
|
106
|
+
overall_limit_reached = false
|
107
|
+
|
108
|
+
loop do
|
109
|
+
found_in_page = 0
|
110
|
+
|
111
|
+
Utility::Logger.info("Requesting #{PAGE_SIZE} documents from MongoDB (Starting at #{skip})")
|
112
|
+
view = cursor.skip(skip).limit(PAGE_SIZE)
|
113
|
+
view.each do |document|
|
114
|
+
yield serialize(document)
|
115
|
+
|
116
|
+
found_in_page += 1
|
117
|
+
found_overall += 1
|
118
|
+
|
119
|
+
overall_limit_reached = found_overall >= overall_limit && overall_limit != Float::INFINITY
|
120
|
+
|
121
|
+
break if overall_limit_reached
|
122
|
+
end
|
123
|
+
|
124
|
+
page_was_empty = found_in_page == 0
|
125
|
+
|
126
|
+
break if page_was_empty || overall_limit_reached
|
127
|
+
|
128
|
+
skip += PAGE_SIZE
|
64
129
|
end
|
65
130
|
end
|
66
131
|
end
|
67
132
|
|
68
133
|
private
|
69
134
|
|
135
|
+
def create_db_cursor_on_collection(collection)
|
136
|
+
return create_find_cursor(collection) if @advanced_filter_config[:find].present?
|
137
|
+
|
138
|
+
return create_aggregate_cursor(collection) if @advanced_filter_config[:aggregate].present?
|
139
|
+
|
140
|
+
return create_simple_rules_cursor(collection) if @rules.present?
|
141
|
+
|
142
|
+
collection.find
|
143
|
+
end
|
144
|
+
|
145
|
+
def create_aggregate_cursor(collection)
|
146
|
+
aggregate = @advanced_filter_config[:aggregate]
|
147
|
+
|
148
|
+
pipeline = aggregate[:pipeline]
|
149
|
+
options = extract_options(aggregate)
|
150
|
+
|
151
|
+
if !pipeline.nil? && pipeline.empty? && !options.present?
|
152
|
+
Utility::Logger.warn('\'Aggregate\' was specified with an empty pipeline and empty options.')
|
153
|
+
end
|
154
|
+
|
155
|
+
[collection.aggregate(pipeline, options), options]
|
156
|
+
end
|
157
|
+
|
158
|
+
def create_find_cursor(collection)
|
159
|
+
find = @advanced_filter_config[:find]
|
160
|
+
|
161
|
+
filter = find[:filter]
|
162
|
+
options = extract_options(find)
|
163
|
+
|
164
|
+
if !filter.nil? && filter.empty? && !options.present?
|
165
|
+
Utility::Logger.warn('\'Find\' was specified with an empty filter and empty options.')
|
166
|
+
end
|
167
|
+
|
168
|
+
[collection.find(filter, options), options]
|
169
|
+
end
|
170
|
+
|
171
|
+
def create_simple_rules_cursor(collection)
|
172
|
+
filter = {}
|
173
|
+
if @rules.present?
|
174
|
+
parser = MongoRulesParser.new(@rules)
|
175
|
+
filter = parser.parse
|
176
|
+
end
|
177
|
+
Utility::Logger.info("Filtering with simple rules filter: #{filter}")
|
178
|
+
filter.present? ? collection.find(filter) : collection.find
|
179
|
+
end
|
180
|
+
|
181
|
+
def extract_options(mongodb_function)
|
182
|
+
mongodb_function[:options].present? ? mongodb_function[:options] : {}
|
183
|
+
end
|
184
|
+
|
70
185
|
def do_health_check
|
71
186
|
with_client do |_client|
|
72
187
|
Utility::Logger.debug("Mongo at #{@host}/#{@database} looks healthy.")
|
@@ -76,34 +191,43 @@ module Connectors
|
|
76
191
|
def with_client
|
77
192
|
raise "Invalid value for 'Direct connection' : #{@direct_connection}." unless %w[true false].include?(@direct_connection.to_s.strip.downcase)
|
78
193
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
Utility::Logger.debug(
|
98
|
-
|
99
|
-
client.collections.each { |coll| Utility::Logger.debug(coll.name) }
|
194
|
+
args = {
|
195
|
+
database: @database,
|
196
|
+
direct_connection: to_boolean(@direct_connection)
|
197
|
+
}
|
198
|
+
|
199
|
+
if @user.present? || @password.present?
|
200
|
+
args[:user] = @user
|
201
|
+
args[:password] = @password
|
202
|
+
end
|
203
|
+
|
204
|
+
Mongo::Client.new(@host, args) do |client|
|
205
|
+
databases = client.database_names
|
206
|
+
|
207
|
+
Utility::Logger.debug("Existing Databases: #{databases}")
|
208
|
+
check_database_exists!(databases, @database)
|
209
|
+
|
210
|
+
collections = client.database.collection_names
|
211
|
+
|
212
|
+
Utility::Logger.debug("Existing Collections: #{collections}")
|
213
|
+
check_collection_exists!(collections, @database, @collection)
|
100
214
|
|
101
215
|
yield client
|
102
|
-
ensure
|
103
|
-
client.close
|
104
216
|
end
|
105
217
|
end
|
106
218
|
|
219
|
+
def check_database_exists!(databases, database)
|
220
|
+
return if databases.include?(database)
|
221
|
+
|
222
|
+
raise "Database (#{database}) does not exist. Existing databases: #{databases.join(', ')}"
|
223
|
+
end
|
224
|
+
|
225
|
+
def check_collection_exists!(collections, database, collection)
|
226
|
+
return if collections.include?(collection)
|
227
|
+
|
228
|
+
raise "Collection (#{collection}) does not exist within database '#{database}'. Existing collections: #{collections.join(', ')}"
|
229
|
+
end
|
230
|
+
|
107
231
|
def serialize(mongodb_document)
|
108
232
|
# This is some lazy serialization here.
|
109
233
|
# Problem: MongoDB has its own format of things - e.g. ids are Bson::ObjectId, which when serialized to JSON
|
@@ -120,11 +244,10 @@ module Connectors
|
|
120
244
|
mongodb_document.map { |v| serialize(v) }
|
121
245
|
when Hash
|
122
246
|
mongodb_document.map do |key, value|
|
123
|
-
|
124
|
-
|
247
|
+
key = 'id' if key == '_id'
|
125
248
|
remapped_value = serialize(value)
|
126
|
-
[
|
127
|
-
end.to_h
|
249
|
+
[key, remapped_value]
|
250
|
+
end.to_h
|
128
251
|
else
|
129
252
|
mongodb_document
|
130
253
|
end
|