connectors_service 8.5.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +93 -0
- data/NOTICE.txt +2 -0
- data/bin/connectors_service +4 -0
- data/bin/list_connectors +4 -0
- data/config/connectors.yml +25 -0
- data/lib/app/app.rb +25 -0
- data/lib/app/config.rb +132 -0
- data/lib/app/console_app.rb +278 -0
- data/lib/app/dispatcher.rb +121 -0
- data/lib/app/menu.rb +104 -0
- data/lib/app/preflight_check.rb +134 -0
- data/lib/app/version.rb +10 -0
- data/lib/connectors/base/adapter.rb +119 -0
- data/lib/connectors/base/connector.rb +57 -0
- data/lib/connectors/base/custom_client.rb +111 -0
- data/lib/connectors/connector_status.rb +31 -0
- data/lib/connectors/crawler/scheduler.rb +32 -0
- data/lib/connectors/example/connector.rb +57 -0
- data/lib/connectors/example/example_attachments/first_attachment.txt +1 -0
- data/lib/connectors/example/example_attachments/second_attachment.txt +1 -0
- data/lib/connectors/example/example_attachments/third_attachment.txt +1 -0
- data/lib/connectors/gitlab/adapter.rb +50 -0
- data/lib/connectors/gitlab/connector.rb +67 -0
- data/lib/connectors/gitlab/custom_client.rb +44 -0
- data/lib/connectors/gitlab/extractor.rb +69 -0
- data/lib/connectors/mongodb/connector.rb +138 -0
- data/lib/connectors/registry.rb +52 -0
- data/lib/connectors/sync_status.rb +21 -0
- data/lib/connectors.rb +16 -0
- data/lib/connectors_app/// +13 -0
- data/lib/connectors_service.rb +24 -0
- data/lib/connectors_utility.rb +16 -0
- data/lib/core/configuration.rb +48 -0
- data/lib/core/connector_settings.rb +142 -0
- data/lib/core/elastic_connector_actions.rb +269 -0
- data/lib/core/heartbeat.rb +32 -0
- data/lib/core/native_scheduler.rb +24 -0
- data/lib/core/output_sink/base_sink.rb +33 -0
- data/lib/core/output_sink/combined_sink.rb +38 -0
- data/lib/core/output_sink/console_sink.rb +51 -0
- data/lib/core/output_sink/es_sink.rb +74 -0
- data/lib/core/output_sink.rb +13 -0
- data/lib/core/scheduler.rb +158 -0
- data/lib/core/single_scheduler.rb +29 -0
- data/lib/core/sync_job_runner.rb +111 -0
- data/lib/core.rb +16 -0
- data/lib/list_connectors.rb +22 -0
- data/lib/stubs/app_config.rb +35 -0
- data/lib/stubs/connectors/stats.rb +35 -0
- data/lib/stubs/service_type.rb +13 -0
- data/lib/utility/constants.rb +20 -0
- data/lib/utility/cron.rb +81 -0
- data/lib/utility/elasticsearch/index/language_data.yml +111 -0
- data/lib/utility/elasticsearch/index/mappings.rb +104 -0
- data/lib/utility/elasticsearch/index/text_analysis_settings.rb +226 -0
- data/lib/utility/environment.rb +33 -0
- data/lib/utility/errors.rb +132 -0
- data/lib/utility/es_client.rb +84 -0
- data/lib/utility/exception_tracking.rb +64 -0
- data/lib/utility/extension_mapping_util.rb +123 -0
- data/lib/utility/logger.rb +84 -0
- data/lib/utility/middleware/basic_auth.rb +27 -0
- data/lib/utility/middleware/bearer_auth.rb +27 -0
- data/lib/utility/middleware/restrict_hostnames.rb +73 -0
- data/lib/utility.rb +16 -0
- metadata +487 -0
@@ -0,0 +1,158 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'time'
|
10
|
+
require 'fugit'
|
11
|
+
require 'core/connector_settings'
|
12
|
+
require 'utility/cron'
|
13
|
+
require 'utility/logger'
|
14
|
+
require 'utility/exception_tracking'
|
15
|
+
|
16
|
+
module Core
|
17
|
+
class Scheduler
|
18
|
+
def initialize(poll_interval, heartbeat_interval)
|
19
|
+
@poll_interval = poll_interval
|
20
|
+
@heartbeat_interval = heartbeat_interval
|
21
|
+
@is_shutting_down = false
|
22
|
+
end
|
23
|
+
|
24
|
+
def connector_settings
|
25
|
+
raise 'Not implemented'
|
26
|
+
end
|
27
|
+
|
28
|
+
def when_triggered
|
29
|
+
loop do
|
30
|
+
connector_settings.each do |cs|
|
31
|
+
if sync_triggered?(cs)
|
32
|
+
yield cs, :sync
|
33
|
+
end
|
34
|
+
if heartbeat_triggered?(cs)
|
35
|
+
yield cs, :heartbeat
|
36
|
+
end
|
37
|
+
if configuration_triggered?(cs)
|
38
|
+
yield cs, :configuration
|
39
|
+
end
|
40
|
+
end
|
41
|
+
if @is_shutting_down
|
42
|
+
break
|
43
|
+
end
|
44
|
+
rescue StandardError => e
|
45
|
+
Utility::ExceptionTracking.log_exception(e, 'Sync failed due to unexpected error.')
|
46
|
+
ensure
|
47
|
+
if @poll_interval > 0 && !@is_shutting_down
|
48
|
+
Utility::Logger.info("Sleeping for #{@poll_interval} seconds in #{self.class}.")
|
49
|
+
sleep(@poll_interval)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def shutdown
|
55
|
+
Utility::Logger.info("Shutting down scheduler #{self.class.name}.")
|
56
|
+
@is_shutting_down = true
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def sync_triggered?(connector_settings)
|
62
|
+
return false unless connector_registered?(connector_settings.service_type)
|
63
|
+
|
64
|
+
unless connector_settings.valid_index_name?
|
65
|
+
Utility::Logger.info("The index name of #{connector_settings.formatted} is invalid.")
|
66
|
+
return false
|
67
|
+
end
|
68
|
+
|
69
|
+
unless connector_settings.connector_status_allows_sync?
|
70
|
+
Utility::Logger.info("#{connector_settings.formatted.capitalize} is in status \"#{connector_settings.connector_status}\" and won't sync yet. Connector needs to be in one of the following statuses: #{Connectors::ConnectorStatus::STATUSES_ALLOWING_SYNC} to run.")
|
71
|
+
|
72
|
+
return false
|
73
|
+
end
|
74
|
+
|
75
|
+
# Sync when sync_now flag is true for the connector
|
76
|
+
if connector_settings[:sync_now] == true
|
77
|
+
Utility::Logger.info("#{connector_settings.formatted.capitalize} is manually triggered to sync now.")
|
78
|
+
return true
|
79
|
+
end
|
80
|
+
|
81
|
+
# Don't sync if sync is explicitly disabled
|
82
|
+
scheduling_settings = connector_settings.scheduling_settings
|
83
|
+
unless scheduling_settings.present? && scheduling_settings[:enabled] == true
|
84
|
+
Utility::Logger.info("#{connector_settings.formatted.capitalize} scheduling is disabled.")
|
85
|
+
return false
|
86
|
+
end
|
87
|
+
|
88
|
+
# We want to sync when sync never actually happened
|
89
|
+
last_synced = connector_settings[:last_synced]
|
90
|
+
if last_synced.nil? || last_synced.empty?
|
91
|
+
Utility::Logger.info("#{connector_settings.formatted.capitalize} has never synced yet, running initial sync.")
|
92
|
+
return true
|
93
|
+
end
|
94
|
+
|
95
|
+
current_schedule = scheduling_settings[:interval]
|
96
|
+
|
97
|
+
# Don't sync if there is no actual scheduling interval
|
98
|
+
if current_schedule.nil? || current_schedule.empty?
|
99
|
+
Utility::Logger.warn("No sync schedule configured for #{connector_settings.formatted}.")
|
100
|
+
return false
|
101
|
+
end
|
102
|
+
|
103
|
+
current_schedule = begin
|
104
|
+
Utility::Cron.quartz_to_crontab(current_schedule)
|
105
|
+
rescue StandardError => e
|
106
|
+
Utility::ExceptionTracking.log_exception(e, "Unable to convert quartz (#{current_schedule}) to crontab.")
|
107
|
+
return false
|
108
|
+
end
|
109
|
+
cron_parser = Fugit::Cron.parse(current_schedule)
|
110
|
+
|
111
|
+
# Don't sync if the scheduling interval is non-parsable
|
112
|
+
unless cron_parser
|
113
|
+
Utility::Logger.error("Unable to parse sync schedule for #{connector_settings.formatted}: expression #{current_schedule} is not a valid Quartz Cron definition.")
|
114
|
+
return false
|
115
|
+
end
|
116
|
+
|
117
|
+
next_trigger_time = cron_parser.next_time(Time.parse(last_synced))
|
118
|
+
|
119
|
+
# Sync if next trigger for the connector is in past
|
120
|
+
if next_trigger_time < Time.now
|
121
|
+
Utility::Logger.info("#{connector_settings.formatted.capitalize} sync is triggered by cron schedule #{current_schedule}.")
|
122
|
+
return true
|
123
|
+
end
|
124
|
+
|
125
|
+
false
|
126
|
+
end
|
127
|
+
|
128
|
+
def heartbeat_triggered?(connector_settings)
|
129
|
+
return false unless connector_registered?(connector_settings.service_type)
|
130
|
+
|
131
|
+
last_seen = connector_settings[:last_seen]
|
132
|
+
return true if last_seen.nil? || last_seen.empty?
|
133
|
+
last_seen = begin
|
134
|
+
Time.parse(last_seen)
|
135
|
+
rescue StandardError
|
136
|
+
Utility::Logger.warn("Unable to parse last_seen #{last_seen}")
|
137
|
+
nil
|
138
|
+
end
|
139
|
+
return true unless last_seen
|
140
|
+
last_seen + @heartbeat_interval < Time.now
|
141
|
+
end
|
142
|
+
|
143
|
+
def configuration_triggered?(connector_settings)
|
144
|
+
return false unless connector_registered?(connector_settings.service_type)
|
145
|
+
|
146
|
+
connector_settings.connector_status == Connectors::ConnectorStatus::CREATED
|
147
|
+
end
|
148
|
+
|
149
|
+
def connector_registered?(service_type)
|
150
|
+
if Connectors::REGISTRY.registered?(service_type)
|
151
|
+
true
|
152
|
+
else
|
153
|
+
Utility::Logger.info("The service type (#{service_type}) is not supported.")
|
154
|
+
false
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'core/scheduler'
|
10
|
+
require 'core/connector_settings'
|
11
|
+
require 'utility/logger'
|
12
|
+
require 'utility/exception_tracking'
|
13
|
+
|
14
|
+
module Core
|
15
|
+
class SingleScheduler < Core::Scheduler
|
16
|
+
def initialize(connector_id, poll_interval, heartbeat_interval)
|
17
|
+
super(poll_interval, heartbeat_interval)
|
18
|
+
@connector_id = connector_id
|
19
|
+
end
|
20
|
+
|
21
|
+
def connector_settings
|
22
|
+
connector_settings = Core::ConnectorSettings.fetch_by_id(@connector_id)
|
23
|
+
[connector_settings]
|
24
|
+
rescue StandardError => e
|
25
|
+
Utility::ExceptionTracking.log_exception(e, "Could not retrieve the connector by id #{@connector_id} due to unexpected error.")
|
26
|
+
[]
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'connectors/connector_status'
|
10
|
+
require 'connectors/registry'
|
11
|
+
require 'core/output_sink'
|
12
|
+
require 'utility'
|
13
|
+
|
14
|
+
module Core
|
15
|
+
class IncompatibleConfigurableFieldsError < StandardError
|
16
|
+
def initialize(service_type, expected_fields, actual_fields)
|
17
|
+
super("Connector of service_type '#{service_type}' expected configurable fields: #{expected_fields}, actual stored fields: #{actual_fields}")
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
class SyncJobRunner
|
22
|
+
def initialize(connector_settings)
|
23
|
+
@connector_settings = connector_settings
|
24
|
+
@sink = Core::OutputSink::EsSink.new(connector_settings.index_name, @connector_settings.request_pipeline)
|
25
|
+
@connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
|
26
|
+
@connector_instance = Connectors::REGISTRY.connector(connector_settings.service_type, connector_settings.configuration)
|
27
|
+
@status = {
|
28
|
+
:indexed_document_count => 0,
|
29
|
+
:deleted_document_count => 0,
|
30
|
+
:error => nil
|
31
|
+
}
|
32
|
+
end
|
33
|
+
|
34
|
+
def execute
|
35
|
+
validate_configuration!
|
36
|
+
do_sync!
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
|
41
|
+
def do_sync!
|
42
|
+
Utility::Logger.info("Starting sync for connector #{@connector_settings.id}.")
|
43
|
+
|
44
|
+
job_id = ElasticConnectorActions.claim_job(@connector_settings.id)
|
45
|
+
|
46
|
+
unless job_id.present?
|
47
|
+
Utility::Logger.error("Failed to claim the job for #{@connector_settings.id}. Please check the logs for the cause of this error.")
|
48
|
+
return
|
49
|
+
end
|
50
|
+
|
51
|
+
begin
|
52
|
+
Utility::Logger.debug("Successfully claimed job for connector #{@connector_settings.id}.")
|
53
|
+
|
54
|
+
@connector_instance.do_health_check!
|
55
|
+
|
56
|
+
incoming_ids = []
|
57
|
+
existing_ids = ElasticConnectorActions.fetch_document_ids(@connector_settings.index_name)
|
58
|
+
|
59
|
+
Utility::Logger.debug("#{existing_ids.size} documents are present in index #{@connector_settings.index_name}.")
|
60
|
+
|
61
|
+
@connector_instance.yield_documents do |document|
|
62
|
+
document = add_ingest_metadata(document)
|
63
|
+
@sink.ingest(document)
|
64
|
+
incoming_ids << document[:id]
|
65
|
+
@status[:indexed_document_count] += 1
|
66
|
+
end
|
67
|
+
|
68
|
+
ids_to_delete = existing_ids - incoming_ids.uniq
|
69
|
+
|
70
|
+
Utility::Logger.info("Deleting #{ids_to_delete.size} documents from index #{@connector_settings.index_name}.")
|
71
|
+
|
72
|
+
ids_to_delete.each do |id|
|
73
|
+
@sink.delete(id)
|
74
|
+
@status[:deleted_document_count] += 1
|
75
|
+
end
|
76
|
+
|
77
|
+
@sink.flush
|
78
|
+
rescue StandardError => e
|
79
|
+
@status[:error] = e.message
|
80
|
+
Utility::ExceptionTracking.log_exception(e)
|
81
|
+
ElasticConnectorActions.update_connector_status(@connector_settings.id, Connectors::ConnectorStatus::ERROR, Utility::Logger.abbreviated_message(e.message))
|
82
|
+
ensure
|
83
|
+
Utility::Logger.info("Upserted #{@status[:indexed_document_count]} documents into #{@connector_settings.index_name}.")
|
84
|
+
Utility::Logger.info("Deleted #{@status[:deleted_document_count]} documents into #{@connector_settings.index_name}.")
|
85
|
+
|
86
|
+
ElasticConnectorActions.complete_sync(@connector_settings.id, job_id, @status.dup)
|
87
|
+
|
88
|
+
if @status[:error]
|
89
|
+
Utility::Logger.info("Failed to sync for connector #{@connector_settings.id} with error #{@status[:error]}.")
|
90
|
+
else
|
91
|
+
Utility::Logger.info("Successfully synced for connector #{@connector_settings.id}.")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def add_ingest_metadata(document)
|
97
|
+
document.tap do |it|
|
98
|
+
it['_extract_binary_content'] = @connector_settings.extract_binary_content? if @connector_settings.extract_binary_content?
|
99
|
+
it['_reduce_whitespace'] = @connector_settings.reduce_whitespace? if @connector_settings.reduce_whitespace?
|
100
|
+
it['_run_ml_inference'] = @connector_settings.run_ml_inference? if @connector_settings.run_ml_inference?
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def validate_configuration!
|
105
|
+
expected_fields = @connector_class.configurable_fields.keys.map(&:to_s).sort
|
106
|
+
actual_fields = @connector_settings.configuration.keys.map(&:to_s).sort
|
107
|
+
|
108
|
+
raise IncompatibleConfigurableFieldsError.new(@connector_class.service_type, expected_fields, actual_fields) if expected_fields != actual_fields
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
data/lib/core.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'core/configuration'
|
10
|
+
require 'core/connector_settings'
|
11
|
+
require 'core/elastic_connector_actions'
|
12
|
+
require 'core/heartbeat'
|
13
|
+
require 'core/scheduler'
|
14
|
+
require 'core/single_scheduler'
|
15
|
+
require 'core/native_scheduler'
|
16
|
+
require 'core/sync_job_runner'
|
@@ -0,0 +1,22 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'connectors/registry'
|
10
|
+
require 'utility'
|
11
|
+
|
12
|
+
class ListConnectors
|
13
|
+
def self.run!
|
14
|
+
Utility::Environment.set_execution_environment(App::Config) do
|
15
|
+
Utility::Logger.info('Registered connectors:')
|
16
|
+
Connectors::REGISTRY.registered_connectors.each do |connector|
|
17
|
+
Utility::Logger.info("- #{Connectors::REGISTRY.connector_class(connector).display_name}")
|
18
|
+
end
|
19
|
+
Utility::Logger.info('Bye')
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
class AppConfig
|
8
|
+
class << self
|
9
|
+
def connectors
|
10
|
+
{
|
11
|
+
'transient_server_error_retry_delay_minutes' => 5
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
def content_source_sync_max_errors
|
16
|
+
1000
|
17
|
+
end
|
18
|
+
|
19
|
+
def content_source_sync_max_consecutive_errors
|
20
|
+
10
|
21
|
+
end
|
22
|
+
|
23
|
+
def content_source_sync_max_error_ratio
|
24
|
+
0.15
|
25
|
+
end
|
26
|
+
|
27
|
+
def content_source_sync_error_ratio_window_size
|
28
|
+
100
|
29
|
+
end
|
30
|
+
|
31
|
+
def content_source_sync_thumbnails_enabled?
|
32
|
+
true
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'active_support/inflector'
|
10
|
+
|
11
|
+
module Connectors
|
12
|
+
module Stats
|
13
|
+
def self.measure(_key, _value = nil, &block)
|
14
|
+
block.call
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.increment(key, value = 1)
|
18
|
+
# no op
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.prefix_key(key)
|
22
|
+
"connectors.#{key}"
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.class_key(klass, deconstantize = true)
|
26
|
+
name = klass.name
|
27
|
+
# Changes Connectors::GoogleDrive::Adapter to Connectors::GoogleDrive
|
28
|
+
name = ActiveSupport::Inflector.deconstantize(name) if deconstantize
|
29
|
+
# Changes Connectors::GoogleDrive to GoogleDrive
|
30
|
+
name = ActiveSupport::Inflector.demodulize(name)
|
31
|
+
# Changes GoogleDrive to google_drive
|
32
|
+
ActiveSupport::Inflector.underscore(name)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
class ServiceType
|
10
|
+
def classify
|
11
|
+
'classify'
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
module Utility
|
10
|
+
class Constants
|
11
|
+
THUMBNAIL_FIELDS = %w[_thumbnail_80x100 _thumbnail_310x430].freeze
|
12
|
+
SUBEXTRACTOR_RESERVED_FIELDS = %w[_subextracted_as_of _subextracted_version].freeze
|
13
|
+
ALLOW_FIELD = '_allow_permissions'
|
14
|
+
DENY_FIELD = '_deny_permissions'
|
15
|
+
CONNECTORS_INDEX = '.elastic-connectors'
|
16
|
+
JOB_INDEX = '.elastic-connectors-sync-jobs'
|
17
|
+
CONTENT_INDEX_PREFIX = 'search-'
|
18
|
+
CRAWLER_SERVICE_TYPE = 'elastic-crawler'
|
19
|
+
end
|
20
|
+
end
|
data/lib/utility/cron.rb
ADDED
@@ -0,0 +1,81 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'utility/logger'
|
10
|
+
|
11
|
+
module Utility
|
12
|
+
# taken from https://regex101.com/r/cU7zG2/1
|
13
|
+
# previous regexp allowed days of the week as [0-6], but it's not correct because the Kibana scheduler
|
14
|
+
# is using [1-7] for days of the week, aligned with the Quartz scheduler: see http://www.quartz-scheduler.org/documentation/2.4.0-SNAPSHOT/tutorials/tutorial-lesson-06.html
|
15
|
+
# But just replacing with [1-7] would also be incorrect, since according to the Cron spec, the days of the week
|
16
|
+
# are 1-6 for Monday-Saturday, and 0 or 7 for Sunday, 7 being a non-standard but still widely used. So, we need to
|
17
|
+
# allow for 0-7.
|
18
|
+
CRON_REGEXP = /^\s*($|#|\w+\s*=|(\?|\*|(?:[0-5]?\d)(?:(?:-|\/|,)(?:[0-5]?\d))?(?:,(?:[0-5]?\d)(?:(?:-|\/|,)(?:[0-5]?\d))?)*)\s+(\?|\*|(?:[0-5]?\d)(?:(?:-|\/|,)(?:[0-5]?\d))?(?:,(?:[0-5]?\d)(?:(?:-|\/|,)(?:[0-5]?\d))?)*)\s+(\?|\*|(?:[01]?\d|2[0-3])(?:(?:-|\/|,)(?:[01]?\d|2[0-3]))?(?:,(?:[01]?\d|2[0-3])(?:(?:-|\/|,)(?:[01]?\d|2[0-3]))?)*)\s+(\?|\*|(?:0?[1-9]|[12]\d|3[01])(?:(?:-|\/|,)(?:0?[1-9]|[12]\d|3[01]))?(?:,(?:0?[1-9]|[12]\d|3[01])(?:(?:-|\/|,)(?:0?[1-9]|[12]\d|3[01]))?)*)\s+(\?|\*|(?:[1-9]|1[012])(?:(?:-|\/|,)(?:[1-9]|1[012]))?(?:L|W)?(?:,(?:[1-9]|1[012])(?:(?:-|\/|,)(?:[1-9]|1[012]))?(?:L|W)?)*|\?|\*|(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(?:(?:-)(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC))?(?:,(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)(?:(?:-)(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC))?)*)\s+(\?|\*|(?:[0-7])(?:(?:-|\/|,|#)(?:[0-7]))?(?:L)?(?:,(?:[0-7])(?:(?:-|\/|,|#)(?:[0-7]))?(?:L)?)*|\?|\*|(?:MON|TUE|WED|THU|FRI|SAT|SUN)(?:(?:-)(?:MON|TUE|WED|THU|FRI|SAT|SUN))?(?:,(?:MON|TUE|WED|THU|FRI|SAT|SUN)(?:(?:-)(?:MON|TUE|WED|THU|FRI|SAT|SUN))?)*)(|\s)+(\?|\*|(?:|\d{4})(?:(?:-|\/|,)(?:|\d{4}))?(?:,(?:|\d{4})(?:(?:-|\/|,)(?:|\d{4}))?)*))$/
|
19
|
+
|
20
|
+
# see https://github.com/quartz-scheduler/quartz/blob/master/quartz-core/src/main/java/org/quartz/CronExpression.java
|
21
|
+
module Cron
|
22
|
+
def self.check(expr)
|
23
|
+
raise StandardError.new("Unsupported expression #{expr} with #") if expr.include?('#')
|
24
|
+
raise StandardError.new("Unsupported expression #{expr} with L") if expr.include?('L')
|
25
|
+
raise StandardError.new("Unsupported expression #{expr} with W") if expr.include?('W') && !expr.include?('WED')
|
26
|
+
|
27
|
+
expr
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.quartz_to_crontab(expression)
|
31
|
+
@seconds = '*'
|
32
|
+
@minutes = '*'
|
33
|
+
@hours = '*'
|
34
|
+
@day_of_month = '*'
|
35
|
+
@month = '*'
|
36
|
+
@day_of_week = '*'
|
37
|
+
@year = '*'
|
38
|
+
|
39
|
+
# ? is not supported
|
40
|
+
converted_expression = expression.tr('?', '*')
|
41
|
+
|
42
|
+
matched = false
|
43
|
+
converted_expression.match(CRON_REGEXP) { |m|
|
44
|
+
@seconds = m[2]
|
45
|
+
@minutes = m[3]
|
46
|
+
@hours = m[4]
|
47
|
+
@day_of_month = check(m[5])
|
48
|
+
@month = check(m[6])
|
49
|
+
@day_of_week = scheduler_dow_to_crontab(check(m[7])).to_s
|
50
|
+
@year = m[9]
|
51
|
+
matched = true
|
52
|
+
}
|
53
|
+
|
54
|
+
raise StandardError.new("Unknown format #{expression}") unless matched
|
55
|
+
|
56
|
+
# Unix cron has five: minute, hour, day, month, and dayofweek
|
57
|
+
# Quartz adds seconds and year
|
58
|
+
converted_expression = "#{@minutes} #{@hours} #{@day_of_month} #{@month} #{@day_of_week}"
|
59
|
+
|
60
|
+
Utility::Logger.debug("Converted Quartz Cron expression '#{expression}' to Standard Cron Expression '#{converted_expression}'")
|
61
|
+
|
62
|
+
converted_expression
|
63
|
+
end
|
64
|
+
|
65
|
+
# As described above, Quartz uses 1-7 for days of the week, starting with Sunday,
|
66
|
+
# while Unix cron uses 0-6, starting with Monday, and also 7 as an extra non-standard index for Sunday.
|
67
|
+
# (see https://en.wikipedia.org/wiki/Cron for more details)
|
68
|
+
# This means that we need to shift the Quartz day of week that are between 1 and 7 by minus one, but we also allow 0
|
69
|
+
# in case it's not a quartz expression but already the cron standard.
|
70
|
+
# See also the code in connectors-python that does the same thing: https://github.com/elastic/connectors-python/blob/main/connectors/quartz.py
|
71
|
+
def self.scheduler_dow_to_crontab(day)
|
72
|
+
unless /\d/.match?(day)
|
73
|
+
return day
|
74
|
+
end
|
75
|
+
if day.to_i <= 0
|
76
|
+
return day
|
77
|
+
end
|
78
|
+
day.to_i - 1
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,111 @@
|
|
1
|
+
---
|
2
|
+
da:
|
3
|
+
name: Danish
|
4
|
+
stemmer: danish
|
5
|
+
stop_words: _danish_
|
6
|
+
de:
|
7
|
+
name: German
|
8
|
+
stemmer: light_german
|
9
|
+
stop_words: _german_
|
10
|
+
en:
|
11
|
+
name: English
|
12
|
+
stemmer: light_english
|
13
|
+
stop_words: _english_
|
14
|
+
es:
|
15
|
+
name: Spanish
|
16
|
+
stemmer: light_spanish
|
17
|
+
stop_words: _spanish_
|
18
|
+
fr:
|
19
|
+
name: French
|
20
|
+
stemmer: light_french
|
21
|
+
stop_words: _french_
|
22
|
+
custom_filter_definitions:
|
23
|
+
fr-elision:
|
24
|
+
type: elision
|
25
|
+
articles:
|
26
|
+
- l
|
27
|
+
- m
|
28
|
+
- t
|
29
|
+
- qu
|
30
|
+
- n
|
31
|
+
- s
|
32
|
+
- j
|
33
|
+
- d
|
34
|
+
- c
|
35
|
+
- jusqu
|
36
|
+
- quoiqu
|
37
|
+
- lorsqu
|
38
|
+
- puisqu
|
39
|
+
articles_case: true
|
40
|
+
prepended_filters:
|
41
|
+
- fr-elision
|
42
|
+
it:
|
43
|
+
name: Italian
|
44
|
+
stemmer: light_italian
|
45
|
+
stop_words: _italian_
|
46
|
+
custom_filter_definitions:
|
47
|
+
it-elision:
|
48
|
+
type: elision
|
49
|
+
articles:
|
50
|
+
- c
|
51
|
+
- l
|
52
|
+
- all
|
53
|
+
- dall
|
54
|
+
- dell
|
55
|
+
- nell
|
56
|
+
- sull
|
57
|
+
- coll
|
58
|
+
- pell
|
59
|
+
- gl
|
60
|
+
- agl
|
61
|
+
- dagl
|
62
|
+
- degl
|
63
|
+
- negl
|
64
|
+
- sugl
|
65
|
+
- un
|
66
|
+
- m
|
67
|
+
- t
|
68
|
+
- s
|
69
|
+
- v
|
70
|
+
- d
|
71
|
+
articles_case: true
|
72
|
+
prepended_filters:
|
73
|
+
- it-elision
|
74
|
+
ja:
|
75
|
+
name: Japanese
|
76
|
+
stemmer: light_english
|
77
|
+
stop_words: _english_
|
78
|
+
postpended_filters:
|
79
|
+
- cjk_bigram
|
80
|
+
ko:
|
81
|
+
name: Korean
|
82
|
+
stemmer: light_english
|
83
|
+
stop_words: _english_
|
84
|
+
postpended_filters:
|
85
|
+
- cjk_bigram
|
86
|
+
nl:
|
87
|
+
name: Dutch
|
88
|
+
stemmer: dutch
|
89
|
+
stop_words: _dutch_
|
90
|
+
pt:
|
91
|
+
name: Portuguese
|
92
|
+
stemmer: light_portuguese
|
93
|
+
stop_words: _portuguese_
|
94
|
+
pt-br:
|
95
|
+
name: Portuguese (Brazil)
|
96
|
+
stemmer: brazilian
|
97
|
+
stop_words: _brazilian_
|
98
|
+
ru:
|
99
|
+
name: Russian
|
100
|
+
stemmer: russian
|
101
|
+
stop_words: _russian_
|
102
|
+
th:
|
103
|
+
name: Thai
|
104
|
+
stemmer: light_english
|
105
|
+
stop_words: _thai_
|
106
|
+
zh:
|
107
|
+
name: Chinese
|
108
|
+
stemmer: light_english
|
109
|
+
stop_words: _english_
|
110
|
+
postpended_filters:
|
111
|
+
- cjk_bigram
|