RubyGems - connectors_service - Versions diffs - 8.5.0.1 - Mend

connectors_service 8.5.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

checksums.yaml +7 -0
data/LICENSE +93 -0
data/NOTICE.txt +2 -0
data/bin/connectors_service +4 -0
data/bin/list_connectors +4 -0
data/config/connectors.yml +25 -0
data/lib/app/app.rb +25 -0
data/lib/app/config.rb +132 -0
data/lib/app/console_app.rb +278 -0
data/lib/app/dispatcher.rb +121 -0
data/lib/app/menu.rb +104 -0
data/lib/app/preflight_check.rb +134 -0
data/lib/app/version.rb +10 -0
data/lib/connectors/base/adapter.rb +119 -0
data/lib/connectors/base/connector.rb +57 -0
data/lib/connectors/base/custom_client.rb +111 -0
data/lib/connectors/connector_status.rb +31 -0
data/lib/connectors/crawler/scheduler.rb +32 -0
data/lib/connectors/example/connector.rb +57 -0
data/lib/connectors/example/example_attachments/first_attachment.txt +1 -0
data/lib/connectors/example/example_attachments/second_attachment.txt +1 -0
data/lib/connectors/example/example_attachments/third_attachment.txt +1 -0
data/lib/connectors/gitlab/adapter.rb +50 -0
data/lib/connectors/gitlab/connector.rb +67 -0
data/lib/connectors/gitlab/custom_client.rb +44 -0
data/lib/connectors/gitlab/extractor.rb +69 -0
data/lib/connectors/mongodb/connector.rb +138 -0
data/lib/connectors/registry.rb +52 -0
data/lib/connectors/sync_status.rb +21 -0
data/lib/connectors.rb +16 -0
data/lib/connectors_app/// +13 -0
data/lib/connectors_service.rb +24 -0
data/lib/connectors_utility.rb +16 -0
data/lib/core/configuration.rb +48 -0
data/lib/core/connector_settings.rb +142 -0
data/lib/core/elastic_connector_actions.rb +269 -0
data/lib/core/heartbeat.rb +32 -0
data/lib/core/native_scheduler.rb +24 -0
data/lib/core/output_sink/base_sink.rb +33 -0
data/lib/core/output_sink/combined_sink.rb +38 -0
data/lib/core/output_sink/console_sink.rb +51 -0
data/lib/core/output_sink/es_sink.rb +74 -0
data/lib/core/output_sink.rb +13 -0
data/lib/core/scheduler.rb +158 -0
data/lib/core/single_scheduler.rb +29 -0
data/lib/core/sync_job_runner.rb +111 -0
data/lib/core.rb +16 -0
data/lib/list_connectors.rb +22 -0
data/lib/stubs/app_config.rb +35 -0
data/lib/stubs/connectors/stats.rb +35 -0
data/lib/stubs/service_type.rb +13 -0
data/lib/utility/constants.rb +20 -0
data/lib/utility/cron.rb +81 -0
data/lib/utility/elasticsearch/index/language_data.yml +111 -0
data/lib/utility/elasticsearch/index/mappings.rb +104 -0
data/lib/utility/elasticsearch/index/text_analysis_settings.rb +226 -0
data/lib/utility/environment.rb +33 -0
data/lib/utility/errors.rb +132 -0
data/lib/utility/es_client.rb +84 -0
data/lib/utility/exception_tracking.rb +64 -0
data/lib/utility/extension_mapping_util.rb +123 -0
data/lib/utility/logger.rb +84 -0
data/lib/utility/middleware/basic_auth.rb +27 -0
data/lib/utility/middleware/bearer_auth.rb +27 -0
data/lib/utility/middleware/restrict_hostnames.rb +73 -0
data/lib/utility.rb +16 -0
metadata +487 -0

data/lib/utility/elasticsearch/index/mappings.rb ADDED Viewed

@@ -0,0 +1,104 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License;
+# you may not use this file except in compliance with the Elastic License.
+#
+# frozen_string_literal: true
+module Utility
+  module Elasticsearch
+    module Index
+      module Mappings
+        ENUM_IGNORE_ABOVE = 2048
+        DATE_FIELD_MAPPING = {
+          type: 'date'
+        }
+        KEYWORD_FIELD_MAPPING = {
+          type: 'keyword'
+        }
+        TEXT_FIELD_MAPPING = {
+          type: 'text',
+          analyzer: 'iq_text_base',
+          index_options: 'freqs',
+          fields: {
+            'stem': {
+              type: 'text',
+              analyzer: 'iq_text_stem'
+            },
+            'prefix' => {
+              type: 'text',
+              analyzer: 'i_prefix',
+              search_analyzer: 'q_prefix',
+              index_options: 'docs'
+            },
+            'delimiter' => {
+              type: 'text',
+              analyzer: 'iq_text_delimiter',
+              index_options: 'freqs'
+            },
+            'joined': {
+              type: 'text',
+              analyzer: 'i_text_bigram',
+              search_analyzer: 'q_text_bigram',
+              index_options: 'freqs'
+            },
+            'enum': {
+              type: 'keyword',
+              ignore_above: ENUM_IGNORE_ABOVE
+            }
+          }
+        }
+        WORKPLACE_SEARCH_SUBEXTRACTION_STAMP_FIELD_MAPPINGS = {
+          _subextracted_as_of: DATE_FIELD_MAPPING,
+          _subextracted_version: KEYWORD_FIELD_MAPPING
+        }.freeze
+        CRAWLER_FIELD_MAPPINGS = {
+          additional_urls: KEYWORD_FIELD_MAPPING,
+          body_content: TEXT_FIELD_MAPPING,
+          domains: KEYWORD_FIELD_MAPPING,
+          headings: TEXT_FIELD_MAPPING,
+          last_crawled_at: DATE_FIELD_MAPPING,
+          links: KEYWORD_FIELD_MAPPING,
+          meta_description: TEXT_FIELD_MAPPING,
+          meta_keywords: KEYWORD_FIELD_MAPPING,
+          title: TEXT_FIELD_MAPPING,
+          url: KEYWORD_FIELD_MAPPING,
+          url_host: KEYWORD_FIELD_MAPPING,
+          url_path: KEYWORD_FIELD_MAPPING,
+          url_path_dir1: KEYWORD_FIELD_MAPPING,
+          url_path_dir2: KEYWORD_FIELD_MAPPING,
+          url_path_dir3: KEYWORD_FIELD_MAPPING,
+          url_port: KEYWORD_FIELD_MAPPING,
+          url_scheme: KEYWORD_FIELD_MAPPING
+        }.freeze
+        def self.default_text_fields_mappings(connectors_index:, crawler_index: false)
+          {
+            dynamic: true,
+            dynamic_templates: [
+              {
+                data: {
+                  match_mapping_type: 'string',
+                  mapping: TEXT_FIELD_MAPPING
+                }
+              }
+            ],
+            properties: {
+              id: KEYWORD_FIELD_MAPPING
+            }.tap do |properties|
+              properties.merge!(WORKPLACE_SEARCH_SUBEXTRACTION_STAMP_FIELD_MAPPINGS) if connectors_index
+            end.tap do |properties|
+              properties.merge!(CRAWLER_FIELD_MAPPINGS) if crawler_index
+            end
+          }
+        end
+      end
+    end
+  end
+end

data/lib/utility/elasticsearch/index/text_analysis_settings.rb ADDED Viewed

@@ -0,0 +1,226 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License;
+# you may not use this file except in compliance with the Elastic License.
+#
+# frozen_string_literal: true
+require 'yaml'
+module Utility
+  module Elasticsearch
+    module Index
+      class TextAnalysisSettings
+        class UnsupportedLanguageCode < StandardError; end
+        DEFAULT_LANGUAGE = :en
+        FRONT_NGRAM_MAX_GRAM = 12
+        LANGUAGE_DATA_FILE_PATH = File.join(File.dirname(__FILE__), 'language_data.yml')
+        GENERIC_FILTERS = {
+          front_ngram: {
+            type: 'edge_ngram',
+            min_gram: 1,
+            max_gram: FRONT_NGRAM_MAX_GRAM
+          },
+          delimiter: {
+            type: 'word_delimiter_graph',
+            generate_word_parts: true,
+            generate_number_parts: true,
+            catenate_words: true,
+            catenate_numbers: true,
+            catenate_all: true,
+            preserve_original: false,
+            split_on_case_change: true,
+            split_on_numerics: true,
+            stem_english_possessive: true
+          },
+          bigram_joiner: {
+            type: 'shingle',
+            token_separator: '',
+            max_shingle_size: 2,
+            output_unigrams: false
+          },
+          bigram_joiner_unigrams: {
+            type: 'shingle',
+            token_separator: '',
+            max_shingle_size: 2,
+            output_unigrams: true
+          },
+          bigram_max_size: {
+            type: 'length',
+            min: 0,
+            max: 16
+          }
+        }.freeze
+        NON_ICU_ANALYSIS_SETTINGS = {
+          tokenizer_name: 'standard', folding_filters: %w(cjk_width lowercase asciifolding)
+        }.freeze
+        ICU_ANALYSIS_SETTINGS = {
+          tokenizer_name: 'icu_tokenizer', folding_filters: %w(icu_folding)
+        }.freeze
+        def initialize(language_code: nil, analysis_icu: false)
+          @language_code = (language_code || DEFAULT_LANGUAGE).to_sym
+          raise UnsupportedLanguageCode, "Language '#{language_code}' is not supported" unless language_data[@language_code]
+          @analysis_icu = analysis_icu
+          @analysis_settings = icu_settings(analysis_icu)
+        end
+        def to_h
+          {
+            analysis: {
+              analyzer: analyzer_definitions,
+              filter: filter_definitions
+            },
+            index: {
+              similarity: {
+                default: {
+                  type: 'BM25'
+                }
+              }
+            }
+          }
+        end
+        private
+        attr_reader :language_code, :analysis_settings
+        def icu_settings(analysis_settings)
+          return ICU_ANALYSIS_SETTINGS if analysis_settings
+          NON_ICU_ANALYSIS_SETTINGS
+        end
+        def stemmer_name
+          language_data[language_code][:stemmer]
+        end
+        def stop_words_name_or_list
+          language_data[language_code][:stop_words]
+        end
+        def custom_filter_definitions
+          language_data[language_code][:custom_filter_definitions] || {}
+        end
+        def prepended_filters
+          language_data[language_code][:prepended_filters] || []
+        end
+        def postpended_filters
+          language_data[language_code][:postpended_filters] || []
+        end
+        def stem_filter_name
+          "#{language_code}-stem-filter".to_sym
+        end
+        def stop_words_filter_name
+          "#{language_code}-stop-words-filter".to_sym
+        end
+        def filter_definitions
+          definitions = GENERIC_FILTERS.dup
+          definitions[stem_filter_name] = {
+            type: 'stemmer',
+            name: stemmer_name
+          }
+          definitions[stop_words_filter_name] = {
+            type: 'stop',
+            stopwords: stop_words_name_or_list
+          }
+          definitions.merge(custom_filter_definitions)
+        end
+        def analyzer_definitions
+          definitions = {}
+          definitions[:i_prefix] = {
+            tokenizer: analysis_settings[:tokenizer_name],
+            filter: [
+              *analysis_settings[:folding_filters],
+              'front_ngram'
+            ]
+          }
+          definitions[:q_prefix] = {
+            tokenizer: analysis_settings[:tokenizer_name],
+            filter: [
+              *analysis_settings[:folding_filters]
+            ]
+          }
+          definitions[:iq_text_base] = {
+            tokenizer: analysis_settings[:tokenizer_name],
+            filter: [
+              *analysis_settings[:folding_filters],
+              stop_words_filter_name
+            ]
+          }
+          definitions[:iq_text_stem] = {
+            tokenizer: analysis_settings[:tokenizer_name],
+            filter: [
+              *prepended_filters,
+              *analysis_settings[:folding_filters],
+              stop_words_filter_name,
+              stem_filter_name,
+              *postpended_filters
+            ]
+          }
+          definitions[:iq_text_delimiter] = {
+            tokenizer: 'whitespace',
+            filter: [
+              *prepended_filters,
+              'delimiter',
+              *analysis_settings[:folding_filters],
+              stop_words_filter_name,
+              stem_filter_name,
+              *postpended_filters
+            ]
+          }
+          definitions[:i_text_bigram] = {
+            tokenizer: analysis_settings[:tokenizer_name],
+            filter: [
+              *analysis_settings[:folding_filters],
+              stem_filter_name,
+              'bigram_joiner',
+              'bigram_max_size'
+            ]
+          }
+          definitions[:q_text_bigram] = {
+            tokenizer: analysis_settings[:tokenizer_name],
+            filter: [
+              *analysis_settings[:folding_filters],
+              stem_filter_name,
+              'bigram_joiner_unigrams',
+              'bigram_max_size'
+            ]
+          }
+          definitions
+        end
+        def language_data
+          @language_data ||= YAML.safe_load(
+            File.read(LANGUAGE_DATA_FILE_PATH),
+            symbolize_names: true
+          )
+        end
+      end
+    end
+  end
+end

data/lib/utility/environment.rb ADDED Viewed

@@ -0,0 +1,33 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License;
+# you may not use this file except in compliance with the Elastic License.
+#
+require 'logger'
+require 'utility/logger'
+require 'active_support/core_ext/module'
+module Utility
+  module Environment
+    def self.set_execution_environment(config, &block)
+      # Set UTC as the timezone
+      ENV['TZ'] = 'UTC'
+      Logger.level = config[:log_level]
+      es_config = config[:elasticsearch]
+      disable_warnings = if es_config.has_key?(:disable_warnings)
+                           es_config[:disable_warnings]
+                         else
+                           true
+                         end
+      if disable_warnings
+        Logger.info('Disabling warnings')
+        Kernel.silence_warnings(&block)
+      else
+        Logger.info('Enabling warnings')
+        Kernel.enable_warnings(&block)
+      end
+    end
+  end
+end

data/lib/utility/errors.rb ADDED Viewed

@@ -0,0 +1,132 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License;
+# you may not use this file except in compliance with the Elastic License.
+#
+require 'active_support/core_ext/string'
+module Utility
+  class DocumentError
+    attr_accessor :error_class, :error_message, :stack_trace, :error_id
+    def initialize(error_class, error_message, stack_trace, error_id)
+      @error_class = error_class
+      @error_message = error_message
+      @error_id = error_id
+      # keywords must be < 32kb, UTF-8 chars can be up to 3 bytes, thus 32k/3 ~= 10k
+      # See https://github.com/elastic/workplace-search-team/issues/1723
+      @stack_trace = stack_trace.truncate(10_000)
+    end
+    def to_h
+      {
+        'error_class' => error_class,
+        'error_message' => error_message,
+        'stack_trace' => stack_trace,
+        'error_id' => error_id
+      }
+    end
+  end
+  class ClientError < StandardError; end
+  class EvictionWithNoProgressError < StandardError; end
+  class EvictionError < StandardError
+    attr_accessor :cursors
+    def initialize(message = nil, cursors: nil)
+      super(message)
+      @cursors = cursors
+    end
+  end
+  class SuspendedJobError < StandardError
+    attr_accessor :suspend_until, :cursors
+    def initialize(message = nil, suspend_until:, cursors: nil)
+      super(message)
+      @suspend_until = suspend_until
+      @cursors = cursors
+    end
+  end
+  class ThrottlingError < SuspendedJobError; end
+  class TransientServerError < SuspendedJobError; end
+  class UnrecoverableServerError < StandardError; end
+  class TransientSubextractorError < StandardError; end
+  class JobDocumentLimitError < StandardError; end
+  class JobClaimingError < StandardError; end
+  class MonitoringError < StandardError
+    attr_accessor :tripped_by
+    def initialize(message = nil, tripped_by: nil)
+      super("#{message}#{tripped_by.present? ? " Tripped by - #{tripped_by.class}: #{tripped_by.message}" : ''}")
+      @tripped_by = tripped_by
+    end
+  end
+  class MaxSuccessiveErrorsExceededError < MonitoringError; end
+  class MaxErrorsExceededError < MonitoringError; end
+  class MaxErrorsInWindowExceededError < MonitoringError; end
+  class JobSyncNotPossibleYetError < StandardError
+    attr_accessor :sync_will_be_possible_at
+    def initialize(message = nil, sync_will_be_possible_at: nil)
+      human_readable_errors = []
+      human_readable_errors.push(message) unless message.nil?
+      human_readable_errors.push("Content source was created too recently to schedule jobs, next job scheduling is possible at #{sync_will_be_possible_at}.") unless sync_will_be_possible_at.nil?
+      super(human_readable_errors.join(' '))
+    end
+  end
+  class PlatinumLicenseRequiredError < StandardError; end
+  class JobInterruptedError < StandardError; end
+  class JobCannotBeUpdatedError < StandardError; end
+  class SecretInvalidError < StandardError; end
+  class InvalidIndexingConfigurationError < StandardError; end
+  class InvalidTokenError < StandardError; end
+  class TokenRefreshFailedError < StandardError; end
+  class ConnectorNotAvailableError < StandardError; end
+  # For when we want to explicitly set a #cause but can't
+  class ExplicitlyCausedError < StandardError
+    attr_reader :reason
+    def initialize(reason)
+      @reason = reason
+    end
+  end
+  class PublishingFailedError < ExplicitlyCausedError; end
+  class Error
+    attr_reader :status_code, :code, :message
+    def initialize(status_code, code, message)
+      @status_code = status_code
+      @code = code
+      @message = message
+    end
+    def to_h
+      {
+        'code' => @code,
+        'message' => @message
+      }
+    end
+  end
+  class HealthCheckFailedError < StandardError
+    def initialize(msg = nil)
+      super("Health check failed for 3rd-party service: #{msg}")
+    end
+  end
+  INTERNAL_SERVER_ERROR = Utility::Error.new(500, 'INTERNAL_SERVER_ERROR', 'Internal server error')
+  INVALID_API_KEY = Utility::Error.new(401, 'INVALID_API_KEY', 'Invalid API key')
+  UNSUPPORTED_AUTH_SCHEME = Utility::Error.new(401, 'UNSUPPORTED_AUTH_SCHEME', 'Unsupported authorization scheme')
+  INVALID_ACCESS_TOKEN = Utility::Error.new(401, 'INVALID_ACCESS_TOKEN', 'Invalid/expired access token, please refresh the token')
+  TOKEN_REFRESH_ERROR = Utility::Error.new(401, 'TOKEN_REFRESH_ERROR', 'Failed to refresh token, please re-authenticate the application')
+end

data/lib/utility/es_client.rb ADDED Viewed

@@ -0,0 +1,84 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License;
+# you may not use this file except in compliance with the Elastic License.
+#
+# frozen_string_literal: true
+require 'logger'
+require 'elasticsearch'
+module Utility
+  class EsClient < ::Elasticsearch::Client
+    class IndexingFailedError < StandardError
+      def initialize(message, error = nil)
+        super(message)
+        @cause = error
+      end
+      attr_reader :cause
+    end
+    def initialize(es_config)
+      super(connection_configs(es_config))
+    end
+    def connection_configs(es_config)
+      configs = {}
+      configs[:api_key] = es_config[:api_key] if es_config[:api_key]
+      if es_config[:cloud_id]
+        configs[:cloud_id] = es_config[:cloud_id]
+      elsif es_config[:hosts]
+        configs[:hosts] = es_config[:hosts]
+      else
+        raise 'Either elasticsearch.cloud_id or elasticsearch.hosts should be configured.'
+      end
+      configs[:retry_on_failure] = es_config[:retry_on_failure] || false
+      configs[:request_timeout] = es_config[:request_timeout] || nil
+      configs[:log] = es_config[:log] || false
+      configs[:trace] = es_config[:trace] || false
+      # if log or trace is activated, we use the application logger
+      configs[:logger] = if configs[:log] || configs[:trace]
+                           Utility::Logger.logger
+                         else
+                           # silence!
+                           ::Logger.new(IO::NULL)
+                         end
+      configs
+    end
+    def bulk(arguments = {})
+      raise_if_necessary(super(arguments))
+    end
+    private
+    def raise_if_necessary(response)
+      if response['errors']
+        first_error = nil
+        response['items'].each do |item|
+          %w[index delete].each do |op|
+            if item.has_key?(op) && item[op].has_key?('error')
+              first_error = item
+              break
+            end
+          end
+        end
+        if first_error
+          trace_id = Utility::Logger.generate_trace_id
+          Utility::Logger.error("Failed to index documents into Elasticsearch. First error in response is: #{first_error.to_json}")
+          short_message = Utility::Logger.abbreviated_message(first_error.to_json)
+          raise IndexingFailedError.new("Failed to index documents into Elasticsearch with an error '#{short_message}'. Look up the error ID [#{trace_id}] in the application logs to see the full error message.")
+        else
+          raise IndexingFailedError.new('Failed to index documents into Elasticsearch due to unknown error. Try enabling tracing for Elasticsearch and checking the logs.')
+        end
+      end
+      response
+    end
+  end
+end

data/lib/utility/exception_tracking.rb ADDED Viewed

@@ -0,0 +1,64 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License;
+# you may not use this file except in compliance with the Elastic License.
+#
+# frozen_string_literal: true
+require 'bson'
+require 'utility/logger'
+module Utility
+  class ExceptionTracking
+    class << self
+      def capture_message(message, context = {})
+        Utility::Logger.error("Error: #{message}. Context: #{context.inspect}")
+        # When the method is called from a rescue block, our return value may leak outside of its
+        # intended scope, so let's explicitly return nil here to be safe.
+        nil
+      end
+      def capture_exception(exception, context = {})
+        Utility::Logger.log_stacktrace(generate_stack_trace(exception))
+        Utility::Logger.error("Context: #{context.inspect}") if context
+      end
+      def log_exception(exception, message = nil)
+        Utility::Logger.error(message) if message
+        Utility::Logger.log_stacktrace(generate_stack_trace(exception))
+      end
+      def augment_exception(exception)
+        unless exception.respond_to?(:id)
+          exception.instance_eval do
+            def id
+              @error_id ||= BSON::ObjectId.new.to_s
+            end
+          end
+        end
+      end
+      def generate_error_message(exception, message, context)
+        context = { :message_id => exception.id }.merge(context || {}) if exception.respond_to?(:id)
+        context_message = context && "Context: #{context.inspect}"
+        ['Exception', message, exception.class.to_s, exception.message, context_message]
+          .compact
+          .map { |part| part.to_s.dup.force_encoding('UTF-8') }
+          .join(': ')
+      end
+      def generate_stack_trace(exception)
+        full_message = exception.full_message
+        cause = exception
+        while cause.cause != cause && (cause = cause.cause)
+          full_message << "Cause:\n#{cause.full_message}"
+        end
+        full_message.dup.force_encoding('UTF-8')
+      end
+    end
+  end
+end