RubyGems - connectors_service - Versions diffs - 8.6.0.4.pre.20221116T024501Z → 8.6.0.4 - Mend

connectors_service 8.6.0.4.pre.20221116T024501Z → 8.6.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

checksums.yaml +4 -4
data/config/connectors.yml +6 -6
data/lib/app/app.rb +0 -4
data/lib/app/dispatcher.rb +17 -42
data/lib/app/preflight_check.rb +0 -11
data/lib/connectors/base/connector.rb +14 -43
data/lib/connectors/example/connector.rb +0 -6
data/lib/connectors/gitlab/connector.rb +1 -6
data/lib/connectors/mongodb/connector.rb +43 -47
data/lib/connectors/sync_status.rb +1 -6
data/lib/core/configuration.rb +1 -3
data/lib/core/connector_settings.rb +16 -52
data/lib/core/elastic_connector_actions.rb +59 -320
data/lib/core/output_sink/base_sink.rb +33 -0
data/lib/core/output_sink/combined_sink.rb +38 -0
data/lib/core/output_sink/console_sink.rb +51 -0
data/lib/core/output_sink/es_sink.rb +74 -0
data/lib/core/{ingestion.rb → output_sink.rb} +5 -1
data/lib/core/scheduler.rb +10 -40
data/lib/core/single_scheduler.rb +1 -1
data/lib/core/sync_job_runner.rb +16 -72
data/lib/core.rb +0 -4
data/lib/utility/constants.rb +0 -2
data/lib/utility/errors.rb +12 -0
data/lib/utility/logger.rb +1 -1
data/lib/utility.rb +4 -11
metadata +9 -27
data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +0 -173
data/lib/connectors/base/advanced_snippet_validator.rb +0 -34
data/lib/connectors/base/simple_rules_parser.rb +0 -42
data/lib/connectors/example/example_advanced_snippet_validator.rb +0 -35
data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +0 -35
data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +0 -22
data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +0 -292
data/lib/connectors/mongodb/mongo_rules_parser.rb +0 -81
data/lib/connectors/tolerable_error_helper.rb +0 -43
data/lib/core/connector_job.rb +0 -210
data/lib/core/filtering/post_process_engine.rb +0 -39
data/lib/core/filtering/post_process_result.rb +0 -27
data/lib/core/filtering/simple_rule.rb +0 -141
data/lib/core/filtering/validation_job_runner.rb +0 -53
data/lib/core/filtering/validation_status.rb +0 -17
data/lib/core/filtering.rb +0 -17
data/lib/core/ingestion/es_sink.rb +0 -118
data/lib/core/jobs/consumer.rb +0 -114
data/lib/core/jobs/producer.rb +0 -26
data/lib/utility/bulk_queue.rb +0 -85
data/lib/utility/error_monitor.rb +0 -108
data/lib/utility/filtering.rb +0 -22

data/lib/core/connector_job.rb DELETED Viewed

@@ -1,210 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License;
-# you may not use this file except in compliance with the Elastic License.
-#
-# frozen_string_literal: true
-require 'active_support/core_ext/hash/indifferent_access'
-require 'connectors/sync_status'
-require 'core/elastic_connector_actions'
-require 'utility'
-module Core
-  class ConnectorJob
-    DEFAULT_PAGE_SIZE = 100
-    def self.fetch_by_id(job_id)
-      es_response = ElasticConnectorActions.get_job(job_id)
-      return nil unless es_response[:found]
-      new(es_response)
-    end
-    def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
-      status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
-      query = { bool: { must: [{ terms: status_term }] } }
-      return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
-      query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
-      fetch_jobs_by_query(query, page_size)
-    end
-    def self.orphaned_jobs(_page_size = DEFAULT_PAGE_SIZE)
-      []
-    end
-    def self.stuck_jobs(_page_size = DEFAULT_PAGE_SIZE)
-      []
-    end
-    def self.enqueue(_connector_id)
-      nil
-    end
-    def id
-      @elasticsearch_response[:_id]
-    end
-    def [](property_name)
-      @elasticsearch_response[:_source][property_name]
-    end
-    def error
-      self[:error]
-    end
-    def status
-      self[:status]
-    end
-    def in_progress?
-      status == Connectors::SyncStatus::IN_PROGRESS
-    end
-    def canceling?
-      status == Connectors::SyncStatus::CANCELING
-    end
-    def suspended?
-      status == Connectors::SyncStatus::SUSPENDED
-    end
-    def canceled?
-      status == Connectors::SyncStatus::CANCELED
-    end
-    def pending?
-      Connectors::SyncStatus::PENDING_STATUSES.include?(status)
-    end
-    def active?
-      Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
-    end
-    def terminated?
-      Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
-    end
-    def connector_snapshot
-      self[:connector] || {}
-    end
-    def connector_id
-      @elasticsearch_response[:_source][:connector][:id]
-    end
-    def index_name
-      connector_snapshot[:index_name]
-    end
-    def language
-      connector_snapshot[:language]
-    end
-    def service_type
-      connector_snapshot[:service_type]
-    end
-    def configuration
-      connector_snapshot[:configuration]
-    end
-    def filtering
-      Utility::Filtering.extract_filter(connector_snapshot[:filtering])
-    end
-    def pipeline
-      @elasticsearch_response[:_source][:pipeline]
-    end
-    def connector
-      @connector ||= ConnectorSettings.fetch_by_id(connector_id)
-    end
-    def done!(ingestion_stats = {}, connector_metadata = {})
-      terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
-    end
-    def error!(message, ingestion_stats = {}, connector_metadata = {})
-      terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
-    end
-    def cancel!(ingestion_stats = {}, connector_metadata = {})
-      terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
-    end
-    def with_concurrency_control
-      response = ElasticConnectorActions.get_job(id)
-      yield response, response['_seq_no'], response['_primary_term']
-    end
-    def make_running!
-      with_concurrency_control do |es_doc, seq_no, primary_term|
-        now = Time.now
-        doc = {
-          status: Connectors::SyncStatus::IN_PROGRESS,
-          started_at: now,
-          last_seen: now,
-          worker_hostname: Socket.gethostname
-        }
-        ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
-      end
-    end
-    def es_source
-      @elasticsearch_response[:_source]
-    end
-    private
-    def self.fetch_jobs_by_query(query, page_size)
-      results = []
-      offset = 0
-      loop do
-        response = ElasticConnectorActions.search_jobs(query, page_size, offset)
-        hits = response.dig('hits', 'hits') || []
-        total = response.dig('hits', 'total', 'value') || 0
-        results += hits.map { |hit| new(hit) }
-        break if results.size >= total
-        offset += hits.size
-      end
-      results
-    end
-    def initialize(es_response)
-      # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
-      @elasticsearch_response = es_response.with_indifferent_access
-    end
-    def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
-      ingestion_stats ||= {}
-      ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
-      doc = {
-        :last_seen => Time.now,
-        :completed_at => Time.now,
-        :status => status,
-        :error => error
-      }.merge(ingestion_stats)
-      doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
-      doc[:metadata] = connector_metadata if connector_metadata&.any?
-      ElasticConnectorActions.update_job_fields(id, doc)
-    end
-    def seq_no
-      @elasticsearch_response[:_seq_no]
-    end
-    def primary_term
-      @elasticsearch_response[:_primary_term]
-    end
-  end
-end

data/lib/core/filtering/post_process_engine.rb DELETED Viewed

@@ -1,39 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License;
-# you may not use this file except in compliance with the Elastic License.
-#
-# frozen_string_literal: true
-require 'core/filtering'
-require 'utility/filtering'
-module Core
-  module Filtering
-    class PostProcessEngine
-      attr_reader :rules
-      def initialize(job_description)
-        @rules = ordered_rules(job_description.dig('connector', 'filtering'))
-      end
-      def process(document)
-        @rules.each do |rule|
-          if rule.match?(document.stringify_keys)
-            return PostProcessResult.new(document, rule)
-          end
-        end
-        PostProcessResult.new(document, SimpleRule::DEFAULT_RULE)
-      end
-      private
-      def ordered_rules(job_filtering)
-        job_rules = Utility::Filtering.extract_filter(job_filtering)['rules']
-        sorted_rules = job_rules.sort_by { |rule| rule['order'] }.reject { |rule| rule['id'] == Core::Filtering::SimpleRule::DEFAULT_RULE_ID }
-        sorted_rules.each_with_object([]) { |rule, output| output << SimpleRule.new(rule) }
-      end
-    end
-  end
-end

data/lib/core/filtering/post_process_result.rb DELETED Viewed

@@ -1,27 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License;
-# you may not use this file except in compliance with the Elastic License.
-#
-# frozen_string_literal: true
-require 'utility/logger'
-module Core
-  module Filtering
-    class PostProcessResult
-      attr_reader :document, :matching_rule
-      def initialize(document, matching_rule)
-        @document = document
-        @matching_rule = matching_rule
-        Utility::Logger.debug("Document '#{document['id']}' matched filtering rule: #{matching_rule.id}. It will be #{matching_rule.policy}d")
-      end
-      def is_include?
-        matching_rule.is_include?
-      end
-    end
-  end
-end

data/lib/core/filtering/simple_rule.rb DELETED Viewed

@@ -1,141 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License;
-# you may not use this file except in compliance with the Elastic License.
-#
-# frozen_string_literal: true
-require 'utility/logger'
-module Core
-  module Filtering
-    class SimpleRule
-      DEFAULT_RULE_ID = 'DEFAULT'
-      class Policy
-        INCLUDE = 'include'
-        EXCLUDE = 'exclude'
-      end
-      class Rule
-        REGEX = 'regex'
-        EQUALS = 'equals'
-        STARTS_WITH = 'starts_with'
-        ENDS_WITH = 'ends_with'
-        CONTAINS = 'contains'
-        LESS_THAN = '<'
-        GREATER_THAN = '>'
-      end
-      attr_reader :policy, :field, :rule, :value, :id
-      def initialize(rule_hash)
-        @policy = rule_hash.fetch('policy')
-        @field = rule_hash.fetch('field')
-        @rule = rule_hash.fetch('rule')
-        @value = rule_hash.fetch('value')
-        @id = rule_hash.fetch('id')
-        @rule_hash = rule_hash
-      rescue KeyError => e
-        raise "#{e.key} is required"
-      end
-      def self.from_args(id, policy, field, rule, value)
-        SimpleRule.new(
-          {
-            'id' => id,
-            'policy' => policy,
-            'field' => field,
-            'rule' => rule,
-            'value' => value
-          }
-        )
-      end
-      DEFAULT_RULE = SimpleRule.new(
-        'policy' => 'include',
-        'field' => '_',
-        'rule' => 'regex',
-        'value' => '.*',
-        'id' => SimpleRule::DEFAULT_RULE_ID
-      )
-      def match?(document)
-        return true if id == DEFAULT_RULE_ID
-        doc_value = document[field]
-        return false if doc_value.nil?
-        coerced_value = coerce(doc_value)
-        case rule
-        when Rule::EQUALS
-          case coerced_value
-          when Integer
-            doc_value == coerced_value
-          when DateTime, Time
-            doc_value.to_s == coerced_value.to_s
-          else
-            doc_value.to_s == coerced_value
-          end
-        when Rule::STARTS_WITH
-          doc_value.to_s.start_with?(value)
-        when Rule::ENDS_WITH
-          doc_value.to_s.end_with?(value)
-        when Rule::CONTAINS
-          doc_value.to_s.include?(value)
-        when Rule::REGEX
-          doc_value.to_s.match(/#{value}/)
-        when Rule::LESS_THAN
-          doc_value < coerced_value
-        when Rule::GREATER_THAN
-          doc_value > coerced_value
-        else
-          false
-        end
-      end
-      def coerce(doc_value)
-        case doc_value
-        when String
-          value.to_s
-        when Integer
-          value.to_i
-        when DateTime, Time
-          to_date(value)
-        when TrueClass, FalseClass # Ruby doesn't have a Boolean type, TIL
-          to_bool(value).to_s
-        else
-          value.to_s
-        end
-      rescue StandardError => e
-        Utility::Logger.debug("Failed to coerce value '#{value}' (#{value.class}) based on document value '#{doc_value}' (#{doc_value.class}) due to error: #{e.class}: #{e.message}")
-        value.to_s
-      end
-      def is_include?
-        policy == Policy::INCLUDE
-      end
-      def is_exclude?
-        policy == Policy::EXCLUDE
-      end
-      def to_h
-        @rule_hash
-      end
-      private
-      def to_bool(str)
-        return true if str == true || str =~ (/^(true|t|yes|y|on|1)$/i)
-        return false if str == false || str.blank? || str =~ (/^(false|f|no|n|off|0)$/i)
-        raise ArgumentError.new("invalid value for Boolean: \"#{str}\"")
-      end
-      def to_date(str)
-        DateTime.parse(str)
-      rescue ArgumentError
-        Time.at(str.to_i) # try with it as an int string of millis
-      end
-    end
-  end
-end

data/lib/core/filtering/validation_job_runner.rb DELETED Viewed

@@ -1,53 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License;
-# you may not use this file except in compliance with the Elastic License.
-#
-# frozen_string_literal: true
-require 'connectors/connector_status'
-require 'connectors/registry'
-module Core
-  module Filtering
-    DEFAULT_DOMAIN = 'DEFAULT'
-    class ValidationJobRunner
-      def initialize(connector_settings)
-        @connector_settings = connector_settings
-        @connector_class = Connectors::REGISTRY.connector_class(connector_settings.service_type)
-        @validation_finished = false
-        @status = { :error => nil }
-      end
-      def execute
-        Utility::Logger.info("Starting a validation job for connector #{@connector_settings.id}.")
-        validation_result = @connector_class.validate_filtering(@connector_settings.filtering[:draft])
-        # currently only used for connectors -> DEFAULT domain can be assumed (will be changed with the integration of crawler)
-        ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_result })
-        @validation_finished = true
-      rescue StandardError => e
-        Utility::ExceptionTracking.log_exception(e)
-        validation_failed_result = { :state => Core::Filtering::ValidationStatus::INVALID,
-                                     :errors => [
-                                       { :ids => [], :messages => ['Unknown problem occurred while validating, see logs for details.'] }
-                                     ] }
-        ElasticConnectorActions.update_filtering_validation(@connector_settings.id, { DEFAULT_DOMAIN => validation_failed_result })
-      ensure
-        if !@validation_finished && !@status[:error].present?
-          @status[:error] = 'Validation thread did not finish execution. Check connector logs for more details.'
-        end
-        if @status[:error]
-          Utility::Logger.warn("Failed to validate filtering for connector #{@connector_settings.id} with error '#{@status[:error]}'.")
-        else
-          Utility::Logger.info("Successfully validated filtering for connector #{@connector_settings.id}.")
-        end
-      end
-    end
-  end
-end

data/lib/core/filtering/validation_status.rb DELETED Viewed

@@ -1,17 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License;
-# you may not use this file except in compliance with the Elastic License.
-#
-# frozen_string_literal: true
-module Core
-  module Filtering
-    class ValidationStatus
-      INVALID = 'invalid'
-      VALID = 'valid'
-      EDITED = 'edited'
-    end
-  end
-end

data/lib/core/filtering.rb DELETED Viewed

@@ -1,17 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License;
-# you may not use this file except in compliance with the Elastic License.
-#
-# frozen_string_literal: true
-require 'core/filtering/post_process_engine'
-require 'core/filtering/post_process_result'
-require 'core/filtering/simple_rule'
-require 'core/filtering/validation_job_runner'
-require 'core/filtering/validation_status'
-module Core::Filtering
-  DEFAULT_DOMAIN = 'DEFAULT'
-end

data/lib/core/ingestion/es_sink.rb DELETED Viewed

@@ -1,118 +0,0 @@
-#
-# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
-# or more contributor license agreements. Licensed under the Elastic License;
-# you may not use this file except in compliance with the Elastic License.
-#
-# frozen_string_literal: true
-require 'app/config'
-require 'utility/bulk_queue'
-require 'utility/es_client'
-require 'utility/logger'
-require 'elasticsearch/api'
-#
-# This class is responsible for sending the data to the data storage.
-# While we don't actually allow to output our data anywhere except
-# Elasticsearch, we still want to be able to do so sometime in future.
-#
-# This class should stay simple and any change to the class should be careful
-# with the thought of introducing other sinks in future.
-module Core
-  module Ingestion
-    class EsSink
-      def initialize(index_name, request_pipeline, bulk_queue = Utility::BulkQueue.new, max_allowed_document_size = 5 * 1024 * 1024)
-        @client = Utility::EsClient.new(App::Config[:elasticsearch])
-        @index_name = index_name
-        @request_pipeline = request_pipeline
-        @operation_queue = bulk_queue
-        @max_allowed_document_size = max_allowed_document_size
-        @queued = {
-          :indexed_document_count => 0,
-          :deleted_document_count => 0,
-          :indexed_document_volume => 0
-        }
-        @completed = {
-          :indexed_document_count => 0,
-          :deleted_document_count => 0,
-          :indexed_document_volume => 0
-        }
-      end
-      def ingest(document)
-        if document.nil? || document.empty?
-          Utility::Logger.warn('Connector attempted to ingest an empty document, skipping')
-          return
-        end
-        id = document['id']
-        serialized_document = serialize(document)
-        document_size = serialized_document.bytesize
-        if @max_allowed_document_size > 0 && document_size > @max_allowed_document_size
-          Utility::Logger.warn("Connector attempted to ingest too large document with id=#{document['id']} [#{document_size}/#{@max_allowed_document_size}], skipping the document.")
-          return
-        end
-        index_op = serialize({ 'index' => { '_index' => @index_name, '_id' => id } })
-        flush unless @operation_queue.will_fit?(index_op, serialized_document)
-        @operation_queue.add(
-          index_op,
-          serialized_document
-        )
-        @queued[:indexed_document_count] += 1
-        @queued[:indexed_document_volume] += document_size
-      end
-      def ingest_multiple(documents)
-        documents.each { |doc| ingest(doc) }
-      end
-      def delete(id)
-        return if id.nil?
-        delete_op = serialize({ 'delete' => { '_index' => @index_name, '_id' => id } })
-        flush unless @operation_queue.will_fit?(delete_op)
-        @operation_queue.add(delete_op)
-        @queued[:deleted_document_count] += 1
-      end
-      def delete_multiple(ids)
-        ids.each { |id| delete(id) }
-      end
-      def flush
-        data = @operation_queue.pop_all
-        return if data.empty?
-        @client.bulk(:body => data, :pipeline => @request_pipeline)
-        @completed[:indexed_document_count] += @queued[:indexed_document_count]
-        @completed[:deleted_document_count] += @queued[:deleted_document_count]
-        @completed[:indexed_document_volume] += @queued[:indexed_document_volume]
-        @queued[:indexed_document_count] = 0
-        @queued[:deleted_document_count] = 0
-        @queued[:indexed_document_volume] = 0
-      end
-      def ingestion_stats
-        @completed.dup
-      end
-      private
-      def serialize(document)
-        Elasticsearch::API.serializer.dump(document)
-      end
-    end
-  end
-end