RubyGems - connectors_service - Versions diffs - 8.6.0.4.pre.20221114T233727Z → 8.6.0.4.pre.20221116T024501Z - Mend

connectors_service 8.6.0.4.pre.20221114T233727Z → 8.6.0.4.pre.20221116T024501Z

Files changed (33) hide show

checksums.yaml +4 -4
data/config/connectors.yml +4 -4
data/lib/app/app.rb +4 -0
data/lib/app/dispatcher.rb +30 -17
data/lib/connectors/base/advanced_snippet_against_schema_validator.rb +173 -0
data/lib/connectors/base/advanced_snippet_validator.rb +34 -0
data/lib/connectors/base/connector.rb +27 -5
data/lib/connectors/example/connector.rb +3 -12
data/lib/connectors/example/example_advanced_snippet_validator.rb +35 -0
data/lib/connectors/gitlab/connector.rb +3 -12
data/lib/connectors/gitlab/gitlab_advanced_snippet_validator.rb +35 -0
data/lib/connectors/mongodb/connector.rb +9 -24
data/lib/connectors/mongodb/mongo_advanced_snippet_against_schema_validator.rb +22 -0
data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb +292 -0
data/lib/connectors/sync_status.rb +6 -1
data/lib/connectors/tolerable_error_helper.rb +43 -0
data/lib/core/connector_job.rb +96 -23
data/lib/core/connector_settings.rb +29 -6
data/lib/core/elastic_connector_actions.rb +77 -55
data/lib/core/filtering/validation_job_runner.rb +1 -1
data/lib/core/ingestion/es_sink.rb +68 -9
data/lib/core/ingestion.rb +0 -1
data/lib/core/jobs/consumer.rb +114 -0
data/lib/core/jobs/producer.rb +26 -0
data/lib/core/single_scheduler.rb +1 -1
data/lib/core/sync_job_runner.rb +20 -12
data/lib/core.rb +2 -0
data/lib/utility/error_monitor.rb +108 -0
data/lib/utility/errors.rb +0 -12
data/lib/utility/logger.rb +0 -1
data/lib/utility.rb +6 -0
metadata +12 -3
data/lib/core/ingestion/ingester.rb +0 -90

data/lib/connectors/mongodb/mongo_advanced_snippet_schema.rb ADDED Viewed

@@ -0,0 +1,292 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License;
+# you may not use this file except in compliance with the Elastic License.
+#
+# frozen_string_literal: true
+module Connectors
+  module MongoDB
+    module AdvancedSnippet
+      # Pipeline stages: https://www.mongodb.com/docs/manual/reference/operator/aggregation-pipeline/
+      ALLOWED_PIPELINE_STAGES = %w[
+        $addFields $bucket $bucketAuto $changeStream $collStats $count $densify
+        $documents $facet $fill $geoNear $graphLookup $group $indexStats $limit
+        $listSessions $lookup $match $merge $out $planCacheStats $project $redact
+        $replaceRoot $replaceWith $sample $search $searchMeta $set $setWindowFields
+        $skip $sort $sortByCount $unionWith $unset $unwind
+      ]
+      # All except the $out, $merge, $geoNear, and $changeStream stages can appear multiple times in a pipeline.
+      # Source: https://www.mongodb.com/docs/manual/reference/operator/aggregation-pipeline/
+      PIPELINE_STAGES_ALLOWED_ONCE = %w[$out $merge $geoNear $changeStream]
+      NON_NEGATIVE_INTEGER = ->(value) { value.is_a?(Integer) && value >= 0 }
+      READ_CONCERN_LEVEL = ->(level) { %w[local available majority linearizable].include?(level) }
+      STRING_OR_DOCUMENT = ->(value) { value.is_a?(Hash) || value.is_a?(String) }
+      MUTUAL_EXCLUSIVE_FILTER = ->(fields) { fields.size <= 1 }
+      AGGREGATION_PIPELINE = lambda { |pipeline|
+        return false unless pipeline.is_a?(Array)
+        allowed_once_appearances = Set.new
+        pipeline.flat_map(&:keys).each do |key|
+          return false unless ALLOWED_PIPELINE_STAGES.include?(key)
+          if PIPELINE_STAGES_ALLOWED_ONCE.include?(key)
+            return false if allowed_once_appearances.include?(key)
+            allowed_once_appearances.add(key)
+          end
+        end
+        true
+      }
+      # Ruby has no 'Boolean' class
+      BOOLEAN = ->(value) { value.is_a?(TrueClass) || value.is_a?(FalseClass) }
+      COLLATION = {
+        :name => 'collation',
+        :type => Hash,
+        :optional => true,
+        :fields => [
+          {
+            :name => 'locale',
+            :type => String,
+            :optional => true
+          },
+          {
+            :name => 'caseLevel',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'caseFirst',
+            :type => String,
+            :optional => true
+          },
+          {
+            :name => 'strength',
+            :type => Integer,
+            :optional => true
+          },
+          {
+            :name => 'numericOrdering',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'alternate',
+            :type => String,
+            :optional => true
+          },
+          {
+            :name => 'maxVariable',
+            :type => String,
+            :optional => true
+          },
+          {
+            :name => 'backwards',
+            :type => BOOLEAN,
+            :optional => true
+          },
+        ]
+      }
+      CURSOR_TYPE = ->(cursor) { [:tailable, :tailable_await].include?(cursor) }
+      # Aggregate options: https://www.mongodb.com/docs/manual/reference/method/db.collection.aggregate/
+      AGGREGATE_OPTIONS = {
+        :name => 'options',
+        :type => Hash,
+        :optional => true,
+        :fields => [
+          {
+            :name => 'explain',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'allowDiskUse',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'cursor',
+            :type => Hash,
+            :optional => true,
+            :fields => [
+              {
+                :name => 'batchSize',
+                :type => NON_NEGATIVE_INTEGER
+              }
+            ]
+          },
+          {
+            :name => 'maxTimeMS',
+            :type => NON_NEGATIVE_INTEGER,
+            :optional => true
+          },
+          {
+            :name => 'bypassDocumentValidation',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'readConcern',
+            :type => Hash,
+            :optional => true,
+            :fields => [
+              {
+                :name => 'level',
+                :type => READ_CONCERN_LEVEL
+              }
+            ]
+          },
+          COLLATION,
+          {
+            :name => 'hint',
+            :type => STRING_OR_DOCUMENT,
+            :optional => true
+          },
+          {
+            :name => 'comment',
+            :type => String,
+            :optional => true
+          },
+          {
+            :name => 'writeConcern',
+            :type => Hash,
+            :optional => true
+          },
+          {
+            :name => 'let',
+            :type => Hash,
+            :optional => true
+          }
+        ]
+      }
+      AGGREGATE_PIPELINE = {
+        :name => 'pipeline',
+        :type => AGGREGATION_PIPELINE,
+        :optional => true,
+      }
+      AGGREGATE = {
+        :name => 'aggregate',
+        :type => Hash,
+        :optional => true,
+        :fields => [
+          AGGREGATE_PIPELINE,
+          AGGREGATE_OPTIONS
+        ]
+      }
+      FIND_OPTIONS = {
+        :name => 'options',
+        :type => Hash,
+        :optional => true,
+        :fields => [
+          {
+            :name => 'allowDiskUse',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'allowPartialResults',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'batchSize',
+            :type => NON_NEGATIVE_INTEGER,
+            :optional => true
+          },
+          COLLATION,
+          {
+            :name => 'cursorType',
+            :type => CURSOR_TYPE,
+            :optional => true
+          },
+          {
+            :name => 'limit',
+            :type => NON_NEGATIVE_INTEGER,
+            :optional => true
+          },
+          {
+            :name => 'maxTimeMS',
+            :type => NON_NEGATIVE_INTEGER,
+            :optional => true
+          },
+          {
+            :name => 'modifiers',
+            :type => Hash,
+            :optional => true
+          },
+          {
+            :name => 'noCursorTimeout',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'oplogReplay',
+            :type => BOOLEAN,
+            :optional => true
+          },
+          {
+            :name => 'projection',
+            :type => Hash,
+            :optional => true
+          },
+          {
+            :name => 'skip',
+            :type => NON_NEGATIVE_INTEGER,
+            :optional => true
+          },
+          {
+            :name => 'sort',
+            :type => Hash,
+            :optional => true
+          },
+          {
+            :name => 'let',
+            :type => Hash,
+            :optional => true
+          }
+        ]
+      }
+      # TODO: return true for now. Will be more involved (basically needs full query parsing or "dummy" execution against a running instance)
+      FILTER = ->(_filter) { true }
+      FIND_FILTER = {
+        :name => 'filter',
+        :type => FILTER
+      }
+      FIND = {
+        :name => 'find',
+        :type => Hash,
+        :optional => true,
+        :fields => [
+          FIND_OPTIONS,
+          FIND_FILTER
+        ]
+      }
+      SCHEMA = {
+        :fields => {
+          :constraints => MUTUAL_EXCLUSIVE_FILTER,
+          :values => [
+            AGGREGATE,
+            FIND
+          ]
+        }
+      }
+    end
+  end
+end

data/lib/connectors/sync_status.rb CHANGED Viewed

@@ -26,11 +26,16 @@ module Connectors
       ERROR
     ]
-    PENDING_STATUES = [
+    PENDING_STATUSES = [
       PENDING,
       SUSPENDED
     ]
+    ACTIVE_STATUSES = [
+      IN_PROGRESS,
+      CANCELING
+    ]
     TERMINAL_STATUSES = [
       CANCELED,
       COMPLETED,

data/lib/connectors/tolerable_error_helper.rb ADDED Viewed

@@ -0,0 +1,43 @@
+#
+# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+# or more contributor license agreements. Licensed under the Elastic License;
+# you may not use this file except in compliance with the Elastic License.
+#
+require 'utility/logger'
+require 'utility/exception_tracking'
+require 'utility/error_monitor'
+module Connectors
+  class TolerableErrorHelper
+    def initialize(error_monitor)
+      @error_monitor = error_monitor
+    end
+    def yield_single_document(identifier: nil)
+      Utility::Logger.debug("Extracting single document for #{identifier}") if identifier
+      yield
+      @error_monitor.note_success
+    rescue *fatal_exception_classes => e
+      Utility::ExceptionTracking.augment_exception(e)
+      Utility::Logger.error("Encountered a fall-through error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
+      raise
+    rescue StandardError => e
+      Utility::ExceptionTracking.augment_exception(e)
+      Utility::Logger.warn("Encountered error during extraction#{identifying_error_message(identifier)}: #{e.class}: #{e.message} {:message_id => #{e.id}}")
+      @error_monitor.note_error(e, :id => e.id)
+    end
+    private
+    def identifying_error_message(identifier)
+      identifier.present? ? " of '#{identifier}'" : ''
+    end
+    def fatal_exception_classes
+      [
+        Utility::ErrorMonitor::MonitoringError
+      ]
+    end
+  end
+end

data/lib/core/connector_job.rb CHANGED Viewed

@@ -15,18 +15,22 @@ module Core
   class ConnectorJob
     DEFAULT_PAGE_SIZE = 100
-    # Error Classes
-    class ConnectorJobNotFoundError < StandardError; end
     def self.fetch_by_id(job_id)
       es_response = ElasticConnectorActions.get_job(job_id)
+      return nil unless es_response[:found]
-      raise ConnectorJobNotFoundError.new("Connector job with id=#{job_id} was not found.") unless es_response[:found]
       new(es_response)
     end
-    def self.pending_jobs(page_size = DEFAULT_PAGE_SIZE)
-      query = { terms: { status: Connectors::SyncStatus::PENDING_STATUES } }
+    def self.pending_jobs(connectors_ids: [], page_size: DEFAULT_PAGE_SIZE)
+      status_term = { status: Connectors::SyncStatus::PENDING_STATUSES }
+      query = { bool: { must: [{ terms: status_term }] } }
+      return fetch_jobs_by_query(query, page_size) if connectors_ids.empty?
+      query[:bool][:must] << { terms: { 'connector.id' => connectors_ids } }
       fetch_jobs_by_query(query, page_size)
     end
@@ -50,6 +54,10 @@ module Core
       @elasticsearch_response[:_source][property_name]
     end
+    def error
+      self[:error]
+    end
     def status
       self[:status]
     end
@@ -62,16 +70,36 @@ module Core
       status == Connectors::SyncStatus::CANCELING
     end
+    def suspended?
+      status == Connectors::SyncStatus::SUSPENDED
+    end
+    def canceled?
+      status == Connectors::SyncStatus::CANCELED
+    end
+    def pending?
+      Connectors::SyncStatus::PENDING_STATUSES.include?(status)
+    end
+    def active?
+      Connectors::SyncStatus::ACTIVE_STATUSES.include?(status)
+    end
+    def terminated?
+      Connectors::SyncStatus::TERMINAL_STATUSES.include?(status)
+    end
     def connector_snapshot
-      self[:connector]
+      self[:connector] || {}
     end
     def connector_id
-      connector_snapshot[:id]
+      @elasticsearch_response[:_source][:connector][:id]
     end
     def index_name
-      connector_snapshot[:configuration]
+      connector_snapshot[:index_name]
     end
     def language
@@ -91,33 +119,51 @@ module Core
     end
     def pipeline
-      connector_snapshot[:pipeline]
+      @elasticsearch_response[:_source][:pipeline]
     end
     def connector
       @connector ||= ConnectorSettings.fetch_by_id(connector_id)
     end
-    def reload_connector!
-      @connector = nil
-      connector
+    def done!(ingestion_stats = {}, connector_metadata = {})
+      terminate!(Connectors::SyncStatus::COMPLETED, nil, ingestion_stats, connector_metadata)
     end
-    def reload
-      es_response = ElasticConnectorActions.get_job(id)
-      raise ConnectorJobNotFoundError.new("Connector job with id=#{id} was not found.") unless es_response[:found]
-      # TODO: remove the usage of with_indifferent_access. get_id method is expected to return a hash
-      @elasticsearch_response = es_response.with_indifferent_access
-      @connector = nil
+    def error!(message, ingestion_stats = {}, connector_metadata = {})
+      terminate!(Connectors::SyncStatus::ERROR, message, ingestion_stats, connector_metadata)
     end
-    private
+    def cancel!(ingestion_stats = {}, connector_metadata = {})
+      terminate!(Connectors::SyncStatus::CANCELED, nil, ingestion_stats, connector_metadata)
+    end
-    def initialize(es_response)
-      # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
-      @elasticsearch_response = es_response.with_indifferent_access
+    def with_concurrency_control
+      response = ElasticConnectorActions.get_job(id)
+      yield response, response['_seq_no'], response['_primary_term']
     end
+    def make_running!
+      with_concurrency_control do |es_doc, seq_no, primary_term|
+        now = Time.now
+        doc = {
+          status: Connectors::SyncStatus::IN_PROGRESS,
+          started_at: now,
+          last_seen: now,
+          worker_hostname: Socket.gethostname
+        }
+        ElasticConnectorActions.update_job_fields(es_doc[:_id], doc, seq_no, primary_term)
+      end
+    end
+    def es_source
+      @elasticsearch_response[:_source]
+    end
+    private
     def self.fetch_jobs_by_query(query, page_size)
       results = []
       offset = 0
@@ -133,5 +179,32 @@ module Core
       results
     end
+    def initialize(es_response)
+      # TODO: remove the usage of with_indifferent_access. The initialize method should expect a hash argument
+      @elasticsearch_response = es_response.with_indifferent_access
+    end
+    def terminate!(status, error = nil, ingestion_stats = {}, connector_metadata = {})
+      ingestion_stats ||= {}
+      ingestion_stats[:total_document_count] = ElasticConnectorActions.document_count(index_name)
+      doc = {
+        :last_seen => Time.now,
+        :completed_at => Time.now,
+        :status => status,
+        :error => error
+      }.merge(ingestion_stats)
+      doc[:canceled_at] = Time.now if status == Connectors::SyncStatus::CANCELED
+      doc[:metadata] = connector_metadata if connector_metadata&.any?
+      ElasticConnectorActions.update_job_fields(id, doc)
+    end
+    def seq_no
+      @elasticsearch_response[:_seq_no]
+    end
+    def primary_term
+      @elasticsearch_response[:_primary_term]
+    end
   end
 end

data/lib/core/connector_settings.rb CHANGED Viewed

@@ -8,7 +8,6 @@
 require 'active_support/core_ext/hash/indifferent_access'
 require 'connectors/connector_status'
-require 'connectors/registry'
 require 'core/elastic_connector_actions'
 require 'utility'
@@ -24,18 +23,16 @@ module Core
     DEFAULT_PAGE_SIZE = 100
-    # Error Classes
-    class ConnectorNotFoundError < StandardError; end
     def self.fetch_by_id(connector_id)
       es_response = ElasticConnectorActions.get_connector(connector_id)
-      connectors_meta = ElasticConnectorActions.connectors_meta
+      return nil unless es_response[:found]
-      raise ConnectorNotFoundError.new("Connector with id=#{connector_id} was not found.") unless es_response[:found]
+      connectors_meta = ElasticConnectorActions.connectors_meta
       new(es_response, connectors_meta)
     end
     def self.fetch_native_connectors(page_size = DEFAULT_PAGE_SIZE)
+      require 'connectors/registry' unless defined?(Connectors::REGISTRY)
       query = {
         bool: {
           filter: [
@@ -122,6 +119,32 @@ module Core
       index_name&.start_with?(Utility::Constants::CONTENT_INDEX_PREFIX)
     end
+    def ready_for_sync?
+      Connectors::REGISTRY.registered?(service_type) &&
+        valid_index_name? &&
+        connector_status_allows_sync?
+    end
+    def running?
+      @elasticsearch_response[:_source][:last_sync_status] == Connectors::SyncStatus::IN_PROGRESS
+    end
+    def update_last_sync!(job)
+      doc = {
+        :last_sync_status => job.status,
+        :last_synced => Time.now,
+        :last_sync_error => job.error,
+        :error => job.error
+      }
+      if job.terminated?
+        doc[:last_indexed_document_count] = job[:indexed_document_count]
+        doc[:last_deleted_document_count] = job[:deleted_document_count]
+      end
+      Core::ElasticConnectorActions.update_connector_fields(job.connector_id, doc)
+    end
     private
     def initialize(es_response, connectors_meta)