RubyGems - dataflow-rb - Versions diffs - 0.9.0 - Mend

dataflow-rb 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +7 -0
data/.env.test.example +6 -0
data/.gitignore +14 -0
data/.rspec +2 -0
data/.travis.yml +4 -0
data/Gemfile +4 -0
data/LICENSE +21 -0
data/README.md +46 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +7 -0
data/dataflow-rb.gemspec +42 -0
data/lib/config/mongoid.yml +21 -0
data/lib/dataflow/adapters/csv_adapter.rb +123 -0
data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
data/lib/dataflow/adapters/psql_adapter.rb +21 -0
data/lib/dataflow/adapters/settings.rb +33 -0
data/lib/dataflow/adapters/sql_adapter.rb +322 -0
data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
data/lib/dataflow/errors/not_implemented_error.rb +7 -0
data/lib/dataflow/event_mixin.rb +77 -0
data/lib/dataflow/extensions/mongo_driver.rb +21 -0
data/lib/dataflow/extensions/msgpack.rb +19 -0
data/lib/dataflow/logger.rb +27 -0
data/lib/dataflow/node.rb +37 -0
data/lib/dataflow/nodes/compute_node.rb +495 -0
data/lib/dataflow/nodes/data_node.rb +331 -0
data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
data/lib/dataflow/nodes/filter/where_node.rb +44 -0
data/lib/dataflow/nodes/join_node.rb +151 -0
data/lib/dataflow/nodes/map_node.rb +50 -0
data/lib/dataflow/nodes/merge_node.rb +33 -0
data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
data/lib/dataflow/nodes/select_keys_node.rb +39 -0
data/lib/dataflow/nodes/snapshot_node.rb +77 -0
data/lib/dataflow/nodes/sql_query_node.rb +50 -0
data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
data/lib/dataflow/nodes/upsert_node.rb +68 -0
data/lib/dataflow/properties_mixin.rb +35 -0
data/lib/dataflow/schema_mixin.rb +134 -0
data/lib/dataflow/version.rb +4 -0
data/lib/dataflow-rb.rb +72 -0
metadata +371 -0

data/lib/dataflow/adapters/settings.rb ADDED Viewed

@@ -0,0 +1,33 @@
+# frozen_string_literal: true
+module Dataflow
+  module Adapters
+    class Settings
+      attr_accessor :connection_uri, :db_name, :indexes, :adapter_type,
+                    :dataset_name, :read_dataset_name, :write_dataset_name, :schema
+      def initialize(data_node: nil, connection_uri: nil, db_name: nil,
+                     dataset_name: nil, indexes: nil, adapter_type: nil, schema: nil)
+        @connection_uri = connection_uri
+        # first try to set the options based on the data node settings
+        if data_node.present?
+          @db_name            = data_node.db_name
+          @dataset_name       = data_node.name
+          @read_dataset_name  = data_node.read_dataset_name
+          @write_dataset_name = data_node.write_dataset_name
+          @indexes            = data_node.indexes
+          @schema             = data_node.schema
+        end
+        # override if needed
+        @db_name            ||= db_name
+        @dataset_name       ||= dataset_name
+        @read_dataset_name  ||= dataset_name
+        @write_dataset_name ||= dataset_name
+        @indexes            ||= indexes
+        @adapter_type       ||= adapter_type
+        @schema             ||= schema
+      end
+    end
+  end
+end

data/lib/dataflow/adapters/sql_adapter.rb ADDED Viewed

@@ -0,0 +1,322 @@
+# frozen_string_literal: true
+module Dataflow
+  module Adapters
+    # Interface between a data node and mongodb.
+    # We use mongodb to perform all the store/retrieve operations.
+    class SqlAdapter
+      class << self
+        # Get (or create) a client that satisfies the given connection settings.
+        # @param settings [Hash] Represents the connection settings to the DB.
+        # @param db_name [String] The database name to which the client will connect.
+        # @return [Sequel::Database] a sequel database object.
+        def client(settings, db_name: nil)
+          @clients ||= {}
+          case settings.adapter_type
+          when 'mysql2'
+            host = ENV['MOJACO_MYSQL_ADDRESS'] || '127.0.0.1'
+            port = ENV['MOJACO_MYSQL_PORT'] || '3306'
+            user = ENV['MOJACO_MYSQL_USER']
+            password = ENV['MOJACO_MYSQL_PASSWORD']
+          when 'postgresql'
+            host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
+            port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
+            user = ENV['MOJACO_POSTGRESQL_USER'] || 'eurico'
+            password = ENV['MOJACO_POSTGRESQL_PASSWORD'] || 'eurico'
+          end
+          db_name ||= settings.db_name
+          user_password = user
+          user_password += ":#{password}" if password.present?
+          uri = "#{settings.adapter_type}://#{user_password}@#{host}:#{port}"
+          connection_uri = settings.connection_uri || "#{uri}/#{db_name}"
+          return @clients[connection_uri] if @clients[connection_uri].present?
+          # first, make sure the DB is created (if it is not an external db)
+          is_external_db = settings.connection_uri.present?
+          try_create_db(uri, db_name, user, password) unless is_external_db
+          # then, create the connection object
+          @clients[connection_uri] ||= Sequel.connect("#{connection_uri}?encoding=utf8")
+        end
+        # Used internally to try to create the DB automatically.
+        # @param uri [String] the connection uri to the DB.
+        # @param db_name [String] the database name.
+        # @return [Boolean] whether the db was created or not.
+        def try_create_db(uri, db_name, user, password)
+          Sequel.connect(uri, user: user, password: password) do |db|
+            db.run("CREATE DATABASE #{db_name}")
+            true
+          end
+        rescue Sequel::DatabaseError => e
+          # ignore error
+          false
+        end
+        # Force the clients to disconnect their connections.
+        # Use before forking.
+        def disconnect_clients
+          @clients ||= {}
+          @clients.values.each(&:disconnect)
+        end
+      end
+      SYSTEM_ID = :_id
+      attr_reader :settings
+      attr_reader :client
+      def initialize(args)
+        update_settings(args)
+        @client = SqlAdapter.client(settings)
+        @schema = settings.schema || [] # TODO: detect if the table schema has a mis-match
+      end
+      def update_settings(args)
+        @settings = Dataflow::Adapters::Settings.new(args)
+      end
+      def set_schema(schema)
+        @schema = schema
+      end
+      # retrieve a single element from a data node
+      def find(where: {}, fields: [], sort: {}, offset: 0)
+        all(where: where, fields: fields, sort: sort, offset: offset, limit: 1).first
+      end
+      # retrieve all elements from a data node
+      def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
+        res = client[settings.read_dataset_name.to_sym]
+        # if there is no fields, automatically
+        # select all the fields expect the system _id
+        fields = res.columns.reject { |x| x == SYSTEM_ID } if fields.blank?
+        res = res.select(*fields.map(&:to_sym)) if fields.present?
+        res = apply_query(res, where)
+        (sort || {}).each do |k, v|
+          sort_value = v == 1 ? k.to_sym : Sequel.desc(k.to_sym)
+          res = res.order(sort_value)
+        end
+        res = res.offset(offset) if offset > 0
+        res = res.limit(limit) if limit > 0
+        if block_given?
+          yield res
+        else
+          res.to_a
+        end
+      end
+      # Create queries that permit processing the whole dataset in parallel without using offsets.
+      def ordered_system_id_queries(batch_size:)
+        ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
+        queries_count = (ids.size / batch_size.to_f).ceil
+        Array.new(queries_count) do |i|
+          from = ids[i * batch_size]
+          to = ids[(i + 1) * batch_size] || ids[-1]
+          is_last = i == queries_count - 1
+          where_query = { SYSTEM_ID => { '>=' => from } }
+          operator = is_last ? '<=' : '<'
+          where_query[SYSTEM_ID][operator] = to
+          where_query
+        end
+      end
+      # count the number of records
+      def count(where: {})
+        res = client[settings.read_dataset_name.to_sym]
+        res = apply_query(res, where)
+        res.count
+      rescue Sequel::DatabaseError
+        0
+      end
+      # Save the given records
+      # TODO: support :replace_by parameter
+      def save(records:)
+        dataset = client[settings.write_dataset_name.to_sym]
+        columns = dataset.columns.reject { |x| x == SYSTEM_ID }
+        tabular_data = records.map do |record|
+          columns.map { |col| record[col] }
+        end
+        dataset.insert_ignore.import(columns, tabular_data)
+      end
+      # Delete records that match the options.
+      # @param where query to apply on the delete operation.
+      # @note this deletes on the read dataset
+      # i.e. changes are seen immediately in the case of double buffered datasets
+      def delete(where: {})
+        res = client[settings.read_dataset_name.to_sym]
+        res = apply_query(res, where)
+        res.delete
+      end
+      # recreate the table/collection
+      def recreate_dataset(dataset: nil)
+        dataset ||= settings.write_dataset_name.to_sym
+        client.drop_table?(dataset)
+        unless @schema.present?
+          p 'WARNING: recreate dataset aborted: no schema'
+          return
+        end
+        create_table(dataset, @schema)
+      end
+      # Create the indexes on this dataset.
+      # @param dataset [String] Specify on which dataset the operation will be performed.
+      #        Default: the adatpter's settings' dataset.
+      # @param type [Symbol] select which indexes type to create.
+      #        Can be :all (default), :unique_only, :non_unique_only.
+      # TODO: add support for a :drop_retry_on_error parameter.
+      def create_indexes(dataset: nil, type: :all)
+        dataset ||= settings.write_dataset_name
+        dataset = dataset.to_sym
+        indexes = (settings.indexes || [])
+        case type
+        when :unique_only
+          indexes = indexes.select { |idx| idx['unique'] }
+        when :non_unique_only
+          indexes = indexes.reject { |idx| idx['unique'] }
+        end
+        indexes.each do |index|
+          params = index_parameters(index)
+          begin
+            client.add_index(dataset, *params)
+          rescue Sequel::DatabaseError => e
+            # ignore index already exists
+            raise e unless e.wrapped_exception.is_a?(PG::DuplicateTable)
+          end
+        end
+      end
+      def usage(dataset:)
+        indexes = retrieve_collection_indexes(dataset)
+        table_usage = fetch_table_usage(dataset: dataset)
+        table_usage.merge(effective_indexes: indexes)
+      end
+      private
+      MAX_INT = 2_147_483_647
+      MAX_VARCHAR = 255
+      def create_table(dataset, schema)
+        client.create_table(dataset.to_sym) do
+          # always add an _id field to be used internally
+          primary_key SYSTEM_ID
+          schema.each do |column, info|
+            type = info[:type]
+            max_size = info[:max] || info.dig(:types, type, :max)
+            case type
+            when 'object', 'string'
+              max_size ||= info.dig(:types, 'string', :max) || MAX_VARCHAR + 1
+              col_type = if max_size <= MAX_VARCHAR
+                           "varchar(#{max_size})"
+                         else
+                           'text'
+                         end
+            when 'time'
+              col_type = 'timestamp'
+            when 'integer'
+              max_size ||= MAX_INT + 1
+              col_type = if max_size <= MAX_INT
+                           'integer'
+                         else
+                           'bigint'
+                         end
+            when 'numeric'
+              col_type = 'real'
+            when 'array', 'hash'
+              p "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
+              col_type = 'json'
+            else
+              p "Error: unexpected type '#{type}'. Keeping as-is."
+              col_type = type
+            end
+            # create a column with the given type
+            p "#{column} #{type} -> #{col_type}"
+            column(column.to_sym, col_type)
+          end
+        end
+      end
+      def apply_query(res, opts)
+        queries = transform_to_query(opts)
+        queries.each do |query_args|
+          res = res.where(*query_args)
+        end
+        res
+      end
+      def transform_to_query(opts)
+        # map to a serie of AND clauses queries
+        opts.flat_map do |k, v|
+          if v.is_a? Hash
+            v.map do |operator, value|
+              case operator
+              when '!='
+                if value.is_a? Array
+                  ["#{k} NOT IN ?", value]
+                else
+                  ["#{k} <> ?", value]
+                end
+              when '<'
+                ["#{k} < ?", value]
+              when '<='
+                ["#{k} <= ?", value]
+              when '>'
+                ["#{k} > ?", value]
+              when '>='
+                ["#{k} >= ?", value]
+              end
+            end
+          else
+            # e.g. simple match { 'id' => 1} or IN clauses { 'id' => [1,2] }
+            # are supported with simples hashes
+            [[{ k.to_sym => v }]]
+          end
+        end
+      end
+      # Required index format for sequel:
+      # :keys, unique: true
+      def index_parameters(index)
+        index = index.with_indifferent_access
+        keys = Array(index[:key]).map(&:to_sym)
+        params = [keys]
+        params << { unique: true } if index[:unique]
+        params
+      end
+      def retrieve_collection_indexes(collection)
+        psql_indexes = client.indexes(collection)
+        psql_indexes.values.map do |idx|
+          cols = idx[:columns].map(&:to_s)
+          index = { 'key' => cols }
+          index['unique'] = true if idx[:unique]
+          index
+        end.compact
+      end
+    end
+  end
+end

data/lib/dataflow/errors/invalid_configuration_error.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+module Dataflow
+  module Errors
+    class InvalidConfigurationError < StandardError
+    end
+  end
+end

data/lib/dataflow/errors/not_implemented_error.rb ADDED Viewed

@@ -0,0 +1,7 @@
+# frozen_string_literal: true
+module Dataflow
+  module Errors
+    class NotImplementedError < StandardError
+    end
+  end
+end

data/lib/dataflow/event_mixin.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+module Dataflow
+  module EventMixin
+    extend ActiveSupport::Concern
+    module ClassMethods
+      def event(event_name)
+        # re-open the base class
+        handlers_var_name = "@#{event_name}_handlers"
+        # Defines a class method called "event_name".
+        # It will serve a global-level evt handler for this class.
+        # @yield (optional) the event handler to add
+        # @return Array the list of event handlers
+        define_singleton_method(event_name) do |&block|
+          handlers = instance_variable_get(handlers_var_name)
+          unless handlers
+            handlers = []
+            instance_variable_set(handlers_var_name, [])
+          end
+          if block.present?
+            handlers << block
+            instance_variable_set(handlers_var_name, handlers)
+          end
+          # return all events from the hierarchy
+          superclass_handlers = []
+          superclass = self.superclass
+          while superclass
+            superclass_handlers += superclass.instance_variable_get(
+              :"@#{event_name}_handlers"
+            ) || []
+            superclass = superclass.superclass
+          end
+          handlers + superclass_handlers
+        end
+        # Defines a method called "event_name".
+        # It will serve a instance-level evt handler.
+        # @yield (optional) the event handler to add
+        # @return Array the list of event handlers
+        define_method(event_name) do |&block|
+          handlers = instance_variable_get(handlers_var_name)
+          unless handlers
+            handlers = []
+            instance_variable_set(handlers_var_name, [])
+          end
+          if block.present?
+            handlers << block
+            instance_variable_set(handlers_var_name, handlers)
+          end
+          handlers
+        end
+        # Defines a way to fire the event: "on_event_name(evt)"
+        # @param *args a variable list of arguments passed to the handlers
+        define_method("on_#{event_name}") do |*args|
+          handlers = send(event_name) + self.class.send(event_name)
+          handlers.each do |handler|
+            begin
+              handler.call(self, *args)
+            rescue StandardError => e
+              @logger&.log("ERROR IN HANDLER [on_#{event_name}]: #{e}")
+              # ignore error in handlers
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/dataflow/extensions/mongo_driver.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+module Mongo
+  class Collection
+    class View
+      attr_reader :cursor
+      def initial_query
+        @cursor = nil
+        result = nil
+        read_with_retry do
+          server = read.select_server(cluster, false)
+          result = send_initial_query(server)
+          @cursor = Cursor.new(view, result, server)
+        end
+        result
+      end
+    end
+  end
+end

data/lib/dataflow/extensions/msgpack.rb ADDED Viewed

@@ -0,0 +1,19 @@
+# frozen_string_literal: true
+Time.class_eval do
+  def to_msgpack(out = '')
+    iso8601.to_msgpack(out)
+  end
+end
+DateTime.class_eval do
+  def to_msgpack(out = '')
+    iso8601.to_msgpack(out)
+  end
+end
+Date.class_eval do
+  def to_msgpack(out = '')
+    iso8601.to_msgpack(out)
+  end
+end

data/lib/dataflow/logger.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# frozen_string_literal: true
+module Dataflow
+  class Logger
+    attr_accessor :prefix
+    attr_accessor :use_notifications
+    def initialize(prefix:, use_notifications: false)
+      @prefix = prefix
+      @use_notifications = use_notifications
+      @@impl = LoggerImpl.new
+    end
+    def log(str)
+      return if ENV['RACK_ENV'] == 'test'
+      now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
+      message = "[#{now}] #{prefix} :: #{str}"
+      logger_impl = @@impl
+      logger_impl.log(message)
+    end
+    class LoggerImpl
+      def log(message)
+        puts message
+      end
+    end
+  end
+end

data/lib/dataflow/node.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+module Dataflow
+  # Define (default) common interface for nodes.
+  # These may be overriden with their specific implementations.
+  module Node
+    # Returns either a DataNode or a ComputeNode that match the id
+    def self.find(id)
+      begin
+        return Dataflow::Nodes::DataNode.find(id)
+      rescue Mongoid::Errors::DocumentNotFound
+        # try again against a computed node
+      end
+      Dataflow::Nodes::ComputeNode.find(id)
+    end
+    def updated?
+      true
+    end
+    def recompute(*args)
+      # Interface only, for recursion purposes
+    end
+    # Overriden in computed node
+    def valid_for_computation?
+      true
+    end
+    def validate!
+      # throw if normal model validation do not pass.
+      valid = valid_for_computation?
+      raise Dataflow::Errors::InvalidConfigurationError, errors.messages unless valid
+      true
+    end
+  end
+end