RubyGems - dataflow-rb - Versions diffs - 0.9.0 - Mend

dataflow-rb 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +7 -0
data/.env.test.example +6 -0
data/.gitignore +14 -0
data/.rspec +2 -0
data/.travis.yml +4 -0
data/Gemfile +4 -0
data/LICENSE +21 -0
data/README.md +46 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +7 -0
data/dataflow-rb.gemspec +42 -0
data/lib/config/mongoid.yml +21 -0
data/lib/dataflow/adapters/csv_adapter.rb +123 -0
data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
data/lib/dataflow/adapters/psql_adapter.rb +21 -0
data/lib/dataflow/adapters/settings.rb +33 -0
data/lib/dataflow/adapters/sql_adapter.rb +322 -0
data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
data/lib/dataflow/errors/not_implemented_error.rb +7 -0
data/lib/dataflow/event_mixin.rb +77 -0
data/lib/dataflow/extensions/mongo_driver.rb +21 -0
data/lib/dataflow/extensions/msgpack.rb +19 -0
data/lib/dataflow/logger.rb +27 -0
data/lib/dataflow/node.rb +37 -0
data/lib/dataflow/nodes/compute_node.rb +495 -0
data/lib/dataflow/nodes/data_node.rb +331 -0
data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
data/lib/dataflow/nodes/filter/where_node.rb +44 -0
data/lib/dataflow/nodes/join_node.rb +151 -0
data/lib/dataflow/nodes/map_node.rb +50 -0
data/lib/dataflow/nodes/merge_node.rb +33 -0
data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
data/lib/dataflow/nodes/select_keys_node.rb +39 -0
data/lib/dataflow/nodes/snapshot_node.rb +77 -0
data/lib/dataflow/nodes/sql_query_node.rb +50 -0
data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
data/lib/dataflow/nodes/upsert_node.rb +68 -0
data/lib/dataflow/properties_mixin.rb +35 -0
data/lib/dataflow/schema_mixin.rb +134 -0
data/lib/dataflow/version.rb +4 -0
data/lib/dataflow-rb.rb +72 -0
metadata +371 -0

data/lib/dataflow/nodes/data_node.rb ADDED Viewed

@@ -0,0 +1,331 @@
+# frozen_string_literal: true
+module Dataflow
+  module Nodes
+    # Data nodes are used to build a data computing/transformation graph.
+    # At each step we can save the results to a (temp) table.
+    #
+    # Nodes::DataNode represents one of the data nodes.
+    # It is meant to be treated as an interface and should not be used directly.
+    class DataNode
+      include Mongoid::Document
+      include Dataflow::Node
+      include Dataflow::PropertiesMixin
+      include Dataflow::EventMixin
+      include Dataflow::SchemaMixin
+      event :schema_inference_started
+      event :schema_inference_progressed
+      event :schema_inference_finished
+      event :export_started
+      event :export_progressed
+      event :export_finished
+      # make sure we have only one node per db/table combination
+      index({ db_name: 1, name: 1 }, unique: true)
+      # The database name used by this node
+      field :db_name, type: String, editable: false
+      # The dataset name used by this node for storage.
+      field :name, type: String
+      # The schema of this node
+      field :schema,                  type: Hash,    editable: false
+      field :inferred_schema,         type: Hash,    editable: false
+      field :inferred_schema_at,      type: Time,    editable: false
+      # How many samples were used to infer the schema
+      field :inferred_schema_from,    type: Integer, editable: false
+      # The time when this node was last updated
+      field :updated_at,              type: Time, editable: false
+      # One of the possible backend this node will use e.g.: :mongodb, :csv, :mysql
+      field :db_backend,              type: Symbol, editable: false, default: :mongodb
+      # Represents the time in seconds within which to expect an update on this node
+      field :update_expected_within,  type: Integer,                 default: 0
+      # The indexes this node will implement on its dataset.
+      # Indexes should be in the following format:
+      # [
+      #   { key: 'id' },
+      #   { key: 'updated_at' },
+      #   { key: ['id', 'updated_at'], unique: true }
+      # ]
+      field :indexes,              type: Array, default: []
+      # whether to use double buffering or not
+      field :use_double_buffering, type: Boolean,   editable: false, default: false
+      # internal use: where to read/write from. Use 1 and 2 for legacy reasons.
+      field :read_dataset_idx,     type: Integer,   editable: false, default: 1
+      field :write_dataset_idx,    type: Integer,   editable: false, default: 2
+      # Necessary fields:
+      validates_presence_of :db_name
+      validates_presence_of :name
+      # Before create: run default initializations
+      before_create :set_defaults
+      # Sets the default parameters before creating the object.
+      def set_defaults
+        self.schema = schema || {}
+        # Use the schema as the inferred schema if none is provided.
+        # This useful when there is no need to infer schemas (e.g. in SQL)
+        self.inferred_schema ||= schema
+      end
+      # Callback: after creation make sure the underlying dataset matches this node's properties.
+      after_create do
+        handle_dataset_settings_changed
+      end
+      # Callback: after save, make sure the underlying dataset is valid if
+      # any dataset-related proprety changed.
+      after_save do
+        if name_changed? || indexes_changed? || db_backend_changed?
+          handle_dataset_settings_changed
+        end
+      end
+      # When the dataset properties changed notify the adapter to handle the new settings.
+      def handle_dataset_settings_changed
+        db_adapter.update_settings(data_node: self)
+        # recreate the dataset if there is no data
+        if db_adapter.count.zero?
+          db_adapter.recreate_dataset(dataset: read_dataset_name)
+        end
+        db_adapter.create_indexes(dataset: read_dataset_name)
+      end
+      # Finds and return from the dataset, based on the given options.
+      # @param where [Hash] the condition to apply for retrieving the element.
+      #             e.g.: { 'id' => 1 } will fetch a record with the id 1.
+      #             An empty option hash will retrieve any record.
+      # @return [Hash] returns a single record from the dataset.
+      def find(where: {})
+        db_adapter.find(where: where)
+      end
+      # Returns all the records from a dataset that match the options.
+      # @param where [Hash] the condition to apply for retrieving the element.
+      #             e.g.: { 'id' => 1 } will fetch a record with the id 1.
+      #             An empty option hash will retrieve any record.
+      # @param fields [Array] Array of strings representing which fields to include.
+      #               e.g.: ['id', 'updated_at'] will only return these two fields.
+      # @param sort [Hash] represents the sorting of the returned dataset.
+      #             e.g. { 'id' => 1, 'updated_at' => -1 } will sort by
+      #             id ASC and by updated_at DESC.
+      # @param limit [Integer] limits the amount of records returned.
+      # @param offset [Integer] starting offset of the records returned.
+      #        Use with limit to implement pagination.
+      # @yield [db_client] When a block is passed, yields the db client on which .each
+      #        can be called to stream the results rather than load everything in memory.
+      #        Other methods can also be called depending on the backend,
+      #        the downside being back-end portability (use at your own risk).
+      def all(where: {}, fields: [], sort: {}, limit: 0, offset: 0, &block)
+        db_adapter.all(where: where, fields: fields, sort: sort, limit: limit, offset: offset, &block)
+      end
+      # Supports paginating efficiently through the dataset.
+      # @param where [Hash] the condition to apply for retrieving the element.
+      #             e.g.: { 'id' => 1 } will fetch a record with the id 1.
+      #             An empty option hash will retrieve any record.
+      #             IMPORTANT: do not use the system id in the query. It will be overwritten.
+      # @param fields [Array] Array of strings representing which fields to include.
+      #               e.g.: ['id', 'updated_at'] will only return these two fields.
+      # @param limit [Integer] limits the amount of records returned.
+      # @param cursor [String] indicates from which page should the results be returned.
+      # @return [Hash] with 2 fields:
+      #         - data [Array] that contains the fetched records
+      #         - next_cursor [String] a string to pass into the sub-sequent
+      #                                calls to fetch the next page of the data
+      def all_paginated(where: {}, fields: [], cursor: nil)
+        db_adapter.all_paginated(where: where, fields: fields, cursor: cursor)
+      end
+      # Return a list of order (ASC) system IDs.
+      # @param batch_size [Integer] how many IDs to select per query.
+      # These can be used to process the dataset in parallel by querying on a sub-section:
+      # queries = node.ordered_system_id_queries
+      # Parallel.each(queries) do |query|
+      #   process(node.all(where: query))
+      # end
+      def ordered_system_id_queries(batch_size:)
+        db_adapter.ordered_system_id_queries(batch_size: batch_size)
+      end
+      # Counts how many records matches the condition or all if no condition is given.
+      # @return [Integer] the record count.
+      def count(where: {})
+        db_adapter.count(where: where)
+      end
+      # Adds the given records to the dataset and updates the updated_at time.
+      # @param records [Array] an array of the records to be added.
+      def add(records:)
+        return if records.blank?
+        db_adapter.save(records: records)
+        self.updated_at = Time.now
+        save!
+      end
+      # Clear the data that matches the options.
+      def clear(where: {})
+        db_adapter.delete(where: where)
+      end
+      # Update this node's schema.
+      def update_schema(sch)
+        self.schema = sch
+        db_adapter.update_settings(data_node: self)
+      end
+      # Recreates a dataset.
+      # @param dataset_type [Symbol] select which dataset to recreate.
+      #        Can :read or :write.
+      def recreate_dataset(dataset_type: :read)
+        # fetch the proper dataset name
+        dataset = send("#{dataset_type}_dataset_name")
+        db_adapter.recreate_dataset(dataset: dataset)
+      end
+      # Applies unique indexes on the dataset.
+      # As this will be enforcing constraints, it is best applied
+      # before adding any data.
+      # @param dataset_type [Symbol] select which dataset to recreate.
+      #        Can :read or :write.
+      def create_unique_indexes(dataset_type: :read)
+        dataset = send("#{dataset_type}_dataset_name")
+        db_adapter.create_indexes(dataset: dataset, type: :unique_only)
+      end
+      # Applies non-unique indexes on the dataset.
+      # For performance reasons, these indexes are best applied
+      # after adding data (especially on large import operations).
+      def create_non_unique_indexes(dataset_type: :read)
+        dataset = send("#{dataset_type}_dataset_name")
+        db_adapter.create_indexes(dataset: dataset, type: :non_unique_only)
+      end
+      def read_dataset_name
+        return @temporary_read_dataset if @temporary_read_dataset
+        if use_double_buffering
+          "#{name}_buffer#{read_dataset_idx}"
+        else
+          name
+        end
+      end
+      def write_dataset_name
+        if use_double_buffering
+          "#{name}_buffer#{write_dataset_idx}"
+        else
+          name
+        end
+      end
+      # Use to select from which dataset you want to read.
+      # A possible use case is to read from an old dataset name.
+      # @param dataset [String] the dataset name from where to read from.
+      #        It must be a valid dataset name for the current settings.
+      def read_dataset_name=(dataset)
+        return unless valid_dataset_names.include?(dataset)
+        @temporary_read_dataset = dataset
+        db_adapter.update_settings(data_node: self)
+        dataset
+      end
+      def swap_read_write_datasets!
+        raise Dataflow::Errors::InvalidConfigurationError, '#swap_read_write_dataset_names! called on "#{self.name}" but "use_double_buffering" is not activated.' unless use_double_buffering
+        tmp = read_dataset_idx
+        self.read_dataset_idx = write_dataset_idx
+        self.write_dataset_idx = tmp
+        db_adapter.update_settings(data_node: self)
+        save!
+      end
+      def import(connection_opts: {}, keys: nil)
+        importer = db_adapter(connection_opts)
+        records = importer.all
+        add(records: records)
+      end
+      def export(connection_opts: { db_backend: :csv }, keys: nil, where: {})
+        on_export_started(connection_opts: connection_opts, keys: keys)
+        # instanciate and export without saving anything
+        Export::ToCsvNode.new(dependency_ids: [self], query: where.to_json).compute_impl
+        on_export_finished
+      end
+      # retrieves some informations about this node and its usage
+      def info(write_dataset: false)
+        dataset = write_dataset ? write_dataset_name : read_dataset_name
+        usage = db_adapter.usage(dataset: dataset)
+        {
+          name: name,
+          type: self.class.to_s,
+          dataset: dataset,
+          db_backend: db_backend,
+          updated_at: updated_at,
+          record_count: count,
+          indexes: indexes,
+          effective_indexes: usage[:effective_indexes],
+          mem_usage: usage[:memory],
+          storage_usage: usage[:storage]
+        }
+      end
+      def use_symbols?
+        (db_backend.to_s =~ /sql/).present?
+      end
+      private
+      def db_adapter(connection_opts = {})
+        db_backend = connection_opts[:db_backend] || self.db_backend
+        opts = connection_opts.deep_dup
+        opts.delete(:db_backend)
+        has_options = opts.present?
+        case db_backend.downcase.to_s
+        when 'mongodb'
+          return Adapters::MongoDbAdapter.new(opts) if has_options
+          @mongodb_adapter ||= Adapters::MongoDbAdapter.new(data_node: self)
+          return @mongodb_adapter
+        when 'csv'
+          return Adapters::CsvAdapter.new(opts) if has_options
+          @csv_adapter ||= Adapters::CsvAdapter.new(data_node: self)
+          return @csv_adapter
+        when 'mysql'
+          opts[:adapter_type] = 'mysql2'
+          return Adapters::SqlAdapter.new(opts) if has_options
+          @mysql_adapter ||= Adapters::MysqlAdapter.new(data_node: self, adapter_type: 'mysql2')
+          return @mysql_adapter
+        when 'postgresql'
+          opts[:adapter_type] = 'postgresql'
+          return Adapters::SqlAdapter.new(opts) if has_options
+          @postgresql_adapter ||= Adapters::PsqlAdapter.new(data_node: self, adapter_type: 'postgresql')
+          return @postgresql_adapter
+        end
+        raise Errors::NotImplementedError, "'#{db_backend}' backend is not implemented."
+      end
+      def valid_dataset_names
+        if use_double_buffering
+          ["#{name}_buffer1", "#{name}_buffer2"]
+        else
+          [name]
+        end
+      end
+    end # class DataNode
+  end # module Nodes
+end # module Dataflow

data/lib/dataflow/nodes/export/to_csv_node.rb ADDED Viewed

@@ -0,0 +1,54 @@
+# frozen_string_literal: true
+module Dataflow
+  module Nodes
+    module Export
+      # Export a dataset to CSV
+      class ToCsvNode < ComputeNode
+        ensure_dependencies exactly: 1
+        # A JSON encoded query to pass along.
+        field :query, type: String, default: {}.to_json
+        def compute_impl
+          node = dependencies.first
+          where = JSON.parse(query)
+          # fetch the schema
+          sch = node.infer_partial_schema(where: where, extended: true)
+          # re-order the schema if needed
+          if node.respond_to? :keys
+            sch = node.keys.map { |k| [k, sch[k]] }.to_h if keys.present?
+          end
+          # create the dataset
+          csv_adapter = Adapters::CsvAdapter.new(data_node: node)
+          csv_adapter.set_schema(sch)
+          csv_adapter.recreate_dataset
+          # export in parallel
+          max_per_process = 1000
+          max_per_process = limit_per_process if limit_per_process < 0
+          data_count = [node.count(where: where), 1].max
+          equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
+          count_per_process = [max_per_process, equal_split_per_process].min
+          queries = node.ordered_system_id_queries(batch_size: count_per_process)
+          parallel_each(queries.each_with_index) do |query, _idx|
+            # TODO: re-enabled event on_export_progressed
+            # progress = (idx / queries.count.to_f * 100).ceil
+            # on_export_progressed(pct_complete: progress)
+            batch = node.all(where: query.merge(where))
+            csv_adapter.save(records: batch)
+          end
+          # needed by the csv exporter to finalize in a single file
+          csv_adapter.on_save_finished
+        end
+      end
+    end
+  end
+end

data/lib/dataflow/nodes/filter/drop_while_node.rb ADDED Viewed

@@ -0,0 +1,117 @@
+# frozen_string_literal: true
+module Dataflow
+  module Nodes
+    module Filter
+      # Makes a sequency based on a key (e.g. id), and order it (e.g. by time),
+      # and then applies the same logic as ruby's drop_while.
+      # See: https://ruby-doc.org/core-2.4.0/Array.html#method-i-drop_while
+      class DropWhileNode < ComputeNode
+        VALID_OPS = %w(eq ne le lt ge gt).freeze
+        VALID_MODES = %w(both left right).freeze
+        # group by the id key
+        field :id_key,    type: String,  required_for_computing: true
+        # then sort by the sort_by
+        field :sort_by,   type: String,  required_for_computing: true
+        field :sort_asc,  type: Boolean, required_for_computing: true, default: true
+        # the apply a drop_while on { field op value }
+        field :field,     type: String,  required_for_computing: true
+        field :op,        type: String,  required_for_computing: true, values: VALID_OPS
+        field :value,     required_for_computing: true
+        field :drop_mode, type: String, required_for_computing: true, values: VALID_MODES, default: VALID_MODES[0]
+        ensure_data_node_exists
+        ensure_dependencies exactly: 1
+        def compute_impl
+          base_node = dependencies.first
+          records_count = base_node.count
+          return if records_count == 0
+          ids = base_node.all(fields: [id_key]) do |results|
+            results.distinct(id_key)
+          end
+          count_per_process = (ids.count / Parallel.processor_count.to_f).ceil
+          limit = limit_per_process.to_i
+          count_per_process = [limit, count_per_process].min if limit > 0
+          parallel_each(ids.each_slice(count_per_process)) do |ids_slice|
+            # ids.each_slice(count_per_process) do |ids_slice|
+            process_ids(node: base_node, ids: ids_slice)
+          end
+        end
+        private
+        def process_ids(node:, ids:)
+          records = node.all(where: { id_key => ids })
+          groups = records.group_by { |x| x[id_key] }
+          result = groups.flat_map do |_, group|
+            process_group(group)
+          end.compact
+          data_node.add(records: result)
+        end
+        # sort the record group and then proceed to drop the elements
+        # that satisfy the condition
+        def process_group(record_group)
+          sort_tokens = record_dig_tokens(key: sort_by, use_sym: dependencies.first.use_symbols?)
+          group = record_group.sort_by { |x| x.dig(*sort_tokens) }
+          group = group.reverse unless sort_asc
+          modes = drop_mode == 'both' ? %w(left right) : [drop_mode]
+          modes.each do |mode|
+            # if we want to drop on the right,
+            # reverse the array, drop on the left and reverse again
+            group = group.reverse if mode == 'right'
+            group = drop_while(group)
+            group = group.reverse if mode == 'right'
+          end
+          group
+        end
+        # apply a single drop_while on the group.
+        def drop_while(group)
+          value_tokens = record_dig_tokens(key: field, use_sym: dependencies.first.use_symbols?)
+          case op.to_s.downcase
+          when 'eq'
+            group.drop_while { |x| x.dig(*value_tokens) == value }
+          when 'ne'
+            group.drop_while { |x| x.dig(*value_tokens) != value }
+          when 'le'
+            group.drop_while do |x|
+              val = x.dig(*value_tokens)
+              next true if val.nil? # drop nil values
+              val <= value
+            end
+          when 'lt'
+            group.drop_while do |x|
+              val = x.dig(*value_tokens)
+              next true if val.nil? # drop nil values
+              val < value
+            end
+          when 'ge'
+            group.drop_while do |x|
+              val = x.dig(*value_tokens)
+              next true if val.nil? # drop nil values
+              val >= value
+            end
+          when 'gt'
+            group.drop_while do |x|
+              val = x.dig(*value_tokens)
+              next true if val.nil? # drop nil values
+              val > value
+            end
+          else
+            raise Errors::InvalidConfigurationError, "Invalid op key: #{op}"
+          end
+        end
+      end
+    end
+  end
+end

data/lib/dataflow/nodes/filter/newest_node.rb ADDED Viewed

@@ -0,0 +1,66 @@
+# frozen_string_literal: true
+module Dataflow
+  module Nodes
+    module Filter
+      # Select the newest record among records with the same id key.
+      class NewestNode < ComputeNode
+        field :id_key, type: String, required_for_computing: true
+        field :date_key, type: String, required_for_computing: true
+        ensure_data_node_exists
+        ensure_dependencies exactly: 1
+        private
+        def ensure_keys_are_set!
+          raise Errors::InvalidConfigurationError, 'Id key must be set.' if id_key.blank?
+          raise Errors::InvalidConfigurationError, 'Date key must be set.' if date_key.blank?
+        end
+        def compute_impl
+          base_node = dependencies.first
+          records_count = base_node.count
+          return if records_count == 0
+          ids = base_node.all(fields: [id_key]) do |results|
+            results.distinct(id_key)
+          end
+          count_per_process = (ids.count / Parallel.processor_count.to_f).ceil
+          limit = limit_per_process.to_i
+          count_per_process = [limit, count_per_process].min if limit > 0
+          parallel_each(ids.each_slice(count_per_process)) do |ids_slice|
+            # ids.each_slice(count_per_process) do |ids_slice|
+            process_ids(node: base_node, ids: ids_slice)
+          end
+        end
+        def process_ids(node:, ids:)
+          metatata = node.all(where: { id_key => ids }, fields: [id_key, date_key])
+          groups = metatata.group_by { |x| x[id_key] }
+          newest_record_metadata = filter_by_newest(groups: groups,
+                                                    date_key: date_key)
+          records = newest_record_metadata.map do |metadata|
+            query = {
+              id_key => metadata[id_key],
+              date_key => metadata[date_key]
+            }
+            node.find(where: query)
+          end.compact
+          data_node.add(records: records)
+        end
+        def filter_by_newest(groups:, date_key:)
+          groups.map do |_, entries|
+            # sort by date ASC and select the newest
+            entries
+              .sort_by do |x|
+              x[date_key].is_a?(Time) ? x[date_key] : Timeliness.parse(x[date_key])
+            end.last
+          end
+        end
+      end
+    end
+  end
+end

data/lib/dataflow/nodes/filter/where_node.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+module Dataflow
+  module Nodes
+    module Filter
+      # Select records that match the condition.
+      class WhereNode < ComputeNode
+        VALID_OPS = %w(eq ne le lt ge gt).freeze
+        field :key,     type: String, required_for_computing: true
+        field :op,      type: String, required_for_computing: true, values: VALID_OPS
+        field :value,                 required_for_computing: true
+        ensure_data_node_exists
+        ensure_dependencies exactly: 1
+        private
+        def compute_batch(records:)
+          where(records: records)
+        end
+        def where(records:)
+          tokens = record_dig_tokens(key: key, use_sym: dependencies.first.use_symbols?)
+          case op.to_s.downcase
+          when 'eq'
+            records.select { |x| x.dig(*tokens) == value }
+          when 'ne'
+            records.select { |x| x.dig(*tokens) != value }
+          when 'le'
+            records.select { |x| x.dig(*tokens) <= value }
+          when 'lt'
+            records.select { |x| x.dig(*tokens) < value }
+          when 'ge'
+            records.select { |x| x.dig(*tokens) >= value }
+          when 'gt'
+            records.select { |x| x.dig(*tokens) > value }
+          else
+            raise Errors::InvalidConfigurationError, "Invalid op key: #{op}"
+          end
+        end
+      end
+    end
+  end
+end