RubyGems - dataflow-rb - Versions diffs - 0.9.0 - Mend

dataflow-rb 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

checksums.yaml +7 -0
data/.env.test.example +6 -0
data/.gitignore +14 -0
data/.rspec +2 -0
data/.travis.yml +4 -0
data/Gemfile +4 -0
data/LICENSE +21 -0
data/README.md +46 -0
data/Rakefile +6 -0
data/bin/console +14 -0
data/bin/setup +7 -0
data/dataflow-rb.gemspec +42 -0
data/lib/config/mongoid.yml +21 -0
data/lib/dataflow/adapters/csv_adapter.rb +123 -0
data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
data/lib/dataflow/adapters/psql_adapter.rb +21 -0
data/lib/dataflow/adapters/settings.rb +33 -0
data/lib/dataflow/adapters/sql_adapter.rb +322 -0
data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
data/lib/dataflow/errors/not_implemented_error.rb +7 -0
data/lib/dataflow/event_mixin.rb +77 -0
data/lib/dataflow/extensions/mongo_driver.rb +21 -0
data/lib/dataflow/extensions/msgpack.rb +19 -0
data/lib/dataflow/logger.rb +27 -0
data/lib/dataflow/node.rb +37 -0
data/lib/dataflow/nodes/compute_node.rb +495 -0
data/lib/dataflow/nodes/data_node.rb +331 -0
data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
data/lib/dataflow/nodes/filter/where_node.rb +44 -0
data/lib/dataflow/nodes/join_node.rb +151 -0
data/lib/dataflow/nodes/map_node.rb +50 -0
data/lib/dataflow/nodes/merge_node.rb +33 -0
data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
data/lib/dataflow/nodes/select_keys_node.rb +39 -0
data/lib/dataflow/nodes/snapshot_node.rb +77 -0
data/lib/dataflow/nodes/sql_query_node.rb +50 -0
data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
data/lib/dataflow/nodes/upsert_node.rb +68 -0
data/lib/dataflow/properties_mixin.rb +35 -0
data/lib/dataflow/schema_mixin.rb +134 -0
data/lib/dataflow/version.rb +4 -0
data/lib/dataflow-rb.rb +72 -0
metadata +371 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 39235e7bba48dcc339e007eefd360ae549439a29
+  data.tar.gz: b0c424349d25ecc970a8e74e72f7bbc00d43a0b9
+SHA512:
+  metadata.gz: 96d3a5bc08fb881025d6379e3453efb42e6a0cb7d87f773d3acb44f75236f519ce73e118b8bc51d924ab2862fb56925c6bad05907eb7fd5a4b07c0c19e49422a
+  data.tar.gz: b420ecbcf013232b770260f613ffa9ff298e15b02f41de58ccb5057839897652ce52381b43ee6f41d849ec5dc8ed60ca3af68d9b71bebb4737179affba9b1b05

data/.env.test.example ADDED Viewed

@@ -0,0 +1,6 @@
+# These need to be set for the tests to run properly.
+# You need to create a .env.test and set these properly:
+#MOJACO_MYSQL_USER=
+#MOJACO_MYSQL_PASSWORD=
+#MOJACO_POSTGRESQL_USER=
+#MOJACO_POSTGRESQL_PASSWORD=

data/.gitignore ADDED Viewed

@@ -0,0 +1,14 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+/datanodes
+.idea
+.byebug_history
+.tags
+.env.test

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --format documentation
2	+ --color

data/.travis.yml ADDED Viewed

@@ -0,0 +1,4 @@
+language: ruby
+rvm:
+  - 2.3.3
+before_install: gem install bundler -v 1.14.3

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in dataflow.gemspec
+gemspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2016-2017, Phybbit Ltd.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,46 @@
+# Dataflow
+The purpose of this gem is to help building complex dataflows and support automating long-running batch processes.
+It handles parallelizing computation whenever it cans and re-computing dependencies that are not up-to-date.
+There are two main concepts in describing a computing graph:
+- data-nodes, which support storing/retrieving data from databases
+- compute-nodes, which supports arbitrary processing, can depend on any number of nodes (compute/data) and can push their results to a data-node if needed
+The main use case is to represent data sources with data-nodes and link those to compute-nodes. Upon computing, the node will store the result in another data-node.
+The graph's metadata (e.g. nodes' dependencies, properties) is stored in MongoDB. It also uses MongoDB as the default DB for the data-node storage as it allows for quick schema-less prototyping. MySQL and PostgreSQL are also supported (through [Sequel](https://github.com/jeremyevans/sequel)).
+This repository only includes the most common nodes. Other repos will include custom (application-dependent) nodes.
+It has some similarities with the [Luigi](https://github.com/spotify/luigi) python module.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'dataflow-rb'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install dataflow-rb
+## Usage
+TODO: Write usage instructions here
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/phybbit/dataflow-rb.

data/Rakefile ADDED Viewed

@@ -0,0 +1,6 @@
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+task :default => :spec

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "dataflow"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+require "pry"
+Pry.start
+# require "irb"
+# IRB.start

data/bin/setup ADDED Viewed

@@ -0,0 +1,7 @@
+#!/bin/bash
+set -euo pipefail
+IFS=$'\n\t'
+bundle install
+# Do any other automated setup that you need to do here

data/dataflow-rb.gemspec ADDED Viewed

@@ -0,0 +1,42 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'dataflow/version'
+Gem::Specification.new do |spec|
+  spec.name          = 'dataflow-rb'
+  spec.version       = Dataflow::VERSION
+  spec.authors       = ['okoriko']
+  spec.email         = ['eurico@phybbit.com']
+  spec.summary       = %q{Helps building data and automation pipelines. It handles recomputing dependencies and parallel execution.}
+  spec.description   = %q{Helps building data pipelines. It handles recomputing dependencies and parallel execution.}
+  spec.homepage      = 'https://phybbit.com'
+  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  spec.bindir        = 'exe'
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  spec.add_development_dependency 'bundler'
+  spec.add_development_dependency 'rake'
+  spec.add_development_dependency 'rspec'
+  spec.add_development_dependency 'byebug'
+  spec.add_development_dependency 'pry-byebug'
+  spec.add_development_dependency 'timecop'
+  spec.add_development_dependency 'ruby-prof'
+  spec.add_development_dependency 'dotenv'
+  spec.add_dependency 'activesupport',    '>= 4.0.0'
+  spec.add_dependency 'schema-inference', '~>1.2.1'
+  spec.add_dependency 'parallel',         '~>1.10'
+  spec.add_dependency 'mongoid',          '~>6.0'
+  spec.add_dependency 'sequel',           '~>4.0'
+  spec.add_dependency 'mysql2',           '~>0.4'
+  spec.add_dependency 'pg',               '~>0.19'
+  spec.add_dependency 'sequel_pg',        '~>1.6'
+  spec.add_dependency 'msgpack',          '~>1.0'
+  spec.add_dependency 'smarter_csv',      '1.1.0'
+  spec.add_dependency 'timeliness',       '~>0.3'
+  spec.add_dependency 'chronic',          '~>0.10'
+end

data/lib/config/mongoid.yml ADDED Viewed

@@ -0,0 +1,21 @@
+test:
+  clients:
+    default:
+      database: dataflow_test
+      hosts:
+        - localhost:27017
+      options:
+        read:
+          mode: :primary
+        max_pool_size: 1
+default:
+  clients:
+    default:
+      database: dataflow
+      hosts:
+        - localhost:27017
+      options:
+        read:
+          mode: :primary
+        max_pool_size: 10

data/lib/dataflow/adapters/csv_adapter.rb ADDED Viewed

@@ -0,0 +1,123 @@
+# frozen_string_literal: true
+require 'securerandom'
+module Dataflow
+  module Adapters
+    # Interface between a data node and csv.
+    # We use mongodb to perform all the store/retrieve operations.
+    class CsvAdapter
+      include Dataflow::SchemaMixin
+      attr_reader :settings
+      def initialize(args)
+        # make sure the CsvPath exist
+        `mkdir -p #{Dataflow::CsvPath}`
+        update_settings(args)
+      end
+      def update_settings(args)
+        @settings = Dataflow::Adapters::Settings.new(args)
+        @schema = [] # TODO: pre-fetch the csv's schema
+      end
+      def set_schema(schema)
+        @schema = schema
+      end
+      # retrieve a single element from a data node
+      def find(where: opts = {})
+        raise Errors::NotImplementedError, '#find is not yet support on CSV.'
+      end
+      # retrieve all elements from a data node
+      def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
+        SmarterCSV.process(file_path, strings_as_keys: true)
+      rescue Errno::ENOENT => e
+        []
+      end
+      # count the number of records
+      def count(where: {})
+        all(where: where).count
+      end
+      # save the given records
+      def save(records:)
+        write_csv_part(records, keys: @schema.keys)
+      end
+      def on_save_finished
+        write_single_csv(keys: @schema.keys)
+      end
+      def remove(_opts = {})
+        raise Errors::NotImplementedError, '#find is not yet support on CSV.'
+      end
+      def recreate_dataset(dataset: nil)
+        # simply delete the file
+        delete_file(file_path)
+        # and any parts if any is still there
+        file_parts.each { |part| delete_file(part) }
+      end
+      def create_indexes(*); end
+      private
+      def delete_file(path)
+        File.delete(path)
+      rescue Errno::ENOENT => e
+        # no file present, no problem
+      end
+      def file_path
+        filename = "#{settings.db_name}.#{settings.dataset_name}.csv"
+        "#{Dataflow::CsvPath}/#{filename}"
+      end
+      def file_parts
+        part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
+        Dir["#{file_path}.part_*"]
+      end
+      def write_csv_part(data, keys:)
+        # prepare the data
+        key_tokens = keys.map { |key| record_dig_tokens(key: key) }
+        rows = data.map do |datum|
+          key_tokens.map { |tokens| datum.dig(*tokens) }
+        end
+        # dump in a part file
+        uuid = SecureRandom.hex
+        CSV.open("#{file_path}.part_#{uuid}", 'w') do |csv|
+          rows.each { |row| csv << row }
+        end
+      end
+      def write_single_csv(keys:)
+        # export headers
+        header_filepath = "#{file_path}.header"
+        CSV.open(header_filepath, 'w') do |csv|
+          csv << keys
+        end
+        # make sure the destination file is deleted
+        delete_file(file_path)
+        # merge the files into the output
+        files = [header_filepath] + file_parts
+        files.each do |file|
+          # cat each file to the destination file
+          `cat #{file} >> #{file_path}`
+        end
+        # remove the intermediary files
+        files.each do |file|
+          delete_file(file)
+        end
+      end
+    end
+  end
+end

data/lib/dataflow/adapters/mongo_db_adapter.rb ADDED Viewed

@@ -0,0 +1,307 @@
+# frozen_string_literal: true
+module Dataflow
+  module Adapters
+    # Interface between a data node and mongodb.
+    # We use mongodb to perform all the store/retrieve operations.
+    class MongoDbAdapter
+      SYSTEM_ID = '_id'
+      class << self
+        def client(settings, db_name: nil)
+          @clients ||= {}
+          host = ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
+          port = '27017'
+          connection_uri = settings.connection_uri || "#{host}:#{port}"
+          db_name ||= settings.db_name
+          @clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
+        end
+        def admin_client(settings)
+          return @admin_client if @admin_client
+          @admin_client = client(settings, db_name: 'admin')
+        end
+        # Force the clients to disconnect their connections.
+        # Use before forking.
+        def disconnect_clients
+          @clients ||= {}
+          @clients.values.each(&:close)
+        end
+      end
+      attr_reader :settings
+      attr_reader :client
+      def initialize(args)
+        update_settings(args)
+        @client = MongoDbAdapter.client(settings)
+        @admin_client = MongoDbAdapter.admin_client(settings)
+      end
+      def update_settings(args)
+        @settings = Dataflow::Adapters::Settings.new(args)
+      end
+      # retrieve a single element from a data node
+      def find(where: {}, fields: [], sort: {}, offset: 0)
+        all(where: where, fields: fields, sort: sort, offset: offset, limit: 1).first
+      end
+      # retrieve all elements from a data node
+      def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
+        projection = fields.map { |field| [field, 1] }
+        unless fields.map(&:to_s).include?(SYSTEM_ID)
+          # by default, do not select the _id field
+          projection << [SYSTEM_ID, 0].freeze
+        end
+        opts = transform_to_query(where)
+        res = client[read_dataset_name].find(opts)
+        res = res.projection(projection.to_h)
+        res = res.sort(sort)   if sort
+        res = res.skip(offset) if offset > 0
+        res = res.limit(limit) if limit > 0
+        if block_given?
+          yield res
+        else
+          res.to_a
+        end
+      end
+      # Helper that supports paginating through the whole dataset at fixed
+      # performance. Unlike using offset/skip which requires to read through
+      # the skipped content (high usage of CPU), we use the internal mongo
+      # cursor to get batch of results.
+      # @return [Hash] with 2 fields: data and next_cursor for the next call
+      def all_paginated(where: {}, fields: [], cursor: nil)
+        cursor = cursor.to_i
+        data = []
+        # If there is no cursor, we make the initial query
+        # get the first batch of data and get the cursor id.
+        if cursor.zero?
+          all(where: where, fields: fields) do |res|
+            results = res.initial_query
+            data = results.documents
+            cursor = res.cursor.id
+          end
+        end
+        # The first query's result batch is a small 101 set of results
+        # so we want to get one more batch of data.
+        # However, there might be queries whose results are very small
+        # and the resulting cursor is 0. In such case there is no more
+        # data to be fetched.
+        unless cursor.zero?
+          # send a getMore command on the cursor id
+          command = { getMore: cursor, collection: read_dataset_name }
+          result = client.database.command(command).documents[0]
+          cursor = result['cursor']['id']
+          data += result['cursor']['nextBatch']
+        end
+        # We want to return the cursor as a string.
+        # If there is no cursor (zero) then make it empty
+        cursor = '' if cursor.zero?
+        { 'data' => data, 'next_cursor' => cursor.to_s }
+      rescue Mongo::Error::OperationFailure
+        { 'data' => data, 'next_cursor' => '' }
+      end
+      # Create queries that permit processing the whole dataset in parallel without using offsets.
+      def ordered_system_id_queries(batch_size:)
+        ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
+        queries_count = (ids.size / batch_size.to_f).ceil
+        Array.new(queries_count) do |i|
+          from = ids[i * batch_size]
+          to = ids[(i + 1) * batch_size] || ids[-1]
+          is_last = i == queries_count - 1
+          where_query = { SYSTEM_ID => { '>=' => from } }
+          operator = is_last ? '<=' : '<'
+          where_query[SYSTEM_ID][operator] = to
+          where_query
+        end
+      end
+      # count the number of records
+      def count(where: {})
+        client[read_dataset_name].count(transform_to_query(where))
+      end
+      # Save the given records.
+      # @param replace_by [Array] if the replace_by key is provided,
+      #        it will try to replace records with the matching key,
+      #        or insert if none is found.
+      def save(records:, replace_by: nil)
+        if replace_by.present?
+          replace_keys = Array(replace_by)
+          bulk_ops = records.map do |record|
+            filter = replace_keys.map { |x| [x, record[x]] }.to_h
+            {
+              replace_one: {
+                filter: filter,
+                replacement: record,
+                upsert: true
+              }
+            }
+          end
+          client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
+        else
+          save_many(records: records)
+        end
+      end
+      # Delete records that match the options.
+      # @param where query to apply on the delete operation.
+      def delete(where: {})
+        client[read_dataset_name].delete_many(transform_to_query(where))
+      end
+      # recreate the table/collection
+      def recreate_dataset(dataset: nil)
+        dataset ||= write_dataset_name
+        collection = client[dataset]
+        collection.drop
+        collection.create
+      end
+      # Create the indexes on this dataset.
+      # @param dataset [String] Specify on which dataset the operation will be performed.
+      #        Default: the adatpter's settings' dataset.
+      # @param type [Symbol] select which indexes type to create.
+      #        Can be :all (default), :unique_only, :non_unique_only
+      def create_indexes(dataset: nil, type: :all, drop_retry_on_error: true)
+        dataset ||= write_dataset_name
+        return unless settings.indexes.present?
+        indexes = (settings.indexes || [])
+        case type
+        when :unique_only
+          indexes = indexes.select { |idx| idx['unique'] }
+        when :non_unique_only
+          indexes = indexes.reject { |idx| idx['unique'] }
+        end
+        indexes = indexes.map { |x| format_index(x) }
+        client[dataset].indexes.create_many(indexes)
+      rescue Mongo::Error::OperationFailure => e
+        raise e unless drop_retry_on_error
+        client[dataset].indexes.drop_all
+        create_indexes(drop_retry_on_error: false)
+      end
+      def usage(dataset:)
+        indexes = retrieve_collection_indexes(dataset)
+        command = { collstats: dataset }
+        result = client.database.command(command).documents[0]
+        {
+          memory: result['size'],
+          storage: result['storageSize'],
+          effective_indexes: indexes
+        }
+      rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
+        {
+          memory: 0,
+          storage: 0,
+          effective_indexes: indexes
+        }
+      end
+      private
+      def write_dataset_name
+        settings.write_dataset_name
+      end
+      def read_dataset_name
+        settings.read_dataset_name
+      end
+      def transform_to_query(opts)
+        sanitized_opts = {}
+        opts.each do |k, v|
+          if v.is_a? Array
+            # e.g. { 'id' => [1,2] } transform to mongodb IN clauses
+            sanitized_opts[k] = { '$in' => v.map { |value| try_cast_value(k, value) } }
+          elsif v.is_a? Hash
+            sanitized_opts[k] = {}
+            v.each do |operator, value|
+              case operator.to_s
+              when '!='
+                # we still need to check and transform into
+                if value.is_a? Array
+                  # { '$nin' => [value] }
+                  sanitized_opts[k]['$nin'] = value.map { |x| try_cast_value(k, x) }
+                else
+                  # or {'$ne' => value }
+                  sanitized_opts[k]['$ne'] = try_cast_value(k, value)
+                end
+              when '<'
+                sanitized_opts[k]['$lt'] = try_cast_value(k, value)
+              when '<='
+                sanitized_opts[k]['$lte'] = try_cast_value(k, value)
+              when '>'
+                sanitized_opts[k]['$gt'] = try_cast_value(k, value)
+              when '>='
+                sanitized_opts[k]['$gte'] = try_cast_value(k, value)
+              end
+            end
+          else
+            sanitized_opts[k] = try_cast_value(k, v)
+          end
+        end
+        sanitized_opts
+      end
+      def try_cast_value(field, value)
+        # cast to time when querying on _mojaco_updated_at
+        return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
+        # cast to ObjectId when querying on _id
+        return BSON::ObjectId(value) if field == SYSTEM_ID && value.is_a?(String)
+        # TODO: add other casts based on the field type
+        value
+      end
+      def save_many(records:)
+        client[write_dataset_name].insert_many(records, ordered: false)
+      rescue Mongo::Error::BulkWriteError => e
+        dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
+        # don't raise if it is errors about duplicated keys
+        raise e unless dup_key_error
+      end
+      # Required index format for mongodb:
+      # { :key => { name: 1 }, :unique => true },
+      def format_index(dataset_index)
+        dataset_index = dataset_index.with_indifferent_access
+        index_key = {}
+        keys = Array(dataset_index[:key])
+        keys.each { |k| index_key[k] = 1 }
+        index = { key: index_key }
+        index[:unique] = true if dataset_index[:unique]
+        index
+      end
+      def retrieve_collection_indexes(collection)
+        mongo_indexes = client[collection].indexes
+        mongo_indexes.map do |idx|
+          # skip the default index
+          next if idx['key'].keys == ['_id']
+          index = { 'key' => idx['key'].keys }
+          index['unique'] = true if idx['unique']
+          index
+        end.compact
+      end
+    end
+  end
+end

data/lib/dataflow/adapters/mysql_adapter.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+module Dataflow
+  module Adapters
+    # Interface between a data node and mongodb.
+    # We use mongodb to perform all the store/retrieve operations.
+    class MysqlAdapter < SqlAdapter
+      def fetch_table_usage(dataset:)
+        size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
+        {
+          memory: size,
+          storage: size
+        }
+      rescue Sequel::DatabaseError => e
+        {
+          memory: 0,
+          storage: 0
+        }
+      end
+    end
+  end
+end

data/lib/dataflow/adapters/psql_adapter.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+module Dataflow
+  module Adapters
+    # Interface between a data node and mongodb.
+    # We use mongodb to perform all the store/retrieve operations.
+    class PsqlAdapter < SqlAdapter
+      def fetch_table_usage(dataset:)
+        size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
+        {
+          memory: size,
+          storage: size
+        }
+      rescue Sequel::DatabaseError
+        {
+          memory: 0,
+          storage: 0
+        }
+      end
+    end
+  end
+end