dataflow-rb 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.env.test.example +6 -0
  3. data/.gitignore +14 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +46 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/dataflow-rb.gemspec +42 -0
  13. data/lib/config/mongoid.yml +21 -0
  14. data/lib/dataflow/adapters/csv_adapter.rb +123 -0
  15. data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
  16. data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
  17. data/lib/dataflow/adapters/psql_adapter.rb +21 -0
  18. data/lib/dataflow/adapters/settings.rb +33 -0
  19. data/lib/dataflow/adapters/sql_adapter.rb +322 -0
  20. data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
  21. data/lib/dataflow/errors/not_implemented_error.rb +7 -0
  22. data/lib/dataflow/event_mixin.rb +77 -0
  23. data/lib/dataflow/extensions/mongo_driver.rb +21 -0
  24. data/lib/dataflow/extensions/msgpack.rb +19 -0
  25. data/lib/dataflow/logger.rb +27 -0
  26. data/lib/dataflow/node.rb +37 -0
  27. data/lib/dataflow/nodes/compute_node.rb +495 -0
  28. data/lib/dataflow/nodes/data_node.rb +331 -0
  29. data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
  30. data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
  31. data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
  32. data/lib/dataflow/nodes/filter/where_node.rb +44 -0
  33. data/lib/dataflow/nodes/join_node.rb +151 -0
  34. data/lib/dataflow/nodes/map_node.rb +50 -0
  35. data/lib/dataflow/nodes/merge_node.rb +33 -0
  36. data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
  37. data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
  38. data/lib/dataflow/nodes/select_keys_node.rb +39 -0
  39. data/lib/dataflow/nodes/snapshot_node.rb +77 -0
  40. data/lib/dataflow/nodes/sql_query_node.rb +50 -0
  41. data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
  42. data/lib/dataflow/nodes/upsert_node.rb +68 -0
  43. data/lib/dataflow/properties_mixin.rb +35 -0
  44. data/lib/dataflow/schema_mixin.rb +134 -0
  45. data/lib/dataflow/version.rb +4 -0
  46. data/lib/dataflow-rb.rb +72 -0
  47. metadata +371 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 39235e7bba48dcc339e007eefd360ae549439a29
4
+ data.tar.gz: b0c424349d25ecc970a8e74e72f7bbc00d43a0b9
5
+ SHA512:
6
+ metadata.gz: 96d3a5bc08fb881025d6379e3453efb42e6a0cb7d87f773d3acb44f75236f519ce73e118b8bc51d924ab2862fb56925c6bad05907eb7fd5a4b07c0c19e49422a
7
+ data.tar.gz: b420ecbcf013232b770260f613ffa9ff298e15b02f41de58ccb5057839897652ce52381b43ee6f41d849ec5dc8ed60ca3af68d9b71bebb4737179affba9b1b05
data/.env.test.example ADDED
@@ -0,0 +1,6 @@
1
+ # These need to be set for the tests to run properly.
2
+ # You need to create a .env.test and set these properly:
3
+ #MOJACO_MYSQL_USER=
4
+ #MOJACO_MYSQL_PASSWORD=
5
+ #MOJACO_POSTGRESQL_USER=
6
+ #MOJACO_POSTGRESQL_PASSWORD=
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /datanodes
11
+ .idea
12
+ .byebug_history
13
+ .tags
14
+ .env.test
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.3
4
+ before_install: gem install bundler -v 1.14.3
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in dataflow.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016-2017, Phybbit Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Dataflow
2
+
3
+ The purpose of this gem is to help building complex dataflows and support automating long-running batch processes.
4
+ It handles parallelizing computation whenever it cans and re-computing dependencies that are not up-to-date.
5
+
6
+ There are two main concepts in describing a computing graph:
7
+ - data-nodes, which support storing/retrieving data from databases
8
+ - compute-nodes, which supports arbitrary processing, can depend on any number of nodes (compute/data) and can push their results to a data-node if needed
9
+
10
+ The main use case is to represent data sources with data-nodes and link those to compute-nodes. Upon computing, the node will store the result in another data-node.
11
+
12
+ The graph's metadata (e.g. nodes' dependencies, properties) is stored in MongoDB. It also uses MongoDB as the default DB for the data-node storage as it allows for quick schema-less prototyping. MySQL and PostgreSQL are also supported (through [Sequel](https://github.com/jeremyevans/sequel)).
13
+
14
+ This repository only includes the most common nodes. Other repos will include custom (application-dependent) nodes.
15
+
16
+ It has some similarities with the [Luigi](https://github.com/spotify/luigi) python module.
17
+
18
+ ## Installation
19
+
20
+ Add this line to your application's Gemfile:
21
+
22
+ ```ruby
23
+ gem 'dataflow-rb'
24
+ ```
25
+
26
+ And then execute:
27
+
28
+ $ bundle
29
+
30
+ Or install it yourself as:
31
+
32
+ $ gem install dataflow-rb
33
+
34
+ ## Usage
35
+
36
+ TODO: Write usage instructions here
37
+
38
+ ## Development
39
+
40
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
41
+
42
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
43
+
44
+ ## Contributing
45
+
46
+ Bug reports and pull requests are welcome on GitHub at https://github.com/phybbit/dataflow-rb.
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "dataflow"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ require "pry"
11
+ Pry.start
12
+
13
+ # require "irb"
14
+ # IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,42 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'dataflow/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'dataflow-rb'
8
+ spec.version = Dataflow::VERSION
9
+ spec.authors = ['okoriko']
10
+ spec.email = ['eurico@phybbit.com']
11
+
12
+ spec.summary = %q{Helps building data and automation pipelines. It handles recomputing dependencies and parallel execution.}
13
+ spec.description = %q{Helps building data pipelines. It handles recomputing dependencies and parallel execution.}
14
+ spec.homepage = 'https://phybbit.com'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = 'exe'
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_development_dependency 'bundler'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'rspec'
24
+ spec.add_development_dependency 'byebug'
25
+ spec.add_development_dependency 'pry-byebug'
26
+ spec.add_development_dependency 'timecop'
27
+ spec.add_development_dependency 'ruby-prof'
28
+ spec.add_development_dependency 'dotenv'
29
+
30
+ spec.add_dependency 'activesupport', '>= 4.0.0'
31
+ spec.add_dependency 'schema-inference', '~>1.2.1'
32
+ spec.add_dependency 'parallel', '~>1.10'
33
+ spec.add_dependency 'mongoid', '~>6.0'
34
+ spec.add_dependency 'sequel', '~>4.0'
35
+ spec.add_dependency 'mysql2', '~>0.4'
36
+ spec.add_dependency 'pg', '~>0.19'
37
+ spec.add_dependency 'sequel_pg', '~>1.6'
38
+ spec.add_dependency 'msgpack', '~>1.0'
39
+ spec.add_dependency 'smarter_csv', '1.1.0'
40
+ spec.add_dependency 'timeliness', '~>0.3'
41
+ spec.add_dependency 'chronic', '~>0.10'
42
+ end
@@ -0,0 +1,21 @@
1
+ test:
2
+ clients:
3
+ default:
4
+ database: dataflow_test
5
+ hosts:
6
+ - localhost:27017
7
+ options:
8
+ read:
9
+ mode: :primary
10
+ max_pool_size: 1
11
+
12
+ default:
13
+ clients:
14
+ default:
15
+ database: dataflow
16
+ hosts:
17
+ - localhost:27017
18
+ options:
19
+ read:
20
+ mode: :primary
21
+ max_pool_size: 10
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+ require 'securerandom'
3
+
4
+ module Dataflow
5
+ module Adapters
6
+ # Interface between a data node and csv.
7
+ # We use mongodb to perform all the store/retrieve operations.
8
+ class CsvAdapter
9
+ include Dataflow::SchemaMixin
10
+
11
+ attr_reader :settings
12
+
13
+ def initialize(args)
14
+ # make sure the CsvPath exist
15
+ `mkdir -p #{Dataflow::CsvPath}`
16
+ update_settings(args)
17
+ end
18
+
19
+ def update_settings(args)
20
+ @settings = Dataflow::Adapters::Settings.new(args)
21
+ @schema = [] # TODO: pre-fetch the csv's schema
22
+ end
23
+
24
+ def set_schema(schema)
25
+ @schema = schema
26
+ end
27
+
28
+ # retrieve a single element from a data node
29
+ def find(where: opts = {})
30
+ raise Errors::NotImplementedError, '#find is not yet support on CSV.'
31
+ end
32
+
33
+ # retrieve all elements from a data node
34
+ def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
35
+ SmarterCSV.process(file_path, strings_as_keys: true)
36
+ rescue Errno::ENOENT => e
37
+ []
38
+ end
39
+
40
+ # count the number of records
41
+ def count(where: {})
42
+ all(where: where).count
43
+ end
44
+
45
+ # save the given records
46
+ def save(records:)
47
+ write_csv_part(records, keys: @schema.keys)
48
+ end
49
+
50
+ def on_save_finished
51
+ write_single_csv(keys: @schema.keys)
52
+ end
53
+
54
+ def remove(_opts = {})
55
+ raise Errors::NotImplementedError, '#find is not yet support on CSV.'
56
+ end
57
+
58
+ def recreate_dataset(dataset: nil)
59
+ # simply delete the file
60
+ delete_file(file_path)
61
+ # and any parts if any is still there
62
+ file_parts.each { |part| delete_file(part) }
63
+ end
64
+
65
+ def create_indexes(*); end
66
+
67
+ private
68
+
69
+ def delete_file(path)
70
+ File.delete(path)
71
+ rescue Errno::ENOENT => e
72
+ # no file present, no problem
73
+ end
74
+
75
+ def file_path
76
+ filename = "#{settings.db_name}.#{settings.dataset_name}.csv"
77
+ "#{Dataflow::CsvPath}/#{filename}"
78
+ end
79
+
80
+ def file_parts
81
+ part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
82
+ Dir["#{file_path}.part_*"]
83
+ end
84
+
85
+ def write_csv_part(data, keys:)
86
+ # prepare the data
87
+ key_tokens = keys.map { |key| record_dig_tokens(key: key) }
88
+ rows = data.map do |datum|
89
+ key_tokens.map { |tokens| datum.dig(*tokens) }
90
+ end
91
+
92
+ # dump in a part file
93
+ uuid = SecureRandom.hex
94
+ CSV.open("#{file_path}.part_#{uuid}", 'w') do |csv|
95
+ rows.each { |row| csv << row }
96
+ end
97
+ end
98
+
99
+ def write_single_csv(keys:)
100
+ # export headers
101
+ header_filepath = "#{file_path}.header"
102
+ CSV.open(header_filepath, 'w') do |csv|
103
+ csv << keys
104
+ end
105
+
106
+ # make sure the destination file is deleted
107
+ delete_file(file_path)
108
+
109
+ # merge the files into the output
110
+ files = [header_filepath] + file_parts
111
+ files.each do |file|
112
+ # cat each file to the destination file
113
+ `cat #{file} >> #{file_path}`
114
+ end
115
+
116
+ # remove the intermediary files
117
+ files.each do |file|
118
+ delete_file(file)
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,307 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ # Interface between a data node and mongodb.
5
+ # We use mongodb to perform all the store/retrieve operations.
6
+ class MongoDbAdapter
7
+ SYSTEM_ID = '_id'
8
+
9
+ class << self
10
+ def client(settings, db_name: nil)
11
+ @clients ||= {}
12
+ host = ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
13
+ port = '27017'
14
+ connection_uri = settings.connection_uri || "#{host}:#{port}"
15
+ db_name ||= settings.db_name
16
+ @clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
17
+ end
18
+
19
+ def admin_client(settings)
20
+ return @admin_client if @admin_client
21
+ @admin_client = client(settings, db_name: 'admin')
22
+ end
23
+
24
+ # Force the clients to disconnect their connections.
25
+ # Use before forking.
26
+ def disconnect_clients
27
+ @clients ||= {}
28
+ @clients.values.each(&:close)
29
+ end
30
+ end
31
+
32
+ attr_reader :settings
33
+ attr_reader :client
34
+
35
+ def initialize(args)
36
+ update_settings(args)
37
+ @client = MongoDbAdapter.client(settings)
38
+ @admin_client = MongoDbAdapter.admin_client(settings)
39
+ end
40
+
41
+ def update_settings(args)
42
+ @settings = Dataflow::Adapters::Settings.new(args)
43
+ end
44
+
45
+ # retrieve a single element from a data node
46
+ def find(where: {}, fields: [], sort: {}, offset: 0)
47
+ all(where: where, fields: fields, sort: sort, offset: offset, limit: 1).first
48
+ end
49
+
50
+ # retrieve all elements from a data node
51
+ def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
52
+ projection = fields.map { |field| [field, 1] }
53
+
54
+ unless fields.map(&:to_s).include?(SYSTEM_ID)
55
+ # by default, do not select the _id field
56
+ projection << [SYSTEM_ID, 0].freeze
57
+ end
58
+
59
+ opts = transform_to_query(where)
60
+ res = client[read_dataset_name].find(opts)
61
+ res = res.projection(projection.to_h)
62
+
63
+ res = res.sort(sort) if sort
64
+ res = res.skip(offset) if offset > 0
65
+ res = res.limit(limit) if limit > 0
66
+
67
+ if block_given?
68
+ yield res
69
+ else
70
+ res.to_a
71
+ end
72
+ end
73
+
74
+ # Helper that supports paginating through the whole dataset at fixed
75
+ # performance. Unlike using offset/skip which requires to read through
76
+ # the skipped content (high usage of CPU), we use the internal mongo
77
+ # cursor to get batch of results.
78
+ # @return [Hash] with 2 fields: data and next_cursor for the next call
79
+ def all_paginated(where: {}, fields: [], cursor: nil)
80
+ cursor = cursor.to_i
81
+ data = []
82
+
83
+ # If there is no cursor, we make the initial query
84
+ # get the first batch of data and get the cursor id.
85
+ if cursor.zero?
86
+ all(where: where, fields: fields) do |res|
87
+ results = res.initial_query
88
+ data = results.documents
89
+ cursor = res.cursor.id
90
+ end
91
+ end
92
+
93
+ # The first query's result batch is a small 101 set of results
94
+ # so we want to get one more batch of data.
95
+ # However, there might be queries whose results are very small
96
+ # and the resulting cursor is 0. In such case there is no more
97
+ # data to be fetched.
98
+ unless cursor.zero?
99
+ # send a getMore command on the cursor id
100
+ command = { getMore: cursor, collection: read_dataset_name }
101
+ result = client.database.command(command).documents[0]
102
+ cursor = result['cursor']['id']
103
+ data += result['cursor']['nextBatch']
104
+ end
105
+
106
+ # We want to return the cursor as a string.
107
+ # If there is no cursor (zero) then make it empty
108
+ cursor = '' if cursor.zero?
109
+
110
+ { 'data' => data, 'next_cursor' => cursor.to_s }
111
+ rescue Mongo::Error::OperationFailure
112
+ { 'data' => data, 'next_cursor' => '' }
113
+ end
114
+
115
+ # Create queries that permit processing the whole dataset in parallel without using offsets.
116
+ def ordered_system_id_queries(batch_size:)
117
+ ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
118
+ queries_count = (ids.size / batch_size.to_f).ceil
119
+ Array.new(queries_count) do |i|
120
+ from = ids[i * batch_size]
121
+ to = ids[(i + 1) * batch_size] || ids[-1]
122
+ is_last = i == queries_count - 1
123
+
124
+ where_query = { SYSTEM_ID => { '>=' => from } }
125
+ operator = is_last ? '<=' : '<'
126
+ where_query[SYSTEM_ID][operator] = to
127
+
128
+ where_query
129
+ end
130
+ end
131
+
132
+ # count the number of records
133
+ def count(where: {})
134
+ client[read_dataset_name].count(transform_to_query(where))
135
+ end
136
+
137
+ # Save the given records.
138
+ # @param replace_by [Array] if the replace_by key is provided,
139
+ # it will try to replace records with the matching key,
140
+ # or insert if none is found.
141
+ def save(records:, replace_by: nil)
142
+ if replace_by.present?
143
+ replace_keys = Array(replace_by)
144
+ bulk_ops = records.map do |record|
145
+ filter = replace_keys.map { |x| [x, record[x]] }.to_h
146
+ {
147
+ replace_one: {
148
+ filter: filter,
149
+ replacement: record,
150
+ upsert: true
151
+ }
152
+ }
153
+ end
154
+ client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
155
+ else
156
+ save_many(records: records)
157
+ end
158
+ end
159
+
160
+ # Delete records that match the options.
161
+ # @param where query to apply on the delete operation.
162
+ def delete(where: {})
163
+ client[read_dataset_name].delete_many(transform_to_query(where))
164
+ end
165
+
166
+ # recreate the table/collection
167
+ def recreate_dataset(dataset: nil)
168
+ dataset ||= write_dataset_name
169
+ collection = client[dataset]
170
+ collection.drop
171
+ collection.create
172
+ end
173
+
174
+ # Create the indexes on this dataset.
175
+ # @param dataset [String] Specify on which dataset the operation will be performed.
176
+ # Default: the adatpter's settings' dataset.
177
+ # @param type [Symbol] select which indexes type to create.
178
+ # Can be :all (default), :unique_only, :non_unique_only
179
+ def create_indexes(dataset: nil, type: :all, drop_retry_on_error: true)
180
+ dataset ||= write_dataset_name
181
+ return unless settings.indexes.present?
182
+
183
+ indexes = (settings.indexes || [])
184
+
185
+ case type
186
+ when :unique_only
187
+ indexes = indexes.select { |idx| idx['unique'] }
188
+ when :non_unique_only
189
+ indexes = indexes.reject { |idx| idx['unique'] }
190
+ end
191
+
192
+ indexes = indexes.map { |x| format_index(x) }
193
+ client[dataset].indexes.create_many(indexes)
194
+ rescue Mongo::Error::OperationFailure => e
195
+ raise e unless drop_retry_on_error
196
+ client[dataset].indexes.drop_all
197
+ create_indexes(drop_retry_on_error: false)
198
+ end
199
+
200
+ def usage(dataset:)
201
+ indexes = retrieve_collection_indexes(dataset)
202
+ command = { collstats: dataset }
203
+ result = client.database.command(command).documents[0]
204
+ {
205
+ memory: result['size'],
206
+ storage: result['storageSize'],
207
+ effective_indexes: indexes
208
+ }
209
+ rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
210
+ {
211
+ memory: 0,
212
+ storage: 0,
213
+ effective_indexes: indexes
214
+ }
215
+ end
216
+
217
+ private
218
+
219
+ def write_dataset_name
220
+ settings.write_dataset_name
221
+ end
222
+
223
+ def read_dataset_name
224
+ settings.read_dataset_name
225
+ end
226
+
227
+ def transform_to_query(opts)
228
+ sanitized_opts = {}
229
+ opts.each do |k, v|
230
+ if v.is_a? Array
231
+ # e.g. { 'id' => [1,2] } transform to mongodb IN clauses
232
+ sanitized_opts[k] = { '$in' => v.map { |value| try_cast_value(k, value) } }
233
+ elsif v.is_a? Hash
234
+ sanitized_opts[k] = {}
235
+ v.each do |operator, value|
236
+ case operator.to_s
237
+ when '!='
238
+ # we still need to check and transform into
239
+ if value.is_a? Array
240
+ # { '$nin' => [value] }
241
+ sanitized_opts[k]['$nin'] = value.map { |x| try_cast_value(k, x) }
242
+ else
243
+ # or {'$ne' => value }
244
+ sanitized_opts[k]['$ne'] = try_cast_value(k, value)
245
+ end
246
+ when '<'
247
+ sanitized_opts[k]['$lt'] = try_cast_value(k, value)
248
+ when '<='
249
+ sanitized_opts[k]['$lte'] = try_cast_value(k, value)
250
+ when '>'
251
+ sanitized_opts[k]['$gt'] = try_cast_value(k, value)
252
+ when '>='
253
+ sanitized_opts[k]['$gte'] = try_cast_value(k, value)
254
+ end
255
+ end
256
+ else
257
+ sanitized_opts[k] = try_cast_value(k, v)
258
+ end
259
+ end
260
+ sanitized_opts
261
+ end
262
+
263
+ def try_cast_value(field, value)
264
+ # cast to time when querying on _mojaco_updated_at
265
+ return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
266
+ # cast to ObjectId when querying on _id
267
+ return BSON::ObjectId(value) if field == SYSTEM_ID && value.is_a?(String)
268
+
269
+ # TODO: add other casts based on the field type
270
+ value
271
+ end
272
+
273
+ def save_many(records:)
274
+ client[write_dataset_name].insert_many(records, ordered: false)
275
+ rescue Mongo::Error::BulkWriteError => e
276
+ dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
277
+ # don't raise if it is errors about duplicated keys
278
+ raise e unless dup_key_error
279
+ end
280
+
281
+ # Required index format for mongodb:
282
+ # { :key => { name: 1 }, :unique => true },
283
+ def format_index(dataset_index)
284
+ dataset_index = dataset_index.with_indifferent_access
285
+
286
+ index_key = {}
287
+ keys = Array(dataset_index[:key])
288
+ keys.each { |k| index_key[k] = 1 }
289
+ index = { key: index_key }
290
+ index[:unique] = true if dataset_index[:unique]
291
+ index
292
+ end
293
+
294
+ def retrieve_collection_indexes(collection)
295
+ mongo_indexes = client[collection].indexes
296
+ mongo_indexes.map do |idx|
297
+ # skip the default index
298
+ next if idx['key'].keys == ['_id']
299
+
300
+ index = { 'key' => idx['key'].keys }
301
+ index['unique'] = true if idx['unique']
302
+ index
303
+ end.compact
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ # Interface between a data node and mongodb.
5
+ # We use mongodb to perform all the store/retrieve operations.
6
+ class MysqlAdapter < SqlAdapter
7
+ def fetch_table_usage(dataset:)
8
+ size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
9
+ {
10
+ memory: size,
11
+ storage: size
12
+ }
13
+ rescue Sequel::DatabaseError => e
14
+ {
15
+ memory: 0,
16
+ storage: 0
17
+ }
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ # Interface between a data node and mongodb.
5
+ # We use mongodb to perform all the store/retrieve operations.
6
+ class PsqlAdapter < SqlAdapter
7
+ def fetch_table_usage(dataset:)
8
+ size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
9
+ {
10
+ memory: size,
11
+ storage: size
12
+ }
13
+ rescue Sequel::DatabaseError
14
+ {
15
+ memory: 0,
16
+ storage: 0
17
+ }
18
+ end
19
+ end
20
+ end
21
+ end