dataflow-rb 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.env.test.example +6 -0
  3. data/.gitignore +14 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +46 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/dataflow-rb.gemspec +42 -0
  13. data/lib/config/mongoid.yml +21 -0
  14. data/lib/dataflow/adapters/csv_adapter.rb +123 -0
  15. data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
  16. data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
  17. data/lib/dataflow/adapters/psql_adapter.rb +21 -0
  18. data/lib/dataflow/adapters/settings.rb +33 -0
  19. data/lib/dataflow/adapters/sql_adapter.rb +322 -0
  20. data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
  21. data/lib/dataflow/errors/not_implemented_error.rb +7 -0
  22. data/lib/dataflow/event_mixin.rb +77 -0
  23. data/lib/dataflow/extensions/mongo_driver.rb +21 -0
  24. data/lib/dataflow/extensions/msgpack.rb +19 -0
  25. data/lib/dataflow/logger.rb +27 -0
  26. data/lib/dataflow/node.rb +37 -0
  27. data/lib/dataflow/nodes/compute_node.rb +495 -0
  28. data/lib/dataflow/nodes/data_node.rb +331 -0
  29. data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
  30. data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
  31. data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
  32. data/lib/dataflow/nodes/filter/where_node.rb +44 -0
  33. data/lib/dataflow/nodes/join_node.rb +151 -0
  34. data/lib/dataflow/nodes/map_node.rb +50 -0
  35. data/lib/dataflow/nodes/merge_node.rb +33 -0
  36. data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
  37. data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
  38. data/lib/dataflow/nodes/select_keys_node.rb +39 -0
  39. data/lib/dataflow/nodes/snapshot_node.rb +77 -0
  40. data/lib/dataflow/nodes/sql_query_node.rb +50 -0
  41. data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
  42. data/lib/dataflow/nodes/upsert_node.rb +68 -0
  43. data/lib/dataflow/properties_mixin.rb +35 -0
  44. data/lib/dataflow/schema_mixin.rb +134 -0
  45. data/lib/dataflow/version.rb +4 -0
  46. data/lib/dataflow-rb.rb +72 -0
  47. metadata +371 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 39235e7bba48dcc339e007eefd360ae549439a29
4
+ data.tar.gz: b0c424349d25ecc970a8e74e72f7bbc00d43a0b9
5
+ SHA512:
6
+ metadata.gz: 96d3a5bc08fb881025d6379e3453efb42e6a0cb7d87f773d3acb44f75236f519ce73e118b8bc51d924ab2862fb56925c6bad05907eb7fd5a4b07c0c19e49422a
7
+ data.tar.gz: b420ecbcf013232b770260f613ffa9ff298e15b02f41de58ccb5057839897652ce52381b43ee6f41d849ec5dc8ed60ca3af68d9b71bebb4737179affba9b1b05
data/.env.test.example ADDED
@@ -0,0 +1,6 @@
1
+ # These need to be set for the tests to run properly.
2
+ # You need to create a .env.test and set these properly:
3
+ #MOJACO_MYSQL_USER=
4
+ #MOJACO_MYSQL_PASSWORD=
5
+ #MOJACO_POSTGRESQL_USER=
6
+ #MOJACO_POSTGRESQL_PASSWORD=
data/.gitignore ADDED
@@ -0,0 +1,14 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ /datanodes
11
+ .idea
12
+ .byebug_history
13
+ .tags
14
+ .env.test
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.3
4
+ before_install: gem install bundler -v 1.14.3
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in dataflow.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016-2017, Phybbit Ltd.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # Dataflow
2
+
3
+ The purpose of this gem is to help building complex dataflows and support automating long-running batch processes.
4
+ It handles parallelizing computation whenever it cans and re-computing dependencies that are not up-to-date.
5
+
6
+ There are two main concepts in describing a computing graph:
7
+ - data-nodes, which support storing/retrieving data from databases
8
+ - compute-nodes, which supports arbitrary processing, can depend on any number of nodes (compute/data) and can push their results to a data-node if needed
9
+
10
+ The main use case is to represent data sources with data-nodes and link those to compute-nodes. Upon computing, the node will store the result in another data-node.
11
+
12
+ The graph's metadata (e.g. nodes' dependencies, properties) is stored in MongoDB. It also uses MongoDB as the default DB for the data-node storage as it allows for quick schema-less prototyping. MySQL and PostgreSQL are also supported (through [Sequel](https://github.com/jeremyevans/sequel)).
13
+
14
+ This repository only includes the most common nodes. Other repos will include custom (application-dependent) nodes.
15
+
16
+ It has some similarities with the [Luigi](https://github.com/spotify/luigi) python module.
17
+
18
+ ## Installation
19
+
20
+ Add this line to your application's Gemfile:
21
+
22
+ ```ruby
23
+ gem 'dataflow-rb'
24
+ ```
25
+
26
+ And then execute:
27
+
28
+ $ bundle
29
+
30
+ Or install it yourself as:
31
+
32
+ $ gem install dataflow-rb
33
+
34
+ ## Usage
35
+
36
+ TODO: Write usage instructions here
37
+
38
+ ## Development
39
+
40
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
41
+
42
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
43
+
44
+ ## Contributing
45
+
46
+ Bug reports and pull requests are welcome on GitHub at https://github.com/phybbit/dataflow-rb.
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "dataflow"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ require "pry"
11
+ Pry.start
12
+
13
+ # require "irb"
14
+ # IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,42 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'dataflow/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'dataflow-rb'
8
+ spec.version = Dataflow::VERSION
9
+ spec.authors = ['okoriko']
10
+ spec.email = ['eurico@phybbit.com']
11
+
12
+ spec.summary = %q{Helps building data and automation pipelines. It handles recomputing dependencies and parallel execution.}
13
+ spec.description = %q{Helps building data pipelines. It handles recomputing dependencies and parallel execution.}
14
+ spec.homepage = 'https://phybbit.com'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = 'exe'
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_development_dependency 'bundler'
22
+ spec.add_development_dependency 'rake'
23
+ spec.add_development_dependency 'rspec'
24
+ spec.add_development_dependency 'byebug'
25
+ spec.add_development_dependency 'pry-byebug'
26
+ spec.add_development_dependency 'timecop'
27
+ spec.add_development_dependency 'ruby-prof'
28
+ spec.add_development_dependency 'dotenv'
29
+
30
+ spec.add_dependency 'activesupport', '>= 4.0.0'
31
+ spec.add_dependency 'schema-inference', '~>1.2.1'
32
+ spec.add_dependency 'parallel', '~>1.10'
33
+ spec.add_dependency 'mongoid', '~>6.0'
34
+ spec.add_dependency 'sequel', '~>4.0'
35
+ spec.add_dependency 'mysql2', '~>0.4'
36
+ spec.add_dependency 'pg', '~>0.19'
37
+ spec.add_dependency 'sequel_pg', '~>1.6'
38
+ spec.add_dependency 'msgpack', '~>1.0'
39
+ spec.add_dependency 'smarter_csv', '1.1.0'
40
+ spec.add_dependency 'timeliness', '~>0.3'
41
+ spec.add_dependency 'chronic', '~>0.10'
42
+ end
@@ -0,0 +1,21 @@
1
+ test:
2
+ clients:
3
+ default:
4
+ database: dataflow_test
5
+ hosts:
6
+ - localhost:27017
7
+ options:
8
+ read:
9
+ mode: :primary
10
+ max_pool_size: 1
11
+
12
+ default:
13
+ clients:
14
+ default:
15
+ database: dataflow
16
+ hosts:
17
+ - localhost:27017
18
+ options:
19
+ read:
20
+ mode: :primary
21
+ max_pool_size: 10
@@ -0,0 +1,123 @@
1
+ # frozen_string_literal: true
2
+ require 'securerandom'
3
+
4
+ module Dataflow
5
+ module Adapters
6
+ # Interface between a data node and csv.
7
+ # We use mongodb to perform all the store/retrieve operations.
8
+ class CsvAdapter
9
+ include Dataflow::SchemaMixin
10
+
11
+ attr_reader :settings
12
+
13
+ def initialize(args)
14
+ # make sure the CsvPath exist
15
+ `mkdir -p #{Dataflow::CsvPath}`
16
+ update_settings(args)
17
+ end
18
+
19
+ def update_settings(args)
20
+ @settings = Dataflow::Adapters::Settings.new(args)
21
+ @schema = [] # TODO: pre-fetch the csv's schema
22
+ end
23
+
24
+ def set_schema(schema)
25
+ @schema = schema
26
+ end
27
+
28
+ # retrieve a single element from a data node
29
+ def find(where: opts = {})
30
+ raise Errors::NotImplementedError, '#find is not yet support on CSV.'
31
+ end
32
+
33
+ # retrieve all elements from a data node
34
+ def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
35
+ SmarterCSV.process(file_path, strings_as_keys: true)
36
+ rescue Errno::ENOENT => e
37
+ []
38
+ end
39
+
40
+ # count the number of records
41
+ def count(where: {})
42
+ all(where: where).count
43
+ end
44
+
45
+ # save the given records
46
+ def save(records:)
47
+ write_csv_part(records, keys: @schema.keys)
48
+ end
49
+
50
+ def on_save_finished
51
+ write_single_csv(keys: @schema.keys)
52
+ end
53
+
54
+ def remove(_opts = {})
55
+ raise Errors::NotImplementedError, '#find is not yet support on CSV.'
56
+ end
57
+
58
+ def recreate_dataset(dataset: nil)
59
+ # simply delete the file
60
+ delete_file(file_path)
61
+ # and any parts if any is still there
62
+ file_parts.each { |part| delete_file(part) }
63
+ end
64
+
65
+ def create_indexes(*); end
66
+
67
+ private
68
+
69
+ def delete_file(path)
70
+ File.delete(path)
71
+ rescue Errno::ENOENT => e
72
+ # no file present, no problem
73
+ end
74
+
75
+ def file_path
76
+ filename = "#{settings.db_name}.#{settings.dataset_name}.csv"
77
+ "#{Dataflow::CsvPath}/#{filename}"
78
+ end
79
+
80
+ def file_parts
81
+ part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
82
+ Dir["#{file_path}.part_*"]
83
+ end
84
+
85
+ def write_csv_part(data, keys:)
86
+ # prepare the data
87
+ key_tokens = keys.map { |key| record_dig_tokens(key: key) }
88
+ rows = data.map do |datum|
89
+ key_tokens.map { |tokens| datum.dig(*tokens) }
90
+ end
91
+
92
+ # dump in a part file
93
+ uuid = SecureRandom.hex
94
+ CSV.open("#{file_path}.part_#{uuid}", 'w') do |csv|
95
+ rows.each { |row| csv << row }
96
+ end
97
+ end
98
+
99
+ def write_single_csv(keys:)
100
+ # export headers
101
+ header_filepath = "#{file_path}.header"
102
+ CSV.open(header_filepath, 'w') do |csv|
103
+ csv << keys
104
+ end
105
+
106
+ # make sure the destination file is deleted
107
+ delete_file(file_path)
108
+
109
+ # merge the files into the output
110
+ files = [header_filepath] + file_parts
111
+ files.each do |file|
112
+ # cat each file to the destination file
113
+ `cat #{file} >> #{file_path}`
114
+ end
115
+
116
+ # remove the intermediary files
117
+ files.each do |file|
118
+ delete_file(file)
119
+ end
120
+ end
121
+ end
122
+ end
123
+ end
@@ -0,0 +1,307 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ # Interface between a data node and mongodb.
5
+ # We use mongodb to perform all the store/retrieve operations.
6
+ class MongoDbAdapter
7
+ SYSTEM_ID = '_id'
8
+
9
+ class << self
10
+ def client(settings, db_name: nil)
11
+ @clients ||= {}
12
+ host = ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
13
+ port = '27017'
14
+ connection_uri = settings.connection_uri || "#{host}:#{port}"
15
+ db_name ||= settings.db_name
16
+ @clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
17
+ end
18
+
19
+ def admin_client(settings)
20
+ return @admin_client if @admin_client
21
+ @admin_client = client(settings, db_name: 'admin')
22
+ end
23
+
24
+ # Force the clients to disconnect their connections.
25
+ # Use before forking.
26
+ def disconnect_clients
27
+ @clients ||= {}
28
+ @clients.values.each(&:close)
29
+ end
30
+ end
31
+
32
+ attr_reader :settings
33
+ attr_reader :client
34
+
35
+ def initialize(args)
36
+ update_settings(args)
37
+ @client = MongoDbAdapter.client(settings)
38
+ @admin_client = MongoDbAdapter.admin_client(settings)
39
+ end
40
+
41
+ def update_settings(args)
42
+ @settings = Dataflow::Adapters::Settings.new(args)
43
+ end
44
+
45
+ # retrieve a single element from a data node
46
+ def find(where: {}, fields: [], sort: {}, offset: 0)
47
+ all(where: where, fields: fields, sort: sort, offset: offset, limit: 1).first
48
+ end
49
+
50
+ # retrieve all elements from a data node
51
+ def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
52
+ projection = fields.map { |field| [field, 1] }
53
+
54
+ unless fields.map(&:to_s).include?(SYSTEM_ID)
55
+ # by default, do not select the _id field
56
+ projection << [SYSTEM_ID, 0].freeze
57
+ end
58
+
59
+ opts = transform_to_query(where)
60
+ res = client[read_dataset_name].find(opts)
61
+ res = res.projection(projection.to_h)
62
+
63
+ res = res.sort(sort) if sort
64
+ res = res.skip(offset) if offset > 0
65
+ res = res.limit(limit) if limit > 0
66
+
67
+ if block_given?
68
+ yield res
69
+ else
70
+ res.to_a
71
+ end
72
+ end
73
+
74
+ # Helper that supports paginating through the whole dataset at fixed
75
+ # performance. Unlike using offset/skip which requires to read through
76
+ # the skipped content (high usage of CPU), we use the internal mongo
77
+ # cursor to get batch of results.
78
+ # @return [Hash] with 2 fields: data and next_cursor for the next call
79
+ def all_paginated(where: {}, fields: [], cursor: nil)
80
+ cursor = cursor.to_i
81
+ data = []
82
+
83
+ # If there is no cursor, we make the initial query
84
+ # get the first batch of data and get the cursor id.
85
+ if cursor.zero?
86
+ all(where: where, fields: fields) do |res|
87
+ results = res.initial_query
88
+ data = results.documents
89
+ cursor = res.cursor.id
90
+ end
91
+ end
92
+
93
+ # The first query's result batch is a small 101 set of results
94
+ # so we want to get one more batch of data.
95
+ # However, there might be queries whose results are very small
96
+ # and the resulting cursor is 0. In such case there is no more
97
+ # data to be fetched.
98
+ unless cursor.zero?
99
+ # send a getMore command on the cursor id
100
+ command = { getMore: cursor, collection: read_dataset_name }
101
+ result = client.database.command(command).documents[0]
102
+ cursor = result['cursor']['id']
103
+ data += result['cursor']['nextBatch']
104
+ end
105
+
106
+ # We want to return the cursor as a string.
107
+ # If there is no cursor (zero) then make it empty
108
+ cursor = '' if cursor.zero?
109
+
110
+ { 'data' => data, 'next_cursor' => cursor.to_s }
111
+ rescue Mongo::Error::OperationFailure
112
+ { 'data' => data, 'next_cursor' => '' }
113
+ end
114
+
115
+ # Create queries that permit processing the whole dataset in parallel without using offsets.
116
+ def ordered_system_id_queries(batch_size:)
117
+ ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
118
+ queries_count = (ids.size / batch_size.to_f).ceil
119
+ Array.new(queries_count) do |i|
120
+ from = ids[i * batch_size]
121
+ to = ids[(i + 1) * batch_size] || ids[-1]
122
+ is_last = i == queries_count - 1
123
+
124
+ where_query = { SYSTEM_ID => { '>=' => from } }
125
+ operator = is_last ? '<=' : '<'
126
+ where_query[SYSTEM_ID][operator] = to
127
+
128
+ where_query
129
+ end
130
+ end
131
+
132
+ # count the number of records
133
+ def count(where: {})
134
+ client[read_dataset_name].count(transform_to_query(where))
135
+ end
136
+
137
+ # Save the given records.
138
+ # @param replace_by [Array] if the replace_by key is provided,
139
+ # it will try to replace records with the matching key,
140
+ # or insert if none is found.
141
+ def save(records:, replace_by: nil)
142
+ if replace_by.present?
143
+ replace_keys = Array(replace_by)
144
+ bulk_ops = records.map do |record|
145
+ filter = replace_keys.map { |x| [x, record[x]] }.to_h
146
+ {
147
+ replace_one: {
148
+ filter: filter,
149
+ replacement: record,
150
+ upsert: true
151
+ }
152
+ }
153
+ end
154
+ client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
155
+ else
156
+ save_many(records: records)
157
+ end
158
+ end
159
+
160
+ # Delete records that match the options.
161
+ # @param where query to apply on the delete operation.
162
+ def delete(where: {})
163
+ client[read_dataset_name].delete_many(transform_to_query(where))
164
+ end
165
+
166
+ # recreate the table/collection
167
+ def recreate_dataset(dataset: nil)
168
+ dataset ||= write_dataset_name
169
+ collection = client[dataset]
170
+ collection.drop
171
+ collection.create
172
+ end
173
+
174
+ # Create the indexes on this dataset.
175
+ # @param dataset [String] Specify on which dataset the operation will be performed.
176
+ # Default: the adatpter's settings' dataset.
177
+ # @param type [Symbol] select which indexes type to create.
178
+ # Can be :all (default), :unique_only, :non_unique_only
179
+ def create_indexes(dataset: nil, type: :all, drop_retry_on_error: true)
180
+ dataset ||= write_dataset_name
181
+ return unless settings.indexes.present?
182
+
183
+ indexes = (settings.indexes || [])
184
+
185
+ case type
186
+ when :unique_only
187
+ indexes = indexes.select { |idx| idx['unique'] }
188
+ when :non_unique_only
189
+ indexes = indexes.reject { |idx| idx['unique'] }
190
+ end
191
+
192
+ indexes = indexes.map { |x| format_index(x) }
193
+ client[dataset].indexes.create_many(indexes)
194
+ rescue Mongo::Error::OperationFailure => e
195
+ raise e unless drop_retry_on_error
196
+ client[dataset].indexes.drop_all
197
+ create_indexes(drop_retry_on_error: false)
198
+ end
199
+
200
+ def usage(dataset:)
201
+ indexes = retrieve_collection_indexes(dataset)
202
+ command = { collstats: dataset }
203
+ result = client.database.command(command).documents[0]
204
+ {
205
+ memory: result['size'],
206
+ storage: result['storageSize'],
207
+ effective_indexes: indexes
208
+ }
209
+ rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
210
+ {
211
+ memory: 0,
212
+ storage: 0,
213
+ effective_indexes: indexes
214
+ }
215
+ end
216
+
217
+ private
218
+
219
+ def write_dataset_name
220
+ settings.write_dataset_name
221
+ end
222
+
223
+ def read_dataset_name
224
+ settings.read_dataset_name
225
+ end
226
+
227
+ def transform_to_query(opts)
228
+ sanitized_opts = {}
229
+ opts.each do |k, v|
230
+ if v.is_a? Array
231
+ # e.g. { 'id' => [1,2] } transform to mongodb IN clauses
232
+ sanitized_opts[k] = { '$in' => v.map { |value| try_cast_value(k, value) } }
233
+ elsif v.is_a? Hash
234
+ sanitized_opts[k] = {}
235
+ v.each do |operator, value|
236
+ case operator.to_s
237
+ when '!='
238
+ # we still need to check and transform into
239
+ if value.is_a? Array
240
+ # { '$nin' => [value] }
241
+ sanitized_opts[k]['$nin'] = value.map { |x| try_cast_value(k, x) }
242
+ else
243
+ # or {'$ne' => value }
244
+ sanitized_opts[k]['$ne'] = try_cast_value(k, value)
245
+ end
246
+ when '<'
247
+ sanitized_opts[k]['$lt'] = try_cast_value(k, value)
248
+ when '<='
249
+ sanitized_opts[k]['$lte'] = try_cast_value(k, value)
250
+ when '>'
251
+ sanitized_opts[k]['$gt'] = try_cast_value(k, value)
252
+ when '>='
253
+ sanitized_opts[k]['$gte'] = try_cast_value(k, value)
254
+ end
255
+ end
256
+ else
257
+ sanitized_opts[k] = try_cast_value(k, v)
258
+ end
259
+ end
260
+ sanitized_opts
261
+ end
262
+
263
+ def try_cast_value(field, value)
264
+ # cast to time when querying on _mojaco_updated_at
265
+ return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
266
+ # cast to ObjectId when querying on _id
267
+ return BSON::ObjectId(value) if field == SYSTEM_ID && value.is_a?(String)
268
+
269
+ # TODO: add other casts based on the field type
270
+ value
271
+ end
272
+
273
+ def save_many(records:)
274
+ client[write_dataset_name].insert_many(records, ordered: false)
275
+ rescue Mongo::Error::BulkWriteError => e
276
+ dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
277
+ # don't raise if it is errors about duplicated keys
278
+ raise e unless dup_key_error
279
+ end
280
+
281
+ # Required index format for mongodb:
282
+ # { :key => { name: 1 }, :unique => true },
283
+ def format_index(dataset_index)
284
+ dataset_index = dataset_index.with_indifferent_access
285
+
286
+ index_key = {}
287
+ keys = Array(dataset_index[:key])
288
+ keys.each { |k| index_key[k] = 1 }
289
+ index = { key: index_key }
290
+ index[:unique] = true if dataset_index[:unique]
291
+ index
292
+ end
293
+
294
+ def retrieve_collection_indexes(collection)
295
+ mongo_indexes = client[collection].indexes
296
+ mongo_indexes.map do |idx|
297
+ # skip the default index
298
+ next if idx['key'].keys == ['_id']
299
+
300
+ index = { 'key' => idx['key'].keys }
301
+ index['unique'] = true if idx['unique']
302
+ index
303
+ end.compact
304
+ end
305
+ end
306
+ end
307
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ # Interface between a data node and mongodb.
5
+ # We use mongodb to perform all the store/retrieve operations.
6
+ class MysqlAdapter < SqlAdapter
7
+ def fetch_table_usage(dataset:)
8
+ size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
9
+ {
10
+ memory: size,
11
+ storage: size
12
+ }
13
+ rescue Sequel::DatabaseError => e
14
+ {
15
+ memory: 0,
16
+ storage: 0
17
+ }
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Adapters
4
+ # Interface between a data node and mongodb.
5
+ # We use mongodb to perform all the store/retrieve operations.
6
+ class PsqlAdapter < SqlAdapter
7
+ def fetch_table_usage(dataset:)
8
+ size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
9
+ {
10
+ memory: size,
11
+ storage: size
12
+ }
13
+ rescue Sequel::DatabaseError
14
+ {
15
+ memory: 0,
16
+ storage: 0
17
+ }
18
+ end
19
+ end
20
+ end
21
+ end