dataflow-rb 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.env.test.example +6 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/README.md +46 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/dataflow-rb.gemspec +42 -0
- data/lib/config/mongoid.yml +21 -0
- data/lib/dataflow/adapters/csv_adapter.rb +123 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
- data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
- data/lib/dataflow/adapters/psql_adapter.rb +21 -0
- data/lib/dataflow/adapters/settings.rb +33 -0
- data/lib/dataflow/adapters/sql_adapter.rb +322 -0
- data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
- data/lib/dataflow/errors/not_implemented_error.rb +7 -0
- data/lib/dataflow/event_mixin.rb +77 -0
- data/lib/dataflow/extensions/mongo_driver.rb +21 -0
- data/lib/dataflow/extensions/msgpack.rb +19 -0
- data/lib/dataflow/logger.rb +27 -0
- data/lib/dataflow/node.rb +37 -0
- data/lib/dataflow/nodes/compute_node.rb +495 -0
- data/lib/dataflow/nodes/data_node.rb +331 -0
- data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
- data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
- data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
- data/lib/dataflow/nodes/filter/where_node.rb +44 -0
- data/lib/dataflow/nodes/join_node.rb +151 -0
- data/lib/dataflow/nodes/map_node.rb +50 -0
- data/lib/dataflow/nodes/merge_node.rb +33 -0
- data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
- data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
- data/lib/dataflow/nodes/select_keys_node.rb +39 -0
- data/lib/dataflow/nodes/snapshot_node.rb +77 -0
- data/lib/dataflow/nodes/sql_query_node.rb +50 -0
- data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
- data/lib/dataflow/nodes/upsert_node.rb +68 -0
- data/lib/dataflow/properties_mixin.rb +35 -0
- data/lib/dataflow/schema_mixin.rb +134 -0
- data/lib/dataflow/version.rb +4 -0
- data/lib/dataflow-rb.rb +72 -0
- metadata +371 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 39235e7bba48dcc339e007eefd360ae549439a29
|
4
|
+
data.tar.gz: b0c424349d25ecc970a8e74e72f7bbc00d43a0b9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 96d3a5bc08fb881025d6379e3453efb42e6a0cb7d87f773d3acb44f75236f519ce73e118b8bc51d924ab2862fb56925c6bad05907eb7fd5a4b07c0c19e49422a
|
7
|
+
data.tar.gz: b420ecbcf013232b770260f613ffa9ff298e15b02f41de58ccb5057839897652ce52381b43ee6f41d849ec5dc8ed60ca3af68d9b71bebb4737179affba9b1b05
|
data/.env.test.example
ADDED
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016-2017, Phybbit Ltd.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Dataflow
|
2
|
+
|
3
|
+
The purpose of this gem is to help building complex dataflows and support automating long-running batch processes.
|
4
|
+
It handles parallelizing computation whenever it cans and re-computing dependencies that are not up-to-date.
|
5
|
+
|
6
|
+
There are two main concepts in describing a computing graph:
|
7
|
+
- data-nodes, which support storing/retrieving data from databases
|
8
|
+
- compute-nodes, which supports arbitrary processing, can depend on any number of nodes (compute/data) and can push their results to a data-node if needed
|
9
|
+
|
10
|
+
The main use case is to represent data sources with data-nodes and link those to compute-nodes. Upon computing, the node will store the result in another data-node.
|
11
|
+
|
12
|
+
The graph's metadata (e.g. nodes' dependencies, properties) is stored in MongoDB. It also uses MongoDB as the default DB for the data-node storage as it allows for quick schema-less prototyping. MySQL and PostgreSQL are also supported (through [Sequel](https://github.com/jeremyevans/sequel)).
|
13
|
+
|
14
|
+
This repository only includes the most common nodes. Other repos will include custom (application-dependent) nodes.
|
15
|
+
|
16
|
+
It has some similarities with the [Luigi](https://github.com/spotify/luigi) python module.
|
17
|
+
|
18
|
+
## Installation
|
19
|
+
|
20
|
+
Add this line to your application's Gemfile:
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
gem 'dataflow-rb'
|
24
|
+
```
|
25
|
+
|
26
|
+
And then execute:
|
27
|
+
|
28
|
+
$ bundle
|
29
|
+
|
30
|
+
Or install it yourself as:
|
31
|
+
|
32
|
+
$ gem install dataflow-rb
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
TODO: Write usage instructions here
|
37
|
+
|
38
|
+
## Development
|
39
|
+
|
40
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
41
|
+
|
42
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
43
|
+
|
44
|
+
## Contributing
|
45
|
+
|
46
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/phybbit/dataflow-rb.
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "dataflow"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
require "pry"
|
11
|
+
Pry.start
|
12
|
+
|
13
|
+
# require "irb"
|
14
|
+
# IRB.start
|
data/bin/setup
ADDED
data/dataflow-rb.gemspec
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'dataflow/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'dataflow-rb'
|
8
|
+
spec.version = Dataflow::VERSION
|
9
|
+
spec.authors = ['okoriko']
|
10
|
+
spec.email = ['eurico@phybbit.com']
|
11
|
+
|
12
|
+
spec.summary = %q{Helps building data and automation pipelines. It handles recomputing dependencies and parallel execution.}
|
13
|
+
spec.description = %q{Helps building data pipelines. It handles recomputing dependencies and parallel execution.}
|
14
|
+
spec.homepage = 'https://phybbit.com'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = 'exe'
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_development_dependency 'bundler'
|
22
|
+
spec.add_development_dependency 'rake'
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
|
+
spec.add_development_dependency 'byebug'
|
25
|
+
spec.add_development_dependency 'pry-byebug'
|
26
|
+
spec.add_development_dependency 'timecop'
|
27
|
+
spec.add_development_dependency 'ruby-prof'
|
28
|
+
spec.add_development_dependency 'dotenv'
|
29
|
+
|
30
|
+
spec.add_dependency 'activesupport', '>= 4.0.0'
|
31
|
+
spec.add_dependency 'schema-inference', '~>1.2.1'
|
32
|
+
spec.add_dependency 'parallel', '~>1.10'
|
33
|
+
spec.add_dependency 'mongoid', '~>6.0'
|
34
|
+
spec.add_dependency 'sequel', '~>4.0'
|
35
|
+
spec.add_dependency 'mysql2', '~>0.4'
|
36
|
+
spec.add_dependency 'pg', '~>0.19'
|
37
|
+
spec.add_dependency 'sequel_pg', '~>1.6'
|
38
|
+
spec.add_dependency 'msgpack', '~>1.0'
|
39
|
+
spec.add_dependency 'smarter_csv', '1.1.0'
|
40
|
+
spec.add_dependency 'timeliness', '~>0.3'
|
41
|
+
spec.add_dependency 'chronic', '~>0.10'
|
42
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
test:
|
2
|
+
clients:
|
3
|
+
default:
|
4
|
+
database: dataflow_test
|
5
|
+
hosts:
|
6
|
+
- localhost:27017
|
7
|
+
options:
|
8
|
+
read:
|
9
|
+
mode: :primary
|
10
|
+
max_pool_size: 1
|
11
|
+
|
12
|
+
default:
|
13
|
+
clients:
|
14
|
+
default:
|
15
|
+
database: dataflow
|
16
|
+
hosts:
|
17
|
+
- localhost:27017
|
18
|
+
options:
|
19
|
+
read:
|
20
|
+
mode: :primary
|
21
|
+
max_pool_size: 10
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'securerandom'
|
3
|
+
|
4
|
+
module Dataflow
|
5
|
+
module Adapters
|
6
|
+
# Interface between a data node and csv.
|
7
|
+
# We use mongodb to perform all the store/retrieve operations.
|
8
|
+
class CsvAdapter
|
9
|
+
include Dataflow::SchemaMixin
|
10
|
+
|
11
|
+
attr_reader :settings
|
12
|
+
|
13
|
+
def initialize(args)
|
14
|
+
# make sure the CsvPath exist
|
15
|
+
`mkdir -p #{Dataflow::CsvPath}`
|
16
|
+
update_settings(args)
|
17
|
+
end
|
18
|
+
|
19
|
+
def update_settings(args)
|
20
|
+
@settings = Dataflow::Adapters::Settings.new(args)
|
21
|
+
@schema = [] # TODO: pre-fetch the csv's schema
|
22
|
+
end
|
23
|
+
|
24
|
+
def set_schema(schema)
|
25
|
+
@schema = schema
|
26
|
+
end
|
27
|
+
|
28
|
+
# retrieve a single element from a data node
|
29
|
+
def find(where: opts = {})
|
30
|
+
raise Errors::NotImplementedError, '#find is not yet support on CSV.'
|
31
|
+
end
|
32
|
+
|
33
|
+
# retrieve all elements from a data node
|
34
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
35
|
+
SmarterCSV.process(file_path, strings_as_keys: true)
|
36
|
+
rescue Errno::ENOENT => e
|
37
|
+
[]
|
38
|
+
end
|
39
|
+
|
40
|
+
# count the number of records
|
41
|
+
def count(where: {})
|
42
|
+
all(where: where).count
|
43
|
+
end
|
44
|
+
|
45
|
+
# save the given records
|
46
|
+
def save(records:)
|
47
|
+
write_csv_part(records, keys: @schema.keys)
|
48
|
+
end
|
49
|
+
|
50
|
+
def on_save_finished
|
51
|
+
write_single_csv(keys: @schema.keys)
|
52
|
+
end
|
53
|
+
|
54
|
+
def remove(_opts = {})
|
55
|
+
raise Errors::NotImplementedError, '#find is not yet support on CSV.'
|
56
|
+
end
|
57
|
+
|
58
|
+
def recreate_dataset(dataset: nil)
|
59
|
+
# simply delete the file
|
60
|
+
delete_file(file_path)
|
61
|
+
# and any parts if any is still there
|
62
|
+
file_parts.each { |part| delete_file(part) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def create_indexes(*); end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def delete_file(path)
|
70
|
+
File.delete(path)
|
71
|
+
rescue Errno::ENOENT => e
|
72
|
+
# no file present, no problem
|
73
|
+
end
|
74
|
+
|
75
|
+
def file_path
|
76
|
+
filename = "#{settings.db_name}.#{settings.dataset_name}.csv"
|
77
|
+
"#{Dataflow::CsvPath}/#{filename}"
|
78
|
+
end
|
79
|
+
|
80
|
+
def file_parts
|
81
|
+
part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
|
82
|
+
Dir["#{file_path}.part_*"]
|
83
|
+
end
|
84
|
+
|
85
|
+
def write_csv_part(data, keys:)
|
86
|
+
# prepare the data
|
87
|
+
key_tokens = keys.map { |key| record_dig_tokens(key: key) }
|
88
|
+
rows = data.map do |datum|
|
89
|
+
key_tokens.map { |tokens| datum.dig(*tokens) }
|
90
|
+
end
|
91
|
+
|
92
|
+
# dump in a part file
|
93
|
+
uuid = SecureRandom.hex
|
94
|
+
CSV.open("#{file_path}.part_#{uuid}", 'w') do |csv|
|
95
|
+
rows.each { |row| csv << row }
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def write_single_csv(keys:)
|
100
|
+
# export headers
|
101
|
+
header_filepath = "#{file_path}.header"
|
102
|
+
CSV.open(header_filepath, 'w') do |csv|
|
103
|
+
csv << keys
|
104
|
+
end
|
105
|
+
|
106
|
+
# make sure the destination file is deleted
|
107
|
+
delete_file(file_path)
|
108
|
+
|
109
|
+
# merge the files into the output
|
110
|
+
files = [header_filepath] + file_parts
|
111
|
+
files.each do |file|
|
112
|
+
# cat each file to the destination file
|
113
|
+
`cat #{file} >> #{file_path}`
|
114
|
+
end
|
115
|
+
|
116
|
+
# remove the intermediary files
|
117
|
+
files.each do |file|
|
118
|
+
delete_file(file)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,307 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Adapters
|
4
|
+
# Interface between a data node and mongodb.
|
5
|
+
# We use mongodb to perform all the store/retrieve operations.
|
6
|
+
class MongoDbAdapter
|
7
|
+
SYSTEM_ID = '_id'
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def client(settings, db_name: nil)
|
11
|
+
@clients ||= {}
|
12
|
+
host = ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
|
13
|
+
port = '27017'
|
14
|
+
connection_uri = settings.connection_uri || "#{host}:#{port}"
|
15
|
+
db_name ||= settings.db_name
|
16
|
+
@clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
|
17
|
+
end
|
18
|
+
|
19
|
+
def admin_client(settings)
|
20
|
+
return @admin_client if @admin_client
|
21
|
+
@admin_client = client(settings, db_name: 'admin')
|
22
|
+
end
|
23
|
+
|
24
|
+
# Force the clients to disconnect their connections.
|
25
|
+
# Use before forking.
|
26
|
+
def disconnect_clients
|
27
|
+
@clients ||= {}
|
28
|
+
@clients.values.each(&:close)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :settings
|
33
|
+
attr_reader :client
|
34
|
+
|
35
|
+
def initialize(args)
|
36
|
+
update_settings(args)
|
37
|
+
@client = MongoDbAdapter.client(settings)
|
38
|
+
@admin_client = MongoDbAdapter.admin_client(settings)
|
39
|
+
end
|
40
|
+
|
41
|
+
def update_settings(args)
|
42
|
+
@settings = Dataflow::Adapters::Settings.new(args)
|
43
|
+
end
|
44
|
+
|
45
|
+
# retrieve a single element from a data node
|
46
|
+
def find(where: {}, fields: [], sort: {}, offset: 0)
|
47
|
+
all(where: where, fields: fields, sort: sort, offset: offset, limit: 1).first
|
48
|
+
end
|
49
|
+
|
50
|
+
# retrieve all elements from a data node
|
51
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
52
|
+
projection = fields.map { |field| [field, 1] }
|
53
|
+
|
54
|
+
unless fields.map(&:to_s).include?(SYSTEM_ID)
|
55
|
+
# by default, do not select the _id field
|
56
|
+
projection << [SYSTEM_ID, 0].freeze
|
57
|
+
end
|
58
|
+
|
59
|
+
opts = transform_to_query(where)
|
60
|
+
res = client[read_dataset_name].find(opts)
|
61
|
+
res = res.projection(projection.to_h)
|
62
|
+
|
63
|
+
res = res.sort(sort) if sort
|
64
|
+
res = res.skip(offset) if offset > 0
|
65
|
+
res = res.limit(limit) if limit > 0
|
66
|
+
|
67
|
+
if block_given?
|
68
|
+
yield res
|
69
|
+
else
|
70
|
+
res.to_a
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Helper that supports paginating through the whole dataset at fixed
|
75
|
+
# performance. Unlike using offset/skip which requires to read through
|
76
|
+
# the skipped content (high usage of CPU), we use the internal mongo
|
77
|
+
# cursor to get batch of results.
|
78
|
+
# @return [Hash] with 2 fields: data and next_cursor for the next call
|
79
|
+
def all_paginated(where: {}, fields: [], cursor: nil)
|
80
|
+
cursor = cursor.to_i
|
81
|
+
data = []
|
82
|
+
|
83
|
+
# If there is no cursor, we make the initial query
|
84
|
+
# get the first batch of data and get the cursor id.
|
85
|
+
if cursor.zero?
|
86
|
+
all(where: where, fields: fields) do |res|
|
87
|
+
results = res.initial_query
|
88
|
+
data = results.documents
|
89
|
+
cursor = res.cursor.id
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# The first query's result batch is a small 101 set of results
|
94
|
+
# so we want to get one more batch of data.
|
95
|
+
# However, there might be queries whose results are very small
|
96
|
+
# and the resulting cursor is 0. In such case there is no more
|
97
|
+
# data to be fetched.
|
98
|
+
unless cursor.zero?
|
99
|
+
# send a getMore command on the cursor id
|
100
|
+
command = { getMore: cursor, collection: read_dataset_name }
|
101
|
+
result = client.database.command(command).documents[0]
|
102
|
+
cursor = result['cursor']['id']
|
103
|
+
data += result['cursor']['nextBatch']
|
104
|
+
end
|
105
|
+
|
106
|
+
# We want to return the cursor as a string.
|
107
|
+
# If there is no cursor (zero) then make it empty
|
108
|
+
cursor = '' if cursor.zero?
|
109
|
+
|
110
|
+
{ 'data' => data, 'next_cursor' => cursor.to_s }
|
111
|
+
rescue Mongo::Error::OperationFailure
|
112
|
+
{ 'data' => data, 'next_cursor' => '' }
|
113
|
+
end
|
114
|
+
|
115
|
+
# Create queries that permit processing the whole dataset in parallel without using offsets.
|
116
|
+
def ordered_system_id_queries(batch_size:)
|
117
|
+
ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
|
118
|
+
queries_count = (ids.size / batch_size.to_f).ceil
|
119
|
+
Array.new(queries_count) do |i|
|
120
|
+
from = ids[i * batch_size]
|
121
|
+
to = ids[(i + 1) * batch_size] || ids[-1]
|
122
|
+
is_last = i == queries_count - 1
|
123
|
+
|
124
|
+
where_query = { SYSTEM_ID => { '>=' => from } }
|
125
|
+
operator = is_last ? '<=' : '<'
|
126
|
+
where_query[SYSTEM_ID][operator] = to
|
127
|
+
|
128
|
+
where_query
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# count the number of records
|
133
|
+
def count(where: {})
|
134
|
+
client[read_dataset_name].count(transform_to_query(where))
|
135
|
+
end
|
136
|
+
|
137
|
+
# Save the given records.
|
138
|
+
# @param replace_by [Array] if the replace_by key is provided,
|
139
|
+
# it will try to replace records with the matching key,
|
140
|
+
# or insert if none is found.
|
141
|
+
def save(records:, replace_by: nil)
|
142
|
+
if replace_by.present?
|
143
|
+
replace_keys = Array(replace_by)
|
144
|
+
bulk_ops = records.map do |record|
|
145
|
+
filter = replace_keys.map { |x| [x, record[x]] }.to_h
|
146
|
+
{
|
147
|
+
replace_one: {
|
148
|
+
filter: filter,
|
149
|
+
replacement: record,
|
150
|
+
upsert: true
|
151
|
+
}
|
152
|
+
}
|
153
|
+
end
|
154
|
+
client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
|
155
|
+
else
|
156
|
+
save_many(records: records)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Delete records that match the options.
|
161
|
+
# @param where query to apply on the delete operation.
|
162
|
+
def delete(where: {})
|
163
|
+
client[read_dataset_name].delete_many(transform_to_query(where))
|
164
|
+
end
|
165
|
+
|
166
|
+
# recreate the table/collection
|
167
|
+
def recreate_dataset(dataset: nil)
|
168
|
+
dataset ||= write_dataset_name
|
169
|
+
collection = client[dataset]
|
170
|
+
collection.drop
|
171
|
+
collection.create
|
172
|
+
end
|
173
|
+
|
174
|
+
# Create the indexes on this dataset.
|
175
|
+
# @param dataset [String] Specify on which dataset the operation will be performed.
|
176
|
+
# Default: the adatpter's settings' dataset.
|
177
|
+
# @param type [Symbol] select which indexes type to create.
|
178
|
+
# Can be :all (default), :unique_only, :non_unique_only
|
179
|
+
def create_indexes(dataset: nil, type: :all, drop_retry_on_error: true)
|
180
|
+
dataset ||= write_dataset_name
|
181
|
+
return unless settings.indexes.present?
|
182
|
+
|
183
|
+
indexes = (settings.indexes || [])
|
184
|
+
|
185
|
+
case type
|
186
|
+
when :unique_only
|
187
|
+
indexes = indexes.select { |idx| idx['unique'] }
|
188
|
+
when :non_unique_only
|
189
|
+
indexes = indexes.reject { |idx| idx['unique'] }
|
190
|
+
end
|
191
|
+
|
192
|
+
indexes = indexes.map { |x| format_index(x) }
|
193
|
+
client[dataset].indexes.create_many(indexes)
|
194
|
+
rescue Mongo::Error::OperationFailure => e
|
195
|
+
raise e unless drop_retry_on_error
|
196
|
+
client[dataset].indexes.drop_all
|
197
|
+
create_indexes(drop_retry_on_error: false)
|
198
|
+
end
|
199
|
+
|
200
|
+
def usage(dataset:)
|
201
|
+
indexes = retrieve_collection_indexes(dataset)
|
202
|
+
command = { collstats: dataset }
|
203
|
+
result = client.database.command(command).documents[0]
|
204
|
+
{
|
205
|
+
memory: result['size'],
|
206
|
+
storage: result['storageSize'],
|
207
|
+
effective_indexes: indexes
|
208
|
+
}
|
209
|
+
rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
|
210
|
+
{
|
211
|
+
memory: 0,
|
212
|
+
storage: 0,
|
213
|
+
effective_indexes: indexes
|
214
|
+
}
|
215
|
+
end
|
216
|
+
|
217
|
+
private
|
218
|
+
|
219
|
+
def write_dataset_name
|
220
|
+
settings.write_dataset_name
|
221
|
+
end
|
222
|
+
|
223
|
+
def read_dataset_name
|
224
|
+
settings.read_dataset_name
|
225
|
+
end
|
226
|
+
|
227
|
+
def transform_to_query(opts)
|
228
|
+
sanitized_opts = {}
|
229
|
+
opts.each do |k, v|
|
230
|
+
if v.is_a? Array
|
231
|
+
# e.g. { 'id' => [1,2] } transform to mongodb IN clauses
|
232
|
+
sanitized_opts[k] = { '$in' => v.map { |value| try_cast_value(k, value) } }
|
233
|
+
elsif v.is_a? Hash
|
234
|
+
sanitized_opts[k] = {}
|
235
|
+
v.each do |operator, value|
|
236
|
+
case operator.to_s
|
237
|
+
when '!='
|
238
|
+
# we still need to check and transform into
|
239
|
+
if value.is_a? Array
|
240
|
+
# { '$nin' => [value] }
|
241
|
+
sanitized_opts[k]['$nin'] = value.map { |x| try_cast_value(k, x) }
|
242
|
+
else
|
243
|
+
# or {'$ne' => value }
|
244
|
+
sanitized_opts[k]['$ne'] = try_cast_value(k, value)
|
245
|
+
end
|
246
|
+
when '<'
|
247
|
+
sanitized_opts[k]['$lt'] = try_cast_value(k, value)
|
248
|
+
when '<='
|
249
|
+
sanitized_opts[k]['$lte'] = try_cast_value(k, value)
|
250
|
+
when '>'
|
251
|
+
sanitized_opts[k]['$gt'] = try_cast_value(k, value)
|
252
|
+
when '>='
|
253
|
+
sanitized_opts[k]['$gte'] = try_cast_value(k, value)
|
254
|
+
end
|
255
|
+
end
|
256
|
+
else
|
257
|
+
sanitized_opts[k] = try_cast_value(k, v)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
sanitized_opts
|
261
|
+
end
|
262
|
+
|
263
|
+
def try_cast_value(field, value)
|
264
|
+
# cast to time when querying on _mojaco_updated_at
|
265
|
+
return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
|
266
|
+
# cast to ObjectId when querying on _id
|
267
|
+
return BSON::ObjectId(value) if field == SYSTEM_ID && value.is_a?(String)
|
268
|
+
|
269
|
+
# TODO: add other casts based on the field type
|
270
|
+
value
|
271
|
+
end
|
272
|
+
|
273
|
+
def save_many(records:)
|
274
|
+
client[write_dataset_name].insert_many(records, ordered: false)
|
275
|
+
rescue Mongo::Error::BulkWriteError => e
|
276
|
+
dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
|
277
|
+
# don't raise if it is errors about duplicated keys
|
278
|
+
raise e unless dup_key_error
|
279
|
+
end
|
280
|
+
|
281
|
+
# Required index format for mongodb:
|
282
|
+
# { :key => { name: 1 }, :unique => true },
|
283
|
+
def format_index(dataset_index)
|
284
|
+
dataset_index = dataset_index.with_indifferent_access
|
285
|
+
|
286
|
+
index_key = {}
|
287
|
+
keys = Array(dataset_index[:key])
|
288
|
+
keys.each { |k| index_key[k] = 1 }
|
289
|
+
index = { key: index_key }
|
290
|
+
index[:unique] = true if dataset_index[:unique]
|
291
|
+
index
|
292
|
+
end
|
293
|
+
|
294
|
+
def retrieve_collection_indexes(collection)
|
295
|
+
mongo_indexes = client[collection].indexes
|
296
|
+
mongo_indexes.map do |idx|
|
297
|
+
# skip the default index
|
298
|
+
next if idx['key'].keys == ['_id']
|
299
|
+
|
300
|
+
index = { 'key' => idx['key'].keys }
|
301
|
+
index['unique'] = true if idx['unique']
|
302
|
+
index
|
303
|
+
end.compact
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Adapters
|
4
|
+
# Interface between a data node and mongodb.
|
5
|
+
# We use mongodb to perform all the store/retrieve operations.
|
6
|
+
class MysqlAdapter < SqlAdapter
|
7
|
+
def fetch_table_usage(dataset:)
|
8
|
+
size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
|
9
|
+
{
|
10
|
+
memory: size,
|
11
|
+
storage: size
|
12
|
+
}
|
13
|
+
rescue Sequel::DatabaseError => e
|
14
|
+
{
|
15
|
+
memory: 0,
|
16
|
+
storage: 0
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Adapters
|
4
|
+
# Interface between a data node and mongodb.
|
5
|
+
# We use mongodb to perform all the store/retrieve operations.
|
6
|
+
class PsqlAdapter < SqlAdapter
|
7
|
+
def fetch_table_usage(dataset:)
|
8
|
+
size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
|
9
|
+
{
|
10
|
+
memory: size,
|
11
|
+
storage: size
|
12
|
+
}
|
13
|
+
rescue Sequel::DatabaseError
|
14
|
+
{
|
15
|
+
memory: 0,
|
16
|
+
storage: 0
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|