dataflow-rb 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.env.test.example +6 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/README.md +46 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/dataflow-rb.gemspec +42 -0
- data/lib/config/mongoid.yml +21 -0
- data/lib/dataflow/adapters/csv_adapter.rb +123 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
- data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
- data/lib/dataflow/adapters/psql_adapter.rb +21 -0
- data/lib/dataflow/adapters/settings.rb +33 -0
- data/lib/dataflow/adapters/sql_adapter.rb +322 -0
- data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
- data/lib/dataflow/errors/not_implemented_error.rb +7 -0
- data/lib/dataflow/event_mixin.rb +77 -0
- data/lib/dataflow/extensions/mongo_driver.rb +21 -0
- data/lib/dataflow/extensions/msgpack.rb +19 -0
- data/lib/dataflow/logger.rb +27 -0
- data/lib/dataflow/node.rb +37 -0
- data/lib/dataflow/nodes/compute_node.rb +495 -0
- data/lib/dataflow/nodes/data_node.rb +331 -0
- data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
- data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
- data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
- data/lib/dataflow/nodes/filter/where_node.rb +44 -0
- data/lib/dataflow/nodes/join_node.rb +151 -0
- data/lib/dataflow/nodes/map_node.rb +50 -0
- data/lib/dataflow/nodes/merge_node.rb +33 -0
- data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
- data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
- data/lib/dataflow/nodes/select_keys_node.rb +39 -0
- data/lib/dataflow/nodes/snapshot_node.rb +77 -0
- data/lib/dataflow/nodes/sql_query_node.rb +50 -0
- data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
- data/lib/dataflow/nodes/upsert_node.rb +68 -0
- data/lib/dataflow/properties_mixin.rb +35 -0
- data/lib/dataflow/schema_mixin.rb +134 -0
- data/lib/dataflow/version.rb +4 -0
- data/lib/dataflow-rb.rb +72 -0
- metadata +371 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 39235e7bba48dcc339e007eefd360ae549439a29
|
4
|
+
data.tar.gz: b0c424349d25ecc970a8e74e72f7bbc00d43a0b9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 96d3a5bc08fb881025d6379e3453efb42e6a0cb7d87f773d3acb44f75236f519ce73e118b8bc51d924ab2862fb56925c6bad05907eb7fd5a4b07c0c19e49422a
|
7
|
+
data.tar.gz: b420ecbcf013232b770260f613ffa9ff298e15b02f41de58ccb5057839897652ce52381b43ee6f41d849ec5dc8ed60ca3af68d9b71bebb4737179affba9b1b05
|
data/.env.test.example
ADDED
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2016-2017, Phybbit Ltd.
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
# Dataflow
|
2
|
+
|
3
|
+
The purpose of this gem is to help building complex dataflows and support automating long-running batch processes.
|
4
|
+
It handles parallelizing computation whenever it cans and re-computing dependencies that are not up-to-date.
|
5
|
+
|
6
|
+
There are two main concepts in describing a computing graph:
|
7
|
+
- data-nodes, which support storing/retrieving data from databases
|
8
|
+
- compute-nodes, which supports arbitrary processing, can depend on any number of nodes (compute/data) and can push their results to a data-node if needed
|
9
|
+
|
10
|
+
The main use case is to represent data sources with data-nodes and link those to compute-nodes. Upon computing, the node will store the result in another data-node.
|
11
|
+
|
12
|
+
The graph's metadata (e.g. nodes' dependencies, properties) is stored in MongoDB. It also uses MongoDB as the default DB for the data-node storage as it allows for quick schema-less prototyping. MySQL and PostgreSQL are also supported (through [Sequel](https://github.com/jeremyevans/sequel)).
|
13
|
+
|
14
|
+
This repository only includes the most common nodes. Other repos will include custom (application-dependent) nodes.
|
15
|
+
|
16
|
+
It has some similarities with the [Luigi](https://github.com/spotify/luigi) python module.
|
17
|
+
|
18
|
+
## Installation
|
19
|
+
|
20
|
+
Add this line to your application's Gemfile:
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
gem 'dataflow-rb'
|
24
|
+
```
|
25
|
+
|
26
|
+
And then execute:
|
27
|
+
|
28
|
+
$ bundle
|
29
|
+
|
30
|
+
Or install it yourself as:
|
31
|
+
|
32
|
+
$ gem install dataflow-rb
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
TODO: Write usage instructions here
|
37
|
+
|
38
|
+
## Development
|
39
|
+
|
40
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
41
|
+
|
42
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
43
|
+
|
44
|
+
## Contributing
|
45
|
+
|
46
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/phybbit/dataflow-rb.
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "dataflow"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
require "pry"
|
11
|
+
Pry.start
|
12
|
+
|
13
|
+
# require "irb"
|
14
|
+
# IRB.start
|
data/bin/setup
ADDED
data/dataflow-rb.gemspec
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'dataflow/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'dataflow-rb'
|
8
|
+
spec.version = Dataflow::VERSION
|
9
|
+
spec.authors = ['okoriko']
|
10
|
+
spec.email = ['eurico@phybbit.com']
|
11
|
+
|
12
|
+
spec.summary = %q{Helps building data and automation pipelines. It handles recomputing dependencies and parallel execution.}
|
13
|
+
spec.description = %q{Helps building data pipelines. It handles recomputing dependencies and parallel execution.}
|
14
|
+
spec.homepage = 'https://phybbit.com'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = 'exe'
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_development_dependency 'bundler'
|
22
|
+
spec.add_development_dependency 'rake'
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
|
+
spec.add_development_dependency 'byebug'
|
25
|
+
spec.add_development_dependency 'pry-byebug'
|
26
|
+
spec.add_development_dependency 'timecop'
|
27
|
+
spec.add_development_dependency 'ruby-prof'
|
28
|
+
spec.add_development_dependency 'dotenv'
|
29
|
+
|
30
|
+
spec.add_dependency 'activesupport', '>= 4.0.0'
|
31
|
+
spec.add_dependency 'schema-inference', '~>1.2.1'
|
32
|
+
spec.add_dependency 'parallel', '~>1.10'
|
33
|
+
spec.add_dependency 'mongoid', '~>6.0'
|
34
|
+
spec.add_dependency 'sequel', '~>4.0'
|
35
|
+
spec.add_dependency 'mysql2', '~>0.4'
|
36
|
+
spec.add_dependency 'pg', '~>0.19'
|
37
|
+
spec.add_dependency 'sequel_pg', '~>1.6'
|
38
|
+
spec.add_dependency 'msgpack', '~>1.0'
|
39
|
+
spec.add_dependency 'smarter_csv', '1.1.0'
|
40
|
+
spec.add_dependency 'timeliness', '~>0.3'
|
41
|
+
spec.add_dependency 'chronic', '~>0.10'
|
42
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
test:
|
2
|
+
clients:
|
3
|
+
default:
|
4
|
+
database: dataflow_test
|
5
|
+
hosts:
|
6
|
+
- localhost:27017
|
7
|
+
options:
|
8
|
+
read:
|
9
|
+
mode: :primary
|
10
|
+
max_pool_size: 1
|
11
|
+
|
12
|
+
default:
|
13
|
+
clients:
|
14
|
+
default:
|
15
|
+
database: dataflow
|
16
|
+
hosts:
|
17
|
+
- localhost:27017
|
18
|
+
options:
|
19
|
+
read:
|
20
|
+
mode: :primary
|
21
|
+
max_pool_size: 10
|
@@ -0,0 +1,123 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'securerandom'
|
3
|
+
|
4
|
+
module Dataflow
|
5
|
+
module Adapters
|
6
|
+
# Interface between a data node and csv.
|
7
|
+
# We use mongodb to perform all the store/retrieve operations.
|
8
|
+
class CsvAdapter
|
9
|
+
include Dataflow::SchemaMixin
|
10
|
+
|
11
|
+
attr_reader :settings
|
12
|
+
|
13
|
+
def initialize(args)
|
14
|
+
# make sure the CsvPath exist
|
15
|
+
`mkdir -p #{Dataflow::CsvPath}`
|
16
|
+
update_settings(args)
|
17
|
+
end
|
18
|
+
|
19
|
+
def update_settings(args)
|
20
|
+
@settings = Dataflow::Adapters::Settings.new(args)
|
21
|
+
@schema = [] # TODO: pre-fetch the csv's schema
|
22
|
+
end
|
23
|
+
|
24
|
+
def set_schema(schema)
|
25
|
+
@schema = schema
|
26
|
+
end
|
27
|
+
|
28
|
+
# retrieve a single element from a data node
|
29
|
+
def find(where: opts = {})
|
30
|
+
raise Errors::NotImplementedError, '#find is not yet support on CSV.'
|
31
|
+
end
|
32
|
+
|
33
|
+
# retrieve all elements from a data node
|
34
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
35
|
+
SmarterCSV.process(file_path, strings_as_keys: true)
|
36
|
+
rescue Errno::ENOENT => e
|
37
|
+
[]
|
38
|
+
end
|
39
|
+
|
40
|
+
# count the number of records
|
41
|
+
def count(where: {})
|
42
|
+
all(where: where).count
|
43
|
+
end
|
44
|
+
|
45
|
+
# save the given records
|
46
|
+
def save(records:)
|
47
|
+
write_csv_part(records, keys: @schema.keys)
|
48
|
+
end
|
49
|
+
|
50
|
+
def on_save_finished
|
51
|
+
write_single_csv(keys: @schema.keys)
|
52
|
+
end
|
53
|
+
|
54
|
+
def remove(_opts = {})
|
55
|
+
raise Errors::NotImplementedError, '#find is not yet support on CSV.'
|
56
|
+
end
|
57
|
+
|
58
|
+
def recreate_dataset(dataset: nil)
|
59
|
+
# simply delete the file
|
60
|
+
delete_file(file_path)
|
61
|
+
# and any parts if any is still there
|
62
|
+
file_parts.each { |part| delete_file(part) }
|
63
|
+
end
|
64
|
+
|
65
|
+
def create_indexes(*); end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def delete_file(path)
|
70
|
+
File.delete(path)
|
71
|
+
rescue Errno::ENOENT => e
|
72
|
+
# no file present, no problem
|
73
|
+
end
|
74
|
+
|
75
|
+
def file_path
|
76
|
+
filename = "#{settings.db_name}.#{settings.dataset_name}.csv"
|
77
|
+
"#{Dataflow::CsvPath}/#{filename}"
|
78
|
+
end
|
79
|
+
|
80
|
+
def file_parts
|
81
|
+
part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
|
82
|
+
Dir["#{file_path}.part_*"]
|
83
|
+
end
|
84
|
+
|
85
|
+
def write_csv_part(data, keys:)
|
86
|
+
# prepare the data
|
87
|
+
key_tokens = keys.map { |key| record_dig_tokens(key: key) }
|
88
|
+
rows = data.map do |datum|
|
89
|
+
key_tokens.map { |tokens| datum.dig(*tokens) }
|
90
|
+
end
|
91
|
+
|
92
|
+
# dump in a part file
|
93
|
+
uuid = SecureRandom.hex
|
94
|
+
CSV.open("#{file_path}.part_#{uuid}", 'w') do |csv|
|
95
|
+
rows.each { |row| csv << row }
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def write_single_csv(keys:)
|
100
|
+
# export headers
|
101
|
+
header_filepath = "#{file_path}.header"
|
102
|
+
CSV.open(header_filepath, 'w') do |csv|
|
103
|
+
csv << keys
|
104
|
+
end
|
105
|
+
|
106
|
+
# make sure the destination file is deleted
|
107
|
+
delete_file(file_path)
|
108
|
+
|
109
|
+
# merge the files into the output
|
110
|
+
files = [header_filepath] + file_parts
|
111
|
+
files.each do |file|
|
112
|
+
# cat each file to the destination file
|
113
|
+
`cat #{file} >> #{file_path}`
|
114
|
+
end
|
115
|
+
|
116
|
+
# remove the intermediary files
|
117
|
+
files.each do |file|
|
118
|
+
delete_file(file)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,307 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Adapters
|
4
|
+
# Interface between a data node and mongodb.
|
5
|
+
# We use mongodb to perform all the store/retrieve operations.
|
6
|
+
class MongoDbAdapter
|
7
|
+
SYSTEM_ID = '_id'
|
8
|
+
|
9
|
+
class << self
|
10
|
+
def client(settings, db_name: nil)
|
11
|
+
@clients ||= {}
|
12
|
+
host = ENV['MOJACO_MONGO_ADDRESS'] || '127.0.0.1'
|
13
|
+
port = '27017'
|
14
|
+
connection_uri = settings.connection_uri || "#{host}:#{port}"
|
15
|
+
db_name ||= settings.db_name
|
16
|
+
@clients["#{connection_uri}.#{db_name}"] ||= Mongo::Client.new([connection_uri], database: db_name)
|
17
|
+
end
|
18
|
+
|
19
|
+
def admin_client(settings)
|
20
|
+
return @admin_client if @admin_client
|
21
|
+
@admin_client = client(settings, db_name: 'admin')
|
22
|
+
end
|
23
|
+
|
24
|
+
# Force the clients to disconnect their connections.
|
25
|
+
# Use before forking.
|
26
|
+
def disconnect_clients
|
27
|
+
@clients ||= {}
|
28
|
+
@clients.values.each(&:close)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
attr_reader :settings
|
33
|
+
attr_reader :client
|
34
|
+
|
35
|
+
def initialize(args)
|
36
|
+
update_settings(args)
|
37
|
+
@client = MongoDbAdapter.client(settings)
|
38
|
+
@admin_client = MongoDbAdapter.admin_client(settings)
|
39
|
+
end
|
40
|
+
|
41
|
+
def update_settings(args)
|
42
|
+
@settings = Dataflow::Adapters::Settings.new(args)
|
43
|
+
end
|
44
|
+
|
45
|
+
# retrieve a single element from a data node
|
46
|
+
def find(where: {}, fields: [], sort: {}, offset: 0)
|
47
|
+
all(where: where, fields: fields, sort: sort, offset: offset, limit: 1).first
|
48
|
+
end
|
49
|
+
|
50
|
+
# retrieve all elements from a data node
|
51
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
52
|
+
projection = fields.map { |field| [field, 1] }
|
53
|
+
|
54
|
+
unless fields.map(&:to_s).include?(SYSTEM_ID)
|
55
|
+
# by default, do not select the _id field
|
56
|
+
projection << [SYSTEM_ID, 0].freeze
|
57
|
+
end
|
58
|
+
|
59
|
+
opts = transform_to_query(where)
|
60
|
+
res = client[read_dataset_name].find(opts)
|
61
|
+
res = res.projection(projection.to_h)
|
62
|
+
|
63
|
+
res = res.sort(sort) if sort
|
64
|
+
res = res.skip(offset) if offset > 0
|
65
|
+
res = res.limit(limit) if limit > 0
|
66
|
+
|
67
|
+
if block_given?
|
68
|
+
yield res
|
69
|
+
else
|
70
|
+
res.to_a
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
# Helper that supports paginating through the whole dataset at fixed
|
75
|
+
# performance. Unlike using offset/skip which requires to read through
|
76
|
+
# the skipped content (high usage of CPU), we use the internal mongo
|
77
|
+
# cursor to get batch of results.
|
78
|
+
# @return [Hash] with 2 fields: data and next_cursor for the next call
|
79
|
+
def all_paginated(where: {}, fields: [], cursor: nil)
|
80
|
+
cursor = cursor.to_i
|
81
|
+
data = []
|
82
|
+
|
83
|
+
# If there is no cursor, we make the initial query
|
84
|
+
# get the first batch of data and get the cursor id.
|
85
|
+
if cursor.zero?
|
86
|
+
all(where: where, fields: fields) do |res|
|
87
|
+
results = res.initial_query
|
88
|
+
data = results.documents
|
89
|
+
cursor = res.cursor.id
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# The first query's result batch is a small 101 set of results
|
94
|
+
# so we want to get one more batch of data.
|
95
|
+
# However, there might be queries whose results are very small
|
96
|
+
# and the resulting cursor is 0. In such case there is no more
|
97
|
+
# data to be fetched.
|
98
|
+
unless cursor.zero?
|
99
|
+
# send a getMore command on the cursor id
|
100
|
+
command = { getMore: cursor, collection: read_dataset_name }
|
101
|
+
result = client.database.command(command).documents[0]
|
102
|
+
cursor = result['cursor']['id']
|
103
|
+
data += result['cursor']['nextBatch']
|
104
|
+
end
|
105
|
+
|
106
|
+
# We want to return the cursor as a string.
|
107
|
+
# If there is no cursor (zero) then make it empty
|
108
|
+
cursor = '' if cursor.zero?
|
109
|
+
|
110
|
+
{ 'data' => data, 'next_cursor' => cursor.to_s }
|
111
|
+
rescue Mongo::Error::OperationFailure
|
112
|
+
{ 'data' => data, 'next_cursor' => '' }
|
113
|
+
end
|
114
|
+
|
115
|
+
# Create queries that permit processing the whole dataset in parallel without using offsets.
|
116
|
+
def ordered_system_id_queries(batch_size:)
|
117
|
+
ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID].to_s }
|
118
|
+
queries_count = (ids.size / batch_size.to_f).ceil
|
119
|
+
Array.new(queries_count) do |i|
|
120
|
+
from = ids[i * batch_size]
|
121
|
+
to = ids[(i + 1) * batch_size] || ids[-1]
|
122
|
+
is_last = i == queries_count - 1
|
123
|
+
|
124
|
+
where_query = { SYSTEM_ID => { '>=' => from } }
|
125
|
+
operator = is_last ? '<=' : '<'
|
126
|
+
where_query[SYSTEM_ID][operator] = to
|
127
|
+
|
128
|
+
where_query
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# count the number of records
|
133
|
+
def count(where: {})
|
134
|
+
client[read_dataset_name].count(transform_to_query(where))
|
135
|
+
end
|
136
|
+
|
137
|
+
# Save the given records.
|
138
|
+
# @param replace_by [Array] if the replace_by key is provided,
|
139
|
+
# it will try to replace records with the matching key,
|
140
|
+
# or insert if none is found.
|
141
|
+
def save(records:, replace_by: nil)
|
142
|
+
if replace_by.present?
|
143
|
+
replace_keys = Array(replace_by)
|
144
|
+
bulk_ops = records.map do |record|
|
145
|
+
filter = replace_keys.map { |x| [x, record[x]] }.to_h
|
146
|
+
{
|
147
|
+
replace_one: {
|
148
|
+
filter: filter,
|
149
|
+
replacement: record,
|
150
|
+
upsert: true
|
151
|
+
}
|
152
|
+
}
|
153
|
+
end
|
154
|
+
client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
|
155
|
+
else
|
156
|
+
save_many(records: records)
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Delete records that match the options.
|
161
|
+
# @param where query to apply on the delete operation.
|
162
|
+
def delete(where: {})
|
163
|
+
client[read_dataset_name].delete_many(transform_to_query(where))
|
164
|
+
end
|
165
|
+
|
166
|
+
# recreate the table/collection
|
167
|
+
def recreate_dataset(dataset: nil)
|
168
|
+
dataset ||= write_dataset_name
|
169
|
+
collection = client[dataset]
|
170
|
+
collection.drop
|
171
|
+
collection.create
|
172
|
+
end
|
173
|
+
|
174
|
+
# Create the indexes on this dataset.
|
175
|
+
# @param dataset [String] Specify on which dataset the operation will be performed.
|
176
|
+
# Default: the adatpter's settings' dataset.
|
177
|
+
# @param type [Symbol] select which indexes type to create.
|
178
|
+
# Can be :all (default), :unique_only, :non_unique_only
|
179
|
+
def create_indexes(dataset: nil, type: :all, drop_retry_on_error: true)
|
180
|
+
dataset ||= write_dataset_name
|
181
|
+
return unless settings.indexes.present?
|
182
|
+
|
183
|
+
indexes = (settings.indexes || [])
|
184
|
+
|
185
|
+
case type
|
186
|
+
when :unique_only
|
187
|
+
indexes = indexes.select { |idx| idx['unique'] }
|
188
|
+
when :non_unique_only
|
189
|
+
indexes = indexes.reject { |idx| idx['unique'] }
|
190
|
+
end
|
191
|
+
|
192
|
+
indexes = indexes.map { |x| format_index(x) }
|
193
|
+
client[dataset].indexes.create_many(indexes)
|
194
|
+
rescue Mongo::Error::OperationFailure => e
|
195
|
+
raise e unless drop_retry_on_error
|
196
|
+
client[dataset].indexes.drop_all
|
197
|
+
create_indexes(drop_retry_on_error: false)
|
198
|
+
end
|
199
|
+
|
200
|
+
def usage(dataset:)
|
201
|
+
indexes = retrieve_collection_indexes(dataset)
|
202
|
+
command = { collstats: dataset }
|
203
|
+
result = client.database.command(command).documents[0]
|
204
|
+
{
|
205
|
+
memory: result['size'],
|
206
|
+
storage: result['storageSize'],
|
207
|
+
effective_indexes: indexes
|
208
|
+
}
|
209
|
+
rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
|
210
|
+
{
|
211
|
+
memory: 0,
|
212
|
+
storage: 0,
|
213
|
+
effective_indexes: indexes
|
214
|
+
}
|
215
|
+
end
|
216
|
+
|
217
|
+
private
|
218
|
+
|
219
|
+
def write_dataset_name
|
220
|
+
settings.write_dataset_name
|
221
|
+
end
|
222
|
+
|
223
|
+
def read_dataset_name
|
224
|
+
settings.read_dataset_name
|
225
|
+
end
|
226
|
+
|
227
|
+
def transform_to_query(opts)
|
228
|
+
sanitized_opts = {}
|
229
|
+
opts.each do |k, v|
|
230
|
+
if v.is_a? Array
|
231
|
+
# e.g. { 'id' => [1,2] } transform to mongodb IN clauses
|
232
|
+
sanitized_opts[k] = { '$in' => v.map { |value| try_cast_value(k, value) } }
|
233
|
+
elsif v.is_a? Hash
|
234
|
+
sanitized_opts[k] = {}
|
235
|
+
v.each do |operator, value|
|
236
|
+
case operator.to_s
|
237
|
+
when '!='
|
238
|
+
# we still need to check and transform into
|
239
|
+
if value.is_a? Array
|
240
|
+
# { '$nin' => [value] }
|
241
|
+
sanitized_opts[k]['$nin'] = value.map { |x| try_cast_value(k, x) }
|
242
|
+
else
|
243
|
+
# or {'$ne' => value }
|
244
|
+
sanitized_opts[k]['$ne'] = try_cast_value(k, value)
|
245
|
+
end
|
246
|
+
when '<'
|
247
|
+
sanitized_opts[k]['$lt'] = try_cast_value(k, value)
|
248
|
+
when '<='
|
249
|
+
sanitized_opts[k]['$lte'] = try_cast_value(k, value)
|
250
|
+
when '>'
|
251
|
+
sanitized_opts[k]['$gt'] = try_cast_value(k, value)
|
252
|
+
when '>='
|
253
|
+
sanitized_opts[k]['$gte'] = try_cast_value(k, value)
|
254
|
+
end
|
255
|
+
end
|
256
|
+
else
|
257
|
+
sanitized_opts[k] = try_cast_value(k, v)
|
258
|
+
end
|
259
|
+
end
|
260
|
+
sanitized_opts
|
261
|
+
end
|
262
|
+
|
263
|
+
def try_cast_value(field, value)
|
264
|
+
# cast to time when querying on _mojaco_updated_at
|
265
|
+
return Timeliness.parse(value) || value if field =~ /_mojaco_updated_at/
|
266
|
+
# cast to ObjectId when querying on _id
|
267
|
+
return BSON::ObjectId(value) if field == SYSTEM_ID && value.is_a?(String)
|
268
|
+
|
269
|
+
# TODO: add other casts based on the field type
|
270
|
+
value
|
271
|
+
end
|
272
|
+
|
273
|
+
def save_many(records:)
|
274
|
+
client[write_dataset_name].insert_many(records, ordered: false)
|
275
|
+
rescue Mongo::Error::BulkWriteError => e
|
276
|
+
dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
|
277
|
+
# don't raise if it is errors about duplicated keys
|
278
|
+
raise e unless dup_key_error
|
279
|
+
end
|
280
|
+
|
281
|
+
# Required index format for mongodb:
|
282
|
+
# { :key => { name: 1 }, :unique => true },
|
283
|
+
def format_index(dataset_index)
|
284
|
+
dataset_index = dataset_index.with_indifferent_access
|
285
|
+
|
286
|
+
index_key = {}
|
287
|
+
keys = Array(dataset_index[:key])
|
288
|
+
keys.each { |k| index_key[k] = 1 }
|
289
|
+
index = { key: index_key }
|
290
|
+
index[:unique] = true if dataset_index[:unique]
|
291
|
+
index
|
292
|
+
end
|
293
|
+
|
294
|
+
def retrieve_collection_indexes(collection)
|
295
|
+
mongo_indexes = client[collection].indexes
|
296
|
+
mongo_indexes.map do |idx|
|
297
|
+
# skip the default index
|
298
|
+
next if idx['key'].keys == ['_id']
|
299
|
+
|
300
|
+
index = { 'key' => idx['key'].keys }
|
301
|
+
index['unique'] = true if idx['unique']
|
302
|
+
index
|
303
|
+
end.compact
|
304
|
+
end
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Adapters
|
4
|
+
# Interface between a data node and mongodb.
|
5
|
+
# We use mongodb to perform all the store/retrieve operations.
|
6
|
+
class MysqlAdapter < SqlAdapter
|
7
|
+
def fetch_table_usage(dataset:)
|
8
|
+
size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
|
9
|
+
{
|
10
|
+
memory: size,
|
11
|
+
storage: size
|
12
|
+
}
|
13
|
+
rescue Sequel::DatabaseError => e
|
14
|
+
{
|
15
|
+
memory: 0,
|
16
|
+
storage: 0
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Adapters
|
4
|
+
# Interface between a data node and mongodb.
|
5
|
+
# We use mongodb to perform all the store/retrieve operations.
|
6
|
+
class PsqlAdapter < SqlAdapter
|
7
|
+
def fetch_table_usage(dataset:)
|
8
|
+
size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
|
9
|
+
{
|
10
|
+
memory: size,
|
11
|
+
storage: size
|
12
|
+
}
|
13
|
+
rescue Sequel::DatabaseError
|
14
|
+
{
|
15
|
+
memory: 0,
|
16
|
+
storage: 0
|
17
|
+
}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|