dataflow-rb 0.9.1 → 0.9.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +15 -2
- data/CHANGELOG.md +15 -0
- data/README.md +45 -1
- data/lib/dataflow/adapters/sql_adapter.rb +2 -2
- data/lib/dataflow/nodes/join_node.rb +9 -3
- data/lib/dataflow/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: da8a0cc4aa93a9a282f672e830d2ab8931e6fe58
|
4
|
+
data.tar.gz: a4a205460bcda2715d1e5bd16b4fe0982a0f652c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14cdd199d230e5048d599372798343274bc130cc906dcb4f39449bb4dd54eec89bd06047ef16560e0cedc15588d333701550bdc0ad5ba37d6511b9935d7b7d5d
|
7
|
+
data.tar.gz: 1d1658b28845cd78128d44e0f9acae8848117ebc304ec37f059f326faa2d22f9547c47405773becfa49f99a7071c6c11ce02b73926b14994f9c6c4f0c7643489
|
data/.travis.yml
CHANGED
@@ -1,4 +1,17 @@
|
|
1
|
+
dist: trusty
|
2
|
+
sudo: required
|
1
3
|
language: ruby
|
2
4
|
rvm:
|
3
|
-
- 2.3.
|
4
|
-
before_install:
|
5
|
+
- 2.3.1
|
6
|
+
before_install:
|
7
|
+
- gem install bundler -v 1.14.3
|
8
|
+
- mysql -e 'CREATE DATABASE dataflow_test;'
|
9
|
+
- psql -c 'create database dataflow_test;' -U postgres
|
10
|
+
services:
|
11
|
+
- mongodb
|
12
|
+
- mysql
|
13
|
+
- postgresql
|
14
|
+
env:
|
15
|
+
- MOJACO_MYSQL_USER=root MOJACO_POSTGRESQL_USER=postgres
|
16
|
+
addons:
|
17
|
+
postgresql: "9.6"
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
#### 0.9.2
|
6
|
+
- [2f3129c] Fix bug when joining datasets directly in SQL
|
7
|
+
- Updated the readme with some information on how to use the gem
|
8
|
+
- Set up .travis.yml
|
9
|
+
|
10
|
+
#### 0.9.1
|
11
|
+
- Fixed the gem public information
|
12
|
+
|
13
|
+
#### 0.9.0
|
14
|
+
- Extracted the open-source version
|
15
|
+
|
data/README.md
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/Phybbit/dataflow-rb.svg?branch=master)](https://travis-ci.org/Phybbit/dataflow-rb)
|
2
|
+
|
1
3
|
# Dataflow
|
2
4
|
|
3
5
|
The purpose of this gem is to help building complex dataflows and support automating long-running batch processes.
|
@@ -31,9 +33,51 @@ Or install it yourself as:
|
|
31
33
|
|
32
34
|
$ gem install dataflow-rb
|
33
35
|
|
36
|
+
You also need to install:
|
37
|
+
- mongodb 3.2 (required)
|
38
|
+
- postgresql (optional)
|
39
|
+
- mysql (optional)
|
40
|
+
|
34
41
|
## Usage
|
35
42
|
|
36
|
-
|
43
|
+
```ruby
|
44
|
+
require 'dataflow-rb'
|
45
|
+
|
46
|
+
# Create a data node
|
47
|
+
node1 = Dataflow::Nodes::DataNode.create(db_name: 'test', name: 'data_source1')
|
48
|
+
node1.add(records: [{id: 1, first_name: 'hello'}])
|
49
|
+
node1.all
|
50
|
+
# => [{"id"=>1, "name"=>"test"}]
|
51
|
+
|
52
|
+
node2 = Dataflow::Nodes::DataNode.create(db_name: 'test', name: 'data_source2')
|
53
|
+
node2.add(records: [{id: 1, last_name: 'world'}])
|
54
|
+
node2.all
|
55
|
+
# => [{"id"=>1, "name"=>"world"}]
|
56
|
+
|
57
|
+
# We will keep the results of the computation in this dataset
|
58
|
+
result_node = Dataflow::Nodes::DataNode.create(db_name: 'test', name: 'result')
|
59
|
+
|
60
|
+
# Join the 2 datasets by id:
|
61
|
+
compute_node = Dataflow::Nodes::JoinNode.create(
|
62
|
+
name: 'join',
|
63
|
+
dependency_ids: [node1, node2],
|
64
|
+
data_node_id: result_node,
|
65
|
+
key1: 'id',
|
66
|
+
key2: 'id'
|
67
|
+
)
|
68
|
+
compute_node.compute
|
69
|
+
compute_node.data_node.all
|
70
|
+
# => [{"id"=>1, "first_name"=>"hello", "last_name"=>"world"}]
|
71
|
+
compute_node.all # this is just a facade for the above
|
72
|
+
# => [{"id"=>1, "first_name"=>"hello", "last_name"=>"world"}]
|
73
|
+
|
74
|
+
# Fetch the data again later:
|
75
|
+
result_node = Dataflow::Nodes::DataNode.find_by(name: 'result')
|
76
|
+
# or the short hand:
|
77
|
+
result_node = Dataflow.data_node('result')
|
78
|
+
result_node.all
|
79
|
+
# => [{"id"=>1, "first_name"=>"hello", "last_name"=>"world"}]
|
80
|
+
```
|
37
81
|
|
38
82
|
## Development
|
39
83
|
|
@@ -21,8 +21,8 @@ module Dataflow
|
|
21
21
|
when 'postgresql'
|
22
22
|
host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
|
23
23
|
port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
|
24
|
-
user = ENV['MOJACO_POSTGRESQL_USER']
|
25
|
-
password = ENV['MOJACO_POSTGRESQL_PASSWORD']
|
24
|
+
user = ENV['MOJACO_POSTGRESQL_USER']
|
25
|
+
password = ENV['MOJACO_POSTGRESQL_PASSWORD']
|
26
26
|
end
|
27
27
|
|
28
28
|
db_name ||= settings.db_name
|
@@ -52,7 +52,7 @@ module Dataflow
|
|
52
52
|
|
53
53
|
private
|
54
54
|
|
55
|
-
def
|
55
|
+
def sql_join_query
|
56
56
|
fields = required_schema.keys
|
57
57
|
select_keys = dependencies[0].schema.keys.map { |x| "d1.#{x}" } + (dependencies[1].schema.keys - dependencies[0].schema.keys).map { |x| "d2.#{x}" }
|
58
58
|
query = "INSERT INTO #{write_dataset_name} (#{fields.join(',')})
|
@@ -60,10 +60,16 @@ module Dataflow
|
|
60
60
|
FROM #{dependencies[0].read_dataset_name} as d1
|
61
61
|
INNER JOIN #{dependencies[1].read_dataset_name} as d2
|
62
62
|
ON d1.#{key1} = d2.#{key2}"
|
63
|
-
p query
|
64
|
-
db_adapter.client[query].to_a
|
65
63
|
end
|
66
64
|
|
65
|
+
def execute_sql_join
|
66
|
+
query = sql_join_query
|
67
|
+
# TODO: work on a better way to interface this
|
68
|
+
sql_adapter = data_node.send(:db_adapter)
|
69
|
+
sql_adapter.client[query].to_a
|
70
|
+
end
|
71
|
+
|
72
|
+
|
67
73
|
def compute_batch(records:)
|
68
74
|
join(n1_records: records)
|
69
75
|
end
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-02-
|
11
|
+
date: 2017-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -302,6 +302,7 @@ files:
|
|
302
302
|
- ".gitignore"
|
303
303
|
- ".rspec"
|
304
304
|
- ".travis.yml"
|
305
|
+
- CHANGELOG.md
|
305
306
|
- Gemfile
|
306
307
|
- LICENSE
|
307
308
|
- README.md
|