dataflow-rb 0.9.1 → 0.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +15 -2
- data/CHANGELOG.md +15 -0
- data/README.md +45 -1
- data/lib/dataflow/adapters/sql_adapter.rb +2 -2
- data/lib/dataflow/nodes/join_node.rb +9 -3
- data/lib/dataflow/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: da8a0cc4aa93a9a282f672e830d2ab8931e6fe58
|
4
|
+
data.tar.gz: a4a205460bcda2715d1e5bd16b4fe0982a0f652c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14cdd199d230e5048d599372798343274bc130cc906dcb4f39449bb4dd54eec89bd06047ef16560e0cedc15588d333701550bdc0ad5ba37d6511b9935d7b7d5d
|
7
|
+
data.tar.gz: 1d1658b28845cd78128d44e0f9acae8848117ebc304ec37f059f326faa2d22f9547c47405773becfa49f99a7071c6c11ce02b73926b14994f9c6c4f0c7643489
|
data/.travis.yml
CHANGED
@@ -1,4 +1,17 @@
|
|
1
|
+
dist: trusty
|
2
|
+
sudo: required
|
1
3
|
language: ruby
|
2
4
|
rvm:
|
3
|
-
- 2.3.
|
4
|
-
before_install:
|
5
|
+
- 2.3.1
|
6
|
+
before_install:
|
7
|
+
- gem install bundler -v 1.14.3
|
8
|
+
- mysql -e 'CREATE DATABASE dataflow_test;'
|
9
|
+
- psql -c 'create database dataflow_test;' -U postgres
|
10
|
+
services:
|
11
|
+
- mongodb
|
12
|
+
- mysql
|
13
|
+
- postgresql
|
14
|
+
env:
|
15
|
+
- MOJACO_MYSQL_USER=root MOJACO_POSTGRESQL_USER=postgres
|
16
|
+
addons:
|
17
|
+
postgresql: "9.6"
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
|
4
|
+
|
5
|
+
#### 0.9.2
|
6
|
+
- [2f3129c] Fix bug when joining datasets directly in SQL
|
7
|
+
- Updated the readme with some information on how to use the gem
|
8
|
+
- Set up .travis.yml
|
9
|
+
|
10
|
+
#### 0.9.1
|
11
|
+
- Fixed the gem public information
|
12
|
+
|
13
|
+
#### 0.9.0
|
14
|
+
- Extracted the open-source version
|
15
|
+
|
data/README.md
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
[](https://travis-ci.org/Phybbit/dataflow-rb)
|
2
|
+
|
1
3
|
# Dataflow
|
2
4
|
|
3
5
|
The purpose of this gem is to help building complex dataflows and support automating long-running batch processes.
|
@@ -31,9 +33,51 @@ Or install it yourself as:
|
|
31
33
|
|
32
34
|
$ gem install dataflow-rb
|
33
35
|
|
36
|
+
You also need to install:
|
37
|
+
- mongodb 3.2 (required)
|
38
|
+
- postgresql (optional)
|
39
|
+
- mysql (optional)
|
40
|
+
|
34
41
|
## Usage
|
35
42
|
|
36
|
-
|
43
|
+
```ruby
|
44
|
+
require 'dataflow-rb'
|
45
|
+
|
46
|
+
# Create a data node
|
47
|
+
node1 = Dataflow::Nodes::DataNode.create(db_name: 'test', name: 'data_source1')
|
48
|
+
node1.add(records: [{id: 1, first_name: 'hello'}])
|
49
|
+
node1.all
|
50
|
+
# => [{"id"=>1, "name"=>"test"}]
|
51
|
+
|
52
|
+
node2 = Dataflow::Nodes::DataNode.create(db_name: 'test', name: 'data_source2')
|
53
|
+
node2.add(records: [{id: 1, last_name: 'world'}])
|
54
|
+
node2.all
|
55
|
+
# => [{"id"=>1, "name"=>"world"}]
|
56
|
+
|
57
|
+
# We will keep the results of the computation in this dataset
|
58
|
+
result_node = Dataflow::Nodes::DataNode.create(db_name: 'test', name: 'result')
|
59
|
+
|
60
|
+
# Join the 2 datasets by id:
|
61
|
+
compute_node = Dataflow::Nodes::JoinNode.create(
|
62
|
+
name: 'join',
|
63
|
+
dependency_ids: [node1, node2],
|
64
|
+
data_node_id: result_node,
|
65
|
+
key1: 'id',
|
66
|
+
key2: 'id'
|
67
|
+
)
|
68
|
+
compute_node.compute
|
69
|
+
compute_node.data_node.all
|
70
|
+
# => [{"id"=>1, "first_name"=>"hello", "last_name"=>"world"}]
|
71
|
+
compute_node.all # this is just a facade for the above
|
72
|
+
# => [{"id"=>1, "first_name"=>"hello", "last_name"=>"world"}]
|
73
|
+
|
74
|
+
# Fetch the data again later:
|
75
|
+
result_node = Dataflow::Nodes::DataNode.find_by(name: 'result')
|
76
|
+
# or the short hand:
|
77
|
+
result_node = Dataflow.data_node('result')
|
78
|
+
result_node.all
|
79
|
+
# => [{"id"=>1, "first_name"=>"hello", "last_name"=>"world"}]
|
80
|
+
```
|
37
81
|
|
38
82
|
## Development
|
39
83
|
|
@@ -21,8 +21,8 @@ module Dataflow
|
|
21
21
|
when 'postgresql'
|
22
22
|
host = ENV['MOJACO_POSTGRESQL_ADDRESS'] || '127.0.0.1'
|
23
23
|
port = ENV['MOJACO_POSTGRESQL_PORT'] || '5432'
|
24
|
-
user = ENV['MOJACO_POSTGRESQL_USER']
|
25
|
-
password = ENV['MOJACO_POSTGRESQL_PASSWORD']
|
24
|
+
user = ENV['MOJACO_POSTGRESQL_USER']
|
25
|
+
password = ENV['MOJACO_POSTGRESQL_PASSWORD']
|
26
26
|
end
|
27
27
|
|
28
28
|
db_name ||= settings.db_name
|
@@ -52,7 +52,7 @@ module Dataflow
|
|
52
52
|
|
53
53
|
private
|
54
54
|
|
55
|
-
def
|
55
|
+
def sql_join_query
|
56
56
|
fields = required_schema.keys
|
57
57
|
select_keys = dependencies[0].schema.keys.map { |x| "d1.#{x}" } + (dependencies[1].schema.keys - dependencies[0].schema.keys).map { |x| "d2.#{x}" }
|
58
58
|
query = "INSERT INTO #{write_dataset_name} (#{fields.join(',')})
|
@@ -60,10 +60,16 @@ module Dataflow
|
|
60
60
|
FROM #{dependencies[0].read_dataset_name} as d1
|
61
61
|
INNER JOIN #{dependencies[1].read_dataset_name} as d2
|
62
62
|
ON d1.#{key1} = d2.#{key2}"
|
63
|
-
p query
|
64
|
-
db_adapter.client[query].to_a
|
65
63
|
end
|
66
64
|
|
65
|
+
def execute_sql_join
|
66
|
+
query = sql_join_query
|
67
|
+
# TODO: work on a better way to interface this
|
68
|
+
sql_adapter = data_node.send(:db_adapter)
|
69
|
+
sql_adapter.client[query].to_a
|
70
|
+
end
|
71
|
+
|
72
|
+
|
67
73
|
def compute_batch(records:)
|
68
74
|
join(n1_records: records)
|
69
75
|
end
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.9.
|
4
|
+
version: 0.9.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-02-
|
11
|
+
date: 2017-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -302,6 +302,7 @@ files:
|
|
302
302
|
- ".gitignore"
|
303
303
|
- ".rspec"
|
304
304
|
- ".travis.yml"
|
305
|
+
- CHANGELOG.md
|
305
306
|
- Gemfile
|
306
307
|
- LICENSE
|
307
308
|
- README.md
|