datafusion 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +45 -0
- data/bin/datafusion +27 -19
- data/datafusion.gemspec +1 -0
- data/lib/datafusion/db_executor.rb +4 -17
- data/lib/datafusion/debug_executor.rb +2 -4
- data/lib/datafusion/integrations.rb +33 -3
- data/lib/datafusion/version.rb +1 -1
- data/lib/datafusion.rb +24 -14
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 845ce5180fa9f3f7095e877220e627e4950dcbce
|
4
|
+
data.tar.gz: 97b1440a756415f07c563be88ef9c095aa0a52c0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 274395d436809093a756c535bf68783e1165e2cbf8079a76ed7a6c0006ae92fa6b6d9aff57c238904ba5ebc26b1dae4dba493b416ea474d42275a0082ef25825
|
7
|
+
data.tar.gz: 1f1e1df5a6a74836513e8761237f0f7faaa309f45475308da763ce475222e48273c43cee34604337cbad5ea0015e9cf02d494cfaaa2459d6d6dc4d1170e4e6e9
|
data/README.md
CHANGED
@@ -99,6 +99,51 @@ and have that parsed by datafusion and set up a `postgres` instance to be able t
|
|
99
99
|
integrate with them and give you the ability to fuse and dissect your data across
|
100
100
|
sources.
|
101
101
|
|
102
|
+
## Agent
|
103
|
+
|
104
|
+
This part documents the always-on agent, which schedules data refreshes across your
|
105
|
+
defined integration sources.
|
106
|
+
|
107
|
+
## Cached Tables
|
108
|
+
|
109
|
+
You can define a refresh schedule and a cached view on a table with the `cached`
|
110
|
+
key like so:
|
111
|
+
|
112
|
+
```yaml
|
113
|
+
tables:
|
114
|
+
- name: ware1
|
115
|
+
database: db
|
116
|
+
collection: foobar
|
117
|
+
cached:
|
118
|
+
name: mt_ware1
|
119
|
+
query: select * from ware1
|
120
|
+
refresh: 1s
|
121
|
+
```
|
122
|
+
|
123
|
+
* A `refresh` is a natural language short for a time period like `1m`, `5hr`, `15s` and so on.
|
124
|
+
* The `query` should mostly be a `select-all` from your defined table name.
|
125
|
+
* The `name` part is a name which is available for you during your regular SQL queries,
|
126
|
+
so that you can mix-and-match real-time and soft-real-time (cached) data.
|
127
|
+
|
128
|
+
## Refreshing
|
129
|
+
|
130
|
+
The agent comes with a built-in scheduler which is able to execute refresh queries on
|
131
|
+
your postgres cluster.
|
132
|
+
|
133
|
+
Start it like so:
|
134
|
+
|
135
|
+
```
|
136
|
+
$ datafusion -f integrations.yaml -a postgres://postgres:@localhost
|
137
|
+
```
|
138
|
+
|
139
|
+
And it will immediately come to life, telling you how many schedules it maintains, and
|
140
|
+
various output during the refresh process of the data.
|
141
|
+
|
142
|
+
It is advisable to keep this agent up directly under `systemd` or `upstart`, and look
|
143
|
+
for an `ERROR` level logs for job failures.
|
144
|
+
|
145
|
+
|
146
|
+
|
102
147
|
|
103
148
|
# Contributing
|
104
149
|
|
data/bin/datafusion
CHANGED
@@ -14,10 +14,12 @@ end
|
|
14
14
|
# $ datafusion --fuse integrations.yml
|
15
15
|
# $ datafusion --agent
|
16
16
|
#
|
17
|
+
|
17
18
|
o = Slop::Options.new
|
18
|
-
o.string '-f', '--
|
19
|
-
o.string '-
|
20
|
-
o.
|
19
|
+
o.string '-f', '--file', 'Integrations file (URL or local)'
|
20
|
+
o.string '-c', '--connection', 'Connection string to fusion engine (postgres)', default: 'postgres://localhost'
|
21
|
+
o.bool '-s', '--setup', 'Setup integrations', default: false
|
22
|
+
o.bool '-a', '--agent', 'Connection string (i.e postgres://localhost)', default: false
|
21
23
|
o.bool '-d', '--dryrun', 'dry run for refreshes', default: false
|
22
24
|
|
23
25
|
o.on '--version', 'print the version' do
|
@@ -30,22 +32,28 @@ end
|
|
30
32
|
end
|
31
33
|
opts = Slop::Parser.new(o).parse(ARGV)
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
-
puts Datafusion.fuse(opts[:user], opts[:fuse])
|
36
|
-
else
|
37
|
-
bail "Error: please provide a file to fuse", opts
|
38
|
-
end
|
39
|
-
elsif opts[:fuse] && opts[:agent]
|
40
|
-
|
41
|
-
exec_class = Datafusion::DebugExecutor
|
42
|
-
unless opts[:dryrun]
|
43
|
-
exec_class = Datafusion::DbExecutor
|
44
|
-
end
|
45
|
-
exec = exec_class.new(opts[:agent])
|
46
|
-
sched = Datafusion.refresh(opts[:fuse], exec)
|
47
|
-
Datafusion.log.info("Running refresh agent.")
|
48
|
-
sched.join
|
35
|
+
unless opts[:file]
|
36
|
+
bail("Please provide a file", opts)
|
49
37
|
end
|
50
38
|
|
39
|
+
unless opts[:connection]
|
40
|
+
bail("Please provide a connection", opts)
|
41
|
+
end
|
51
42
|
|
43
|
+
exec_class = Datafusion::DebugExecutor.new
|
44
|
+
unless opts[:dryrun]
|
45
|
+
exec_class = Datafusion::DbExecutor.new(opts[:connection])
|
46
|
+
end
|
47
|
+
|
48
|
+
file = opts[:file]
|
49
|
+
|
50
|
+
if opts[:setup]
|
51
|
+
puts Datafusion.fuse(file, exec_class, opts)
|
52
|
+
elsif opts[:agent]
|
53
|
+
sched = Datafusion.refresh(file, exec_class, opts)
|
54
|
+
Datafusion.log.info("Running refresh agent.")
|
55
|
+
sched.join
|
56
|
+
else
|
57
|
+
bail("Please pick a mode: --setup | --agent", opts)
|
58
|
+
end
|
59
|
+
|
data/datafusion.gemspec
CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_dependency 'colorize', '~> 0.7.7'
|
24
24
|
spec.add_dependency 'rufus-scheduler', '~> 3.2.0'
|
25
25
|
spec.add_dependency 'sequel', '~> 4.3.0'
|
26
|
+
spec.add_dependency 'retriable', '~> 2.1.0'
|
26
27
|
|
27
28
|
spec.add_development_dependency "bundler", "~> 1.10"
|
28
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
@@ -7,25 +7,12 @@ module Datafusion
|
|
7
7
|
def initialize(conn)
|
8
8
|
@db = Sequel.connect(conn)
|
9
9
|
end
|
10
|
-
|
11
|
-
|
12
|
-
# TODO use refresh [..] concurrently
|
13
|
-
#
|
14
|
-
# This means we also need to define a unique index per materialized
|
15
|
-
# view so that PG will know how to use MVCC.
|
16
|
-
#
|
17
|
-
# This needs some code to detect:
|
18
|
-
# 1. At setup time - when an index is already there, don't add it.
|
19
|
-
# 2. At refresh time - if a table doesn't have any data, it cannot be
|
20
|
-
# refreshed with concurrently - it needs a normal refresh first.
|
21
|
-
#
|
22
|
-
# For now we refresh and block.
|
23
|
-
#
|
10
|
+
|
11
|
+
def execute(sql, label='')
|
24
12
|
run = rand(36**5).to_s(36)
|
25
13
|
|
26
|
-
Datafusion.log.info("#{TAG}: starting run id:#{run} for #{
|
27
|
-
|
28
|
-
@db[refresh_sql].each do |r|
|
14
|
+
Datafusion.log.info("#{TAG}: starting run id:#{run} for: '#{label}'")
|
15
|
+
@db[sql].each do |r|
|
29
16
|
Datafusion.log.info("#{TAG}: out: #{r}")
|
30
17
|
end
|
31
18
|
Datafusion.log.info("#{TAG}: finished run id:#{run}")
|
@@ -1,11 +1,41 @@
|
|
1
1
|
require 'erb'
|
2
2
|
require 'yaml'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'retriable'
|
5
|
+
require 'uri'
|
3
6
|
|
4
7
|
module Datafusion
|
5
8
|
class Integrations
|
6
|
-
def self.load(
|
7
|
-
|
8
|
-
|
9
|
+
def self.load(file, opts={})
|
10
|
+
retry_count = opts[:retry_count] || 20
|
11
|
+
Retriable.retriable :tries => retry_count, :on_retry => self.method(:could_not_open) do
|
12
|
+
erb = ERB.new(open(file).read)
|
13
|
+
YAML.load(erb.result(binding))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.could_not_open(exception, try, elapsed_time, next_interval)
|
18
|
+
Datafusion.log.error("#{exception.class}: '#{exception.message}' - #{try} tries in #{elapsed_time} seconds and #{next_interval} seconds until the next try.")
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.render(file, opts)
|
22
|
+
pguser = URI(opts[:connection] || "").user || 'postgres'
|
23
|
+
|
24
|
+
integs = Integrations.load(file)
|
25
|
+
|
26
|
+
out = ""
|
27
|
+
integs.each do |k, v|
|
28
|
+
erb = SnippetRenderer.new(v["kind"], v.merge({"user" => pguser, "name" => k}))
|
29
|
+
out << erb.render()
|
30
|
+
end
|
31
|
+
return out
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.schedules(file)
|
35
|
+
integs = Integrations.load(file)
|
36
|
+
integs.map do |k, v|
|
37
|
+
v["tables"].map{|t| t["cached"] }.compact
|
38
|
+
end.flatten
|
9
39
|
end
|
10
40
|
end
|
11
41
|
end
|
data/lib/datafusion/version.rb
CHANGED
data/lib/datafusion.rb
CHANGED
@@ -17,29 +17,39 @@ module Datafusion
|
|
17
17
|
@log = logger
|
18
18
|
end
|
19
19
|
|
20
|
-
def self.fuse(
|
21
|
-
|
22
|
-
out
|
23
|
-
integs.each do |k, v|
|
24
|
-
erb = SnippetRenderer.new(v["kind"], v.merge({"user" => pguser, "name" => k}))
|
25
|
-
out += erb.render()
|
26
|
-
end
|
27
|
-
out
|
20
|
+
def self.fuse(file, executor, opts)
|
21
|
+
out = Integrations.render(file, opts)
|
22
|
+
executor.execute(out, "integrations")
|
28
23
|
end
|
29
24
|
|
30
|
-
def self.refresh(file, executor)
|
31
|
-
|
32
|
-
schedules = integs.map do |k, v|
|
33
|
-
v["tables"].map{|t| t["cached"] }.compact
|
34
|
-
end.flatten
|
25
|
+
def self.refresh(file, executor, opts)
|
26
|
+
schedules = Integrations.schedules(file)
|
35
27
|
Datafusion.log.info("Discovered #{schedules.size} schedule(s).")
|
36
28
|
|
37
29
|
scheduler = Rufus::Scheduler.new
|
38
30
|
schedules.each do |schedule|
|
39
31
|
scheduler.every(schedule["refresh"]) do
|
40
|
-
|
32
|
+
#
|
33
|
+
# TODO use refresh [..] concurrently
|
34
|
+
#
|
35
|
+
# This means we also need to define a unique index per materialized
|
36
|
+
# view so that PG will know how to use MVCC.
|
37
|
+
#
|
38
|
+
# This needs some code to detect:
|
39
|
+
# 1. At setup time - when an index is already there, don't add it.
|
40
|
+
# 2. At refresh time - if a table doesn't have any data, it cannot be
|
41
|
+
# refreshed with concurrently - it needs a normal refresh first.
|
42
|
+
#
|
43
|
+
# For now we refresh and block.
|
44
|
+
#
|
45
|
+
refresh_sql = "REFRESH materialized view #{schedule['name']}"
|
46
|
+
|
47
|
+
executor.execute(refresh_sql, "schedule: #{schedule}")
|
41
48
|
end
|
42
49
|
end
|
50
|
+
def scheduler.on_error(job, error)
|
51
|
+
Datafusion.log.error("SCHEDULER: intercepted error in #{job.id}: #{error.message}")
|
52
|
+
end
|
43
53
|
scheduler
|
44
54
|
end
|
45
55
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datafusion
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dotan Nahum
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: slop
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 4.3.0
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: retriable
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 2.1.0
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 2.1.0
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|