datafusion 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +45 -0
- data/bin/datafusion +27 -19
- data/datafusion.gemspec +1 -0
- data/lib/datafusion/db_executor.rb +4 -17
- data/lib/datafusion/debug_executor.rb +2 -4
- data/lib/datafusion/integrations.rb +33 -3
- data/lib/datafusion/version.rb +1 -1
- data/lib/datafusion.rb +24 -14
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 845ce5180fa9f3f7095e877220e627e4950dcbce
|
4
|
+
data.tar.gz: 97b1440a756415f07c563be88ef9c095aa0a52c0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 274395d436809093a756c535bf68783e1165e2cbf8079a76ed7a6c0006ae92fa6b6d9aff57c238904ba5ebc26b1dae4dba493b416ea474d42275a0082ef25825
|
7
|
+
data.tar.gz: 1f1e1df5a6a74836513e8761237f0f7faaa309f45475308da763ce475222e48273c43cee34604337cbad5ea0015e9cf02d494cfaaa2459d6d6dc4d1170e4e6e9
|
data/README.md
CHANGED
@@ -99,6 +99,51 @@ and have that parsed by datafusion and set up a `postgres` instance to be able t
|
|
99
99
|
integrate with them and give you the ability to fuse and dissect your data across
|
100
100
|
sources.
|
101
101
|
|
102
|
+
## Agent
|
103
|
+
|
104
|
+
This part documents the always-on agent, which schedules data refreshes across your
|
105
|
+
defined integration sources.
|
106
|
+
|
107
|
+
## Cached Tables
|
108
|
+
|
109
|
+
You can define a refresh schedule and a cached view on a table with the `cached`
|
110
|
+
key like so:
|
111
|
+
|
112
|
+
```yaml
|
113
|
+
tables:
|
114
|
+
- name: ware1
|
115
|
+
database: db
|
116
|
+
collection: foobar
|
117
|
+
cached:
|
118
|
+
name: mt_ware1
|
119
|
+
query: select * from ware1
|
120
|
+
refresh: 1s
|
121
|
+
```
|
122
|
+
|
123
|
+
* A `refresh` is a natural language short for a time period like `1m`, `5hr`, `15s` and so on.
|
124
|
+
* The `query` should mostly be a `select-all` from your defined table name.
|
125
|
+
* The `name` part is a name which is available for you during your regular SQL queries,
|
126
|
+
so that you can mix-and-match real-time and soft-real-time (cached) data.
|
127
|
+
|
128
|
+
## Refreshing
|
129
|
+
|
130
|
+
The agent comes with a built-in scheduler which is able to execute refresh queries on
|
131
|
+
your postgres cluster.
|
132
|
+
|
133
|
+
Start it like so:
|
134
|
+
|
135
|
+
```
|
136
|
+
$ datafusion -f integrations.yaml -a postgres://postgres:@localhost
|
137
|
+
```
|
138
|
+
|
139
|
+
And it will immediately come to life, telling you how many schedules it maintains, and
|
140
|
+
various output during the refresh process of the data.
|
141
|
+
|
142
|
+
It is advisable to keep this agent up directly under `systemd` or `upstart`, and look
|
143
|
+
for an `ERROR` level logs for job failures.
|
144
|
+
|
145
|
+
|
146
|
+
|
102
147
|
|
103
148
|
# Contributing
|
104
149
|
|
data/bin/datafusion
CHANGED
@@ -14,10 +14,12 @@ end
|
|
14
14
|
# $ datafusion --fuse integrations.yml
|
15
15
|
# $ datafusion --agent
|
16
16
|
#
|
17
|
+
|
17
18
|
o = Slop::Options.new
|
18
|
-
o.string '-f', '--
|
19
|
-
o.string '-
|
20
|
-
o.
|
19
|
+
o.string '-f', '--file', 'Integrations file (URL or local)'
|
20
|
+
o.string '-c', '--connection', 'Connection string to fusion engine (postgres)', default: 'postgres://localhost'
|
21
|
+
o.bool '-s', '--setup', 'Setup integrations', default: false
|
22
|
+
o.bool '-a', '--agent', 'Connection string (i.e postgres://localhost)', default: false
|
21
23
|
o.bool '-d', '--dryrun', 'dry run for refreshes', default: false
|
22
24
|
|
23
25
|
o.on '--version', 'print the version' do
|
@@ -30,22 +32,28 @@ end
|
|
30
32
|
end
|
31
33
|
opts = Slop::Parser.new(o).parse(ARGV)
|
32
34
|
|
33
|
-
|
34
|
-
|
35
|
-
puts Datafusion.fuse(opts[:user], opts[:fuse])
|
36
|
-
else
|
37
|
-
bail "Error: please provide a file to fuse", opts
|
38
|
-
end
|
39
|
-
elsif opts[:fuse] && opts[:agent]
|
40
|
-
|
41
|
-
exec_class = Datafusion::DebugExecutor
|
42
|
-
unless opts[:dryrun]
|
43
|
-
exec_class = Datafusion::DbExecutor
|
44
|
-
end
|
45
|
-
exec = exec_class.new(opts[:agent])
|
46
|
-
sched = Datafusion.refresh(opts[:fuse], exec)
|
47
|
-
Datafusion.log.info("Running refresh agent.")
|
48
|
-
sched.join
|
35
|
+
unless opts[:file]
|
36
|
+
bail("Please provide a file", opts)
|
49
37
|
end
|
50
38
|
|
39
|
+
unless opts[:connection]
|
40
|
+
bail("Please provide a connection", opts)
|
41
|
+
end
|
51
42
|
|
43
|
+
exec_class = Datafusion::DebugExecutor.new
|
44
|
+
unless opts[:dryrun]
|
45
|
+
exec_class = Datafusion::DbExecutor.new(opts[:connection])
|
46
|
+
end
|
47
|
+
|
48
|
+
file = opts[:file]
|
49
|
+
|
50
|
+
if opts[:setup]
|
51
|
+
puts Datafusion.fuse(file, exec_class, opts)
|
52
|
+
elsif opts[:agent]
|
53
|
+
sched = Datafusion.refresh(file, exec_class, opts)
|
54
|
+
Datafusion.log.info("Running refresh agent.")
|
55
|
+
sched.join
|
56
|
+
else
|
57
|
+
bail("Please pick a mode: --setup | --agent", opts)
|
58
|
+
end
|
59
|
+
|
data/datafusion.gemspec
CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
|
|
23
23
|
spec.add_dependency 'colorize', '~> 0.7.7'
|
24
24
|
spec.add_dependency 'rufus-scheduler', '~> 3.2.0'
|
25
25
|
spec.add_dependency 'sequel', '~> 4.3.0'
|
26
|
+
spec.add_dependency 'retriable', '~> 2.1.0'
|
26
27
|
|
27
28
|
spec.add_development_dependency "bundler", "~> 1.10"
|
28
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
@@ -7,25 +7,12 @@ module Datafusion
|
|
7
7
|
def initialize(conn)
|
8
8
|
@db = Sequel.connect(conn)
|
9
9
|
end
|
10
|
-
|
11
|
-
|
12
|
-
# TODO use refresh [..] concurrently
|
13
|
-
#
|
14
|
-
# This means we also need to define a unique index per materialized
|
15
|
-
# view so that PG will know how to use MVCC.
|
16
|
-
#
|
17
|
-
# This needs some code to detect:
|
18
|
-
# 1. At setup time - when an index is already there, don't add it.
|
19
|
-
# 2. At refresh time - if a table doesn't have any data, it cannot be
|
20
|
-
# refreshed with concurrently - it needs a normal refresh first.
|
21
|
-
#
|
22
|
-
# For now we refresh and block.
|
23
|
-
#
|
10
|
+
|
11
|
+
def execute(sql, label='')
|
24
12
|
run = rand(36**5).to_s(36)
|
25
13
|
|
26
|
-
Datafusion.log.info("#{TAG}: starting run id:#{run} for #{
|
27
|
-
|
28
|
-
@db[refresh_sql].each do |r|
|
14
|
+
Datafusion.log.info("#{TAG}: starting run id:#{run} for: '#{label}'")
|
15
|
+
@db[sql].each do |r|
|
29
16
|
Datafusion.log.info("#{TAG}: out: #{r}")
|
30
17
|
end
|
31
18
|
Datafusion.log.info("#{TAG}: finished run id:#{run}")
|
@@ -1,11 +1,41 @@
|
|
1
1
|
require 'erb'
|
2
2
|
require 'yaml'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'retriable'
|
5
|
+
require 'uri'
|
3
6
|
|
4
7
|
module Datafusion
|
5
8
|
class Integrations
|
6
|
-
def self.load(
|
7
|
-
|
8
|
-
|
9
|
+
def self.load(file, opts={})
|
10
|
+
retry_count = opts[:retry_count] || 20
|
11
|
+
Retriable.retriable :tries => retry_count, :on_retry => self.method(:could_not_open) do
|
12
|
+
erb = ERB.new(open(file).read)
|
13
|
+
YAML.load(erb.result(binding))
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.could_not_open(exception, try, elapsed_time, next_interval)
|
18
|
+
Datafusion.log.error("#{exception.class}: '#{exception.message}' - #{try} tries in #{elapsed_time} seconds and #{next_interval} seconds until the next try.")
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.render(file, opts)
|
22
|
+
pguser = URI(opts[:connection] || "").user || 'postgres'
|
23
|
+
|
24
|
+
integs = Integrations.load(file)
|
25
|
+
|
26
|
+
out = ""
|
27
|
+
integs.each do |k, v|
|
28
|
+
erb = SnippetRenderer.new(v["kind"], v.merge({"user" => pguser, "name" => k}))
|
29
|
+
out << erb.render()
|
30
|
+
end
|
31
|
+
return out
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.schedules(file)
|
35
|
+
integs = Integrations.load(file)
|
36
|
+
integs.map do |k, v|
|
37
|
+
v["tables"].map{|t| t["cached"] }.compact
|
38
|
+
end.flatten
|
9
39
|
end
|
10
40
|
end
|
11
41
|
end
|
data/lib/datafusion/version.rb
CHANGED
data/lib/datafusion.rb
CHANGED
@@ -17,29 +17,39 @@ module Datafusion
|
|
17
17
|
@log = logger
|
18
18
|
end
|
19
19
|
|
20
|
-
def self.fuse(
|
21
|
-
|
22
|
-
out
|
23
|
-
integs.each do |k, v|
|
24
|
-
erb = SnippetRenderer.new(v["kind"], v.merge({"user" => pguser, "name" => k}))
|
25
|
-
out += erb.render()
|
26
|
-
end
|
27
|
-
out
|
20
|
+
def self.fuse(file, executor, opts)
|
21
|
+
out = Integrations.render(file, opts)
|
22
|
+
executor.execute(out, "integrations")
|
28
23
|
end
|
29
24
|
|
30
|
-
def self.refresh(file, executor)
|
31
|
-
|
32
|
-
schedules = integs.map do |k, v|
|
33
|
-
v["tables"].map{|t| t["cached"] }.compact
|
34
|
-
end.flatten
|
25
|
+
def self.refresh(file, executor, opts)
|
26
|
+
schedules = Integrations.schedules(file)
|
35
27
|
Datafusion.log.info("Discovered #{schedules.size} schedule(s).")
|
36
28
|
|
37
29
|
scheduler = Rufus::Scheduler.new
|
38
30
|
schedules.each do |schedule|
|
39
31
|
scheduler.every(schedule["refresh"]) do
|
40
|
-
|
32
|
+
#
|
33
|
+
# TODO use refresh [..] concurrently
|
34
|
+
#
|
35
|
+
# This means we also need to define a unique index per materialized
|
36
|
+
# view so that PG will know how to use MVCC.
|
37
|
+
#
|
38
|
+
# This needs some code to detect:
|
39
|
+
# 1. At setup time - when an index is already there, don't add it.
|
40
|
+
# 2. At refresh time - if a table doesn't have any data, it cannot be
|
41
|
+
# refreshed with concurrently - it needs a normal refresh first.
|
42
|
+
#
|
43
|
+
# For now we refresh and block.
|
44
|
+
#
|
45
|
+
refresh_sql = "REFRESH materialized view #{schedule['name']}"
|
46
|
+
|
47
|
+
executor.execute(refresh_sql, "schedule: #{schedule}")
|
41
48
|
end
|
42
49
|
end
|
50
|
+
def scheduler.on_error(job, error)
|
51
|
+
Datafusion.log.error("SCHEDULER: intercepted error in #{job.id}: #{error.message}")
|
52
|
+
end
|
43
53
|
scheduler
|
44
54
|
end
|
45
55
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datafusion
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dotan Nahum
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: slop
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 4.3.0
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: retriable
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 2.1.0
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 2.1.0
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: bundler
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|