datafusion 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e3b5c2ed07b12b3e782f1682e25788b34166f992
4
- data.tar.gz: 00ff332020e14d21b672794d95b0ccdd0863b8c2
3
+ metadata.gz: 845ce5180fa9f3f7095e877220e627e4950dcbce
4
+ data.tar.gz: 97b1440a756415f07c563be88ef9c095aa0a52c0
5
5
  SHA512:
6
- metadata.gz: 5f64b338193e26ed427257de847879c4ea3743b23417487322bc0574e290a0abd7611a7baadfe14f1b1d5d21d7c43ad958425c4f2bc1d317600068051eddcc75
7
- data.tar.gz: 4e4ddb04652c10f466a549ecdec841455f2f9425f6ad19fe8186cbbacebf096a71d3cc3fd4466b7ad9552e6083c21b0fd7f25c1083388f76ea42929d00854a08
6
+ metadata.gz: 274395d436809093a756c535bf68783e1165e2cbf8079a76ed7a6c0006ae92fa6b6d9aff57c238904ba5ebc26b1dae4dba493b416ea474d42275a0082ef25825
7
+ data.tar.gz: 1f1e1df5a6a74836513e8761237f0f7faaa309f45475308da763ce475222e48273c43cee34604337cbad5ea0015e9cf02d494cfaaa2459d6d6dc4d1170e4e6e9
data/README.md CHANGED
@@ -99,6 +99,51 @@ and have that parsed by datafusion and set up a `postgres` instance to be able t
99
99
  integrate with them and give you the ability to fuse and dissect your data across
100
100
  sources.
101
101
 
102
+ ## Agent
103
+
104
+ This part documents the always-on agent, which schedules data refreshes across your
105
+ defined integration sources.
106
+
107
+ ## Cached Tables
108
+
109
+ You can define a refresh schedule and a cached view on a table with the `cached`
110
+ key like so:
111
+
112
+ ```yaml
113
+ tables:
114
+ - name: ware1
115
+ database: db
116
+ collection: foobar
117
+ cached:
118
+ name: mt_ware1
119
+ query: select * from ware1
120
+ refresh: 1s
121
+ ```
122
+
123
+ * A `refresh` is a natural language short for a time period like `1m`, `5hr`, `15s` and so on.
124
+ * The `query` should mostly be a `select-all` from your defined table name.
125
+ * The `name` part is a name which is available for you during your regular SQL queries,
126
+ so that you can mix-and-match real-time and soft-real-time (cached) data.
127
+
128
+ ## Refreshing
129
+
130
+ The agent comes with a built-in scheduler which is able to execute refresh queries on
131
+ your postgres cluster.
132
+
133
+ Start it like so:
134
+
135
+ ```
136
+ $ datafusion -f integrations.yaml -a postgres://postgres:@localhost
137
+ ```
138
+
139
+ And it will immediately come to life, telling you how many schedules it maintains, and
140
+ various output during the refresh process of the data.
141
+
142
+ It is advisable to keep this agent up directly under `systemd` or `upstart`, and look
143
+ for an `ERROR` level logs for job failures.
144
+
145
+
146
+
102
147
 
103
148
  # Contributing
104
149
 
data/bin/datafusion CHANGED
@@ -14,10 +14,12 @@ end
14
14
  # $ datafusion --fuse integrations.yml
15
15
  # $ datafusion --agent
16
16
  #
17
+
17
18
  o = Slop::Options.new
18
- o.string '-f', '--fuse', ''
19
- o.string '-u', '--user', '', default: 'postgres'
20
- o.string '-a', '--agent', 'Connection string (i.e postgres://localhost)', default: ""
19
+ o.string '-f', '--file', 'Integrations file (URL or local)'
20
+ o.string '-c', '--connection', 'Connection string to fusion engine (postgres)', default: 'postgres://localhost'
21
+ o.bool '-s', '--setup', 'Setup integrations', default: false
22
+ o.bool '-a', '--agent', 'Connection string (i.e postgres://localhost)', default: false
21
23
  o.bool '-d', '--dryrun', 'dry run for refreshes', default: false
22
24
 
23
25
  o.on '--version', 'print the version' do
@@ -30,22 +32,28 @@ end
30
32
  end
31
33
  opts = Slop::Parser.new(o).parse(ARGV)
32
34
 
33
- if opts[:fuse] && opts[:agent].empty?
34
- if File.exist?(opts[:fuse])
35
- puts Datafusion.fuse(opts[:user], opts[:fuse])
36
- else
37
- bail "Error: please provide a file to fuse", opts
38
- end
39
- elsif opts[:fuse] && opts[:agent]
40
-
41
- exec_class = Datafusion::DebugExecutor
42
- unless opts[:dryrun]
43
- exec_class = Datafusion::DbExecutor
44
- end
45
- exec = exec_class.new(opts[:agent])
46
- sched = Datafusion.refresh(opts[:fuse], exec)
47
- Datafusion.log.info("Running refresh agent.")
48
- sched.join
35
+ unless opts[:file]
36
+ bail("Please provide a file", opts)
49
37
  end
50
38
 
39
+ unless opts[:connection]
40
+ bail("Please provide a connection", opts)
41
+ end
51
42
 
43
+ exec_class = Datafusion::DebugExecutor.new
44
+ unless opts[:dryrun]
45
+ exec_class = Datafusion::DbExecutor.new(opts[:connection])
46
+ end
47
+
48
+ file = opts[:file]
49
+
50
+ if opts[:setup]
51
+ puts Datafusion.fuse(file, exec_class, opts)
52
+ elsif opts[:agent]
53
+ sched = Datafusion.refresh(file, exec_class, opts)
54
+ Datafusion.log.info("Running refresh agent.")
55
+ sched.join
56
+ else
57
+ bail("Please pick a mode: --setup | --agent", opts)
58
+ end
59
+
data/datafusion.gemspec CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency 'colorize', '~> 0.7.7'
24
24
  spec.add_dependency 'rufus-scheduler', '~> 3.2.0'
25
25
  spec.add_dependency 'sequel', '~> 4.3.0'
26
+ spec.add_dependency 'retriable', '~> 2.1.0'
26
27
 
27
28
  spec.add_development_dependency "bundler", "~> 1.10"
28
29
  spec.add_development_dependency "rake", "~> 10.0"
@@ -7,25 +7,12 @@ module Datafusion
7
7
  def initialize(conn)
8
8
  @db = Sequel.connect(conn)
9
9
  end
10
- def exec(schedule)
11
- #
12
- # TODO use refresh [..] concurrently
13
- #
14
- # This means we also need to define a unique index per materialized
15
- # view so that PG will know how to use MVCC.
16
- #
17
- # This needs some code to detect:
18
- # 1. At setup time - when an index is already there, don't add it.
19
- # 2. At refresh time - if a table doesn't have any data, it cannot be
20
- # refreshed with concurrently - it needs a normal refresh first.
21
- #
22
- # For now we refresh and block.
23
- #
10
+
11
+ def execute(sql, label='')
24
12
  run = rand(36**5).to_s(36)
25
13
 
26
- Datafusion.log.info("#{TAG}: starting run id:#{run} for #{schedule}")
27
- refresh_sql = "REFRESH materialized view #{schedule['name']}"
28
- @db[refresh_sql].each do |r|
14
+ Datafusion.log.info("#{TAG}: starting run id:#{run} for: '#{label}'")
15
+ @db[sql].each do |r|
29
16
  Datafusion.log.info("#{TAG}: out: #{r}")
30
17
  end
31
18
  Datafusion.log.info("#{TAG}: finished run id:#{run}")
@@ -1,9 +1,7 @@
1
1
  module Datafusion
2
2
  class DebugExecutor
3
- def initialize(conn)
4
- end
5
- def exec(schedule)
6
- puts "EXECUTE: #{schedule}"
3
+ def execute(stuff, label='')
4
+ puts "-- EXECUTE: #{label}\n#{stuff}"
7
5
  end
8
6
  end
9
7
  end
@@ -1,11 +1,41 @@
1
1
  require 'erb'
2
2
  require 'yaml'
3
+ require 'open-uri'
4
+ require 'retriable'
5
+ require 'uri'
3
6
 
4
7
  module Datafusion
5
8
  class Integrations
6
- def self.load(integfile)
7
- erb = ERB.new(File.read(integfile))
8
- YAML.load(erb.result(binding))
9
+ def self.load(file, opts={})
10
+ retry_count = opts[:retry_count] || 20
11
+ Retriable.retriable :tries => retry_count, :on_retry => self.method(:could_not_open) do
12
+ erb = ERB.new(open(file).read)
13
+ YAML.load(erb.result(binding))
14
+ end
15
+ end
16
+
17
+ def self.could_not_open(exception, try, elapsed_time, next_interval)
18
+ Datafusion.log.error("#{exception.class}: '#{exception.message}' - #{try} tries in #{elapsed_time} seconds and #{next_interval} seconds until the next try.")
19
+ end
20
+
21
+ def self.render(file, opts)
22
+ pguser = URI(opts[:connection] || "").user || 'postgres'
23
+
24
+ integs = Integrations.load(file)
25
+
26
+ out = ""
27
+ integs.each do |k, v|
28
+ erb = SnippetRenderer.new(v["kind"], v.merge({"user" => pguser, "name" => k}))
29
+ out << erb.render()
30
+ end
31
+ return out
32
+ end
33
+
34
+ def self.schedules(file)
35
+ integs = Integrations.load(file)
36
+ integs.map do |k, v|
37
+ v["tables"].map{|t| t["cached"] }.compact
38
+ end.flatten
9
39
  end
10
40
  end
11
41
  end
@@ -1,3 +1,3 @@
1
1
  module Datafusion
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.5"
3
3
  end
data/lib/datafusion.rb CHANGED
@@ -17,29 +17,39 @@ module Datafusion
17
17
  @log = logger
18
18
  end
19
19
 
20
- def self.fuse(pguser, file)
21
- integs = Integrations.load(file)
22
- out = ""
23
- integs.each do |k, v|
24
- erb = SnippetRenderer.new(v["kind"], v.merge({"user" => pguser, "name" => k}))
25
- out += erb.render()
26
- end
27
- out
20
+ def self.fuse(file, executor, opts)
21
+ out = Integrations.render(file, opts)
22
+ executor.execute(out, "integrations")
28
23
  end
29
24
 
30
- def self.refresh(file, executor)
31
- integs = Integrations.load(file)
32
- schedules = integs.map do |k, v|
33
- v["tables"].map{|t| t["cached"] }.compact
34
- end.flatten
25
+ def self.refresh(file, executor, opts)
26
+ schedules = Integrations.schedules(file)
35
27
  Datafusion.log.info("Discovered #{schedules.size} schedule(s).")
36
28
 
37
29
  scheduler = Rufus::Scheduler.new
38
30
  schedules.each do |schedule|
39
31
  scheduler.every(schedule["refresh"]) do
40
- executor.exec(schedule)
32
+ #
33
+ # TODO use refresh [..] concurrently
34
+ #
35
+ # This means we also need to define a unique index per materialized
36
+ # view so that PG will know how to use MVCC.
37
+ #
38
+ # This needs some code to detect:
39
+ # 1. At setup time - when an index is already there, don't add it.
40
+ # 2. At refresh time - if a table doesn't have any data, it cannot be
41
+ # refreshed with concurrently - it needs a normal refresh first.
42
+ #
43
+ # For now we refresh and block.
44
+ #
45
+ refresh_sql = "REFRESH materialized view #{schedule['name']}"
46
+
47
+ executor.execute(refresh_sql, "schedule: #{schedule}")
41
48
  end
42
49
  end
50
+ def scheduler.on_error(job, error)
51
+ Datafusion.log.error("SCHEDULER: intercepted error in #{job.id}: #{error.message}")
52
+ end
43
53
  scheduler
44
54
  end
45
55
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datafusion
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dotan Nahum
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-09 00:00:00.000000000 Z
11
+ date: 2016-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: slop
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 4.3.0
69
+ - !ruby/object:Gem::Dependency
70
+ name: retriable
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 2.1.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 2.1.0
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement