datafusion 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e3b5c2ed07b12b3e782f1682e25788b34166f992
4
- data.tar.gz: 00ff332020e14d21b672794d95b0ccdd0863b8c2
3
+ metadata.gz: 845ce5180fa9f3f7095e877220e627e4950dcbce
4
+ data.tar.gz: 97b1440a756415f07c563be88ef9c095aa0a52c0
5
5
  SHA512:
6
- metadata.gz: 5f64b338193e26ed427257de847879c4ea3743b23417487322bc0574e290a0abd7611a7baadfe14f1b1d5d21d7c43ad958425c4f2bc1d317600068051eddcc75
7
- data.tar.gz: 4e4ddb04652c10f466a549ecdec841455f2f9425f6ad19fe8186cbbacebf096a71d3cc3fd4466b7ad9552e6083c21b0fd7f25c1083388f76ea42929d00854a08
6
+ metadata.gz: 274395d436809093a756c535bf68783e1165e2cbf8079a76ed7a6c0006ae92fa6b6d9aff57c238904ba5ebc26b1dae4dba493b416ea474d42275a0082ef25825
7
+ data.tar.gz: 1f1e1df5a6a74836513e8761237f0f7faaa309f45475308da763ce475222e48273c43cee34604337cbad5ea0015e9cf02d494cfaaa2459d6d6dc4d1170e4e6e9
data/README.md CHANGED
@@ -99,6 +99,51 @@ and have that parsed by datafusion and set up a `postgres` instance to be able t
99
99
  integrate with them and give you the ability to fuse and dissect your data across
100
100
  sources.
101
101
 
102
+ ## Agent
103
+
104
+ This part documents the always-on agent, which schedules data refreshes across your
105
+ defined integration sources.
106
+
107
+ ## Cached Tables
108
+
109
+ You can define a refresh schedule and a cached view on a table with the `cached`
110
+ key like so:
111
+
112
+ ```yaml
113
+ tables:
114
+ - name: ware1
115
+ database: db
116
+ collection: foobar
117
+ cached:
118
+ name: mt_ware1
119
+ query: select * from ware1
120
+ refresh: 1s
121
+ ```
122
+
123
+ * A `refresh` is a natural language short for a time period like `1m`, `5hr`, `15s` and so on.
124
+ * The `query` should mostly be a `select-all` from your defined table name.
125
+ * The `name` part is a name which is available for you during your regular SQL queries,
126
+ so that you can mix-and-match real-time and soft-real-time (cached) data.
127
+
128
+ ## Refreshing
129
+
130
+ The agent comes with a built-in scheduler which is able to execute refresh queries on
131
+ your postgres cluster.
132
+
133
+ Start it like so:
134
+
135
+ ```
136
+ $ datafusion -f integrations.yaml -a postgres://postgres:@localhost
137
+ ```
138
+
139
+ And it will immediately come to life, telling you how many schedules it maintains, and
140
+ various output during the refresh process of the data.
141
+
142
+ It is advisable to keep this agent up directly under `systemd` or `upstart`, and look
143
+ for an `ERROR` level logs for job failures.
144
+
145
+
146
+
102
147
 
103
148
  # Contributing
104
149
 
data/bin/datafusion CHANGED
@@ -14,10 +14,12 @@ end
14
14
  # $ datafusion --fuse integrations.yml
15
15
  # $ datafusion --agent
16
16
  #
17
+
17
18
  o = Slop::Options.new
18
- o.string '-f', '--fuse', ''
19
- o.string '-u', '--user', '', default: 'postgres'
20
- o.string '-a', '--agent', 'Connection string (i.e postgres://localhost)', default: ""
19
+ o.string '-f', '--file', 'Integrations file (URL or local)'
20
+ o.string '-c', '--connection', 'Connection string to fusion engine (postgres)', default: 'postgres://localhost'
21
+ o.bool '-s', '--setup', 'Setup integrations', default: false
22
+ o.bool '-a', '--agent', 'Connection string (i.e postgres://localhost)', default: false
21
23
  o.bool '-d', '--dryrun', 'dry run for refreshes', default: false
22
24
 
23
25
  o.on '--version', 'print the version' do
@@ -30,22 +32,28 @@ end
30
32
  end
31
33
  opts = Slop::Parser.new(o).parse(ARGV)
32
34
 
33
- if opts[:fuse] && opts[:agent].empty?
34
- if File.exist?(opts[:fuse])
35
- puts Datafusion.fuse(opts[:user], opts[:fuse])
36
- else
37
- bail "Error: please provide a file to fuse", opts
38
- end
39
- elsif opts[:fuse] && opts[:agent]
40
-
41
- exec_class = Datafusion::DebugExecutor
42
- unless opts[:dryrun]
43
- exec_class = Datafusion::DbExecutor
44
- end
45
- exec = exec_class.new(opts[:agent])
46
- sched = Datafusion.refresh(opts[:fuse], exec)
47
- Datafusion.log.info("Running refresh agent.")
48
- sched.join
35
+ unless opts[:file]
36
+ bail("Please provide a file", opts)
49
37
  end
50
38
 
39
+ unless opts[:connection]
40
+ bail("Please provide a connection", opts)
41
+ end
51
42
 
43
+ exec_class = Datafusion::DebugExecutor.new
44
+ unless opts[:dryrun]
45
+ exec_class = Datafusion::DbExecutor.new(opts[:connection])
46
+ end
47
+
48
+ file = opts[:file]
49
+
50
+ if opts[:setup]
51
+ puts Datafusion.fuse(file, exec_class, opts)
52
+ elsif opts[:agent]
53
+ sched = Datafusion.refresh(file, exec_class, opts)
54
+ Datafusion.log.info("Running refresh agent.")
55
+ sched.join
56
+ else
57
+ bail("Please pick a mode: --setup | --agent", opts)
58
+ end
59
+
data/datafusion.gemspec CHANGED
@@ -23,6 +23,7 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency 'colorize', '~> 0.7.7'
24
24
  spec.add_dependency 'rufus-scheduler', '~> 3.2.0'
25
25
  spec.add_dependency 'sequel', '~> 4.3.0'
26
+ spec.add_dependency 'retriable', '~> 2.1.0'
26
27
 
27
28
  spec.add_development_dependency "bundler", "~> 1.10"
28
29
  spec.add_development_dependency "rake", "~> 10.0"
@@ -7,25 +7,12 @@ module Datafusion
7
7
  def initialize(conn)
8
8
  @db = Sequel.connect(conn)
9
9
  end
10
- def exec(schedule)
11
- #
12
- # TODO use refresh [..] concurrently
13
- #
14
- # This means we also need to define a unique index per materialized
15
- # view so that PG will know how to use MVCC.
16
- #
17
- # This needs some code to detect:
18
- # 1. At setup time - when an index is already there, don't add it.
19
- # 2. At refresh time - if a table doesn't have any data, it cannot be
20
- # refreshed with concurrently - it needs a normal refresh first.
21
- #
22
- # For now we refresh and block.
23
- #
10
+
11
+ def execute(sql, label='')
24
12
  run = rand(36**5).to_s(36)
25
13
 
26
- Datafusion.log.info("#{TAG}: starting run id:#{run} for #{schedule}")
27
- refresh_sql = "REFRESH materialized view #{schedule['name']}"
28
- @db[refresh_sql].each do |r|
14
+ Datafusion.log.info("#{TAG}: starting run id:#{run} for: '#{label}'")
15
+ @db[sql].each do |r|
29
16
  Datafusion.log.info("#{TAG}: out: #{r}")
30
17
  end
31
18
  Datafusion.log.info("#{TAG}: finished run id:#{run}")
@@ -1,9 +1,7 @@
1
1
  module Datafusion
2
2
  class DebugExecutor
3
- def initialize(conn)
4
- end
5
- def exec(schedule)
6
- puts "EXECUTE: #{schedule}"
3
+ def execute(stuff, label='')
4
+ puts "-- EXECUTE: #{label}\n#{stuff}"
7
5
  end
8
6
  end
9
7
  end
@@ -1,11 +1,41 @@
1
1
  require 'erb'
2
2
  require 'yaml'
3
+ require 'open-uri'
4
+ require 'retriable'
5
+ require 'uri'
3
6
 
4
7
  module Datafusion
5
8
  class Integrations
6
- def self.load(integfile)
7
- erb = ERB.new(File.read(integfile))
8
- YAML.load(erb.result(binding))
9
+ def self.load(file, opts={})
10
+ retry_count = opts[:retry_count] || 20
11
+ Retriable.retriable :tries => retry_count, :on_retry => self.method(:could_not_open) do
12
+ erb = ERB.new(open(file).read)
13
+ YAML.load(erb.result(binding))
14
+ end
15
+ end
16
+
17
+ def self.could_not_open(exception, try, elapsed_time, next_interval)
18
+ Datafusion.log.error("#{exception.class}: '#{exception.message}' - #{try} tries in #{elapsed_time} seconds and #{next_interval} seconds until the next try.")
19
+ end
20
+
21
+ def self.render(file, opts)
22
+ pguser = URI(opts[:connection] || "").user || 'postgres'
23
+
24
+ integs = Integrations.load(file)
25
+
26
+ out = ""
27
+ integs.each do |k, v|
28
+ erb = SnippetRenderer.new(v["kind"], v.merge({"user" => pguser, "name" => k}))
29
+ out << erb.render()
30
+ end
31
+ return out
32
+ end
33
+
34
+ def self.schedules(file)
35
+ integs = Integrations.load(file)
36
+ integs.map do |k, v|
37
+ v["tables"].map{|t| t["cached"] }.compact
38
+ end.flatten
9
39
  end
10
40
  end
11
41
  end
@@ -1,3 +1,3 @@
1
1
  module Datafusion
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.5"
3
3
  end
data/lib/datafusion.rb CHANGED
@@ -17,29 +17,39 @@ module Datafusion
17
17
  @log = logger
18
18
  end
19
19
 
20
- def self.fuse(pguser, file)
21
- integs = Integrations.load(file)
22
- out = ""
23
- integs.each do |k, v|
24
- erb = SnippetRenderer.new(v["kind"], v.merge({"user" => pguser, "name" => k}))
25
- out += erb.render()
26
- end
27
- out
20
+ def self.fuse(file, executor, opts)
21
+ out = Integrations.render(file, opts)
22
+ executor.execute(out, "integrations")
28
23
  end
29
24
 
30
- def self.refresh(file, executor)
31
- integs = Integrations.load(file)
32
- schedules = integs.map do |k, v|
33
- v["tables"].map{|t| t["cached"] }.compact
34
- end.flatten
25
+ def self.refresh(file, executor, opts)
26
+ schedules = Integrations.schedules(file)
35
27
  Datafusion.log.info("Discovered #{schedules.size} schedule(s).")
36
28
 
37
29
  scheduler = Rufus::Scheduler.new
38
30
  schedules.each do |schedule|
39
31
  scheduler.every(schedule["refresh"]) do
40
- executor.exec(schedule)
32
+ #
33
+ # TODO use refresh [..] concurrently
34
+ #
35
+ # This means we also need to define a unique index per materialized
36
+ # view so that PG will know how to use MVCC.
37
+ #
38
+ # This needs some code to detect:
39
+ # 1. At setup time - when an index is already there, don't add it.
40
+ # 2. At refresh time - if a table doesn't have any data, it cannot be
41
+ # refreshed with concurrently - it needs a normal refresh first.
42
+ #
43
+ # For now we refresh and block.
44
+ #
45
+ refresh_sql = "REFRESH materialized view #{schedule['name']}"
46
+
47
+ executor.execute(refresh_sql, "schedule: #{schedule}")
41
48
  end
42
49
  end
50
+ def scheduler.on_error(job, error)
51
+ Datafusion.log.error("SCHEDULER: intercepted error in #{job.id}: #{error.message}")
52
+ end
43
53
  scheduler
44
54
  end
45
55
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: datafusion
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dotan Nahum
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-09 00:00:00.000000000 Z
11
+ date: 2016-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: slop
@@ -66,6 +66,20 @@ dependencies:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
68
  version: 4.3.0
69
+ - !ruby/object:Gem::Dependency
70
+ name: retriable
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: 2.1.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: 2.1.0
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: bundler
71
85
  requirement: !ruby/object:Gem::Requirement