azkaban-rb 0.0.6 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,4 +19,5 @@ Gem::Specification.new do |s|
19
19
  s.require_paths = ["lib"]
20
20
 
21
21
  s.add_dependency "httpclient", "~> 2.1.6"
22
+ s.add_dependency "GraphvizR", "~> 0.5.1"
22
23
  end
@@ -23,7 +23,7 @@ task :clean_job_conf do
23
23
  end
24
24
 
25
25
  props :base do
26
- set "udf.import.list" => "oink.,com.linkedin.pig.,com.linkedin.pig.date.,org.apache.pig.piggybank.,com.linkedin.pig.characters."
26
+ set "udf.import.list" => "oink.,org.apache.pig.piggybank."
27
27
  set "hadoop.job.ugi" => "#{@@user_name},hadoop"
28
28
  set "hdfs.default.classpath.dir" => config["hdfs_classpath"]
29
29
  set "jvm.args" => config["jvm_args"] if config["jvm_args"]
@@ -72,6 +72,12 @@ end
72
72
 
73
73
  task :default => :zip
74
74
 
75
+ desc "Visualize the data flow (requires GraphViz installed)"
76
+ task :visualize do
77
+ rakeGraph = RakeGraph.new
78
+ rakeGraph.visualize("ExampleAzkabanDataflow", "example_azkaban_dataflow.png")
79
+ end
80
+
75
81
  # Create a run task for each pig job so we can run using Rake. Parameter substituion is done automatically.
76
82
  Rake.application.tasks.find_all do |task|
77
83
  if task.job && task.job.instance_of?(Azkaban::PigJob)
@@ -82,4 +88,5 @@ Rake.application.tasks.find_all do |task|
82
88
  `bin/pig #{parameters} #{script}`
83
89
  end
84
90
  end
85
- end
91
+ end
92
+
@@ -1,5 +1,6 @@
1
1
  require "azkaban-rb/version"
2
2
  require "azkaban-rb/tasks"
3
+ require "azkaban-rb/visualization"
3
4
 
4
5
  module Azkaban
5
6
  module Rb
@@ -63,7 +63,7 @@ module Azkaban
63
63
  HTTP::Message.mime_type_handler = Proc.new { |path| Azkaban::mime_type_handler(path) }
64
64
 
65
65
  class JobFile
66
- attr_reader :read_locks, :write_locks, :task, :uses
66
+ attr_reader :read_locks, :write_locks, :task, :uses_arg
67
67
 
68
68
  @output_dir = "conf/"
69
69
 
@@ -154,6 +154,7 @@ module Azkaban
154
154
  end
155
155
 
156
156
  def uses(name)
157
+ @uses_arg = name
157
158
  set "pig.script"=>name
158
159
  end
159
160
 
@@ -174,6 +175,7 @@ module Azkaban
174
175
  end
175
176
 
176
177
  def uses(name)
178
+ @uses_arg = name
177
179
  set "job.class"=>name
178
180
  end
179
181
  end
@@ -185,6 +187,7 @@ module Azkaban
185
187
  end
186
188
 
187
189
  def uses(name)
190
+ @uses_arg = name
188
191
  set "java.class"=>name
189
192
  end
190
193
  end
@@ -196,6 +199,7 @@ module Azkaban
196
199
  end
197
200
 
198
201
  def uses(text)
202
+ @uses_arg = text
199
203
  set "command"=>text
200
204
  end
201
205
  end
@@ -1,5 +1,5 @@
1
1
  module Azkaban
2
2
  module Rb
3
- VERSION = "0.0.6"
3
+ VERSION = "0.0.7"
4
4
  end
5
5
  end
@@ -0,0 +1,221 @@
1
+ require 'graphviz_r'
2
+
3
+ class RakeGraph
4
+ attr_reader :tasks
5
+
6
+ def initialize(namespaces = nil)
7
+ @namespaces = namespaces
8
+ @tasks = {}
9
+ Rake.application.tasks.find_all{ |task| (not task.job.nil?)}.each do |task|
10
+ tasks[RakeGraph.task_name(task)] = task if (task.job.read_locks.size + task.job.write_locks.size) > 0
11
+ end
12
+ @nodes = {}
13
+ @edges = []
14
+ construct_graph()
15
+ end
16
+
17
+ def RakeGraph.task_name(task)
18
+ task_name = "TASK#{task}"
19
+ task_name = task_name.gsub(/[^0-9a-z ]/i, '')
20
+ return task_name
21
+ end
22
+
23
+ def RakeGraph.data_name(name)
24
+ name = "DATA"+name.gsub(/[^0-9a-z ]/i, '')
25
+ return name
26
+ end
27
+
28
+ def task_in_namespace(task)
29
+ return true if @namespaces.nil? or @namespaces.size == 0
30
+ return (task.scope & @namespaces).size > 0
31
+ end
32
+
33
+ def find_prereq(task, prereq)
34
+ scopes = Array.new(task.scope)
35
+ while prereq.start_with? '^'
36
+ scopes.pop
37
+ prereq.slice!(0)
38
+ end
39
+ return RakeGraph.task_name(scopes.join('')+prereq)
40
+ end
41
+
42
+ def construct_graph()
43
+ # first add all of the task nodes
44
+ @tasks.each do |task_name, task|
45
+ next unless task_in_namespace(task)
46
+ node = TaskNode.new(task)
47
+ @nodes[node.name] = node
48
+ end
49
+
50
+ # now add all of the edges and data nodes
51
+ data_nodes = {}
52
+ @nodes.each do |name, node|
53
+ task = node.task
54
+ # find all prereq tasks
55
+ # task.prerequisites.each do |prereq|
56
+ # prereq = find_prereq(task, prereq)
57
+ # next unless @nodes.has_key?(prereq)
58
+ # @edges << TaskEdge.new(prereq, node.name)
59
+ # end
60
+ # find all data reads
61
+ task.job.read_locks.each do |read_lock|
62
+ data_name = RakeGraph.data_name(read_lock)
63
+ data_nodes[data_name] = DataNode.new(read_lock) unless data_nodes.has_key? data_name
64
+ @edges << DataEdge.new(data_name, node.name)
65
+ end
66
+ # find all data writes
67
+ task.job.write_locks.each do |write_lock|
68
+ data_name = RakeGraph.data_name(write_lock)
69
+ data_nodes[data_name] = DataNode.new(write_lock) unless data_nodes.has_key? data_name
70
+ @edges << DataEdge.new(node.name, data_name)
71
+ end
72
+ end
73
+ data_nodes.each do |key, value|
74
+ @nodes[key] = value
75
+ end
76
+ end
77
+
78
+ class Node
79
+ attr_reader :name, :type
80
+
81
+ def initialize(name, type)
82
+ @name = name
83
+ @type = type
84
+ end
85
+
86
+ def fontcolor
87
+ return '#000000'
88
+ end
89
+
90
+ def to_s
91
+ return "#{@type}: #{@name}"
92
+ end
93
+ end
94
+
95
+ class TaskNode < Node
96
+ attr_reader :task
97
+
98
+ def initialize(task)
99
+ super(RakeGraph.task_name(task), task.job.class.to_s)
100
+ @task = task
101
+ end
102
+
103
+ def label
104
+ label = "<#{@task}<br/>#{@task.job.uses_arg}>"
105
+ return label.to_sym
106
+ end
107
+
108
+ def shape
109
+ return :ellipse
110
+ end
111
+
112
+ def fillcolor
113
+ case @type
114
+ when 'Azkaban::PigJob'
115
+ return '#e7a5a5'
116
+ when 'Azkaban::JavaJob'
117
+ return '#E7C6A5'
118
+ when 'Azkaban::CommandJob'
119
+ return '#e7e6a5'
120
+ end
121
+ return ""
122
+ end
123
+ end
124
+
125
+ class DataNode < Node
126
+ attr_reader :filename
127
+
128
+ def initialize(filename)
129
+ super(RakeGraph.data_name(filename), "data")
130
+ @filename = filename
131
+ end
132
+
133
+ def label
134
+ label = @filename
135
+ return "<#{label}>".to_sym
136
+ end
137
+
138
+ def shape
139
+ return :box
140
+ end
141
+
142
+ def fillcolor
143
+ return '#d2e3f3'
144
+ end
145
+ end
146
+
147
+ class Edge
148
+ attr_reader :source, :dest, :type
149
+
150
+ def initialize(source, dest)
151
+ @source = source
152
+ @dest = dest
153
+ end
154
+
155
+ def to_s
156
+ return "#{source} >> #{dest}"
157
+ end
158
+ end
159
+
160
+ class TaskEdge < Edge
161
+ def initialize(source, dest)
162
+ super(source, dest)
163
+ @type = "task"
164
+ end
165
+
166
+ def style
167
+ return :dotted
168
+ end
169
+ end
170
+
171
+ class DataEdge < Edge
172
+ def initialize(source, dest)
173
+ super(source, dest)
174
+ @type = "data"
175
+ end
176
+
177
+ def style
178
+ :solid
179
+ end
180
+ end
181
+
182
+ def visualize(name, output_file)
183
+ g = GraphvizR.new name
184
+ g.graph[:label => name]
185
+ add_nodes(g)
186
+ add_edges(g)
187
+ g.output output_file
188
+ end
189
+
190
+ def add_nodes(g)
191
+ @nodes.each do |name, node|
192
+ g[name] [:label => @label_block.nil? ? node.label : @label_block.call(node),
193
+ :shape => @shape_block.nil? ? node.shape : @shape_block.call(node),
194
+ :fillcolor => @fillcolor_block.nil? ? node.fillcolor : @fillcolor_block.call(node),
195
+ :style => :filled,
196
+ :fontcolor => @fontcolor_block.nil? ? node.fontcolor : @fontcolor_block.call(node)]
197
+ end
198
+ end
199
+
200
+ def add_edges(g)
201
+ @edges.each do |edge|
202
+ (g[edge.source]>>g[edge.dest])[:style => edge.style]
203
+ end
204
+ end
205
+
206
+ def set_label(&block)
207
+ @label_block = block
208
+ end
209
+
210
+ def set_fillcolor(&block)
211
+ @fillcolor_block = block
212
+ end
213
+
214
+ def set_fontcolor(&block)
215
+ @fontcolor_block = block
216
+ end
217
+
218
+ def set_shape(&block)
219
+ @shape_block = block
220
+ end
221
+ end
metadata CHANGED
@@ -1,41 +1,43 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: azkaban-rb
3
3
  version: !ruby/object:Gem::Version
4
- hash: 19
5
4
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 6
10
- version: 0.0.6
5
+ version: 0.0.7
11
6
  platform: ruby
12
7
  authors:
13
- - Matt Hayes
8
+ - Matt Hayes
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
12
 
18
- date: 2011-08-22 00:00:00 Z
13
+ date: 2011-08-23 00:00:00 -07:00
14
+ default_executable:
19
15
  dependencies:
20
- - !ruby/object:Gem::Dependency
21
- name: httpclient
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ~>
27
- - !ruby/object:Gem::Version
28
- hash: 7
29
- segments:
30
- - 2
31
- - 1
32
- - 6
33
- version: 2.1.6
34
- type: :runtime
35
- version_requirements: *id001
16
+ - !ruby/object:Gem::Dependency
17
+ name: httpclient
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 2.1.6
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: GraphvizR
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ~>
34
+ - !ruby/object:Gem::Version
35
+ version: 0.5.1
36
+ type: :runtime
37
+ version_requirements: *id002
36
38
  description: azkaban-rb allows Azkaban jobs to be modeled as rake tasks
37
39
  email:
38
- - matthew.terence.hayes@gmail.com
40
+ - matthew.terence.hayes@gmail.com
39
41
  executables: []
40
42
 
41
43
  extensions: []
@@ -43,22 +45,24 @@ extensions: []
43
45
  extra_rdoc_files: []
44
46
 
45
47
  files:
46
- - .gitignore
47
- - Gemfile
48
- - Rakefile
49
- - azkaban-rb.gemspec
50
- - example/.gitignore
51
- - example/Rakefile
52
- - example/bin/pig
53
- - example/data/input.txt
54
- - example/example_config.yml
55
- - example/hadoop-lzo-0.4.9.jar
56
- - example/pig-0.9.0-core.jar
57
- - example/src/test.pig
58
- - example/src/test2.pig
59
- - lib/azkaban-rb.rb
60
- - lib/azkaban-rb/tasks.rb
61
- - lib/azkaban-rb/version.rb
48
+ - .gitignore
49
+ - Gemfile
50
+ - Rakefile
51
+ - azkaban-rb.gemspec
52
+ - example/.gitignore
53
+ - example/Rakefile
54
+ - example/bin/pig
55
+ - example/data/input.txt
56
+ - example/example_config.yml
57
+ - example/hadoop-lzo-0.4.9.jar
58
+ - example/pig-0.9.0-core.jar
59
+ - example/src/test.pig
60
+ - example/src/test2.pig
61
+ - lib/azkaban-rb.rb
62
+ - lib/azkaban-rb/tasks.rb
63
+ - lib/azkaban-rb/version.rb
64
+ - lib/azkaban-rb/visualization.rb
65
+ has_rdoc: true
62
66
  homepage: https://github.com/matthayes/azkaban-rb
63
67
  licenses: []
64
68
 
@@ -66,29 +70,23 @@ post_install_message:
66
70
  rdoc_options: []
67
71
 
68
72
  require_paths:
69
- - lib
73
+ - lib
70
74
  required_ruby_version: !ruby/object:Gem::Requirement
71
75
  none: false
72
76
  requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- hash: 3
76
- segments:
77
- - 0
78
- version: "0"
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
79
80
  required_rubygems_version: !ruby/object:Gem::Requirement
80
81
  none: false
81
82
  requirements:
82
- - - ">="
83
- - !ruby/object:Gem::Version
84
- hash: 3
85
- segments:
86
- - 0
87
- version: "0"
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
88
86
  requirements: []
89
87
 
90
88
  rubyforge_project: azkaban-rb
91
- rubygems_version: 1.8.7
89
+ rubygems_version: 1.5.1
92
90
  signing_key:
93
91
  specification_version: 3
94
92
  summary: Azkaban job generation using Ruby