tengine_job 1.1.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (70) hide show
  1. checksums.yaml +15 -0
  2. data/Gemfile.lock +78 -48
  3. data/bin/tengine_job +71 -0
  4. data/examples/0004_retry_one_layer.rb +10 -7
  5. data/examples/0027_parallel_ssh_job +9 -0
  6. data/examples/0027_parallel_ssh_jobs.rb +14 -0
  7. data/lib/tengine/job.rb +19 -49
  8. data/lib/tengine/job/dsl.rb +13 -0
  9. data/lib/tengine/job/{dsl_binder.rb → dsl/binder.rb} +4 -4
  10. data/lib/tengine/job/{dsl_evaluator.rb → dsl/evaluator.rb} +2 -2
  11. data/lib/tengine/job/{dsl_loader.rb → dsl/loader.rb} +20 -22
  12. data/lib/tengine/job/runtime.rb +32 -0
  13. data/lib/tengine/job/{drivers → runtime/drivers}/job_control_driver.rb +46 -92
  14. data/lib/tengine/job/{drivers → runtime/drivers}/job_execution_driver.rb +14 -10
  15. data/lib/tengine/job/runtime/drivers/jobnet_control_driver.rb +240 -0
  16. data/lib/tengine/job/{drivers → runtime/drivers}/schedule_driver.rb +4 -4
  17. data/lib/tengine/job/{edge.rb → runtime/edge.rb} +79 -25
  18. data/lib/tengine/job/{executable.rb → runtime/executable.rb} +35 -15
  19. data/lib/tengine/job/{execution.rb → runtime/execution.rb} +19 -11
  20. data/lib/tengine/job/runtime/job_base.rb +5 -0
  21. data/lib/tengine/job/runtime/jobnet.rb +283 -0
  22. data/lib/tengine/job/runtime/junction.rb +44 -0
  23. data/lib/tengine/job/runtime/named_vertex.rb +95 -0
  24. data/lib/tengine/job/runtime/root_jobnet.rb +81 -0
  25. data/lib/tengine/job/{signal.rb → runtime/signal.rb} +99 -13
  26. data/lib/tengine/job/runtime/ssh_job.rb +486 -0
  27. data/lib/tengine/job/{jobnet → runtime}/state_transition.rb +6 -4
  28. data/lib/tengine/job/runtime/stoppable.rb +64 -0
  29. data/lib/tengine/job/runtime/vertex.rb +50 -0
  30. data/lib/tengine/job/structure.rb +20 -0
  31. data/lib/tengine/job/{category.rb → structure/category.rb} +9 -5
  32. data/lib/tengine/job/{jobnet/builder.rb → structure/edge_builder.rb} +11 -7
  33. data/lib/tengine/job/{element_selector_notation.rb → structure/element_selector_notation.rb} +15 -11
  34. data/lib/tengine/job/structure/jobnet_builder.rb +83 -0
  35. data/lib/tengine/job/structure/jobnet_finder.rb +60 -0
  36. data/lib/tengine/job/{name_path.rb → structure/name_path.rb} +2 -2
  37. data/lib/tengine/job/structure/tree.rb +20 -0
  38. data/lib/tengine/job/structure/visitor.rb +67 -0
  39. data/lib/tengine/job/template.rb +24 -0
  40. data/lib/tengine/job/template/edge.rb +37 -0
  41. data/lib/tengine/job/template/expansion.rb +24 -0
  42. data/lib/tengine/job/template/generator.rb +111 -0
  43. data/lib/tengine/job/template/jobnet.rb +83 -0
  44. data/lib/tengine/job/template/junction.rb +14 -0
  45. data/lib/tengine/job/{job.rb → template/named_vertex.rb} +3 -5
  46. data/lib/tengine/job/{root_jobnet_template.rb → template/root_jobnet.rb} +12 -26
  47. data/lib/tengine/job/template/ssh_job.rb +80 -0
  48. data/lib/tengine/job/template/vertex.rb +97 -0
  49. metadata +127 -93
  50. data/lib/tengine/job/connectable.rb +0 -43
  51. data/lib/tengine/job/drivers/jobnet_control_driver.rb +0 -249
  52. data/lib/tengine/job/end.rb +0 -32
  53. data/lib/tengine/job/expansion.rb +0 -37
  54. data/lib/tengine/job/fork.rb +0 -6
  55. data/lib/tengine/job/jobnet.rb +0 -184
  56. data/lib/tengine/job/jobnet/job_state_transition.rb +0 -167
  57. data/lib/tengine/job/jobnet/jobnet_state_transition.rb +0 -110
  58. data/lib/tengine/job/jobnet_actual.rb +0 -84
  59. data/lib/tengine/job/jobnet_template.rb +0 -10
  60. data/lib/tengine/job/join.rb +0 -6
  61. data/lib/tengine/job/junction.rb +0 -29
  62. data/lib/tengine/job/killing.rb +0 -30
  63. data/lib/tengine/job/mm_compatibility.rb +0 -6
  64. data/lib/tengine/job/mm_compatibility/connectable.rb +0 -13
  65. data/lib/tengine/job/root.rb +0 -16
  66. data/lib/tengine/job/root_jobnet_actual.rb +0 -58
  67. data/lib/tengine/job/script_executable.rb +0 -235
  68. data/lib/tengine/job/start.rb +0 -20
  69. data/lib/tengine/job/stoppable.rb +0 -15
  70. data/lib/tengine/job/vertex.rb +0 -181
@@ -0,0 +1,44 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'tengine/job/runtime'
3
+
4
+ # ForkやJoinの継承元となるVertex。特に状態は持たない。
5
+ class Tengine::Job::Runtime::Junction < Tengine::Job::Runtime::Vertex
6
+
7
+ # https://cacoo.com/diagrams/hdLgrzYsTBBpV3Wj#D26C1
8
+ def transmit(signal)
9
+ complete_origin_edge(signal, :except_closed => true)
10
+ # transmitted?で判断すると、closedなものに対する処理を考慮できないので、alive?を使って判断します
11
+ # activate(signal) if prev_edges.all?(&:transmitted?)
12
+ execution = signal.execution
13
+ predicate = execution.retry ? :alive_or_closing_or_closed? : :alive_or_closing?
14
+ unless signal.cache(prev_edges).any?(&predicate)
15
+ activate(signal)
16
+ end
17
+ end
18
+
19
+ def activatable?
20
+ prev_edges.all?(&:transmitted?)
21
+ end
22
+
23
+ def activate(signal)
24
+ Tengine.logger.debug "a" * 100
25
+ Tengine.logger.debug "#{__FILE__}##{__LINE__}"
26
+ Tengine.logger.debug "#{object_id} #{inspect}"
27
+ Tengine.logger.debug "#{signal.cache(parent).object_id} #{signal.cache(parent).inspect}"
28
+ signal.leave(self)
29
+ end
30
+
31
+ def reset(signal)
32
+ signal.leave(self, :reset)
33
+ end
34
+
35
+ end
36
+
37
+
38
+ # 一つのVertexから複数のVertexへSignalを通知する分岐のVertex。
39
+ class Tengine::Job::Runtime::Fork < Tengine::Job::Runtime::Junction
40
+ end
41
+
42
+ # 複数のVertexの終了を待ちあわせて一つのVertexへSignalを通知する合流のVertex。
43
+ class Tengine::Job::Runtime::Join < Tengine::Job::Runtime::Junction
44
+ end
@@ -0,0 +1,95 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'tengine/job/runtime'
3
+
4
+ # 処理を意味するVertex。実際に実行を行うTengine::Job::Scriptやジョブネットである
5
+ # Tengine::Job::Runtime::Jobnetの継承元である。
6
+ class Tengine::Job::Runtime::NamedVertex < Tengine::Job::Runtime::Vertex
7
+ field :name, :type => String # ジョブの名称。
8
+
9
+ validates :name, :presence => true
10
+
11
+ # 楽観的ロックのためのバージョン。更新するたびにインクリメントされます。
12
+ # これはジョブネットや末端のSshJob毎にロックをかけられるようにしています。
13
+ include Tengine::Core::OptimisticLock
14
+ set_locking_field :version
15
+ field :version, :type => Integer, :default => 0
16
+
17
+ # リソース識別子を返します
18
+ def name_as_resource
19
+ @name_as_resource ||= "job:#{Tengine::Event.host_name}/#{Process.pid.to_s}/#{root.id.to_s}/#{id.to_s}"
20
+ end
21
+
22
+ def short_inspect
23
+ "#<%%%-30s id: %s name: %s>" % [self.class.name, self.id.to_s, name]
24
+ end
25
+
26
+ # 末端のジョブあるいはジョブネット単位で実行・停止します。
27
+ include Tengine::Job::Runtime::Executable
28
+ include Tengine::Job::Runtime::Stoppable
29
+
30
+ def root_or_expansion
31
+ p = parent
32
+ raise "something wrong!" if p.nil? && !self.is_a?(Tengine::Job::Runtime::Jobnet)
33
+ p.nil? ? self : p.was_expansion ? p : p.root_or_expansion
34
+ end
35
+
36
+ def update_with_lock(*args)
37
+ if @in_update_with_lock
38
+ Tengine::Job.test_harness_hook("[#{self.class.name}] #{name_path} before yield in nested update_with_lock")
39
+ yield if block_given?
40
+ Tengine::Job.test_harness_hook("[#{self.class.name}] #{name_path} after yield in nested update_with_lock")
41
+ return
42
+ end
43
+ @in_update_with_lock = true
44
+ begin
45
+ Tengine::Job.test_harness_hook("[#{self.class.name}] #{name_path} before update_with_lock")
46
+ super(*args) do
47
+ Tengine::Job.test_harness_hook("[#{self.class.name}] #{name_path} before yield in update_with_lock")
48
+ yield if block_given?
49
+ Tengine::Job.test_harness_hook("[#{self.class.name}] #{name_path} after yield in update_with_lock")
50
+ end
51
+ Tengine::Job.test_harness_hook("[#{self.class.name}] #{name_path} after update_with_lock")
52
+ ensure
53
+ @in_update_with_lock = false
54
+ end
55
+ end
56
+
57
+ def template_vertex
58
+ r = root_or_expansion
59
+ # return nil unless parent # templateから生成される途中だとparentがnilの場合があります
60
+ t = r.template
61
+ t.nil? ? nil : t.vertex_by_absolute_name_path(name_path_until_expansion)
62
+ end
63
+
64
+ def reset_followings(signal)
65
+ return if parent.nil?
66
+ return unless signal.execution.in_scope?(self)
67
+ edge_owner = signal.cache(parent)
68
+ return if signal.paths.include?(edge_owner)
69
+ edges, vertecs = [], []
70
+ visitor = Tengine::Job::Structure::Visitor::TraceEdge.new do |obj|
71
+ dest = obj.is_a?(Tengine::Job::Runtime::Edge) ? edges : vertecs
72
+ dest << obj
73
+ end
74
+ (next_edges || []).each{|edge| edge.accept_visitor(visitor)}
75
+ signal.paths << edge_owner
76
+
77
+ signal.call_later do
78
+ signal.cache(edge_owner).update_with_lock do
79
+ edges.each do |edge|
80
+ next unless signal.execution.in_scope?(edge.destination)
81
+ signal.cache(edge).phase_key = :active
82
+ end
83
+ end
84
+ vertecs.each do |vertex|
85
+ next unless signal.execution.in_scope?(vertex)
86
+ if vertex.is_a?(Tengine::Job::Runtime::NamedVertex) || vertex.is_a?(Tengine::Job::Runtime::End)
87
+ signal.call_later do
88
+ signal.cache(vertex).reset(signal)
89
+ end
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ end
@@ -0,0 +1,81 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'tengine/job/runtime'
3
+
4
+ # DSLを評価して登録されるルートジョブネットを表すVertex
5
+ class Tengine::Job::Runtime::RootJobnet < Tengine::Job::Runtime::Jobnet
6
+
7
+ belongs_to :category, :inverse_of => :runtime_root_jobnets, :index => true, :class_name => "Tengine::Job::Structure::Category"
8
+
9
+ has_many :executions, :inverse_of => :root_jobnet, :class_name => "Tengine::Job::Runtime::Execution"
10
+
11
+ def execute(options = {})
12
+ event_sender = options.delete(:sender) || Tengine::Event.default_sender
13
+ with(safe: safemode(self.class.collection)).save! if new_record?
14
+ result = Tengine::Job::Runtime::Execution.with(
15
+ safe: safemode(Tengine::Job::Runtime::Execution.collection)
16
+ ).create!(
17
+ (options || {}).update(:root_jobnet_id => self.id)
18
+ )
19
+ event_sender.fire(:"start.execution.job.tengine", :properties => {
20
+ :execution_id => result.id,
21
+ :root_jobnet_id => self.id,
22
+ :target_jobnet_id => self.id
23
+ })
24
+ result
25
+ end
26
+
27
+ def rerun(*args)
28
+ options = args.extract_options!
29
+ sender = options.delete(:sender) || Tengine::Event.default_sender
30
+ options = options.merge({
31
+ :retry => true,
32
+ :root_jobnet_id => self.id,
33
+ })
34
+ result = Tengine::Job::Runtime::Execution.new(options)
35
+ result.target_actual_ids ||= []
36
+ result.target_actual_ids += args.flatten
37
+ result.with(safe: safemode(Tengine::Job::Runtime::Execution.collection)).save!
38
+ sender.wait_for_connection do
39
+ sender.fire(:'start.execution.job.tengine', :properties => {
40
+ :execution_id => result.id.to_s
41
+ })
42
+ end
43
+ result
44
+ end
45
+
46
+ def fire_stop_event(options = Hash.new)
47
+ root_jobnet_id = self.id.to_s
48
+ result = Tengine::Job::Runtime::Execution.create!(
49
+ options.merge(:root_jobnet_id => root_jobnet_id))
50
+
51
+ EM.run do
52
+ Tengine::Event.fire(:"stop.jobnet.job.tengine",
53
+ :source_name => name_as_resource,
54
+ :properties => {
55
+ :execution_id => result.id.to_s,
56
+ :root_jobnet_id => root_jobnet_id,
57
+ :target_jobnet_id => root_jobnet_id.to_s,
58
+ :stop_reason => "user_stop",
59
+ })
60
+ end
61
+
62
+ return result
63
+ end
64
+
65
+ def find_duplication
66
+ return nil unless self.new_record?
67
+ self.class.find_by_name(name, :version => self.dsl_version)
68
+ end
69
+
70
+ class << self
71
+ # Tengine::Core::FindByName で定義しているクラスメソッドfind_by_nameを上書きしています
72
+ def find_by_name(name, options = {})
73
+ version = options[:version] || Tengine::Core::Setting.dsl_version
74
+ where({:name => name, :dsl_version => version}).first
75
+ end
76
+ end
77
+
78
+ Tengine::Job::Runtime::Jobnet::VERTEX_CLASSES.keys.each do |key|
79
+ instance_eval("def #{key}_class; Tengine::Job::Runtime::Jobnet.#{key}_class; end", __FILE__, __LINE__)
80
+ end
81
+ end
@@ -1,7 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
- require 'tengine/job'
2
+ require 'tengine/job/runtime'
3
3
 
4
- class Tengine::Job::Signal
4
+ class Tengine::Job::Runtime::Signal
5
5
 
6
6
  class Error < StandardError
7
7
  end
@@ -17,7 +17,8 @@ class Tengine::Job::Signal
17
17
  # 一度のroot_jobnet.update_with_lock では :starting が保存されないので、
18
18
  # 2回のroot_jobnet.update_with_lock に分けることができるようにするための
19
19
  # 処理を記憶しておく属性です
20
- attr_accessor :callback
20
+ attr_reader :callback
21
+ attr_reader :callbacks
21
22
 
22
23
  def initialize(event)
23
24
  @event = event
@@ -25,29 +26,114 @@ class Tengine::Job::Signal
25
26
  end
26
27
 
27
28
  def reset
29
+ @cache = {}
28
30
  @paths = []
29
31
  @reservations = []
30
32
  @data = nil
31
33
  @callback = nil
34
+ @callbacks = []
35
+ end
36
+
37
+ def callback=(value)
38
+ Tengine.logger.warn("Tengine::Job::Runtime::Signal#callback= is deprecated. Use call_later instead of it.\n " << caller.join("\n "))
39
+ @callback = value
40
+ end
41
+
42
+ def call_later(&block)
43
+ @callbacks.push(block)
44
+ end
45
+
46
+ def process_callbacks
47
+ until self.callbacks.empty?
48
+ Tengine.logger.debug("-" * 20)
49
+ callbacks.shift.call
50
+ end
51
+
52
+ while self.callback
53
+ block, @callback = @callback, nil
54
+ block.call
55
+ end
56
+ end
57
+
58
+ def remember(obj)
59
+ if obj.is_a?(Array)
60
+ obj.each{|o| remember(o)}
61
+ else
62
+ return nil if obj.nil?
63
+ key = cache_key(obj)
64
+ cached = cache(*key)
65
+ return cached if cached
66
+ @cache[key] = obj
67
+ end
68
+ obj
69
+ end
70
+
71
+ def cache(*args)
72
+ case args.length
73
+ when 1 then
74
+ obj = args.first
75
+ return nil if obj.nil?
76
+ if obj.is_a?(Array)
77
+ obj.map{|o| cache(o)}
78
+ else
79
+ cache(*cache_key(obj)) || remember(obj)
80
+ end
81
+ when 2 then
82
+ @cache[args]
83
+ else
84
+ raise ArgumentError, "#{self.class.name}#cache requires 1 or 2 arguments"
85
+ end
86
+ end
87
+
88
+ def remember_all(vertex)
89
+ v = Tengine::Job::Structure::Visitor::AllWithEdge.new{|obj| remember(obj) }
90
+ vertex.accept_visitor(v)
91
+ end
92
+
93
+ def cache_key(obj)
94
+ return [obj.class.name, obj.id.to_s]
95
+ end
96
+
97
+ def cache_list
98
+ Tengine.logger.debug "-" * 100
99
+ Tengine.logger.debug "#{__FILE__}##{__LINE__}"
100
+ Tengine.logger.debug "object_id: #{object_id}"
101
+ @cache.each do |key, obj|
102
+ Tengine.logger.debug "#{obj.object_id} #{key.inspect} #{obj.inspect}" << (obj.changed? ? " CHANGED" : "")
103
+ end
104
+ Tengine.logger.debug "-" * 100
105
+ end
106
+
107
+ def changed_vertecs
108
+ @cache.values.select(&:changed?).
109
+ map{|obj| obj.is_a?(Tengine::Job::Runtime::Edge) ? obj.owner : obj}.
110
+ uniq
32
111
  end
33
112
 
34
113
  def execution
35
- @execution ||= Tengine::Job::Execution.find(event[:execution_id])
114
+ @execution ||= Tengine::Job::Runtime::Execution.find(event[:execution_id])
36
115
  end
37
116
 
38
117
  def leave(obj, action = :transmit)
39
118
  @paths << obj
40
119
  begin
41
- if obj.is_a?(Tengine::Job::Edge)
42
- obj.destination.send(action, self)
43
- elsif obj.is_a?(Tengine::Job::Vertex)
120
+ if obj.is_a?(Tengine::Job::Runtime::Edge)
121
+ if obj.destination.is_a?(Tengine::Job::Runtime::NamedVertex)
122
+ self.call_later do
123
+ cache(obj.destination).send(action, self)
124
+ end
125
+ else
126
+ cache(obj.destination).send(action, self)
127
+ end
128
+ elsif obj.is_a?(Tengine::Job::Runtime::Vertex)
44
129
  obj.next_edges.each do |edge|
45
- with_paths_backup{ edge.send(action, self) }
130
+ # cache_list
131
+ with_paths_backup{ cache(edge).send(action, self) }
46
132
  end
47
133
  else
48
- raise Tengine::Job::Signal::Error, "leaving unsupported object: #{obj.inspect}"
134
+ raise Tengine::Job::Runtime::Signal::Error, "leaving unsupported object: #{obj.inspect}"
49
135
  end
50
- rescue Tengine::Job::Signal::Error => e
136
+ rescue Tengine::Job::Runtime::Signal::Error => e
51
137
  puts "[#{e.class.name}] #{e.message}\nsignal.paths: #{@paths.inspect}"
52
138
  raise e
53
139
  end
@@ -77,7 +163,7 @@ class Tengine::Job::Signal
77
163
 
78
164
  def fire(source, event_type_name, properties = {}, options = {})
79
165
  case source
80
- when Tengine::Job::Execution then
166
+ when Tengine::Job::Runtime::Execution then
81
167
  properties[:execution_id] ||= source.id.to_s
82
168
  properties[:root_jobnet_id] ||= source.root_jobnet.id.to_s
83
169
  properties[:root_jobnet_name_path] ||= source.root_jobnet.name_path
@@ -106,8 +192,8 @@ class Tengine::Job::Signal
106
192
  def activate(signal); raise NotImplementedError; end
107
193
 
108
194
  def complete_origin_edge(signal, options = {})
109
- origin_edge = signal.paths.last
110
- origin_edge ||= prev_edges.first
195
+ origin_edge = signal.cache(signal.paths.last)
196
+ origin_edge ||= signal.cache(prev_edges.first)
111
197
  begin
112
198
  return if options[:except_closed] && origin_edge.closed?
113
199
  origin_edge.complete(signal)
@@ -0,0 +1,486 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'tengine/job/runtime'
3
+
4
+ require 'tengine/resource/net_ssh'
5
+
6
+ # ルートジョブネットを他のジョブネット内に展開するための特殊なテンプレート用Vertex。
7
+ class Tengine::Job::Runtime::SshJob < Tengine::Job::Runtime::JobBase
8
+
9
+ class Error < StandardError
10
+ end
11
+
12
+ include Tengine::Core::CollectionAccessible
13
+ include Tengine::Job::Template::SshJob::Settings
14
+
15
+ include Tengine::Job::Runtime::StateTransition
16
+
17
+ field :executing_pid, :type => String # 実行しているプロセスのPID
18
+ field :exit_status , :type => String # 終了したプロセスが返した終了ステータス
19
+ field :error_messages, :type => Array # エラーになった場合のメッセージを保持する配列。再実行時に追加される場合は末尾に追加されます。
20
+ array_text_accessor :error_messages, :delimeter => "\n"
21
+
22
+ before_validation :prepare_server_and_credential
23
+
24
+ def prepare_server_and_credential
25
+ if t = template_vertex
26
+ self.server_name = t.actual_server_name if server_name.blank?
27
+ self.credential_name = t.actual_credential_name if credential_name.blank?
28
+ end
29
+ end
30
+
31
+ def run(execution)
32
+ return ack(@acked_pid) if @acked_pid
33
+ cmd = build_command(execution)
34
+ # puts "cmd:\n" << cmd
35
+ execute(cmd) do |ch, data|
36
+ pids = data.strip.scan(/^\d+$/)
37
+ case pids.length
38
+ when 0 then
39
+ add_error_message("expected a set of numeric charactors but not found in: " << data.inspect)
40
+ raise Error, "Failure to execute #{self.name_path} via SSH. expected numeric charactors but got: #{data}"
41
+ when 1 then
42
+ pid = pids.first.strip
43
+ Tengine.logger.info("got pid: #{pid.inspect}")
44
+ else
45
+ add_error_message("expected a set of numeric charactors but got #{pids.length} sets of numeric charactoers #{pids.inspect} in #{data.inspect}")
46
+ raise Error, "Failure to execute #{self.name_path} via SSH. expected numeric charactors but got: #{data}"
47
+ end
48
+
49
+ if signal = execution.signal
50
+ signal.call_later do
51
+ signal.data = {:executing_pid => pid}
52
+
53
+ # このブロック内の処理はupdate_with_lockによって複数回実行されることがあります。
54
+ # 1回目と同じリロードされていないオブジェクトを2回目以降に使用すると、1回目の変更が残っているので
55
+ # そのオブジェクトに対して処理を行うのはNGです。
56
+ # self.ack(signal) # これはNG
57
+
58
+ # このブロックが実行されるたびに、rootからselfと同じidのオブジェクトを新たに取得する必要があります。
59
+ job = root.vertex(self.id)
60
+ job.ack(signal)
61
+ end
62
+ end
63
+ end
64
+ rescue Exception => e
65
+ Tengine.logger.error("[#{e.class}] #{e.message}\n " << e.backtrace.join("\n "))
66
+ raise
67
+ end
68
+
69
+ class ShellClient
70
+ def initialize(channel, script, callback)
71
+ @channel, @script, @callback = channel, script, callback
72
+ @status = :preparing # :preparing, :waiting, :exiting
73
+ end
74
+
75
+ def setup
76
+ @data = ""
77
+ @result = nil
78
+
79
+ @channel.on_data do |ch, data|
80
+ # puts "on_data: #{data.inspect}"
81
+ @data << data
82
+ Tengine.logger.info("got STDOUT data: #{data.inspect}")
83
+ end
84
+
85
+ @channel.on_process do |ch|
86
+ while @data =~ %r!^.*?\n!
87
+ @data = $'
88
+ dispatch($&)
89
+ end
90
+ end
91
+ end
92
+
93
+ def dispatch(line)
94
+ # puts "line: #{line.inspect}"
95
+ case @status
96
+ when :preparing then execute
97
+ when :waiting then
98
+ if line.strip == one_time_token
99
+ returns
100
+ else
101
+ @result << line
102
+ end
103
+ when :exiting then
104
+ # do nothing...
105
+ else
106
+ raise Error, "Unknown shell channel status: #{@status.inspect}"
107
+ end
108
+ end
109
+
110
+ def start
111
+ prepare # execute, returnsはdispatchから呼ばれます
112
+ end
113
+
114
+ def prepare
115
+ cmd = "export PS1=;"
116
+ Tengine.logger.info("now exec on ssh: \"#{cmd}\"")
117
+ @channel.send_data("#{cmd}\n")
118
+ end
119
+
120
+ def execute
121
+ actual = @script.force_encoding("binary")
122
+ Tengine.logger.info("now exec on ssh: " << @script)
123
+ # puts("now exec on ssh: " << @script)
124
+ @result = ""
125
+ @status = :waiting
126
+ @channel.send_data(actual + "; echo \"#{one_time_token}\"\n")
127
+ end
128
+
129
+ def returns
130
+ @callback.call(@channel, @result) if @callback
131
+ @status = :exiting
132
+ @channel.send_data("exit\n")
133
+ end
134
+
135
+ def one_time_token
136
+ "one_time_token"
137
+ end
138
+ end
139
+
140
+ def execute(cmd, &block)
141
+ raise "actual_server not found for #{self.name_path.inspect}" unless actual_server
142
+ Tengine.logger.info("connecting to #{actual_server.hostname_or_ipv4}")
143
+ port = actual_server.properties["ssh_port"] || 22
144
+ keys_only = actual_credential.auth_type_cd == :ssh_public_key
145
+ Net::SSH.start(actual_server.hostname_or_ipv4, actual_credential, :port => port, :logger => Tengine.logger, :keys_only => keys_only) do |ssh|
146
+ # see http://net-ssh.github.com/ssh/v2/api/classes/Net/SSH/Connection/Channel.html
147
+ c = ssh.open_channel do |ch0|
148
+ ch0.request_pty do |channel, success|
149
+ raise Error, "failed to request_pty" unless success
150
+
151
+ channel.exec("#{ENV['SHELL']} -l") do |shell_ch, success|
152
+ raise Error, "failed to \"#{ENV['SHELL']} -l\"" unless success
153
+
154
+ shell_ch.on_extended_data do |ch, type, data|
155
+ add_error_message(data)
156
+ raise Error, "Failure to execute #{self.name_path} via SSH: #{data}"
157
+ end
158
+
159
+ client = ShellClient.new(shell_ch, cmd, block)
160
+ shell_ch[:client] = client
161
+ client.setup
162
+ client.start
163
+ end
164
+ end
165
+ end
166
+ c.wait
167
+ end
168
+ rescue Tengine::Job::Runtime::SshJob::Error
169
+ raise
170
+ rescue Mongoid::Errors::DocumentNotFound, SocketError, Net::SSH::AuthenticationFailed => src
171
+ error = Error.new("[#{src.class.name}] #{src.message}")
172
+ error.set_backtrace(src.backtrace)
173
+ raise error
174
+ rescue Exception
175
+ # puts "[#{$!.class.name}] #{$!.message}"
176
+ raise
177
+ end
178
+
179
+ def kill(execution)
180
+ lines = []
181
+
182
+ if self.executing_pid.blank?
183
+ Tengine.logger.warn("PID is blank when kill!!\n#{self.inspect}\n " << caller.join("\n "))
184
+ end
185
+
186
+ cmd = executable_command("tengine_job_agent_kill %s %d %s" % [
187
+ self.executing_pid,
188
+ self.actual_killing_signal_interval,
189
+ self.actual_killing_signals.join(","),
190
+ ])
191
+ lines << cmd
192
+ cmd = lines.join(' && ')
193
+ execute(cmd)
194
+ end
195
+
196
+ # def ack(pid)
197
+ # @acked_pid = pid
198
+ # self.executing_pid = pid
199
+ # self.phase_key = :running
200
+ # self.previous_edges.each{|edge| edge.status_key = :transmitted}
201
+ # end
202
+
203
+ def build_command(execution)
204
+ result = []
205
+ mm_env = build_mm_env(execution).map{|k,v| "#{k}=#{v}"}.join(" ")
206
+ # Hadoopジョブの場合は環境変数をセットする
207
+ if is_a?(Tengine::Job::Runtime::Jobnet) && (jobnet_type_key == :hadoop_job_run)
208
+ mm_env << ' ' << hadoop_job_env
209
+ end
210
+ result << "export #{mm_env}"
211
+ template_root = (parent ? root_or_expansion.template : nil)
212
+ if template_root
213
+ template_job = template_root.vertex_by_name_path(self.name_path_until_expansion)
214
+ unless template_job
215
+ raise "job not found #{self.name_path_until_expansion.inspect} in #{template_root.inspect}"
216
+ end
217
+ key = Tengine::Job::Dsl::Loader.template_block_store_key(template_job, :preparation)
218
+ preparation_block = Tengine::Job::Dsl::Loader.template_block_store[key]
219
+ if preparation_block
220
+ preparation = instance_eval(&preparation_block)
221
+ unless preparation.blank?
222
+ result << preparation
223
+ end
224
+ end
225
+ end
226
+ unless execution.preparation_command.blank?
227
+ result << execution.preparation_command
228
+ end
229
+ # cmdはユーザーが設定したスクリプトを組み立てたもので、
230
+ # プロセスの監視/強制停止のためにtengine_job_agent/bin/tengine_job_agent_run
231
+ # からこれらを実行させるためにはcmdを編集します。
232
+ # tengine_job_agent_runは、標準出力に監視対象となる起動したプロセスのPIDを出力します。
233
+ runner_path = ENV["MM_RUNNER_PATH"] || executable_command("tengine_job_agent_run")
234
+ runner_option = ""
235
+ # 実装するべきか要検討
236
+ # runner_option << " --stdout" if execution.keeping_stdout
237
+ # runner_option << " --stderr" if execution.keeping_stderr
238
+ # script = "#{runner_path}#{runner_option} -- #{self.script}" # runnerのオプションを指定する際は -- の前に設定してください
239
+ script = "#{runner_path}#{runner_option} #{self.script}" # runnerのオプションを指定する際は -- の前に設定してください
240
+ result << script
241
+ result.join(" && ")
242
+ end
243
+
244
+ def executable_command(command)
245
+ if prefix = ENV["MM_CMD_PREFIX"]
246
+ "#{prefix} #{command}"
247
+ else
248
+ command
249
+ end
250
+ end
251
+
252
+ # MMから実行されるシェルスクリプトに渡す環境変数のHashを返します。
253
+ # MM_ACTUAL_JOB_ID : 実行される末端のジョブのMM上でのID
254
+ # MM_ACTUAL_JOB_ANCESTOR_IDS : 実行される末端のジョブの祖先のMM上でのIDをセミコロンで繋げた文字列 (テンプレートジョブ単位)
255
+ # MM_FULL_ACTUAL_JOB_ANCESTOR_IDS : 実行される末端のジョブの祖先のMM上でのIDをセミコロンで繋げた文字列 (expansionから展開した単位)
256
+ # MM_ACTUAL_JOB_NAME_PATH : 実行される末端のジョブのname_path
257
+ # MM_ACTUAL_JOB_SECURITY_TOKEN : 公開API呼び出しのためのセキュリティ用のワンタイムトークン
258
+ # MM_TEMPLATE_JOB_ID : テンプレートジョブ(=実行される末端のジョブの元となったジョブ)のID
259
+ # MM_TEMPLATE_JOB_ANCESTOR_IDS : テンプレートジョブの祖先のMM上でのIDをセミコロンで繋げたもの
260
+ # MM_SCHEDULE_ID : 実行スケジュールのID
261
+ # MM_SCHEDULE_ESTIMATED_TIME : 実行スケジュールの見積り時間。単位は分。
262
+ # MM_SCHEDULE_ESTIMATED_END : 実行スケジュールの見積り終了時刻をYYYYMMDDHHMMSS式で。(できればISO 8601など、タイムゾーンも表現できる標準的な形式の方が良い?)
263
+ # MM_MASTER_SCHEDULE_ID : マスタースケジュールがあればそのID。マスタースケジュールがない場合は環境変数は指定されません。
264
+ #
265
+ # 未実装
266
+ # MM_FAILED_JOB_ID : ジョブが失敗した場合にrecoverやfinally内のジョブを実行時に設定される、失敗したジョブのMM上でのID。
267
+ # MM_FAILED_JOB_ANCESTOR_IDS : ジョブが失敗した場合にrecoverやfinally内のジョブを実行時に設定される、失敗したジョブの祖先のMM上でのIDをセミコロンで繋げた文字列。
268
+ def build_mm_env(execution)
269
+ result = {
270
+ "MM_SERVER_NAME" => actual_server_name, # [Tengineの仕様として追加] ジョブの実行サーバ名を設定
271
+ "MM_ROOT_JOBNET_ID" => root.id.to_s,
272
+ "MM_TARGET_JOBNET_ID" => (parent ? parent.id.to_s : nil),
273
+ "MM_ACTUAL_JOB_ID" => id.to_s,
274
+ "MM_ACTUAL_JOB_ANCESTOR_IDS" => '"%s"' % ancestors_until_expansion.map(&:id).map(&:to_s).join(';'),
275
+ "MM_FULL_ACTUAL_JOB_ANCESTOR_IDS" => '"%s"' % ancestors.map(&:id).map(&:to_s).join(';'),
276
+ "MM_ACTUAL_JOB_NAME_PATH" => name_path.dump,
277
+ "MM_ACTUAL_JOB_SECURITY_TOKEN" => "", # TODO トークンの生成
278
+ "MM_SCHEDULE_ID" => execution.id.to_s,
279
+ "MM_SCHEDULE_ESTIMATED_TIME" => execution.estimated_time,
280
+ }
281
+ if estimated_end = execution.actual_estimated_end
282
+ result["MM_SCHEDULE_ESTIMATED_END"] = estimated_end.strftime("%Y%m%d%H%M%S")
283
+ end
284
+ if rjt = (parent ? root.template : nil)
285
+ t = rjt.find_descendant_by_name_path(self.name_path)
286
+ unless t
287
+ template_name_parts = self.name_path_until_expansion.split(Tengine::Job::Structure::NamePath::SEPARATOR).select{|s| !s.empty?}
288
+ root_jobnet_name = template_name_parts.first
289
+ if rjt = Tengine::Job::Template::RootJobnet.find_by_name(root_jobnet_name, :version => rjt.dsl_version)
290
+ t = rjt.find_descendant_by_name_path(self.name_path_until_expansion)
291
+ raise "template job #{name_path.inspect} not found in #{rjt.inspect}" unless t
292
+ else
293
+ raise "Tengine::Job::Template::RootJobnet not found #{self.name_path_until_expansion.inspect}"
294
+ end
295
+ end
296
+ result.update({
297
+ "MM_TEMPLATE_JOB_ID" => t.id.to_s,
298
+ "MM_TEMPLATE_JOB_ANCESTOR_IDS" => '"%s"' % t.ancestors.map(&:id).map(&:to_s).join(';'),
299
+ })
300
+ end
301
+ # if ms = execution.master_schedule
302
+ # result.update({
303
+ # "MM_MASTER_SCHEDULE_ID" => ms.id.to_s,
304
+ # })
305
+ # end
306
+ result
307
+ end
308
+
309
+ def hadoop_job_env
310
+ s = children.select{|c| c.is_a?(Tengine::Job::Runtime::Jobnet) && (c.jobnet_type_key == :hadoop_job)}.
311
+ map{|c| "#{c.name}\\t#{c.id.to_s}\\n"}.join
312
+ "MM_HADOOP_JOBS=\"#{s}\""
313
+ end
314
+
315
+ def add_error_message(msg)
316
+ self.error_messages ||= []
317
+ self.error_messages += [msg]
318
+ end
319
+
320
+
321
+ ## 状態遷移アクション
322
+
323
+ # ハンドリングするドライバ: ジョブネット制御ドライバ
324
+ def transmit(signal)
325
+ self.phase_key = :ready
326
+ self.started_at = signal.event.occurred_at
327
+ signal.fire(self, :"start.job.job.tengine", {
328
+ :target_jobnet_id => parent.id,
329
+ :target_jobnet_name_path => parent.name_path,
330
+ :target_job_id => self.id,
331
+ :target_job_name_path => self.name_path,
332
+ })
333
+ end
334
+ available(:transmit, :on => :initialized,
335
+ :ignored => [:ready, :starting, :running, :dying, :success, :error, :stuck])
336
+
337
+ # ハンドリングするドライバ: ジョブ制御ドライバ
338
+ def activate(signal)
339
+ case phase_key
340
+ when :initialized then
341
+ # 特別ルール「starting直前stop」
342
+ # initializedに戻されたジョブに対して、:readyになる際にtransmitで送信されたイベントを受け取って、
343
+ # activateしようとすると状態は遷移しないが、後続のエッジを実行する。
344
+ # (エッジを実行しようとした際、エッジがclosedならばそのジョブネットのEndに遷移する。)
345
+ next_edges.first.transmit(signal)
346
+ when :ready then
347
+ self.phase_key = :starting
348
+ self.started_at = signal.event.occurred_at
349
+
350
+ signal.call_later do
351
+ complete_origin_edge(signal)
352
+ execution = signal.execution
353
+ if execution.retry
354
+ if execution.target_actual_ids.include?(self.id.to_s)
355
+ execution.ack(signal)
356
+ elsif execution.target_actuals.map{|t| t.parent.id.to_s if t.parent }.include?(self.parent.id.to_s)
357
+ # 自身とTengine::Job::Runtime::Execution#target_actual_idsに含まれるジョブ/ジョブネットと親が同じならば、ackしない
358
+ else
359
+ parent.ack(signal)
360
+ end
361
+ else
362
+ parent.ack(signal) # 再実行でない場合
363
+ end
364
+ # このコールバックはjob_control_driverでupdate_with_lockの外側から
365
+ # 再度呼び出してもらうためにcallbackを設定しています
366
+ signal.call_later do
367
+ # 実際にSSHでスクリプトを実行
368
+ execution = signal.execution
369
+ execution.signal = signal # ackを呼び返してもらうための苦肉の策
370
+ begin
371
+ run(execution)
372
+ rescue Tengine::Job::Runtime::SshJob::Error => e
373
+ Tengine.logger.warn("error on run\nerror: #{e.inspect}\njob: #{self.inspect}\nexecution: #{execution.inspect}")
374
+ signal.call_later do
375
+ self.fail(signal, :message => e.message)
376
+ end
377
+ end
378
+ end
379
+ end
380
+ when :starting then
381
+ raise "something wrong! #{self.inspect}"
382
+ end
383
+ end
384
+ available(:activate, :on => [:initialized, :ready, :starting],
385
+ :ignored => [:running, :dying, :success, :error, :stuck])
386
+
387
+ # ハンドリングするドライバ: ジョブ制御ドライバ
388
+ # スクリプトのプロセスのPIDを取得できたときに実行されます
389
+ def ack(signal)
390
+ self.executing_pid = (signal.data || {})[:executing_pid]
391
+ self.phase_key = :running
392
+ end
393
+ available(:ack, :on => :starting,
394
+ :ignored => [:running, :dying, :success, :error, :stuck])
395
+
396
+ def finish(signal)
397
+ self.exit_status = signal.event[:exit_status]
398
+ self.finished_at = signal.event.occurred_at
399
+ (self.exit_status.to_s == '0') ?
400
+ succeed(signal) :
401
+ fail(signal)
402
+ end
403
+
404
+ # ハンドリングするドライバ: ジョブ制御ドライバ
405
+ def succeed(signal)
406
+ self.phase_key = :success
407
+ self.finished_at = signal.event.occurred_at
408
+ signal.fire(self, :"success.job.job.tengine", {
409
+ :exit_status => self.exit_status,
410
+ :target_jobnet_id => parent.id,
411
+ :target_jobnet_name_path => parent.name_path,
412
+ :target_job_id => self.id,
413
+ :target_job_name_path => self.name_path,
414
+ })
415
+ end
416
+ available :succeed, :on => [:starting, :running, :dying, :stuck], :ignored => [:success]
417
+
418
+ # ハンドリングするドライバ: ジョブ制御ドライバ
419
+ def fail(signal, options = nil)
420
+ self.phase_key = :error
421
+ if msg = signal.event[:message]
422
+ add_error_message(msg)
423
+ end
424
+ if options && (msg = options[:message])
425
+ add_error_message(msg)
426
+ end
427
+ self.finished_at = signal.event.occurred_at
428
+ event_options = {
429
+ :exit_status => self.exit_status,
430
+ :target_jobnet_id => parent.id,
431
+ :target_jobnet_name_path => parent.name_path,
432
+ :target_job_id => self.id,
433
+ :target_job_name_path => self.name_path,
434
+ }
435
+ event_options.update(options) if options
436
+ signal.fire(self, :"error.job.job.tengine", event_options)
437
+ end
438
+ available :fail, :on => [:starting, :running, :dying], :ignored => [:error, :stuck]
439
+
440
+ def fire_stop(signal)
441
+ signal.fire(self, :"stop.job.job.tengine", {
442
+ :stop_reason => signal.event[:stop_reason],
443
+ :target_jobnet_id => parent.id,
444
+ :target_jobnet_name_path => parent.name_path,
445
+ :target_job_id => self.id,
446
+ :target_job_name_path => self.name_path,
447
+ })
448
+ end
449
+ available :fire_stop, :on => [:ready, :starting, :running], :ignored => [:initialized, :dying, :success, :error, :stuck]
450
+
451
+ def stop(signal, &block)
452
+ case phase_key
453
+ when :ready then
454
+ self.phase_key = :initialized
455
+ self.stopped_at = signal.event.occurred_at
456
+ self.stop_reason = signal.event[:stop_reason]
457
+ next_edges.first.transmit(signal)
458
+ when :starting then
459
+ job = nil
460
+ loop do
461
+ # root = self.root.reload # class.find(self.root.id)
462
+ # job = root.find_descendant(self.id)
463
+ job = self.class.find(self.id)
464
+ break unless job.phase_key == :starting
465
+ yield if block_given? # テストの為にyieldしています
466
+ sleep(0.1)
467
+ end
468
+ job.stop(signal, &block)
469
+ when :running then
470
+ self.phase_key = :dying
471
+ self.stopped_at = signal.event.occurred_at
472
+ self.stop_reason = signal.event[:stop_reason]
473
+ signal.call_later do
474
+ kill(signal.execution)
475
+ end
476
+ end
477
+ end
478
+ available :stop, :on => [:ready, :starting, :running], :ignored => [:initialized, :dying, :success, :error, :stuck]
479
+
480
+ def reset(signal, &block)
481
+ self.phase_key = :initialized
482
+ reset_followings(signal)
483
+ end
484
+ available :reset, :on => [:initialized, :ready, :success, :error, :stuck]
485
+
486
+ end