tengine_job 0.6.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (133) hide show
  1. data/.document +5 -0
  2. data/.rspec +1 -0
  3. data/Gemfile +23 -0
  4. data/Gemfile.lock +109 -0
  5. data/README.rdoc +20 -0
  6. data/Rakefile +42 -0
  7. data/VERSION +1 -0
  8. data/examples/0004_retry_one_layer.rb +24 -0
  9. data/examples/0004_retry_one_layer.sh +38 -0
  10. data/examples/0005_retry_two_layer.rb +54 -0
  11. data/examples/0005_retry_two_layer.sh +80 -0
  12. data/examples/0006_retry_three_layer.rb +58 -0
  13. data/examples/0006_retry_three_layer.sh +74 -0
  14. data/examples/0007_simple_jobnet.rb +7 -0
  15. data/examples/0021_dynamic_env.rb +20 -0
  16. data/examples/VERSION +1 -0
  17. data/examples/tengine_job_test.sh +10 -0
  18. data/lib/tengine/job.rb +94 -0
  19. data/lib/tengine/job/category.rb +54 -0
  20. data/lib/tengine/job/connectable.rb +43 -0
  21. data/lib/tengine/job/drivers/job_control_driver.rb +82 -0
  22. data/lib/tengine/job/drivers/job_execution_driver.rb +30 -0
  23. data/lib/tengine/job/drivers/jobnet_control_driver.rb +117 -0
  24. data/lib/tengine/job/drivers/schedule_driver.rb +30 -0
  25. data/lib/tengine/job/dsl_binder.rb +12 -0
  26. data/lib/tengine/job/dsl_evaluator.rb +18 -0
  27. data/lib/tengine/job/dsl_loader.rb +180 -0
  28. data/lib/tengine/job/edge.rb +150 -0
  29. data/lib/tengine/job/element_selector_notation.rb +169 -0
  30. data/lib/tengine/job/end.rb +32 -0
  31. data/lib/tengine/job/executable.rb +74 -0
  32. data/lib/tengine/job/execution.rb +141 -0
  33. data/lib/tengine/job/expansion.rb +37 -0
  34. data/lib/tengine/job/fork.rb +6 -0
  35. data/lib/tengine/job/job.rb +23 -0
  36. data/lib/tengine/job/jobnet.rb +173 -0
  37. data/lib/tengine/job/jobnet/builder.rb +150 -0
  38. data/lib/tengine/job/jobnet/job_state_transition.rb +167 -0
  39. data/lib/tengine/job/jobnet/jobnet_state_transition.rb +110 -0
  40. data/lib/tengine/job/jobnet/state_transition.rb +37 -0
  41. data/lib/tengine/job/jobnet_actual.rb +55 -0
  42. data/lib/tengine/job/jobnet_template.rb +10 -0
  43. data/lib/tengine/job/join.rb +6 -0
  44. data/lib/tengine/job/junction.rb +29 -0
  45. data/lib/tengine/job/killing.rb +30 -0
  46. data/lib/tengine/job/mm_compatibility.rb +6 -0
  47. data/lib/tengine/job/mm_compatibility/connectable.rb +13 -0
  48. data/lib/tengine/job/name_path.rb +31 -0
  49. data/lib/tengine/job/root.rb +16 -0
  50. data/lib/tengine/job/root_jobnet_actual.rb +39 -0
  51. data/lib/tengine/job/root_jobnet_template.rb +49 -0
  52. data/lib/tengine/job/script_executable.rb +235 -0
  53. data/lib/tengine/job/signal.rb +121 -0
  54. data/lib/tengine/job/start.rb +20 -0
  55. data/lib/tengine/job/stoppable.rb +15 -0
  56. data/lib/tengine/job/vertex.rb +172 -0
  57. data/lib/tengine_job.rb +3 -0
  58. data/spec/fixtures/rjn_0001_simple_jobnet_builder.rb +42 -0
  59. data/spec/fixtures/rjn_0002_simple_parallel_jobnet_builder.rb +42 -0
  60. data/spec/fixtures/rjn_0003_fork_join_jobnet_builder.rb +61 -0
  61. data/spec/fixtures/rjn_0004_parallel_jobnet_with_finally_fixture.rb +62 -0
  62. data/spec/fixtures/rjn_0005_retry_two_layer_fixture.rb +153 -0
  63. data/spec/fixtures/rjn_0008_expansion_fixture.rb +32 -0
  64. data/spec/fixtures/rjn_0009_tree_sequential_jobnet_builder.rb +174 -0
  65. data/spec/fixtures/rjn_0010_2jobs_and_1job_parallel_jobnet_builder.rb +39 -0
  66. data/spec/fixtures/rjn_0011_nested_fork_jobnet_builder.rb +96 -0
  67. data/spec/fixtures/rjn_0012_nested_and_finally_builder.rb +157 -0
  68. data/spec/fixtures/rjn_1004_hadoop_job_in_jobnet_fixture.rb +105 -0
  69. data/spec/fixtures/rjn_means_root_jobnet +0 -0
  70. data/spec/fixtures/test_credential_fixture.rb +12 -0
  71. data/spec/fixtures/test_server_fixture.rb +28 -0
  72. data/spec/mongoid.yml +35 -0
  73. data/spec/spec_helper.rb +56 -0
  74. data/spec/sshd/.gitignore +1 -0
  75. data/spec/sshd/id_rsa +51 -0
  76. data/spec/sshd/id_rsa.pub +1 -0
  77. data/spec/sshd/ssh_host_rsa_key +51 -0
  78. data/spec/sshd/ssh_host_rsa_key.pub +1 -0
  79. data/spec/sshd/sshd_config +10 -0
  80. data/spec/sshd/sshd_config.erb +11 -0
  81. data/spec/sshd/tengine_job_test.sh +6 -0
  82. data/spec/support/jobnet_fixture_builder.rb +145 -0
  83. data/spec/support/mongo_index_key_log.rb +91 -0
  84. data/spec/tengine/job/category_spec.rb +193 -0
  85. data/spec/tengine/job/connectable_spec.rb +94 -0
  86. data/spec/tengine/job/drivers/job_controll_driver/connection_error_spec.rb +236 -0
  87. data/spec/tengine/job/drivers/job_controll_driver/duplicated_job_start_spec.rb +302 -0
  88. data/spec/tengine/job/drivers/job_controll_driver/expansion_spec.rb +120 -0
  89. data/spec/tengine/job/drivers/job_controll_driver/stop_spec.rb +159 -0
  90. data/spec/tengine/job/drivers/job_controll_driver_spec.rb +623 -0
  91. data/spec/tengine/job/drivers/job_execution_driver_spec.rb +88 -0
  92. data/spec/tengine/job/drivers/jobnet_control_driver/nested_and_finally_spec.rb +472 -0
  93. data/spec/tengine/job/drivers/jobnet_control_driver/nested_jobnet_spec.rb +231 -0
  94. data/spec/tengine/job/drivers/jobnet_control_driver/stop_jobnet_spec.rb +202 -0
  95. data/spec/tengine/job/drivers/jobnet_control_driver_spec.rb +446 -0
  96. data/spec/tengine/job/drivers/schedule_driver_spec.rb +202 -0
  97. data/spec/tengine/job/dsl_binder_spec.rb +36 -0
  98. data/spec/tengine/job/dsl_loader_spec.rb +403 -0
  99. data/spec/tengine/job/dsls/0013_hadoop_job_run.rb +29 -0
  100. data/spec/tengine/job/dsls/0014_join_and_join.rb +19 -0
  101. data/spec/tengine/job/dsls/0015_fork_and_fork.rb +18 -0
  102. data/spec/tengine/job/dsls/0016_complex_fork_and_join.rb +20 -0
  103. data/spec/tengine/job/dsls/0017_finally.rb +15 -0
  104. data/spec/tengine/job/dsls/0018_expansion.rb +23 -0
  105. data/spec/tengine/job/dsls/0019_execute_job_on_event.rb +16 -0
  106. data/spec/tengine/job/dsls/0020_duplicated_jobnet_name.rb +16 -0
  107. data/spec/tengine/job/dsls/1060_test_dir1/1060_test_dir2/0013_hadoop_job_run.rb +29 -0
  108. data/spec/tengine/job/dsls/2003_expansion/expansion_5.rb +11 -0
  109. data/spec/tengine/job/dsls/VERSION +1 -0
  110. data/spec/tengine/job/dynamic_env_spec.rb +95 -0
  111. data/spec/tengine/job/edge_spec.rb +241 -0
  112. data/spec/tengine/job/element_selector_notation_spec.rb +354 -0
  113. data/spec/tengine/job/examples_spec.rb +62 -0
  114. data/spec/tengine/job/execution_spec.rb +100 -0
  115. data/spec/tengine/job/expansion_spec.rb +116 -0
  116. data/spec/tengine/job/hadoop_job_run_spec.rb +65 -0
  117. data/spec/tengine/job/job_spec.rb +4 -0
  118. data/spec/tengine/job/jobnet/1015_complecated_jobnet_spec.rb +72 -0
  119. data/spec/tengine/job/jobnet_actual_spec.rb +175 -0
  120. data/spec/tengine/job/jobnet_spec.rb +399 -0
  121. data/spec/tengine/job/jobnet_template_spec.rb +240 -0
  122. data/spec/tengine/job/killing_spec.rb +91 -0
  123. data/spec/tengine/job/reset_spec.rb +958 -0
  124. data/spec/tengine/job/reset_spec/4056_1_dump.txt +1 -0
  125. data/spec/tengine/job/root_jobnet_actual_spec.rb +89 -0
  126. data/spec/tengine/job/root_jobnet_template_spec.rb +248 -0
  127. data/spec/tengine/job/script_executable_spec.rb +132 -0
  128. data/spec/tengine/job/stoppable_spec.rb +176 -0
  129. data/spec/tengine/job/vertex_spec.rb +25 -0
  130. data/spec/tengine_job_spec.rb +4 -0
  131. data/tengine_job.gemspec +197 -0
  132. data/tmp/log/.gitignore +1 -0
  133. metadata +296 -0
@@ -0,0 +1,94 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+
4
+ describe Tengine::Job::Connectable do
5
+
6
+ context "Rjn0001SimpleJobnetBuilderを使う場合" do
7
+ [:actual, :template].each do |jobnet_type|
8
+ context "#{jobnet_type}の場合" do
9
+
10
+ before(:all) do
11
+ builder = Rjn0009TreeSequentialJobnetBuilder.new
12
+ builder.send(:"create_#{jobnet_type}")
13
+ @ctx = builder.context
14
+ end
15
+
16
+ {
17
+ "rjn0009" => [nil, nil],
18
+ "j1100" => ["test_credential1" , "test_server1"],
19
+ "j1110" => ["test_credential1" , "test_server1"],
20
+ "j1120" => ["test_credential1" , "test_server1"],
21
+ "j1200" => ["test_credential1" , nil ],
22
+ "j1210" => ["test_credential1" , "mysql_master"],
23
+ "j1300" => [nil , "mysql_master"],
24
+ "j1310" => ["test_credential1" , "mysql_master"],
25
+ "j1400" => [nil , nil ],
26
+ "j1410" => ["test_credential1" , "mysql_master"],
27
+ "j1500" => ["test_credential1" , "mysql_master"],
28
+ "j1510" => ["test_credential1" , "mysql_master"],
29
+ "j1511" => ["test_credential1" , "mysql_master"],
30
+ "j1600" => ["test_credential1" , "mysql_master"],
31
+ "j1610" => ["test_credential1" , "mysql_master"],
32
+ "j1611" => ["test_credential1" , "test_server1"],
33
+ "j1612" => ["gohan_ssh_pk" , "mysql_master"],
34
+ "j1620" => ["test_credential1" , "test_server1"],
35
+ "j1621" => ["test_credential1" , "test_server1"],
36
+ "j1630" => ["gohan_ssh_pk", "mysql_master" ],
37
+ "j1631" => ["gohan_ssh_pk", "mysql_master" ],
38
+ }.each do |job_name, (credential_name, server_name)|
39
+ context job_name do
40
+ subject{ @ctx[job_name.to_sym] }
41
+ its(:actual_credential_name){ should == credential_name }
42
+ its(:actual_server_name){ should == server_name }
43
+ end
44
+ end
45
+
46
+ end
47
+ end
48
+
49
+ end
50
+
51
+ describe :actual_credential do
52
+ before do
53
+ resource_fixture = GokuAtEc2ApNortheast.new
54
+ resource_fixture.goku_ssh_pw
55
+ end
56
+
57
+ it "存在するCredentialの場合" do
58
+ jobnet = Tengine::Job::JobnetTemplate.new(:credential_name => "test_credential1")
59
+ credential = jobnet.actual_credential
60
+ credential.should be_a(Tengine::Resource::Credential)
61
+ credential.name.should == "test_credential1"
62
+ end
63
+
64
+ it "存在しないCredentialの場合" do
65
+ jobnet = Tengine::Job::JobnetTemplate.new(:credential_name => "unexist_credential")
66
+ expect{
67
+ jobnet.actual_credential
68
+ }.to raise_error(Mongoid::Errors::DocumentNotFound)
69
+ end
70
+ end
71
+
72
+
73
+ describe :actual_server do
74
+ before do
75
+ resource_fixture = GokuAtEc2ApNortheast.new
76
+ resource_fixture.hadoop_master_node
77
+ end
78
+
79
+ it "存在するServerの場合" do
80
+ jobnet = Tengine::Job::JobnetTemplate.new(:server_name => "test_server1")
81
+ server = jobnet.actual_server
82
+ server.should be_a(Tengine::Resource::Server)
83
+ server.name.should == "test_server1"
84
+ end
85
+
86
+ it "存在しないServerの場合" do
87
+ jobnet = Tengine::Job::JobnetTemplate.new(:server_name => "unexist_server")
88
+ expect{
89
+ jobnet.actual_server
90
+ }.to raise_error(Mongoid::Errors::DocumentNotFound)
91
+ end
92
+ end
93
+
94
+ end
@@ -0,0 +1,236 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+ require 'tengine/rspec'
4
+
5
+ describe 'connection error' do
6
+ include Tengine::RSpec::Extension
7
+
8
+ target_dsl File.expand_path("../../../../../lib/tengine/job/drivers/job_control_driver.rb", File.dirname(__FILE__))
9
+ driver :job_control_driver
10
+
11
+ let :ssh_dir do
12
+ File.expand_path("../../../../../sshd", __FILE__)
13
+ end
14
+
15
+ before :all do
16
+ raise "WRONG" if $_pid
17
+
18
+ uid = Etc.getlogin
19
+ case uid
20
+ when "root"
21
+ pending "rootは危険なのでこのテストを実行できません"
22
+ when NilClass
23
+ raise "who am i?"
24
+ end
25
+
26
+ # 1. sshdをさがす
27
+ sshd = nil
28
+ ENV["PATH"].split(/:/).find do |dir|
29
+ Dir.glob("#{dir}/sshd") do |path|
30
+ if File.executable?(path)
31
+ sshd = path
32
+ break
33
+ end
34
+ end
35
+ end
36
+
37
+ raise "sshd not found" unless sshd
38
+
39
+ # 2. sshd_configの生成
40
+ template = File.expand_path("sshd_config.erb", ssh_dir)
41
+ hostkey = File.expand_path("ssh_host_rsa_key", ssh_dir)
42
+ clientkey = File.expand_path("id_rsa", ssh_dir)
43
+ File.chmod(0400, hostkey, clientkey)
44
+ File.chmod(0700, ssh_dir)
45
+ $_port = nil
46
+
47
+ # 指定したポートはもう使われているかもしれないので、その際は
48
+ # sshdが起動に失敗するので、何回かポートを変えて試す。
49
+ catch(:return) do
50
+ n = 0
51
+ @port = rand(32768)
52
+ begin
53
+ Tempfile.open("sshd_config", ssh_dir) do |conf|
54
+ File.open(template, "rb") do |tmpl|
55
+ conf.write ERB.new(tmpl.read).result(binding)
56
+ end
57
+ conf.flush
58
+ conf.close(false) # no unlink
59
+ argv = [sshd, "-Def", conf.path, "-h", hostkey]
60
+ $_pid = Process.spawn(*argv)
61
+ x = Time.now
62
+ while Time.now < x + 16.0 do # まあこんくらい待てばいいでしょ
63
+ sleep 0.1
64
+ Process.waitpid2($_pid, Process::WNOHANG)
65
+ Process.kill 0, $_pid
66
+ # netstat -an は Linux / BSD ともに有効
67
+ # どちらかに限ればもう少し効率的な探し方はある。たとえば Linux 限定でよければ netstat -lnt ...
68
+ y = `netstat -an | fgrep LISTEN | fgrep #{@port}`
69
+ if y.lines.to_a.size > 1
70
+ $_port = @port
71
+ throw :return
72
+ end
73
+ end
74
+ pending "failed to invoke sshd in 16 secs."
75
+ end
76
+ rescue Errno::ECHILD, Errno::ESRCH
77
+ if (n += 1) > 10
78
+ pending "10 attempts to invoke sshd failed."
79
+ else
80
+ @port = rand(32768)
81
+ retry
82
+ end
83
+ end
84
+ end
85
+ end
86
+
87
+ after :all do
88
+ if $_pid
89
+ begin
90
+ Process.kill "INT", $_pid
91
+ Process.waitpid $_pid
92
+ rescue Errno::ECHILD
93
+ end
94
+ end
95
+ end
96
+
97
+ # in [rjn0001]
98
+ # (S1) --e1-->(j11)--e2-->(j12)--e3-->(E1)
99
+ #
100
+ context "rjn0001" do
101
+ before do
102
+ Tengine::Job::Vertex.delete_all
103
+ builder = Rjn0001SimpleJobnetBuilder.new
104
+ @root = builder.create_actual
105
+ @ctx = builder.context
106
+ @execution = Tengine::Job::Execution.create!({
107
+ :root_jobnet_id => @root.id,
108
+ })
109
+ @base_props = {
110
+ :execution_id => @execution.id.to_s,
111
+ :root_jobnet_id => @root.id.to_s,
112
+ :target_jobnet_id => @root.id.to_s,
113
+ }
114
+ Tengine::Resource::Server.find_by_name("test_server1").update_attributes :properties => { :ssh_port => $_port }
115
+ end
116
+
117
+ after do
118
+ # 中身を書き換えてしまうので他のテストに影響しないように削除します
119
+ Tengine::Resource::Credential.delete_all
120
+ Tengine::Resource::Server.delete_all
121
+ end
122
+
123
+ context "credential not found" do
124
+ it "対象のジョブはerrorになりエラーイベントが発火される" do
125
+ Tengine::Resource::Credential.delete_all
126
+ @root.phase_key = :starting
127
+ @ctx.edge(:e1).phase_key = :transmitting
128
+ @ctx.vertex(:j11).phase_key = :ready
129
+ @root.save!
130
+ @root.reload
131
+ tengine.should_fire(:"error.job.job.tengine", an_instance_of(Hash))
132
+ tengine.receive("start.job.job.tengine", :properties => {
133
+ :execution_id => @execution.id.to_s,
134
+ :root_jobnet_id => @root.id.to_s,
135
+ :root_jobnet_name_path => @root.name_path,
136
+ :target_jobnet_id => @root.id.to_s,
137
+ :target_jobnet_name_path => @root.name_path,
138
+ :target_job_id => @ctx.vertex(:j11).id.to_s,
139
+ :target_job_name_path => @ctx.vertex(:j11).name_path,
140
+ })
141
+ @root.reload
142
+ @ctx.edge(:e1).phase_key.should == :transmitted
143
+ @ctx.edge(:e2).phase_key.should == :active
144
+ @ctx.vertex(:j11).phase_key.should == :error
145
+ end
146
+ end
147
+
148
+
149
+ context "wrong credential" do
150
+ it "対象のジョブはerrorになりエラーイベントが発火される" do
151
+ credential = Tengine::Resource::Credential.find_by_name("test_credential1")
152
+ hash = credential.auth_values.dup
153
+ hash['username'] = "piccolo"
154
+ credential.auth_values = hash
155
+ credential.save!
156
+ @root.phase_key = :starting
157
+ @ctx.edge(:e1).phase_key = :transmitting
158
+ @ctx.vertex(:j11).phase_key = :ready
159
+ @root.save!
160
+ @root.reload
161
+ tengine.should_fire(:"error.job.job.tengine", an_instance_of(Hash))
162
+ tengine.receive("start.job.job.tengine", :properties => {
163
+ :execution_id => @execution.id.to_s,
164
+ :root_jobnet_id => @root.id.to_s,
165
+ :root_jobnet_name_path => @root.name_path,
166
+ :target_jobnet_id => @root.id.to_s,
167
+ :target_jobnet_name_path => @root.name_path,
168
+ :target_job_id => @ctx.vertex(:j11).id.to_s,
169
+ :target_job_name_path => @ctx.vertex(:j11).name_path,
170
+ })
171
+ @root.reload
172
+ @ctx.edge(:e1).phase_key.should == :transmitted
173
+ @ctx.edge(:e2).phase_key.should == :active
174
+ @ctx.vertex(:j11).phase_key.should == :error
175
+ end
176
+ end
177
+
178
+ context "server not found" do
179
+ it "対象のジョブはerrorになりエラーイベントが発火される" do
180
+ Tengine::Resource::Server.delete_all
181
+ @root.phase_key = :starting
182
+ @ctx.edge(:e1).phase_key = :transmitting
183
+ @ctx.vertex(:j11).phase_key = :ready
184
+ @root.save!
185
+ @root.reload
186
+ tengine.should_fire(:"error.job.job.tengine", an_instance_of(Hash))
187
+ tengine.receive("start.job.job.tengine", :properties => {
188
+ :execution_id => @execution.id.to_s,
189
+ :root_jobnet_id => @root.id.to_s,
190
+ :root_jobnet_name_path => @root.name_path,
191
+ :target_jobnet_id => @root.id.to_s,
192
+ :target_jobnet_name_path => @root.name_path,
193
+ :target_job_id => @ctx.vertex(:j11).id.to_s,
194
+ :target_job_name_path => @ctx.vertex(:j11).name_path,
195
+ })
196
+ @root.reload
197
+ @ctx.edge(:e1).phase_key.should == :transmitted
198
+ @ctx.edge(:e2).phase_key.should == :active
199
+ @ctx.vertex(:j11).phase_key.should == :error
200
+ end
201
+ end
202
+
203
+
204
+ context "wrong server IP" do
205
+ it "対象のジョブはerrorになりエラーイベントが発火される" do
206
+ server = Tengine::Resource::Server.find_by_name("test_server1")
207
+ server.addresses = {'private_ip_address' => "unexist_ip"}
208
+ server.save!
209
+ @root.phase_key = :starting
210
+ @ctx.edge(:e1).phase_key = :transmitting
211
+ @ctx.vertex(:j11).phase_key = :ready
212
+ @root.save!
213
+ @root.reload
214
+ tengine.should_fire(:"error.job.job.tengine", an_instance_of(Hash))
215
+ tengine.receive("start.job.job.tengine", :properties => {
216
+ :execution_id => @execution.id.to_s,
217
+ :root_jobnet_id => @root.id.to_s,
218
+ :root_jobnet_name_path => @root.name_path,
219
+ :target_jobnet_id => @root.id.to_s,
220
+ :target_jobnet_name_path => @root.name_path,
221
+ :target_job_id => @ctx.vertex(:j11).id.to_s,
222
+ :target_job_name_path => @ctx.vertex(:j11).name_path,
223
+ })
224
+ @root.reload
225
+ @ctx.edge(:e1).phase_key.should == :transmitted
226
+ @ctx.edge(:e2).phase_key.should == :active
227
+ @ctx.vertex(:j11).phase_key.should == :error
228
+ end
229
+ end
230
+
231
+
232
+ end
233
+
234
+ end
235
+
236
+
@@ -0,0 +1,302 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'spec_helper'
3
+ require 'tengine/rspec'
4
+
5
+ require 'net/ssh'
6
+
7
+
8
+ # 背景
9
+ # 以下の2つの条件が満たされ場合
10
+ # * 2つのtenginedプロセスが動いている
11
+ # * 並列で実行されるジョブを持つジョブネットが実行される(例えばrjn0002)
12
+ #
13
+ # 問題の詳細
14
+ # プロセス1がstart.job.job.tengineイベントによって起動したj11のプロセスのPIDを得る前に、
15
+ # プロセス2がstart.job.job.tengineイベントによってj12を起動することで、それらのルートジョブネットの
16
+ # versionが更新されてしまい、j11のPIDを得てルートジョブネットを更新する際にversionが
17
+ # 異なってしまっているため、update_with_lockメソッドによって実行に失敗したものと見なされて、
18
+ # 再度update_with_lockのブロックが実行されて、j11のプロセスが実行されてしまう。
19
+ #
20
+ # 本来どうあるべきか?
21
+ # update_with_lock内ではSSHなどの繰り返し実行することを想定していない処理や、
22
+ # イベントの送信を行ってはいけないので、それらの重複が起こらない仕組みになっているべき。
23
+ #
24
+ describe "<BUG>tengindのプロセスを二つ起動した際に並列ジョブがある際にジョブが2度実行される" do
25
+ include Tengine::RSpec::Extension
26
+
27
+ driver_path = File.expand_path("../../../../../lib/tengine/job/drivers/job_control_driver.rb", File.dirname(__FILE__))
28
+
29
+ # in [rjn0002]
30
+ # |--e2-->(j11)--e4-->|
31
+ # (S1)--e1-->[F1] [J1]--e6-->(E1)
32
+ # |--e3-->(j12)--e5-->|
33
+ context "rjn0002" do
34
+ before do
35
+ Tengine::Resource::Server.delete_all
36
+ Tengine::Job::Execution.delete_all
37
+ Tengine::Job::Vertex.delete_all
38
+ TestCredentialFixture.test_credential1
39
+ TestServerFixture.test_server1
40
+ TestServerFixture.test_server2
41
+ builder = Rjn0002SimpleParallelJobnetBuilder.new
42
+ @root = builder.create_actual
43
+ j12 = @root.element("j12")
44
+ j12.server_name = "test_server2"
45
+ @root.save!
46
+
47
+ @ctx = builder.context
48
+ @execution = Tengine::Job::Execution.create!({
49
+ :root_jobnet_id => @root.id,
50
+ })
51
+ @base_props = {
52
+ :execution_id => @execution.id.to_s,
53
+ :root_jobnet_id => @root.id.to_s,
54
+ :root_jobnet_name_path => @root.name_path,
55
+ :target_jobnet_id => @root.id.to_s,
56
+ :target_jobnet_name_path => @root.name_path,
57
+ }
58
+
59
+ # 2つのプロセスの代わりに、2つのカーネルを別のFiberで動かす
60
+ @bootstrap1 = Tengine::Core::Bootstrap.new(:tengined => { :load_path => driver_path })
61
+ @bootstrap1.kernel.tap{|k| k.bind; k.evaluate}
62
+ @tengine1 = Tengine::RSpec::ContextWrapper.new(@bootstrap1.kernel)
63
+ #
64
+ @bootstrap2 = Tengine::Core::Bootstrap.new(:tengined => { :load_path => driver_path })
65
+ @bootstrap2.kernel.tap{|k| k.bind; k.evaluate}
66
+ @tengine2 = Tengine::RSpec::ContextWrapper.new(@bootstrap2.kernel)
67
+ end
68
+
69
+ # tengine1が起動したプロセスのPIDを得る前にtengine2がプロセスを起動することはできません。
70
+ #
71
+ # job_control_driverでのstart.job.job.tengineの処理の概略以下の通りです
72
+ #
73
+ # start.job.job.tengine
74
+ # 1. be starting
75
+ # 2. root_jobnet.update_with_lock
76
+ # 3. execute job with SSH
77
+ # 4. be running
78
+ # 5. root_jobnet.update_with_lock
79
+ #
80
+ # パターン1 (ほぼ同時に1に突入する)
81
+ # ||f1 ||f2 ||DB |
82
+ # ||ver|step||ver|step||ver|
83
+ # ---------------------------------------------------------
84
+ # || 0 | 1 || - | - || 0| f1 starting
85
+ # || 0 | 1 || 0 | 1 || 0| f2 starting 1st
86
+ # || 1 | 2 || 0 | 1 || 1| f1 update_with_lock success
87
+ # || 1 | 2 || 0 | 2 || 1| f2 update_with_lock fail & retry
88
+ # || 1 | 2 || 1 | 1 || 1| f2 starting 2nd
89
+ # || 1 | 2 || 2 | 2 || 2| f2 update_with_lock success
90
+ # || 2 | 3 || 2 | 2 || 2| f1 refrsh & SSH starting
91
+ # || 2 | 3 || 2 | 3 || 2| f2 refrsh & SSH starting
92
+ # || 2 | 4 || 2 | 3 || 2| f1 running
93
+ # || 3 | 5 || 2 | 3 || 3| f1 update_with_lock success
94
+ # || 3 | 5 || 2 | 4 || 3| f2 running 1st
95
+ # || 3 | 5 || 2 | 5 || 3| f2 update_with_lock fail & retry
96
+ # || 3 | 5 || 3 | 4 || 3| f2 running 2nd
97
+ # || 3 | 5 || 4 | 5 || 4| f2 update_with_lock success
98
+
99
+ before do
100
+ @ctx[:e1].phase_key = :transmitted
101
+ @ctx[:e2].phase_key = :transmitting
102
+ @ctx[:e3].phase_key = :transmitting
103
+ @ctx[:j11].phase_key = :ready
104
+ @ctx[:j12].phase_key = :ready
105
+ @root.phase_key = :starting
106
+ @root.version = 0
107
+ @root.save!
108
+
109
+ @pid = Process.pid.to_s
110
+
111
+ @f1 = Fiber.new do
112
+ ssh1 = mock(:ssh1)
113
+ Net::SSH.should_receive(:start).with("localhost",
114
+ an_instance_of(Tengine::Resource::Credential),
115
+ an_instance_of(Hash)).once.and_yield(ssh1)
116
+ channel1 = mock(:channel1)
117
+ ssh1.stub(:open_channel).and_yield(channel1)
118
+ channel1.stub(:exec).with(any_args).and_yield(channel1, true)
119
+ channel1.should_receive(:on_close) do
120
+ Tengine.logger.debug( ("!" * 100) << "\non_close: Fiber.yield #{Process.pid} #{__FILE__}##{__LINE__}")
121
+ Fiber.yield
122
+ end # on_dataが呼び出される前に止める
123
+ channel1.should_receive(:on_data).and_yield(channel1, @pid)
124
+ channel1.stub(:on_extended_data)
125
+ @tengine1.receive("start.job.job.tengine", :properties => {
126
+ :target_job_id => @ctx.vertex(:j11).id.to_s,
127
+ :target_job_name_path => @ctx.vertex(:j11).name_path,
128
+ }.update(@base_props))
129
+ :end
130
+ end
131
+
132
+ @f2 = Fiber.new do
133
+ ssh2 = mock(:ssh2)
134
+ Net::SSH.should_receive(:start).with("192.168.1.2",
135
+ an_instance_of(Tengine::Resource::Credential),
136
+ an_instance_of(Hash)).once.and_yield(ssh2)
137
+ channel2 = mock(:channel2)
138
+ ssh2.stub(:open_channel).and_yield(channel2)
139
+ channel2.stub(:exec).with(any_args).and_yield(channel2, true)
140
+ channel2.should_receive(:on_close) do
141
+ Tengine.logger.debug( ("!" * 100) << "\non_close: Fiber.yield #{Process.pid} #{__FILE__}##{__LINE__}")
142
+ Fiber.yield
143
+ end # on_dataが呼び出される前に止める
144
+ channel2.should_receive(:on_data).and_yield(channel2, @pid)
145
+ channel2.stub(:on_extended_data)
146
+ @tengine2.receive("start.job.job.tengine", :properties => {
147
+ :target_job_id => @ctx.vertex(:j12).id.to_s,
148
+ :target_job_name_path => @ctx.vertex(:j12).name_path,
149
+ }.update(@base_props))
150
+ :end
151
+ end
152
+
153
+ @j11 = @root.element("j11")
154
+ @j12 = @root.element("j12")
155
+
156
+ @root.reload
157
+ @root.version.should == 0
158
+ Tengine::Job.test_harness_clear
159
+ end
160
+
161
+ it "パターン1" do
162
+ # f1-1.
163
+ Tengine.logger.info("1" * 100)
164
+ Tengine::Job.should_receive(:test_harness).with(1, "before yield in update_with_lock").once
165
+ Tengine::Job.should_receive(:test_harness).with(2, "after yield in update_with_lock").once{ Fiber.yield}
166
+ @f1.resume.should_not == :end
167
+ @root.reload
168
+ @root.version.should == 0
169
+ @root.element("j11").phase_key.should == :ready
170
+ @root.element("j12").phase_key.should == :ready
171
+
172
+ # f2-1.
173
+ Tengine.logger.info("2" * 100)
174
+ Tengine::Job.should_receive(:test_harness).with(3, "before yield in update_with_lock").once
175
+ Tengine::Job.should_receive(:test_harness).with(4, "after yield in update_with_lock").once{ Fiber.yield}
176
+ @f2.resume.should_not == :end
177
+ @root.reload
178
+ @root.version.should == 0
179
+ @root.element("j11").phase_key.should == :ready
180
+ @root.element("j12").phase_key.should == :ready
181
+
182
+ # f1-2.
183
+ Tengine.logger.info("3" * 100)
184
+ Tengine::Job.should_receive(:test_harness).with(5, "after update_with_lock").once{ Fiber.yield}
185
+ @f1.resume.should_not == :end
186
+ @root.reload
187
+ @root.version.should == 1
188
+ @root.element("j11").phase_key.should == :starting
189
+ @root.element("j12").phase_key.should == :ready
190
+
191
+ # f2-1.
192
+ Tengine.logger.info("4" * 100)
193
+ Tengine::Job.should_receive(:test_harness).with(6, "before yield in update_with_lock").once.once
194
+ Tengine::Job.should_receive(:test_harness).with(7, "after yield in update_with_lock").once.once{ Fiber.yield}
195
+ @f2.resume.should_not == :end
196
+ @root.reload
197
+ @root.version.should == 1
198
+ @root.element("j11").phase_key.should == :starting
199
+ @root.element("j12").phase_key.should == :ready
200
+
201
+ # f2-2.
202
+ Tengine.logger.info("5" * 100)
203
+ Tengine::Job.should_receive(:test_harness).with(8, "after update_with_lock").once{ Fiber.yield}
204
+ @f2.resume.should_not == :end
205
+ @root.reload
206
+ @root.version.should == 2
207
+ @root.element("j11").phase_key.should == :starting
208
+ @root.element("j12").phase_key.should == :starting
209
+
210
+ # f1-3.
211
+ Tengine.logger.info("6" * 100)
212
+ @f1.resume.should_not == :end
213
+ @root.reload
214
+ @root.version.should == 2
215
+ @root.element("j11").phase_key.should == :starting
216
+ @root.element("j12").phase_key.should == :starting
217
+
218
+ # f2-3.
219
+ Tengine.logger.info("7" * 100)
220
+ @f2.resume.should_not == :end
221
+ @root.reload
222
+ @root.version.should == 2
223
+ @root.element("j11").phase_key.should == :starting
224
+ @root.element("j12").phase_key.should == :starting
225
+
226
+ # f1-4.
227
+ Tengine.logger.info("8" * 100)
228
+ Tengine::Job.should_receive(:test_harness).with(9, "before yield in update_with_lock").once
229
+ Tengine::Job.should_receive(:test_harness).with(10, "after yield in update_with_lock").once{ Fiber.yield }
230
+ @f1.resume.should_not == :end
231
+ @root.reload
232
+ @root.version.should == 2
233
+ @root.element("j11").phase_key.should == :starting
234
+ @root.element("j12").phase_key.should == :starting
235
+
236
+ # f1-5.
237
+ Tengine.logger.info("9" * 100)
238
+ Tengine::Job.should_receive(:test_harness).with(11, "after update_with_lock").once
239
+ @f1.resume.should == :end
240
+ @root.reload
241
+ @root.version.should == 3
242
+ @root.element("j11").tap do |j|
243
+ j.phase_key.should == :running
244
+ j.executing_pid.should_not be_nil
245
+ end
246
+ @root.element("j12").phase_key.should == :starting
247
+
248
+ # f2-4. 1st
249
+ Tengine.logger.info("a" * 100)
250
+ Tengine::Job.should_receive(:test_harness).with(12, "before yield in update_with_lock").once
251
+ Tengine::Job.should_receive(:test_harness).with(13, "after yield in update_with_lock").once{ Fiber.yield }
252
+ @f2.resume.should_not == :end
253
+ @root.reload
254
+ @root.version.should == 3
255
+ @root.element("j11").tap do |j|
256
+ j.phase_key.should == :running
257
+ j.executing_pid.should_not be_nil
258
+ end
259
+ @root.element("j12").phase_key.should == :starting
260
+
261
+ # f2-5.
262
+ Tengine.logger.info("b" * 100)
263
+ Tengine::Job.should_receive(:test_harness).with(14, "before yield in update_with_lock").once{ Fiber.yield }
264
+ @f2.resume.should_not == :end
265
+ @root.reload
266
+ @root.version.should == 3
267
+ @root.element("j11").tap do |j|
268
+ j.phase_key.should == :running
269
+ j.executing_pid.should_not be_nil
270
+ end
271
+ @root.element("j12").phase_key.should == :starting
272
+
273
+ # f2-4. 2nd
274
+ Tengine.logger.info("c" * 100)
275
+ Tengine::Job.should_receive(:test_harness).with(15, "after yield in update_with_lock").once{ Fiber.yield }
276
+ @f2.resume.should_not == :end
277
+ @root.reload
278
+ @root.version.should == 3
279
+ @root.element("j11").tap do |j|
280
+ j.phase_key.should == :running
281
+ j.executing_pid.should_not be_nil
282
+ end
283
+ @root.element("j12").phase_key.should == :starting
284
+
285
+ # f2-5.
286
+ Tengine.logger.info("d" * 100)
287
+ Tengine::Job.should_receive(:test_harness).with(16, "after update_with_lock").once
288
+ @f2.resume.should == :end
289
+ @root.reload
290
+ @root.version.should == 4
291
+ @root.element("j11").tap do |j|
292
+ j.phase_key.should == :running
293
+ j.executing_pid.should_not be_nil
294
+ end
295
+ @root.element("j12").tap do |j|
296
+ j.executing_pid.should_not be_nil
297
+ j.phase_key.should == :running
298
+ end
299
+ end
300
+
301
+ end
302
+ end