arvados-cli 0.1.20150530020106 → 0.1.20150603135055
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/crunch-job +55 -31
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b2fd398e685fe06211ac7885274e8e8de2d99f4
|
4
|
+
data.tar.gz: 5b5b5b764f9a1853443fde6dc8957f94dfe4ebad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9accb2d87fe4993a89c4cd9ddb4790a605d165185fd17d908ad9e2b3bf6e45fafddb5432d6caa9e7dcf45aeff832683e3ed79dfd0e2b09805624b1e13688a301
|
7
|
+
data.tar.gz: 5c194163fa247bdee9531d7403db995af6d21aba1d1b73961e29204da3c89a5f9574e1a229feee1ce0cef8f97e24140ccff87654f5d0664b3cbe514d118dd435
|
data/bin/crunch-job
CHANGED
@@ -139,7 +139,7 @@ if (defined $job_api_token) {
|
|
139
139
|
$ENV{ARVADOS_API_TOKEN} = $job_api_token;
|
140
140
|
}
|
141
141
|
|
142
|
-
my $have_slurm = exists $ENV{
|
142
|
+
my $have_slurm = exists $ENV{SLURM_JOB_ID} && exists $ENV{SLURM_NODELIST};
|
143
143
|
|
144
144
|
|
145
145
|
$SIG{'USR1'} = sub
|
@@ -342,8 +342,7 @@ my @jobstep_todo = ();
|
|
342
342
|
my @jobstep_done = ();
|
343
343
|
my @jobstep_tomerge = ();
|
344
344
|
my $jobstep_tomerge_level = 0;
|
345
|
-
my $squeue_checked;
|
346
|
-
my $squeue_kill_checked;
|
345
|
+
my $squeue_checked = 0;
|
347
346
|
my $latest_refresh = scalar time;
|
348
347
|
|
349
348
|
|
@@ -1254,29 +1253,45 @@ sub check_refresh_wanted
|
|
1254
1253
|
|
1255
1254
|
sub check_squeue
|
1256
1255
|
{
|
1257
|
-
|
1258
|
-
if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
|
1259
|
-
{
|
1260
|
-
return;
|
1261
|
-
}
|
1262
|
-
$squeue_kill_checked = time;
|
1256
|
+
my $last_squeue_check = $squeue_checked;
|
1263
1257
|
|
1264
|
-
#
|
1265
|
-
|
1258
|
+
# Do not call `squeue` or check the kill list more than once every
|
1259
|
+
# 15 seconds.
|
1260
|
+
return if $last_squeue_check > time - 15;
|
1261
|
+
$squeue_checked = time;
|
1262
|
+
|
1263
|
+
# Look for children from which we haven't received stderr data since
|
1264
|
+
# the last squeue check. If no such children exist, all procs are
|
1265
|
+
# alive and there's no need to even look at squeue.
|
1266
|
+
#
|
1267
|
+
# As long as the crunchstat poll interval (10s) is shorter than the
|
1268
|
+
# squeue check interval (15s) this should make the squeue check an
|
1269
|
+
# infrequent event.
|
1270
|
+
my $silent_procs = 0;
|
1271
|
+
for my $jobstep (values %proc)
|
1266
1272
|
{
|
1267
|
-
if (
|
1268
|
-
&& $proc{$_}->{killtime} <= time)
|
1273
|
+
if ($jobstep->{stderr_at} < $last_squeue_check)
|
1269
1274
|
{
|
1270
|
-
|
1275
|
+
$silent_procs++;
|
1271
1276
|
}
|
1272
1277
|
}
|
1278
|
+
return if $silent_procs == 0;
|
1273
1279
|
|
1274
|
-
#
|
1275
|
-
|
1280
|
+
# use killem() on procs whose killtime is reached
|
1281
|
+
while (my ($pid, $jobstep) = each %proc)
|
1276
1282
|
{
|
1277
|
-
|
1283
|
+
if (exists $jobstep->{killtime}
|
1284
|
+
&& $jobstep->{killtime} <= time
|
1285
|
+
&& $jobstep->{stderr_at} < $last_squeue_check)
|
1286
|
+
{
|
1287
|
+
my $sincewhen = "";
|
1288
|
+
if ($jobstep->{stderr_at}) {
|
1289
|
+
$sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
|
1290
|
+
}
|
1291
|
+
Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
|
1292
|
+
killem ($pid);
|
1293
|
+
}
|
1278
1294
|
}
|
1279
|
-
$squeue_checked = time;
|
1280
1295
|
|
1281
1296
|
if (!$have_slurm)
|
1282
1297
|
{
|
@@ -1285,13 +1300,13 @@ sub check_squeue
|
|
1285
1300
|
}
|
1286
1301
|
|
1287
1302
|
# get a list of steps still running
|
1288
|
-
my @squeue = `squeue
|
1289
|
-
|
1290
|
-
if ($squeue[-1] ne "ok")
|
1303
|
+
my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOB_ID}\E --steps --format='%i %j' --noheader`;
|
1304
|
+
if ($? != 0)
|
1291
1305
|
{
|
1306
|
+
Log(undef, "warning: squeue exit status $? ($!)");
|
1292
1307
|
return;
|
1293
1308
|
}
|
1294
|
-
|
1309
|
+
chop @squeue;
|
1295
1310
|
|
1296
1311
|
# which of my jobsteps are running, according to squeue?
|
1297
1312
|
my %ok;
|
@@ -1299,22 +1314,30 @@ sub check_squeue
|
|
1299
1314
|
{
|
1300
1315
|
if (/^(\d+)\.(\d+) (\S+)/)
|
1301
1316
|
{
|
1302
|
-
if ($1 eq $ENV{
|
1317
|
+
if ($1 eq $ENV{SLURM_JOB_ID})
|
1303
1318
|
{
|
1304
1319
|
$ok{$3} = 1;
|
1305
1320
|
}
|
1306
1321
|
}
|
1307
1322
|
}
|
1308
1323
|
|
1309
|
-
#
|
1310
|
-
|
1324
|
+
# Check for child procs >60s old and not mentioned by squeue.
|
1325
|
+
while (my ($pid, $jobstep) = each %proc)
|
1311
1326
|
{
|
1312
|
-
if ($
|
1313
|
-
|
1314
|
-
|
1327
|
+
if ($jobstep->{time} < time - 60
|
1328
|
+
&& $jobstep->{jobstepname}
|
1329
|
+
&& !exists $ok{$jobstep->{jobstepname}}
|
1330
|
+
&& !exists $jobstep->{killtime})
|
1315
1331
|
{
|
1316
|
-
#
|
1317
|
-
|
1332
|
+
# According to slurm, this task has ended (successfully or not)
|
1333
|
+
# -- but our srun child hasn't exited. First we must wait (30
|
1334
|
+
# seconds) in case this is just a race between communication
|
1335
|
+
# channels. Then, if our srun child process still hasn't
|
1336
|
+
# terminated, we'll conclude some slurm communication
|
1337
|
+
# error/delay has caused the task to die without notifying srun,
|
1338
|
+
# and we'll kill srun ourselves.
|
1339
|
+
$jobstep->{killtime} = time + 30;
|
1340
|
+
Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
|
1318
1341
|
}
|
1319
1342
|
}
|
1320
1343
|
}
|
@@ -1325,7 +1348,7 @@ sub release_allocation
|
|
1325
1348
|
if ($have_slurm)
|
1326
1349
|
{
|
1327
1350
|
Log (undef, "release job allocation");
|
1328
|
-
system "scancel $ENV{
|
1351
|
+
system "scancel $ENV{SLURM_JOB_ID}";
|
1329
1352
|
}
|
1330
1353
|
}
|
1331
1354
|
|
@@ -1339,6 +1362,7 @@ sub readfrompipes
|
|
1339
1362
|
while (0 < sysread ($reader{$job}, $buf, 8192))
|
1340
1363
|
{
|
1341
1364
|
print STDERR $buf if $ENV{CRUNCH_DEBUG};
|
1365
|
+
$jobstep[$job]->{stderr_at} = time;
|
1342
1366
|
$jobstep[$job]->{stderr} .= $buf;
|
1343
1367
|
preprocess_stderr ($job);
|
1344
1368
|
if (length ($jobstep[$job]->{stderr}) > 16384)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arvados-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.20150603135055
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arvados Authors
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: arvados
|
@@ -178,7 +178,7 @@ dependencies:
|
|
178
178
|
- - "<"
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: 1.0.0
|
181
|
-
description: Arvados command line tools, git commit
|
181
|
+
description: Arvados command line tools, git commit c4c8977ef25cc6805f2cca1dedfc83faecc0bc23
|
182
182
|
email: gem-dev@curoverse.com
|
183
183
|
executables:
|
184
184
|
- arv
|