arvados-cli 0.1.20150530020106 → 0.1.20150603135055
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/crunch-job +55 -31
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6b2fd398e685fe06211ac7885274e8e8de2d99f4
|
4
|
+
data.tar.gz: 5b5b5b764f9a1853443fde6dc8957f94dfe4ebad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9accb2d87fe4993a89c4cd9ddb4790a605d165185fd17d908ad9e2b3bf6e45fafddb5432d6caa9e7dcf45aeff832683e3ed79dfd0e2b09805624b1e13688a301
|
7
|
+
data.tar.gz: 5c194163fa247bdee9531d7403db995af6d21aba1d1b73961e29204da3c89a5f9574e1a229feee1ce0cef8f97e24140ccff87654f5d0664b3cbe514d118dd435
|
data/bin/crunch-job
CHANGED
@@ -139,7 +139,7 @@ if (defined $job_api_token) {
|
|
139
139
|
$ENV{ARVADOS_API_TOKEN} = $job_api_token;
|
140
140
|
}
|
141
141
|
|
142
|
-
my $have_slurm = exists $ENV{
|
142
|
+
my $have_slurm = exists $ENV{SLURM_JOB_ID} && exists $ENV{SLURM_NODELIST};
|
143
143
|
|
144
144
|
|
145
145
|
$SIG{'USR1'} = sub
|
@@ -342,8 +342,7 @@ my @jobstep_todo = ();
|
|
342
342
|
my @jobstep_done = ();
|
343
343
|
my @jobstep_tomerge = ();
|
344
344
|
my $jobstep_tomerge_level = 0;
|
345
|
-
my $squeue_checked;
|
346
|
-
my $squeue_kill_checked;
|
345
|
+
my $squeue_checked = 0;
|
347
346
|
my $latest_refresh = scalar time;
|
348
347
|
|
349
348
|
|
@@ -1254,29 +1253,45 @@ sub check_refresh_wanted
|
|
1254
1253
|
|
1255
1254
|
sub check_squeue
|
1256
1255
|
{
|
1257
|
-
|
1258
|
-
if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
|
1259
|
-
{
|
1260
|
-
return;
|
1261
|
-
}
|
1262
|
-
$squeue_kill_checked = time;
|
1256
|
+
my $last_squeue_check = $squeue_checked;
|
1263
1257
|
|
1264
|
-
#
|
1265
|
-
|
1258
|
+
# Do not call `squeue` or check the kill list more than once every
|
1259
|
+
# 15 seconds.
|
1260
|
+
return if $last_squeue_check > time - 15;
|
1261
|
+
$squeue_checked = time;
|
1262
|
+
|
1263
|
+
# Look for children from which we haven't received stderr data since
|
1264
|
+
# the last squeue check. If no such children exist, all procs are
|
1265
|
+
# alive and there's no need to even look at squeue.
|
1266
|
+
#
|
1267
|
+
# As long as the crunchstat poll interval (10s) is shorter than the
|
1268
|
+
# squeue check interval (15s) this should make the squeue check an
|
1269
|
+
# infrequent event.
|
1270
|
+
my $silent_procs = 0;
|
1271
|
+
for my $jobstep (values %proc)
|
1266
1272
|
{
|
1267
|
-
if (
|
1268
|
-
&& $proc{$_}->{killtime} <= time)
|
1273
|
+
if ($jobstep->{stderr_at} < $last_squeue_check)
|
1269
1274
|
{
|
1270
|
-
|
1275
|
+
$silent_procs++;
|
1271
1276
|
}
|
1272
1277
|
}
|
1278
|
+
return if $silent_procs == 0;
|
1273
1279
|
|
1274
|
-
#
|
1275
|
-
|
1280
|
+
# use killem() on procs whose killtime is reached
|
1281
|
+
while (my ($pid, $jobstep) = each %proc)
|
1276
1282
|
{
|
1277
|
-
|
1283
|
+
if (exists $jobstep->{killtime}
|
1284
|
+
&& $jobstep->{killtime} <= time
|
1285
|
+
&& $jobstep->{stderr_at} < $last_squeue_check)
|
1286
|
+
{
|
1287
|
+
my $sincewhen = "";
|
1288
|
+
if ($jobstep->{stderr_at}) {
|
1289
|
+
$sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
|
1290
|
+
}
|
1291
|
+
Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
|
1292
|
+
killem ($pid);
|
1293
|
+
}
|
1278
1294
|
}
|
1279
|
-
$squeue_checked = time;
|
1280
1295
|
|
1281
1296
|
if (!$have_slurm)
|
1282
1297
|
{
|
@@ -1285,13 +1300,13 @@ sub check_squeue
|
|
1285
1300
|
}
|
1286
1301
|
|
1287
1302
|
# get a list of steps still running
|
1288
|
-
my @squeue = `squeue
|
1289
|
-
|
1290
|
-
if ($squeue[-1] ne "ok")
|
1303
|
+
my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOB_ID}\E --steps --format='%i %j' --noheader`;
|
1304
|
+
if ($? != 0)
|
1291
1305
|
{
|
1306
|
+
Log(undef, "warning: squeue exit status $? ($!)");
|
1292
1307
|
return;
|
1293
1308
|
}
|
1294
|
-
|
1309
|
+
chop @squeue;
|
1295
1310
|
|
1296
1311
|
# which of my jobsteps are running, according to squeue?
|
1297
1312
|
my %ok;
|
@@ -1299,22 +1314,30 @@ sub check_squeue
|
|
1299
1314
|
{
|
1300
1315
|
if (/^(\d+)\.(\d+) (\S+)/)
|
1301
1316
|
{
|
1302
|
-
if ($1 eq $ENV{
|
1317
|
+
if ($1 eq $ENV{SLURM_JOB_ID})
|
1303
1318
|
{
|
1304
1319
|
$ok{$3} = 1;
|
1305
1320
|
}
|
1306
1321
|
}
|
1307
1322
|
}
|
1308
1323
|
|
1309
|
-
#
|
1310
|
-
|
1324
|
+
# Check for child procs >60s old and not mentioned by squeue.
|
1325
|
+
while (my ($pid, $jobstep) = each %proc)
|
1311
1326
|
{
|
1312
|
-
if ($
|
1313
|
-
|
1314
|
-
|
1327
|
+
if ($jobstep->{time} < time - 60
|
1328
|
+
&& $jobstep->{jobstepname}
|
1329
|
+
&& !exists $ok{$jobstep->{jobstepname}}
|
1330
|
+
&& !exists $jobstep->{killtime})
|
1315
1331
|
{
|
1316
|
-
#
|
1317
|
-
|
1332
|
+
# According to slurm, this task has ended (successfully or not)
|
1333
|
+
# -- but our srun child hasn't exited. First we must wait (30
|
1334
|
+
# seconds) in case this is just a race between communication
|
1335
|
+
# channels. Then, if our srun child process still hasn't
|
1336
|
+
# terminated, we'll conclude some slurm communication
|
1337
|
+
# error/delay has caused the task to die without notifying srun,
|
1338
|
+
# and we'll kill srun ourselves.
|
1339
|
+
$jobstep->{killtime} = time + 30;
|
1340
|
+
Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
|
1318
1341
|
}
|
1319
1342
|
}
|
1320
1343
|
}
|
@@ -1325,7 +1348,7 @@ sub release_allocation
|
|
1325
1348
|
if ($have_slurm)
|
1326
1349
|
{
|
1327
1350
|
Log (undef, "release job allocation");
|
1328
|
-
system "scancel $ENV{
|
1351
|
+
system "scancel $ENV{SLURM_JOB_ID}";
|
1329
1352
|
}
|
1330
1353
|
}
|
1331
1354
|
|
@@ -1339,6 +1362,7 @@ sub readfrompipes
|
|
1339
1362
|
while (0 < sysread ($reader{$job}, $buf, 8192))
|
1340
1363
|
{
|
1341
1364
|
print STDERR $buf if $ENV{CRUNCH_DEBUG};
|
1365
|
+
$jobstep[$job]->{stderr_at} = time;
|
1342
1366
|
$jobstep[$job]->{stderr} .= $buf;
|
1343
1367
|
preprocess_stderr ($job);
|
1344
1368
|
if (length ($jobstep[$job]->{stderr}) > 16384)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arvados-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.20150603135055
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arvados Authors
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: arvados
|
@@ -178,7 +178,7 @@ dependencies:
|
|
178
178
|
- - "<"
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: 1.0.0
|
181
|
-
description: Arvados command line tools, git commit
|
181
|
+
description: Arvados command line tools, git commit c4c8977ef25cc6805f2cca1dedfc83faecc0bc23
|
182
182
|
email: gem-dev@curoverse.com
|
183
183
|
executables:
|
184
184
|
- arv
|