arvados-cli 0.1.20150530020106 → 0.1.20150603135055

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/crunch-job +55 -31
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5b3e64d8005f4637858cda67b6523494fe599bd9
4
- data.tar.gz: a84f4cb466f5bd6efe8e6cdd6cd383957cfa38ad
3
+ metadata.gz: 6b2fd398e685fe06211ac7885274e8e8de2d99f4
4
+ data.tar.gz: 5b5b5b764f9a1853443fde6dc8957f94dfe4ebad
5
5
  SHA512:
6
- metadata.gz: 6cc34268e017d6458a7563ae246d136c1017aeae337f7fe890feeaabdaa7bc48dae5781388ac3dfdc4fa847b749c3a62c9f8df5c4979cfe0cb63a87e7adcd6f8
7
- data.tar.gz: c936f24c11dff77a9882c435eb45a1f4a0d57b381ed939c0ca8a49ac83fd79d529dd031b5a307bf25d67f4574217f08bb9fdb97ef932c8e2e054ccdb1eea97d7
6
+ metadata.gz: 9accb2d87fe4993a89c4cd9ddb4790a605d165185fd17d908ad9e2b3bf6e45fafddb5432d6caa9e7dcf45aeff832683e3ed79dfd0e2b09805624b1e13688a301
7
+ data.tar.gz: 5c194163fa247bdee9531d7403db995af6d21aba1d1b73961e29204da3c89a5f9574e1a229feee1ce0cef8f97e24140ccff87654f5d0664b3cbe514d118dd435
data/bin/crunch-job CHANGED
@@ -139,7 +139,7 @@ if (defined $job_api_token) {
139
139
  $ENV{ARVADOS_API_TOKEN} = $job_api_token;
140
140
  }
141
141
 
142
- my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
142
+ my $have_slurm = exists $ENV{SLURM_JOB_ID} && exists $ENV{SLURM_NODELIST};
143
143
 
144
144
 
145
145
  $SIG{'USR1'} = sub
@@ -342,8 +342,7 @@ my @jobstep_todo = ();
342
342
  my @jobstep_done = ();
343
343
  my @jobstep_tomerge = ();
344
344
  my $jobstep_tomerge_level = 0;
345
- my $squeue_checked;
346
- my $squeue_kill_checked;
345
+ my $squeue_checked = 0;
347
346
  my $latest_refresh = scalar time;
348
347
 
349
348
 
@@ -1254,29 +1253,45 @@ sub check_refresh_wanted
1254
1253
 
1255
1254
  sub check_squeue
1256
1255
  {
1257
- # return if the kill list was checked <4 seconds ago
1258
- if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
1259
- {
1260
- return;
1261
- }
1262
- $squeue_kill_checked = time;
1256
+ my $last_squeue_check = $squeue_checked;
1263
1257
 
1264
- # use killem() on procs whose killtime is reached
1265
- for (keys %proc)
1258
+ # Do not call `squeue` or check the kill list more than once every
1259
+ # 15 seconds.
1260
+ return if $last_squeue_check > time - 15;
1261
+ $squeue_checked = time;
1262
+
1263
+ # Look for children from which we haven't received stderr data since
1264
+ # the last squeue check. If no such children exist, all procs are
1265
+ # alive and there's no need to even look at squeue.
1266
+ #
1267
+ # As long as the crunchstat poll interval (10s) is shorter than the
1268
+ # squeue check interval (15s) this should make the squeue check an
1269
+ # infrequent event.
1270
+ my $silent_procs = 0;
1271
+ for my $jobstep (values %proc)
1266
1272
  {
1267
- if (exists $proc{$_}->{killtime}
1268
- && $proc{$_}->{killtime} <= time)
1273
+ if ($jobstep->{stderr_at} < $last_squeue_check)
1269
1274
  {
1270
- killem ($_);
1275
+ $silent_procs++;
1271
1276
  }
1272
1277
  }
1278
+ return if $silent_procs == 0;
1273
1279
 
1274
- # return if the squeue was checked <60 seconds ago
1275
- if (defined $squeue_checked && $squeue_checked > time - 60)
1280
+ # use killem() on procs whose killtime is reached
1281
+ while (my ($pid, $jobstep) = each %proc)
1276
1282
  {
1277
- return;
1283
+ if (exists $jobstep->{killtime}
1284
+ && $jobstep->{killtime} <= time
1285
+ && $jobstep->{stderr_at} < $last_squeue_check)
1286
+ {
1287
+ my $sincewhen = "";
1288
+ if ($jobstep->{stderr_at}) {
1289
+ $sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
1290
+ }
1291
+ Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
1292
+ killem ($pid);
1293
+ }
1278
1294
  }
1279
- $squeue_checked = time;
1280
1295
 
1281
1296
  if (!$have_slurm)
1282
1297
  {
@@ -1285,13 +1300,13 @@ sub check_squeue
1285
1300
  }
1286
1301
 
1287
1302
  # get a list of steps still running
1288
- my @squeue = `squeue -s -h -o '%i %j' && echo ok`;
1289
- chop @squeue;
1290
- if ($squeue[-1] ne "ok")
1303
+ my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOB_ID}\E --steps --format='%i %j' --noheader`;
1304
+ if ($? != 0)
1291
1305
  {
1306
+ Log(undef, "warning: squeue exit status $? ($!)");
1292
1307
  return;
1293
1308
  }
1294
- pop @squeue;
1309
+ chop @squeue;
1295
1310
 
1296
1311
  # which of my jobsteps are running, according to squeue?
1297
1312
  my %ok;
@@ -1299,22 +1314,30 @@ sub check_squeue
1299
1314
  {
1300
1315
  if (/^(\d+)\.(\d+) (\S+)/)
1301
1316
  {
1302
- if ($1 eq $ENV{SLURM_JOBID})
1317
+ if ($1 eq $ENV{SLURM_JOB_ID})
1303
1318
  {
1304
1319
  $ok{$3} = 1;
1305
1320
  }
1306
1321
  }
1307
1322
  }
1308
1323
 
1309
- # which of my active child procs (>60s old) were not mentioned by squeue?
1310
- foreach (keys %proc)
1324
+ # Check for child procs >60s old and not mentioned by squeue.
1325
+ while (my ($pid, $jobstep) = each %proc)
1311
1326
  {
1312
- if ($proc{$_}->{time} < time - 60
1313
- && !exists $ok{$proc{$_}->{jobstepname}}
1314
- && !exists $proc{$_}->{killtime})
1327
+ if ($jobstep->{time} < time - 60
1328
+ && $jobstep->{jobstepname}
1329
+ && !exists $ok{$jobstep->{jobstepname}}
1330
+ && !exists $jobstep->{killtime})
1315
1331
  {
1316
- # kill this proc if it hasn't exited in 30 seconds
1317
- $proc{$_}->{killtime} = time + 30;
1332
+ # According to slurm, this task has ended (successfully or not)
1333
+ # -- but our srun child hasn't exited. First we must wait (30
1334
+ # seconds) in case this is just a race between communication
1335
+ # channels. Then, if our srun child process still hasn't
1336
+ # terminated, we'll conclude some slurm communication
1337
+ # error/delay has caused the task to die without notifying srun,
1338
+ # and we'll kill srun ourselves.
1339
+ $jobstep->{killtime} = time + 30;
1340
+ Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
1318
1341
  }
1319
1342
  }
1320
1343
  }
@@ -1325,7 +1348,7 @@ sub release_allocation
1325
1348
  if ($have_slurm)
1326
1349
  {
1327
1350
  Log (undef, "release job allocation");
1328
- system "scancel $ENV{SLURM_JOBID}";
1351
+ system "scancel $ENV{SLURM_JOB_ID}";
1329
1352
  }
1330
1353
  }
1331
1354
 
@@ -1339,6 +1362,7 @@ sub readfrompipes
1339
1362
  while (0 < sysread ($reader{$job}, $buf, 8192))
1340
1363
  {
1341
1364
  print STDERR $buf if $ENV{CRUNCH_DEBUG};
1365
+ $jobstep[$job]->{stderr_at} = time;
1342
1366
  $jobstep[$job]->{stderr} .= $buf;
1343
1367
  preprocess_stderr ($job);
1344
1368
  if (length ($jobstep[$job]->{stderr}) > 16384)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arvados-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20150530020106
4
+ version: 0.1.20150603135055
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arvados Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-30 00:00:00.000000000 Z
11
+ date: 2015-06-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: arvados
@@ -178,7 +178,7 @@ dependencies:
178
178
  - - "<"
179
179
  - !ruby/object:Gem::Version
180
180
  version: 1.0.0
181
- description: Arvados command line tools, git commit 178d3f36265e0e9e9cc0bb6ac8c7c47a9c701687
181
+ description: Arvados command line tools, git commit c4c8977ef25cc6805f2cca1dedfc83faecc0bc23
182
182
  email: gem-dev@curoverse.com
183
183
  executables:
184
184
  - arv