arvados-cli 0.1.20150530020106 → 0.1.20150603135055

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/crunch-job +55 -31
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5b3e64d8005f4637858cda67b6523494fe599bd9
4
- data.tar.gz: a84f4cb466f5bd6efe8e6cdd6cd383957cfa38ad
3
+ metadata.gz: 6b2fd398e685fe06211ac7885274e8e8de2d99f4
4
+ data.tar.gz: 5b5b5b764f9a1853443fde6dc8957f94dfe4ebad
5
5
  SHA512:
6
- metadata.gz: 6cc34268e017d6458a7563ae246d136c1017aeae337f7fe890feeaabdaa7bc48dae5781388ac3dfdc4fa847b749c3a62c9f8df5c4979cfe0cb63a87e7adcd6f8
7
- data.tar.gz: c936f24c11dff77a9882c435eb45a1f4a0d57b381ed939c0ca8a49ac83fd79d529dd031b5a307bf25d67f4574217f08bb9fdb97ef932c8e2e054ccdb1eea97d7
6
+ metadata.gz: 9accb2d87fe4993a89c4cd9ddb4790a605d165185fd17d908ad9e2b3bf6e45fafddb5432d6caa9e7dcf45aeff832683e3ed79dfd0e2b09805624b1e13688a301
7
+ data.tar.gz: 5c194163fa247bdee9531d7403db995af6d21aba1d1b73961e29204da3c89a5f9574e1a229feee1ce0cef8f97e24140ccff87654f5d0664b3cbe514d118dd435
data/bin/crunch-job CHANGED
@@ -139,7 +139,7 @@ if (defined $job_api_token) {
139
139
  $ENV{ARVADOS_API_TOKEN} = $job_api_token;
140
140
  }
141
141
 
142
- my $have_slurm = exists $ENV{SLURM_JOBID} && exists $ENV{SLURM_NODELIST};
142
+ my $have_slurm = exists $ENV{SLURM_JOB_ID} && exists $ENV{SLURM_NODELIST};
143
143
 
144
144
 
145
145
  $SIG{'USR1'} = sub
@@ -342,8 +342,7 @@ my @jobstep_todo = ();
342
342
  my @jobstep_done = ();
343
343
  my @jobstep_tomerge = ();
344
344
  my $jobstep_tomerge_level = 0;
345
- my $squeue_checked;
346
- my $squeue_kill_checked;
345
+ my $squeue_checked = 0;
347
346
  my $latest_refresh = scalar time;
348
347
 
349
348
 
@@ -1254,29 +1253,45 @@ sub check_refresh_wanted
1254
1253
 
1255
1254
  sub check_squeue
1256
1255
  {
1257
- # return if the kill list was checked <4 seconds ago
1258
- if (defined $squeue_kill_checked && $squeue_kill_checked > time - 4)
1259
- {
1260
- return;
1261
- }
1262
- $squeue_kill_checked = time;
1256
+ my $last_squeue_check = $squeue_checked;
1263
1257
 
1264
- # use killem() on procs whose killtime is reached
1265
- for (keys %proc)
1258
+ # Do not call `squeue` or check the kill list more than once every
1259
+ # 15 seconds.
1260
+ return if $last_squeue_check > time - 15;
1261
+ $squeue_checked = time;
1262
+
1263
+ # Look for children from which we haven't received stderr data since
1264
+ # the last squeue check. If no such children exist, all procs are
1265
+ # alive and there's no need to even look at squeue.
1266
+ #
1267
+ # As long as the crunchstat poll interval (10s) is shorter than the
1268
+ # squeue check interval (15s) this should make the squeue check an
1269
+ # infrequent event.
1270
+ my $silent_procs = 0;
1271
+ for my $jobstep (values %proc)
1266
1272
  {
1267
- if (exists $proc{$_}->{killtime}
1268
- && $proc{$_}->{killtime} <= time)
1273
+ if ($jobstep->{stderr_at} < $last_squeue_check)
1269
1274
  {
1270
- killem ($_);
1275
+ $silent_procs++;
1271
1276
  }
1272
1277
  }
1278
+ return if $silent_procs == 0;
1273
1279
 
1274
- # return if the squeue was checked <60 seconds ago
1275
- if (defined $squeue_checked && $squeue_checked > time - 60)
1280
+ # use killem() on procs whose killtime is reached
1281
+ while (my ($pid, $jobstep) = each %proc)
1276
1282
  {
1277
- return;
1283
+ if (exists $jobstep->{killtime}
1284
+ && $jobstep->{killtime} <= time
1285
+ && $jobstep->{stderr_at} < $last_squeue_check)
1286
+ {
1287
+ my $sincewhen = "";
1288
+ if ($jobstep->{stderr_at}) {
1289
+ $sincewhen = " in last " . (time - $jobstep->{stderr_at}) . "s";
1290
+ }
1291
+ Log($jobstep->{jobstep}, "killing orphaned srun process $pid (task not in slurm queue, no stderr received$sincewhen)");
1292
+ killem ($pid);
1293
+ }
1278
1294
  }
1279
- $squeue_checked = time;
1280
1295
 
1281
1296
  if (!$have_slurm)
1282
1297
  {
@@ -1285,13 +1300,13 @@ sub check_squeue
1285
1300
  }
1286
1301
 
1287
1302
  # get a list of steps still running
1288
- my @squeue = `squeue -s -h -o '%i %j' && echo ok`;
1289
- chop @squeue;
1290
- if ($squeue[-1] ne "ok")
1303
+ my @squeue = `squeue --jobs=\Q$ENV{SLURM_JOB_ID}\E --steps --format='%i %j' --noheader`;
1304
+ if ($? != 0)
1291
1305
  {
1306
+ Log(undef, "warning: squeue exit status $? ($!)");
1292
1307
  return;
1293
1308
  }
1294
- pop @squeue;
1309
+ chop @squeue;
1295
1310
 
1296
1311
  # which of my jobsteps are running, according to squeue?
1297
1312
  my %ok;
@@ -1299,22 +1314,30 @@ sub check_squeue
1299
1314
  {
1300
1315
  if (/^(\d+)\.(\d+) (\S+)/)
1301
1316
  {
1302
- if ($1 eq $ENV{SLURM_JOBID})
1317
+ if ($1 eq $ENV{SLURM_JOB_ID})
1303
1318
  {
1304
1319
  $ok{$3} = 1;
1305
1320
  }
1306
1321
  }
1307
1322
  }
1308
1323
 
1309
- # which of my active child procs (>60s old) were not mentioned by squeue?
1310
- foreach (keys %proc)
1324
+ # Check for child procs >60s old and not mentioned by squeue.
1325
+ while (my ($pid, $jobstep) = each %proc)
1311
1326
  {
1312
- if ($proc{$_}->{time} < time - 60
1313
- && !exists $ok{$proc{$_}->{jobstepname}}
1314
- && !exists $proc{$_}->{killtime})
1327
+ if ($jobstep->{time} < time - 60
1328
+ && $jobstep->{jobstepname}
1329
+ && !exists $ok{$jobstep->{jobstepname}}
1330
+ && !exists $jobstep->{killtime})
1315
1331
  {
1316
- # kill this proc if it hasn't exited in 30 seconds
1317
- $proc{$_}->{killtime} = time + 30;
1332
+ # According to slurm, this task has ended (successfully or not)
1333
+ # -- but our srun child hasn't exited. First we must wait (30
1334
+ # seconds) in case this is just a race between communication
1335
+ # channels. Then, if our srun child process still hasn't
1336
+ # terminated, we'll conclude some slurm communication
1337
+ # error/delay has caused the task to die without notifying srun,
1338
+ # and we'll kill srun ourselves.
1339
+ $jobstep->{killtime} = time + 30;
1340
+ Log($jobstep->{jobstep}, "notice: task is not in slurm queue but srun process $pid has not exited");
1318
1341
  }
1319
1342
  }
1320
1343
  }
@@ -1325,7 +1348,7 @@ sub release_allocation
1325
1348
  if ($have_slurm)
1326
1349
  {
1327
1350
  Log (undef, "release job allocation");
1328
- system "scancel $ENV{SLURM_JOBID}";
1351
+ system "scancel $ENV{SLURM_JOB_ID}";
1329
1352
  }
1330
1353
  }
1331
1354
 
@@ -1339,6 +1362,7 @@ sub readfrompipes
1339
1362
  while (0 < sysread ($reader{$job}, $buf, 8192))
1340
1363
  {
1341
1364
  print STDERR $buf if $ENV{CRUNCH_DEBUG};
1365
+ $jobstep[$job]->{stderr_at} = time;
1342
1366
  $jobstep[$job]->{stderr} .= $buf;
1343
1367
  preprocess_stderr ($job);
1344
1368
  if (length ($jobstep[$job]->{stderr}) > 16384)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arvados-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20150530020106
4
+ version: 0.1.20150603135055
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arvados Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-30 00:00:00.000000000 Z
11
+ date: 2015-06-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: arvados
@@ -178,7 +178,7 @@ dependencies:
178
178
  - - "<"
179
179
  - !ruby/object:Gem::Version
180
180
  version: 1.0.0
181
- description: Arvados command line tools, git commit 178d3f36265e0e9e9cc0bb6ac8c7c47a9c701687
181
+ description: Arvados command line tools, git commit c4c8977ef25cc6805f2cca1dedfc83faecc0bc23
182
182
  email: gem-dev@curoverse.com
183
183
  executables:
184
184
  - arv