arvados-cli 0.1.20160608142315 → 0.1.20160913014253

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/crunch-job +33 -1
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a5fc889cbdad1ec1ac89ed8674cf1499e90c20d4
4
- data.tar.gz: aed390277dc320517d93fd8ab7b14da48b3b0a2c
3
+ metadata.gz: 2a3dae7abb2565c96c930378ff0804dee060904f
4
+ data.tar.gz: ae346cc798ae73cc9e97299c245af5133f53cf29
5
5
  SHA512:
6
- metadata.gz: 92372e41f978b51561327fba6bf9107e1291aac4d8dd36de024e8714399a1a9cd904f4713a28bdf663a34049de94b82c4c67e718b27c277b686a4ed0e28f601e
7
- data.tar.gz: 11bef7ffe6f6f12c0c10e3739727d7376457069b547ce1d80953d16c7079b00fa4518ff68cf668eb09719d6df404fc6e7a8142feecb390f9f4ed244bf6d5958c
6
+ metadata.gz: 9185b99de69e9576411307ce24383734a9f03875a9f9723b238c4cb6d5049341132215360956d9060f3b91732378158bce64149281d3ffb2d7b55cc9238cc259
7
+ data.tar.gz: 4fd3ab654fc817827d1cfb79c03519112c7397b5f9c66f33e6d49d0f9f0ad9b4bac7a03a94d52c36442f5f5d0898e7f09004e74860d5c894276fd8f1acea7e3b
data/bin/crunch-job CHANGED
@@ -355,6 +355,7 @@ my @jobstep_done = ();
355
355
  my @jobstep_tomerge = ();
356
356
  my $jobstep_tomerge_level = 0;
357
357
  my $squeue_checked = 0;
358
+ my $sinfo_checked = 0;
358
359
  my $latest_refresh = scalar time;
359
360
 
360
361
 
@@ -1401,6 +1402,37 @@ sub check_squeue
1401
1402
  }
1402
1403
  }
1403
1404
 
1405
+ sub check_sinfo
1406
+ {
1407
+ # If a node fails in a multi-node "srun" call during job setup, the call
1408
+ # may hang instead of exiting with a nonzero code. This function checks
1409
+ # "sinfo" for the health of the nodes that were allocated and ensures that
1410
+ # they are all still in the "alloc" state. If a node that is allocated to
1411
+ # this job is not in "alloc" state, then set please_freeze.
1412
+ #
1413
+ # This is only called from srun_sync() for node configuration. If a
1414
+ # node fails doing actual work, there are other recovery mechanisms.
1415
+
1416
+ # Do not call `sinfo` more than once every 15 seconds.
1417
+ return if $sinfo_checked > time - 15;
1418
+ $sinfo_checked = time;
1419
+
1420
+ # The output format "%t" means output node states.
1421
+ my @sinfo = `sinfo --nodes=\Q$ENV{SLURM_NODELIST}\E --noheader -o "%t"`;
1422
+ if ($? != 0)
1423
+ {
1424
+ Log(undef, "warning: sinfo exit status $? ($!)");
1425
+ return;
1426
+ }
1427
+ chop @sinfo;
1428
+
1429
+ foreach (@sinfo)
1430
+ {
1431
+ if ($_ != "alloc" && $_ != "alloc*") {
1432
+ $main::please_freeze = 1;
1433
+ }
1434
+ }
1435
+ }
1404
1436
 
1405
1437
  sub release_allocation
1406
1438
  {
@@ -1906,7 +1938,6 @@ sub freezeunquote
1906
1938
  return $s;
1907
1939
  }
1908
1940
 
1909
-
1910
1941
  sub srun_sync
1911
1942
  {
1912
1943
  my $srunargs = shift;
@@ -1961,6 +1992,7 @@ sub srun_sync
1961
1992
  if (!$busy || ($latest_refresh + 2 < scalar time)) {
1962
1993
  check_refresh_wanted();
1963
1994
  check_squeue();
1995
+ check_sinfo();
1964
1996
  }
1965
1997
  if (!$busy) {
1966
1998
  select(undef, undef, undef, 0.1);
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arvados-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20160608142315
4
+ version: 0.1.20160913014253
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arvados Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-08 00:00:00.000000000 Z
11
+ date: 2016-09-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: arvados
@@ -164,7 +164,7 @@ dependencies:
164
164
  - - "~>"
165
165
  - !ruby/object:Gem::Version
166
166
  version: '0.8'
167
- description: Arvados command line tools, git commit 548e0c54db524cb7317850d4dfd8f3ee0b93cdb0
167
+ description: Arvados command line tools, git commit b54478ea1b7c8aaeaf565d591f32769bcdc09b8f
168
168
  email: gem-dev@curoverse.com
169
169
  executables:
170
170
  - arv