arvados-cli 0.1.20170711213448 → 0.1.20170726144433
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/crunch-job +25 -12
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 42613b210d35b7a656101e0b9a32ccf9fddaf985
|
4
|
+
data.tar.gz: a74110bd649f4a8c6b3d93131b016b22d9c1a8d4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2051c249708e46df2ddbdef8932220ac85271ba757a74398d54c2454de1469b62d82475619bce2ee009086ab61891d492a71f803e84dc0ee9ceb936c54c7bb20
|
7
|
+
data.tar.gz: 73409dda1ed16857f8db7fe4fd581f2aa9864eb6d7ae6c09ef99f436810d235f097053ae9a14e156f1ae3ff9e2cb145b8dd800c8c6925b5378c543e174176d7a
|
data/bin/crunch-job
CHANGED
@@ -189,7 +189,7 @@ if (($Job || $local_job)->{docker_image_locator}) {
|
|
189
189
|
$cmd = [$docker_bin, 'ps', '-q'];
|
190
190
|
}
|
191
191
|
Log(undef, "Sanity check is `@$cmd`");
|
192
|
-
my ($exited, $stdout, $stderr) = srun_sync(
|
192
|
+
my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
|
193
193
|
["srun", "--nodes=\Q$ENV{SLURM_NNODES}\E", "--ntasks-per-node=1"],
|
194
194
|
$cmd,
|
195
195
|
{label => "sanity check"});
|
@@ -397,7 +397,7 @@ if (!defined $no_clear_tmp) {
|
|
397
397
|
# Find FUSE mounts under $CRUNCH_TMP and unmount them. Then clean
|
398
398
|
# up work directories crunch_tmp/work, crunch_tmp/opt,
|
399
399
|
# crunch_tmp/src*.
|
400
|
-
my ($exited, $stdout, $stderr) = srun_sync(
|
400
|
+
my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
|
401
401
|
["srun", "--nodelist=$nodelist", "-D", $ENV{'TMPDIR'}],
|
402
402
|
['bash', '-ec', q{
|
403
403
|
arv-mount --unmount-timeout 10 --unmount-all ${CRUNCH_TMP}
|
@@ -405,7 +405,7 @@ rm -rf ${JOB_WORK} ${CRUNCH_INSTALL} ${CRUNCH_TMP}/task ${CRUNCH_TMP}/src* ${CRU
|
|
405
405
|
}],
|
406
406
|
{label => "clean work dirs"});
|
407
407
|
if ($exited != 0) {
|
408
|
-
|
408
|
+
exit_retry_unlocked();
|
409
409
|
}
|
410
410
|
}
|
411
411
|
|
@@ -439,20 +439,23 @@ fi
|
|
439
439
|
echo >&2 "image loaded successfully"
|
440
440
|
};
|
441
441
|
|
442
|
-
my ($exited, $stdout, $stderr) = srun_sync(
|
442
|
+
my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
|
443
443
|
["srun", "--nodelist=" . join(',', @node)],
|
444
444
|
["/bin/bash", "-o", "pipefail", "-ec", $docker_install_script],
|
445
445
|
{label => "load docker image"});
|
446
446
|
if ($exited != 0)
|
447
447
|
{
|
448
|
-
|
448
|
+
exit_retry_unlocked();
|
449
449
|
}
|
450
450
|
|
451
451
|
# Determine whether this version of Docker supports memory+swap limits.
|
452
|
-
($exited, $stdout, $stderr) = srun_sync(
|
452
|
+
($exited, $stdout, $stderr, $tempfail) = srun_sync(
|
453
453
|
["srun", "--nodes=1"],
|
454
454
|
[$docker_bin, 'run', '--help'],
|
455
455
|
{label => "check --memory-swap feature"});
|
456
|
+
if ($tempfail) {
|
457
|
+
exit_retry_unlocked();
|
458
|
+
}
|
456
459
|
$docker_limitmem = ($stdout =~ /--memory-swap/);
|
457
460
|
|
458
461
|
# Find a non-root Docker user to use.
|
@@ -472,7 +475,7 @@ echo >&2 "image loaded successfully"
|
|
472
475
|
$label = "check whether user '$try_user' is UID 0";
|
473
476
|
$try_user_arg = "--user=$try_user";
|
474
477
|
}
|
475
|
-
my ($exited, $stdout, $stderr) = srun_sync(
|
478
|
+
my ($exited, $stdout, $stderr, $tempfail) = srun_sync(
|
476
479
|
["srun", "--nodes=1"],
|
477
480
|
["/bin/sh", "-ec",
|
478
481
|
"$docker_bin run $docker_run_args $try_user_arg $docker_hash id --user"],
|
@@ -486,6 +489,8 @@ echo >&2 "image loaded successfully"
|
|
486
489
|
Log(undef, "Container will run with $dockeruserarg");
|
487
490
|
}
|
488
491
|
last;
|
492
|
+
} elsif ($tempfail) {
|
493
|
+
exit_retry_unlocked();
|
489
494
|
}
|
490
495
|
}
|
491
496
|
|
@@ -678,11 +683,14 @@ else {
|
|
678
683
|
"mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
|
679
684
|
|
680
685
|
$ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
|
681
|
-
my ($stdout, $stderr);
|
682
|
-
($exited, $stdout, $stderr) = srun_sync(
|
686
|
+
my ($stdout, $stderr, $tempfail);
|
687
|
+
($exited, $stdout, $stderr, $tempfail) = srun_sync(
|
683
688
|
\@srunargs, \@execargs,
|
684
689
|
{label => "run install script on all workers"},
|
685
|
-
|
690
|
+
$build_script . $git_archive);
|
691
|
+
if ($tempfail) {
|
692
|
+
exit_retry_unlocked();
|
693
|
+
}
|
686
694
|
|
687
695
|
my $stderr_anything_from_script = 0;
|
688
696
|
for my $line (split(/\n/, $stderr)) {
|
@@ -1117,7 +1125,7 @@ if (!defined $main::success)
|
|
1117
1125
|
} elsif ($working_slot_count < 1) {
|
1118
1126
|
save_output_collection();
|
1119
1127
|
save_meta();
|
1120
|
-
|
1128
|
+
exit_retry_unlocked();
|
1121
1129
|
} elsif ($thisround_succeeded == 0 &&
|
1122
1130
|
($thisround_failed == 0 || $thisround_failed > 4)) {
|
1123
1131
|
my $message = "stop because $thisround_failed tasks failed and none succeeded";
|
@@ -2044,7 +2052,7 @@ sub srun_sync
|
|
2044
2052
|
if ($main::please_freeze || $j->{tempfail}) {
|
2045
2053
|
$exited ||= 255;
|
2046
2054
|
}
|
2047
|
-
return ($exited, $j->{stdout_captured}, $j->{stderr_captured});
|
2055
|
+
return ($exited, $j->{stdout_captured}, $j->{stderr_captured}, $j->{tempfail});
|
2048
2056
|
}
|
2049
2057
|
|
2050
2058
|
|
@@ -2132,6 +2140,11 @@ sub find_docker_image {
|
|
2132
2140
|
}
|
2133
2141
|
}
|
2134
2142
|
|
2143
|
+
sub exit_retry_unlocked {
|
2144
|
+
Log(undef, "Transient failure with lock acquired; asking for re-dispatch by exiting ".EX_RETRY_UNLOCKED);
|
2145
|
+
exit(EX_RETRY_UNLOCKED);
|
2146
|
+
}
|
2147
|
+
|
2135
2148
|
sub retry_count {
|
2136
2149
|
# Calculate the number of times an operation should be retried,
|
2137
2150
|
# assuming exponential backoff, and that we're willing to retry as
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arvados-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.20170726144433
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arvados Authors
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-07-
|
11
|
+
date: 2017-07-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: arvados
|
@@ -164,7 +164,7 @@ dependencies:
|
|
164
164
|
- - "~>"
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0.8'
|
167
|
-
description: Arvados command line tools, git commit
|
167
|
+
description: Arvados command line tools, git commit 8cbabbbe014628574a10a48148d179c14137d61f
|
168
168
|
email: gem-dev@curoverse.com
|
169
169
|
executables:
|
170
170
|
- arv
|