arvados-cli 0.1.20141006212502 → 0.1.20141007134429
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/crunch-job +84 -38
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ec72b4c6433914d92d8f712603c11bf014e0dcea
|
4
|
+
data.tar.gz: 5c38938f5a1123631f0b9da225b8e895756788df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a234f80b2daf38847e9a655a3ee040e0978e348dff65011c04e1765944376fdd7ad89c46bf7b905ebb07f9afaa85602e527e6d5d8657c79ba88a4e1586936aac
|
7
|
+
data.tar.gz: f0b12cbf890165ed7ef486d6af389f43fb3f55b6cf398b1b5d07abc6c65cd5719e0199780ed9299b53796a608d50f1d9a74d72f3aa4436a2d3bccb8ebd15e9fc
|
data/bin/crunch-job
CHANGED
@@ -141,22 +141,26 @@ $SIG{'USR2'} = sub
|
|
141
141
|
|
142
142
|
my $arv = Arvados->new('apiVersion' => 'v1');
|
143
143
|
|
144
|
-
my $User = $arv->{'users'}->{'current'}->execute;
|
145
|
-
|
146
144
|
my $Job;
|
147
145
|
my $job_id;
|
148
146
|
my $dbh;
|
149
147
|
my $sth;
|
148
|
+
my @jobstep;
|
149
|
+
|
150
|
+
my $User = retry_op(sub { $arv->{'users'}->{'current'}->execute; });
|
151
|
+
|
150
152
|
if ($jobspec =~ /^[-a-z\d]+$/)
|
151
153
|
{
|
152
154
|
# $jobspec is an Arvados UUID, not a JSON job specification
|
153
|
-
$Job =
|
155
|
+
$Job = retry_op(sub {
|
156
|
+
$arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
|
157
|
+
});
|
154
158
|
if (!$force_unlock) {
|
155
159
|
# Claim this job, and make sure nobody else does
|
156
|
-
eval {
|
160
|
+
eval { retry_op(sub {
|
157
161
|
# lock() sets is_locked_by_uuid and changes state to Running.
|
158
162
|
$arv->{'jobs'}->{'lock'}->execute('uuid' => $Job->{'uuid'})
|
159
|
-
};
|
163
|
+
}); };
|
160
164
|
if ($@) {
|
161
165
|
Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
|
162
166
|
exit EX_TEMPFAIL;
|
@@ -177,7 +181,7 @@ else
|
|
177
181
|
$Job->{'started_at'} = gmtime;
|
178
182
|
$Job->{'state'} = 'Running';
|
179
183
|
|
180
|
-
$Job = $arv->{'jobs'}->{'create'}->execute('job' => $Job);
|
184
|
+
$Job = retry_op(sub { $arv->{'jobs'}->{'create'}->execute('job' => $Job); });
|
181
185
|
}
|
182
186
|
$job_id = $Job->{'uuid'};
|
183
187
|
|
@@ -290,7 +294,6 @@ $ENV{"CRUNCH_JOB_UUID"} = $job_id;
|
|
290
294
|
$ENV{"JOB_UUID"} = $job_id;
|
291
295
|
|
292
296
|
|
293
|
-
my @jobstep;
|
294
297
|
my @jobstep_todo = ();
|
295
298
|
my @jobstep_done = ();
|
296
299
|
my @jobstep_tomerge = ();
|
@@ -308,12 +311,14 @@ if (defined $Job->{thawedfromkey})
|
|
308
311
|
}
|
309
312
|
else
|
310
313
|
{
|
311
|
-
my $first_task =
|
312
|
-
'
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
314
|
+
my $first_task = retry_op(sub {
|
315
|
+
$arv->{'job_tasks'}->{'create'}->execute('job_task' => {
|
316
|
+
'job_uuid' => $Job->{'uuid'},
|
317
|
+
'sequence' => 0,
|
318
|
+
'qsequence' => 0,
|
319
|
+
'parameters' => {},
|
320
|
+
});
|
321
|
+
});
|
317
322
|
push @jobstep, { 'level' => 0,
|
318
323
|
'failures' => 0,
|
319
324
|
'arvados_task' => $first_task,
|
@@ -408,9 +413,10 @@ else {
|
|
408
413
|
} else {
|
409
414
|
# $repo is none of the above. It must be the name of a hosted
|
410
415
|
# repository.
|
411
|
-
my $arv_repo_list =
|
412
|
-
'
|
413
|
-
)->{'items'};
|
416
|
+
my $arv_repo_list = retry_op(sub {
|
417
|
+
$arv->{'repositories'}->{'list'}->execute(
|
418
|
+
'filters' => [['name','=',$repo]])->{'items'};
|
419
|
+
});
|
414
420
|
my $n_found = scalar @{$arv_repo_list};
|
415
421
|
if ($n_found > 0) {
|
416
422
|
Log(undef, "Repository '$repo' -> "
|
@@ -898,8 +904,9 @@ else {
|
|
898
904
|
while (my $manifest_line = <$orig_manifest>) {
|
899
905
|
$orig_manifest_text .= $manifest_line;
|
900
906
|
}
|
901
|
-
my $output =
|
902
|
-
'
|
907
|
+
my $output = retry_op(sub {
|
908
|
+
$arv->{'collections'}->{'create'}->execute(
|
909
|
+
'collection' => {'manifest_text' => $orig_manifest_text});
|
903
910
|
});
|
904
911
|
Log(undef, "output uuid " . $output->{uuid});
|
905
912
|
Log(undef, "output hash " . $output->{portable_data_hash});
|
@@ -1034,13 +1041,15 @@ sub reapchildren
|
|
1034
1041
|
my $newtask_list = [];
|
1035
1042
|
my $newtask_results;
|
1036
1043
|
do {
|
1037
|
-
$newtask_results =
|
1038
|
-
'
|
1039
|
-
'
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
+
$newtask_results = retry_op(sub {
|
1045
|
+
$arv->{'job_tasks'}->{'list'}->execute(
|
1046
|
+
'where' => {
|
1047
|
+
'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
|
1048
|
+
},
|
1049
|
+
'order' => 'qsequence',
|
1050
|
+
'offset' => scalar(@$newtask_list),
|
1051
|
+
);
|
1052
|
+
});
|
1044
1053
|
push(@$newtask_list, @{$newtask_results->{items}});
|
1045
1054
|
} while (@{$newtask_results->{items}});
|
1046
1055
|
foreach my $arvados_task (@$newtask_list) {
|
@@ -1063,7 +1072,9 @@ sub check_refresh_wanted
|
|
1063
1072
|
my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
|
1064
1073
|
if (@stat && $stat[9] > $latest_refresh) {
|
1065
1074
|
$latest_refresh = scalar time;
|
1066
|
-
my $Job2 =
|
1075
|
+
my $Job2 = retry_op(sub {
|
1076
|
+
$arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
|
1077
|
+
});
|
1067
1078
|
for my $attr ('cancelled_at',
|
1068
1079
|
'cancelled_by_user_uuid',
|
1069
1080
|
'cancelled_by_client_uuid',
|
@@ -1244,7 +1255,7 @@ sub collate_output
|
|
1244
1255
|
|
1245
1256
|
my ($child_out, $child_in);
|
1246
1257
|
my $pid = open2($child_out, $child_in, 'arv-put', '--raw',
|
1247
|
-
'--retries',
|
1258
|
+
'--retries', retry_count());
|
1248
1259
|
my $joboutput;
|
1249
1260
|
for (@jobstep)
|
1250
1261
|
{
|
@@ -1574,7 +1585,10 @@ sub find_docker_image {
|
|
1574
1585
|
# If not, return undef for both values.
|
1575
1586
|
my $locator = shift;
|
1576
1587
|
my ($streamname, $filename);
|
1577
|
-
|
1588
|
+
my $image = retry_op(sub {
|
1589
|
+
$arv->{collections}->{get}->execute(uuid => $locator);
|
1590
|
+
});
|
1591
|
+
if ($image) {
|
1578
1592
|
foreach my $line (split(/\n/, $image->{manifest_text})) {
|
1579
1593
|
my @tokens = split(/\s+/, $line);
|
1580
1594
|
next if (!@tokens);
|
@@ -1595,20 +1609,52 @@ sub find_docker_image {
|
|
1595
1609
|
}
|
1596
1610
|
}
|
1597
1611
|
|
1598
|
-
sub
|
1599
|
-
# Calculate
|
1600
|
-
#
|
1601
|
-
|
1602
|
-
my $starttime
|
1603
|
-
|
1604
|
-
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1612
|
+
sub retry_count {
|
1613
|
+
# Calculate the number of times an operation should be retried,
|
1614
|
+
# assuming exponential backoff, and that we're willing to retry as
|
1615
|
+
# long as tasks have been running. Enforce a minimum of 3 retries.
|
1616
|
+
my ($starttime, $endtime, $timediff, $retries);
|
1617
|
+
if (@jobstep) {
|
1618
|
+
$starttime = $jobstep[0]->{starttime};
|
1619
|
+
$endtime = $jobstep[-1]->{finishtime};
|
1620
|
+
}
|
1621
|
+
if (!defined($starttime)) {
|
1622
|
+
$timediff = 0;
|
1623
|
+
} elsif (!defined($endtime)) {
|
1624
|
+
$timediff = time - $starttime;
|
1625
|
+
} else {
|
1626
|
+
$timediff = ($endtime - $starttime) - (time - $endtime);
|
1627
|
+
}
|
1628
|
+
if ($timediff > 0) {
|
1629
|
+
$retries = int(log($timediff) / log(2));
|
1630
|
+
} else {
|
1631
|
+
$retries = 1; # Use the minimum.
|
1608
1632
|
}
|
1609
1633
|
return ($retries > 3) ? $retries : 3;
|
1610
1634
|
}
|
1611
1635
|
|
1636
|
+
sub retry_op {
|
1637
|
+
# Given a function reference, call it with the remaining arguments. If
|
1638
|
+
# it dies, retry it with exponential backoff until it succeeds, or until
|
1639
|
+
# the current retry_count is exhausted.
|
1640
|
+
my $operation = shift;
|
1641
|
+
my $retries = retry_count();
|
1642
|
+
foreach my $try_count (0..$retries) {
|
1643
|
+
my $next_try = time + (2 ** $try_count);
|
1644
|
+
my $result = eval { $operation->(@_); };
|
1645
|
+
if (!$@) {
|
1646
|
+
return $result;
|
1647
|
+
} elsif ($try_count < $retries) {
|
1648
|
+
my $sleep_time = $next_try - time;
|
1649
|
+
sleep($sleep_time) if ($sleep_time > 0);
|
1650
|
+
}
|
1651
|
+
}
|
1652
|
+
# Ensure the error message ends in a newline, so Perl doesn't add
|
1653
|
+
# retry_op's line number to it.
|
1654
|
+
chomp($@);
|
1655
|
+
die($@ . "\n");
|
1656
|
+
}
|
1657
|
+
|
1612
1658
|
sub exit_status_s {
|
1613
1659
|
# Given a $?, return a human-readable exit code string like "0" or
|
1614
1660
|
# "1" or "0 with signal 1" or "1 with signal 11".
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arvados-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.20141007134429
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arvados Authors
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: arvados
|
@@ -178,7 +178,7 @@ dependencies:
|
|
178
178
|
- - "<"
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: 1.0.0
|
181
|
-
description: Arvados command line tools, git commit
|
181
|
+
description: Arvados command line tools, git commit 344c6dcdbae76310879c85a736e4e6cce05d5645
|
182
182
|
email: gem-dev@curoverse.com
|
183
183
|
executables:
|
184
184
|
- arv
|