arvados-cli 0.1.20141006212502 → 0.1.20141007134429

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/crunch-job +84 -38
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b84e06d6def466e49e5c53c836efe17babcc9267
4
- data.tar.gz: 957bec18268b60540dd207a9b53d3b1bb45164de
3
+ metadata.gz: ec72b4c6433914d92d8f712603c11bf014e0dcea
4
+ data.tar.gz: 5c38938f5a1123631f0b9da225b8e895756788df
5
5
  SHA512:
6
- metadata.gz: a6dd627d31352cdb3a0958516090466fc0bcf1e5b2d85f66189023642a7258f89d25dce7b90d4b53223b5bf3176e716bbe2082a7ecc6adafa9edfc4f6ce22bcd
7
- data.tar.gz: 9a2321cb216a5c8bd596137a78e4c846cee688eacde21322486a81986fd46c799c0f765816f8effbb2467296cfefd0d7b397a5e408ae6e035df379ea75fea54c
6
+ metadata.gz: a234f80b2daf38847e9a655a3ee040e0978e348dff65011c04e1765944376fdd7ad89c46bf7b905ebb07f9afaa85602e527e6d5d8657c79ba88a4e1586936aac
7
+ data.tar.gz: f0b12cbf890165ed7ef486d6af389f43fb3f55b6cf398b1b5d07abc6c65cd5719e0199780ed9299b53796a608d50f1d9a74d72f3aa4436a2d3bccb8ebd15e9fc
data/bin/crunch-job CHANGED
@@ -141,22 +141,26 @@ $SIG{'USR2'} = sub
141
141
 
142
142
  my $arv = Arvados->new('apiVersion' => 'v1');
143
143
 
144
- my $User = $arv->{'users'}->{'current'}->execute;
145
-
146
144
  my $Job;
147
145
  my $job_id;
148
146
  my $dbh;
149
147
  my $sth;
148
+ my @jobstep;
149
+
150
+ my $User = retry_op(sub { $arv->{'users'}->{'current'}->execute; });
151
+
150
152
  if ($jobspec =~ /^[-a-z\d]+$/)
151
153
  {
152
154
  # $jobspec is an Arvados UUID, not a JSON job specification
153
- $Job = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
155
+ $Job = retry_op(sub {
156
+ $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
157
+ });
154
158
  if (!$force_unlock) {
155
159
  # Claim this job, and make sure nobody else does
156
- eval {
160
+ eval { retry_op(sub {
157
161
  # lock() sets is_locked_by_uuid and changes state to Running.
158
162
  $arv->{'jobs'}->{'lock'}->execute('uuid' => $Job->{'uuid'})
159
- };
163
+ }); };
160
164
  if ($@) {
161
165
  Log(undef, "Error while locking job, exiting ".EX_TEMPFAIL);
162
166
  exit EX_TEMPFAIL;
@@ -177,7 +181,7 @@ else
177
181
  $Job->{'started_at'} = gmtime;
178
182
  $Job->{'state'} = 'Running';
179
183
 
180
- $Job = $arv->{'jobs'}->{'create'}->execute('job' => $Job);
184
+ $Job = retry_op(sub { $arv->{'jobs'}->{'create'}->execute('job' => $Job); });
181
185
  }
182
186
  $job_id = $Job->{'uuid'};
183
187
 
@@ -290,7 +294,6 @@ $ENV{"CRUNCH_JOB_UUID"} = $job_id;
290
294
  $ENV{"JOB_UUID"} = $job_id;
291
295
 
292
296
 
293
- my @jobstep;
294
297
  my @jobstep_todo = ();
295
298
  my @jobstep_done = ();
296
299
  my @jobstep_tomerge = ();
@@ -308,12 +311,14 @@ if (defined $Job->{thawedfromkey})
308
311
  }
309
312
  else
310
313
  {
311
- my $first_task = $arv->{'job_tasks'}->{'create'}->execute('job_task' => {
312
- 'job_uuid' => $Job->{'uuid'},
313
- 'sequence' => 0,
314
- 'qsequence' => 0,
315
- 'parameters' => {},
316
- });
314
+ my $first_task = retry_op(sub {
315
+ $arv->{'job_tasks'}->{'create'}->execute('job_task' => {
316
+ 'job_uuid' => $Job->{'uuid'},
317
+ 'sequence' => 0,
318
+ 'qsequence' => 0,
319
+ 'parameters' => {},
320
+ });
321
+ });
317
322
  push @jobstep, { 'level' => 0,
318
323
  'failures' => 0,
319
324
  'arvados_task' => $first_task,
@@ -408,9 +413,10 @@ else {
408
413
  } else {
409
414
  # $repo is none of the above. It must be the name of a hosted
410
415
  # repository.
411
- my $arv_repo_list = $arv->{'repositories'}->{'list'}->execute(
412
- 'filters' => [['name','=',$repo]]
413
- )->{'items'};
416
+ my $arv_repo_list = retry_op(sub {
417
+ $arv->{'repositories'}->{'list'}->execute(
418
+ 'filters' => [['name','=',$repo]])->{'items'};
419
+ });
414
420
  my $n_found = scalar @{$arv_repo_list};
415
421
  if ($n_found > 0) {
416
422
  Log(undef, "Repository '$repo' -> "
@@ -898,8 +904,9 @@ else {
898
904
  while (my $manifest_line = <$orig_manifest>) {
899
905
  $orig_manifest_text .= $manifest_line;
900
906
  }
901
- my $output = $arv->{'collections'}->{'create'}->execute('collection' => {
902
- 'manifest_text' => $orig_manifest_text,
907
+ my $output = retry_op(sub {
908
+ $arv->{'collections'}->{'create'}->execute(
909
+ 'collection' => {'manifest_text' => $orig_manifest_text});
903
910
  });
904
911
  Log(undef, "output uuid " . $output->{uuid});
905
912
  Log(undef, "output hash " . $output->{portable_data_hash});
@@ -1034,13 +1041,15 @@ sub reapchildren
1034
1041
  my $newtask_list = [];
1035
1042
  my $newtask_results;
1036
1043
  do {
1037
- $newtask_results = $arv->{'job_tasks'}->{'list'}->execute(
1038
- 'where' => {
1039
- 'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
1040
- },
1041
- 'order' => 'qsequence',
1042
- 'offset' => scalar(@$newtask_list),
1043
- );
1044
+ $newtask_results = retry_op(sub {
1045
+ $arv->{'job_tasks'}->{'list'}->execute(
1046
+ 'where' => {
1047
+ 'created_by_job_task_uuid' => $Jobstep->{'arvados_task'}->{uuid}
1048
+ },
1049
+ 'order' => 'qsequence',
1050
+ 'offset' => scalar(@$newtask_list),
1051
+ );
1052
+ });
1044
1053
  push(@$newtask_list, @{$newtask_results->{items}});
1045
1054
  } while (@{$newtask_results->{items}});
1046
1055
  foreach my $arvados_task (@$newtask_list) {
@@ -1063,7 +1072,9 @@ sub check_refresh_wanted
1063
1072
  my @stat = stat $ENV{"CRUNCH_REFRESH_TRIGGER"};
1064
1073
  if (@stat && $stat[9] > $latest_refresh) {
1065
1074
  $latest_refresh = scalar time;
1066
- my $Job2 = $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
1075
+ my $Job2 = retry_op(sub {
1076
+ $arv->{'jobs'}->{'get'}->execute('uuid' => $jobspec);
1077
+ });
1067
1078
  for my $attr ('cancelled_at',
1068
1079
  'cancelled_by_user_uuid',
1069
1080
  'cancelled_by_client_uuid',
@@ -1244,7 +1255,7 @@ sub collate_output
1244
1255
 
1245
1256
  my ($child_out, $child_in);
1246
1257
  my $pid = open2($child_out, $child_in, 'arv-put', '--raw',
1247
- '--retries', put_retry_count());
1258
+ '--retries', retry_count());
1248
1259
  my $joboutput;
1249
1260
  for (@jobstep)
1250
1261
  {
@@ -1574,7 +1585,10 @@ sub find_docker_image {
1574
1585
  # If not, return undef for both values.
1575
1586
  my $locator = shift;
1576
1587
  my ($streamname, $filename);
1577
- if (my $image = $arv->{collections}->{get}->execute(uuid => $locator)) {
1588
+ my $image = retry_op(sub {
1589
+ $arv->{collections}->{get}->execute(uuid => $locator);
1590
+ });
1591
+ if ($image) {
1578
1592
  foreach my $line (split(/\n/, $image->{manifest_text})) {
1579
1593
  my @tokens = split(/\s+/, $line);
1580
1594
  next if (!@tokens);
@@ -1595,20 +1609,52 @@ sub find_docker_image {
1595
1609
  }
1596
1610
  }
1597
1611
 
1598
- sub put_retry_count {
1599
- # Calculate a --retries argument for arv-put that will have it try
1600
- # approximately as long as this Job has been running.
1601
- my $stoptime = shift || time;
1602
- my $starttime = $jobstep[0]->{starttime};
1603
- my $timediff = defined($starttime) ? ($stoptime - $starttime) : 1;
1604
- my $retries = 0;
1605
- while ($timediff >= 2) {
1606
- $retries++;
1607
- $timediff /= 2;
1612
+ sub retry_count {
1613
+ # Calculate the number of times an operation should be retried,
1614
+ # assuming exponential backoff, and that we're willing to retry as
1615
+ # long as tasks have been running. Enforce a minimum of 3 retries.
1616
+ my ($starttime, $endtime, $timediff, $retries);
1617
+ if (@jobstep) {
1618
+ $starttime = $jobstep[0]->{starttime};
1619
+ $endtime = $jobstep[-1]->{finishtime};
1620
+ }
1621
+ if (!defined($starttime)) {
1622
+ $timediff = 0;
1623
+ } elsif (!defined($endtime)) {
1624
+ $timediff = time - $starttime;
1625
+ } else {
1626
+ $timediff = ($endtime - $starttime) - (time - $endtime);
1627
+ }
1628
+ if ($timediff > 0) {
1629
+ $retries = int(log($timediff) / log(2));
1630
+ } else {
1631
+ $retries = 1; # Use the minimum.
1608
1632
  }
1609
1633
  return ($retries > 3) ? $retries : 3;
1610
1634
  }
1611
1635
 
1636
+ sub retry_op {
1637
+ # Given a function reference, call it with the remaining arguments. If
1638
+ # it dies, retry it with exponential backoff until it succeeds, or until
1639
+ # the current retry_count is exhausted.
1640
+ my $operation = shift;
1641
+ my $retries = retry_count();
1642
+ foreach my $try_count (0..$retries) {
1643
+ my $next_try = time + (2 ** $try_count);
1644
+ my $result = eval { $operation->(@_); };
1645
+ if (!$@) {
1646
+ return $result;
1647
+ } elsif ($try_count < $retries) {
1648
+ my $sleep_time = $next_try - time;
1649
+ sleep($sleep_time) if ($sleep_time > 0);
1650
+ }
1651
+ }
1652
+ # Ensure the error message ends in a newline, so Perl doesn't add
1653
+ # retry_op's line number to it.
1654
+ chomp($@);
1655
+ die($@ . "\n");
1656
+ }
1657
+
1612
1658
  sub exit_status_s {
1613
1659
  # Given a $?, return a human-readable exit code string like "0" or
1614
1660
  # "1" or "0 with signal 1" or "1 with signal 11".
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arvados-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20141006212502
4
+ version: 0.1.20141007134429
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arvados Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-06 00:00:00.000000000 Z
11
+ date: 2014-10-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: arvados
@@ -178,7 +178,7 @@ dependencies:
178
178
  - - "<"
179
179
  - !ruby/object:Gem::Version
180
180
  version: 1.0.0
181
- description: Arvados command line tools, git commit 38cc5c0a51657c6b60f3d3f32c566845988dfb6b
181
+ description: Arvados command line tools, git commit 344c6dcdbae76310879c85a736e4e6cce05d5645
182
182
  email: gem-dev@curoverse.com
183
183
  executables:
184
184
  - arv