arvados-cli 0.1.20150612180532 → 0.1.20150622205518

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/crunch-job +106 -44
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e7b56a00e393a7f5d9fa259e633b661b2b720a8a
4
- data.tar.gz: b5ddabf741422dc9da49e4253e0d5e3d050d133d
3
+ metadata.gz: e321a7b7d04de4e2a7cd3f83e2ddf629b4c15d66
4
+ data.tar.gz: fda717493fb47d9b36ab5e37cc5c4873ccbba676
5
5
  SHA512:
6
- metadata.gz: 002519e2ea167ae32f7c5ab91c0bdc8bf17e5744c4131526d5ad9990c2749267afcbaff30a23dc0b0b74e9686fe78736054424613a9f464717cb86b40caf8f6b
7
- data.tar.gz: ceae829320fcef6bd92f1ea4a7a2d4945897c7284d19cd358ff583d512b086e20285a43406abb34b569a6b8e106eb5d7568acfe3b0abfecc0b83f3e3a6fb1c4e
6
+ metadata.gz: 713b7f792692442ae2e59935994bb187feafcea88bbb7c3fdc7b0f9005049208edbba9c260b68eae723734c43dab96f5414a2a4ec0c8e9873c385ebe9706b2d5
7
+ data.tar.gz: aec453b13239436976570f1fca4bcf935dc823889780ecac1cb749a4fb3409074dccfd5c814593586bfef7483e9f505e9fccc1e5ac3a95f0402da33b92830225
data/bin/crunch-job CHANGED
@@ -98,6 +98,7 @@ use File::Path qw( make_path remove_tree );
98
98
 
99
99
  use constant TASK_TEMPFAIL => 111;
100
100
  use constant EX_TEMPFAIL => 75;
101
+ use constant EX_RETRY_UNLOCKED => 93;
101
102
 
102
103
  $ENV{"TMPDIR"} ||= "/tmp";
103
104
  unless (defined $ENV{"CRUNCH_TMP"}) {
@@ -292,9 +293,16 @@ foreach (@sinfo)
292
293
  {
293
294
  Log (undef, "node $nodename - $ncpus slots");
294
295
  my $node = { name => $nodename,
295
- ncpus => $ncpus,
296
- losing_streak => 0,
297
- hold_until => 0 };
296
+ ncpus => $ncpus,
297
+ # The number of consecutive times a task has been dispatched
298
+ # to this node and failed.
299
+ losing_streak => 0,
300
+ # The number of consecutive times that SLURM has reported
301
+ # a node failure since the last successful task.
302
+ fail_count => 0,
303
+ # Don't dispatch work to this node until this time
304
+ # (in seconds since the epoch) has passed.
305
+ hold_until => 0 };
298
306
  foreach my $cpu (1..$ncpus)
299
307
  {
300
308
  push @slot, { node => $node,
@@ -721,6 +729,7 @@ ONELEVEL:
721
729
  my $thisround_succeeded = 0;
722
730
  my $thisround_failed = 0;
723
731
  my $thisround_failed_multiple = 0;
732
+ my $working_slot_count = scalar(@slot);
724
733
 
725
734
  @jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
726
735
  or $a <=> $b } @jobstep_todo;
@@ -950,6 +959,7 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
950
959
  $Jobstep->{slotindex} = $childslot;
951
960
  delete $Jobstep->{stderr};
952
961
  delete $Jobstep->{finishtime};
962
+ delete $Jobstep->{tempfail};
953
963
 
954
964
  $Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
955
965
  $Jobstep->{'arvados_task'}->save;
@@ -986,6 +996,8 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
986
996
  {
987
997
  update_progress_stats();
988
998
  }
999
+ $working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
1000
+ $_->{node}->{hold_count} < 4 } @slot);
989
1001
  if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
990
1002
  ($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
991
1003
  {
@@ -1009,10 +1021,8 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
1009
1021
  }
1010
1022
 
1011
1023
  # give up if no nodes are succeeding
1012
- if (!grep { $_->{node}->{losing_streak} == 0 &&
1013
- $_->{node}->{hold_count} < 4 } @slot) {
1014
- my $message = "Every node has failed -- giving up on this round";
1015
- Log (undef, $message);
1024
+ if ($working_slot_count < 1) {
1025
+ Log(undef, "Every node has failed -- giving up");
1016
1026
  last THISROUND;
1017
1027
  }
1018
1028
  }
@@ -1048,18 +1058,18 @@ freeze_if_want_freeze();
1048
1058
 
1049
1059
  if (!defined $main::success)
1050
1060
  {
1051
- if (@jobstep_todo &&
1052
- $thisround_succeeded == 0 &&
1053
- ($thisround_failed == 0 || $thisround_failed > 4))
1054
- {
1061
+ if (!@jobstep_todo) {
1062
+ $main::success = 1;
1063
+ } elsif ($working_slot_count < 1) {
1064
+ save_output_collection();
1065
+ save_meta();
1066
+ exit(EX_RETRY_UNLOCKED);
1067
+ } elsif ($thisround_succeeded == 0 &&
1068
+ ($thisround_failed == 0 || $thisround_failed > 4)) {
1055
1069
  my $message = "stop because $thisround_failed tasks failed and none succeeded";
1056
1070
  Log (undef, $message);
1057
1071
  $main::success = 0;
1058
1072
  }
1059
- if (!@jobstep_todo)
1060
- {
1061
- $main::success = 1;
1062
- }
1063
1073
  }
1064
1074
 
1065
1075
  goto ONELEVEL if !defined $main::success;
@@ -1067,16 +1077,7 @@ goto ONELEVEL if !defined $main::success;
1067
1077
 
1068
1078
  release_allocation();
1069
1079
  freeze();
1070
- my $collated_output = &create_output_collection();
1071
-
1072
- if (!$collated_output) {
1073
- Log (undef, "Failed to write output collection");
1074
- }
1075
- else {
1076
- Log(undef, "job output $collated_output");
1077
- $Job->update_attributes('output' => $collated_output);
1078
- }
1079
-
1080
+ my $collated_output = save_output_collection();
1080
1081
  Log (undef, "finish");
1081
1082
 
1082
1083
  save_meta();
@@ -1141,7 +1142,7 @@ sub reapchildren
1141
1142
  if (!$task_success)
1142
1143
  {
1143
1144
  my $temporary_fail;
1144
- $temporary_fail ||= $Jobstep->{node_fail};
1145
+ $temporary_fail ||= $Jobstep->{tempfail};
1145
1146
  $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
1146
1147
 
1147
1148
  ++$thisround_failed;
@@ -1179,6 +1180,7 @@ sub reapchildren
1179
1180
  ++$thisround_succeeded;
1180
1181
  $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
1181
1182
  $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
1183
+ $slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
1182
1184
  push @jobstep_done, $jobstepid;
1183
1185
  Log ($jobstepid, "success in $elapsed seconds");
1184
1186
  }
@@ -1389,10 +1391,19 @@ sub preprocess_stderr
1389
1391
  # whoa.
1390
1392
  $main::please_freeze = 1;
1391
1393
  }
1392
- elsif ($line =~ /(srun: error: (Node failure on|Unable to create job step|.*: Communication connection failure))|arvados.errors.Keep/) {
1393
- $jobstep[$job]->{node_fail} = 1;
1394
+ elsif ($line =~ /srun: error: Node failure on/) {
1395
+ my $job_slot_index = $jobstep[$job]->{slotindex};
1396
+ $slot[$job_slot_index]->{node}->{fail_count}++;
1397
+ $jobstep[$job]->{tempfail} = 1;
1398
+ ban_node_by_slot($job_slot_index);
1399
+ }
1400
+ elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
1401
+ $jobstep[$job]->{tempfail} = 1;
1394
1402
  ban_node_by_slot($jobstep[$job]->{slotindex});
1395
1403
  }
1404
+ elsif ($line =~ /arvados\.errors\.Keep/) {
1405
+ $jobstep[$job]->{tempfail} = 1;
1406
+ }
1396
1407
  }
1397
1408
  }
1398
1409
 
@@ -1511,6 +1522,20 @@ print (arvados.api("v1").collections().
1511
1522
  return $joboutput;
1512
1523
  }
1513
1524
 
1525
+ # Calls create_output_collection, logs the result, and returns it.
1526
+ # If that was successful, save that as the output in the job record.
1527
+ sub save_output_collection {
1528
+ my $collated_output = create_output_collection();
1529
+
1530
+ if (!$collated_output) {
1531
+ Log(undef, "Failed to write output collection");
1532
+ }
1533
+ else {
1534
+ Log(undef, "job output $collated_output");
1535
+ $Job->update_attributes('output' => $collated_output);
1536
+ }
1537
+ return $collated_output;
1538
+ }
1514
1539
 
1515
1540
  sub killem
1516
1541
  {
@@ -1556,6 +1581,8 @@ sub fhbits
1556
1581
  # Send log output to Keep via arv-put.
1557
1582
  #
1558
1583
  # $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
1584
+ # $log_pipe_out_buf is a string containing all output read from arv-put so far.
1585
+ # $log_pipe_out_select is an IO::Select object around $log_pipe_out.
1559
1586
  # $log_pipe_pid is the pid of the arv-put subprocess.
1560
1587
  #
1561
1588
  # The only functions that should access these variables directly are:
@@ -1564,6 +1591,13 @@ sub fhbits
1564
1591
  # Starts an arv-put pipe, reading data on stdin and writing it to
1565
1592
  # a $logfilename file in an output collection.
1566
1593
  #
1594
+ # log_writer_read_output([$timeout])
1595
+ # Read output from $log_pipe_out and append it to $log_pipe_out_buf.
1596
+ # Passes $timeout to the select() call, with a default of 0.01.
1597
+ # Returns the result of the last read() call on $log_pipe_out, or
1598
+ # -1 if read() wasn't called because select() timed out.
1599
+ # Only other log_writer_* functions should need to call this.
1600
+ #
1567
1601
  # log_writer_send($txt)
1568
1602
  # Writes $txt to the output log collection.
1569
1603
  #
@@ -1574,25 +1608,40 @@ sub fhbits
1574
1608
  # Returns a true value if there is currently a live arv-put
1575
1609
  # process, false otherwise.
1576
1610
  #
1577
- my ($log_pipe_in, $log_pipe_out, $log_pipe_pid);
1611
+ my ($log_pipe_in, $log_pipe_out, $log_pipe_out_buf, $log_pipe_out_select,
1612
+ $log_pipe_pid);
1578
1613
 
1579
1614
  sub log_writer_start($)
1580
1615
  {
1581
1616
  my $logfilename = shift;
1582
1617
  $log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
1583
1618
  'arv-put',
1584
- '--portable-data-hash',
1585
- '--project-uuid', $Job->{owner_uuid},
1619
+ '--stream',
1586
1620
  '--retries', '3',
1587
- '--name', $logfilename,
1588
1621
  '--filename', $logfilename,
1589
1622
  '-');
1623
+ $log_pipe_out_buf = "";
1624
+ $log_pipe_out_select = IO::Select->new($log_pipe_out);
1625
+ }
1626
+
1627
+ sub log_writer_read_output {
1628
+ my $timeout = shift || 0.01;
1629
+ my $read = -1;
1630
+ while ($read && $log_pipe_out_select->can_read($timeout)) {
1631
+ $read = read($log_pipe_out, $log_pipe_out_buf, 65536,
1632
+ length($log_pipe_out_buf));
1633
+ }
1634
+ if (!defined($read)) {
1635
+ Log(undef, "error reading log manifest from arv-put: $!");
1636
+ }
1637
+ return $read;
1590
1638
  }
1591
1639
 
1592
1640
  sub log_writer_send($)
1593
1641
  {
1594
1642
  my $txt = shift;
1595
1643
  print $log_pipe_in $txt;
1644
+ log_writer_read_output();
1596
1645
  }
1597
1646
 
1598
1647
  sub log_writer_finish()
@@ -1600,22 +1649,24 @@ sub log_writer_finish()
1600
1649
  return unless $log_pipe_pid;
1601
1650
 
1602
1651
  close($log_pipe_in);
1603
- my $arv_put_output;
1604
1652
 
1605
- my $s = IO::Select->new($log_pipe_out);
1606
- if ($s->can_read(120)) {
1607
- sysread($log_pipe_out, $arv_put_output, 1024);
1608
- chomp($arv_put_output);
1609
- } else {
1653
+ my $read_result = log_writer_read_output(120);
1654
+ if ($read_result == -1) {
1610
1655
  Log (undef, "timed out reading from 'arv-put'");
1656
+ } elsif ($read_result != 0) {
1657
+ Log(undef, "failed to read arv-put log manifest to EOF");
1611
1658
  }
1612
1659
 
1613
1660
  waitpid($log_pipe_pid, 0);
1614
- $log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
1615
1661
  if ($?) {
1616
- Log("log_writer_finish: arv-put exited ".exit_status_s($?))
1662
+ Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?))
1617
1663
  }
1618
1664
 
1665
+ close($log_pipe_out);
1666
+ my $arv_put_output = $log_pipe_out_buf;
1667
+ $log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf =
1668
+ $log_pipe_out_select = undef;
1669
+
1619
1670
  return $arv_put_output;
1620
1671
  }
1621
1672
 
@@ -1679,10 +1730,21 @@ sub save_meta
1679
1730
  return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm
1680
1731
  return unless log_writer_is_active();
1681
1732
 
1682
- my $loglocator = log_writer_finish();
1683
- Log (undef, "log manifest is $loglocator");
1684
- $Job->{'log'} = $loglocator;
1685
- $Job->update_attributes('log', $loglocator);
1733
+ my $log_manifest = "";
1734
+ if ($Job->{log}) {
1735
+ my $prev_log_coll = api_call("collections/get", uuid => $Job->{log});
1736
+ $log_manifest .= $prev_log_coll->{manifest_text};
1737
+ }
1738
+ $log_manifest .= log_writer_finish();
1739
+
1740
+ my $log_coll = api_call(
1741
+ "collections/create", ensure_unique_name => 1, collection => {
1742
+ manifest_text => $log_manifest,
1743
+ owner_uuid => $Job->{owner_uuid},
1744
+ name => sprintf("Log from %s job %s", $Job->{script}, $Job->{uuid}),
1745
+ });
1746
+ Log(undef, "log collection is " . $log_coll->{portable_data_hash});
1747
+ $Job->update_attributes('log' => $log_coll->{portable_data_hash});
1686
1748
  }
1687
1749
 
1688
1750
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arvados-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20150612180532
4
+ version: 0.1.20150622205518
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arvados Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-12 00:00:00.000000000 Z
11
+ date: 2015-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: arvados
@@ -178,7 +178,7 @@ dependencies:
178
178
  - - "<"
179
179
  - !ruby/object:Gem::Version
180
180
  version: 1.0.0
181
- description: Arvados command line tools, git commit f0a92e384da79e0a17efb42de17031f45f006e44
181
+ description: Arvados command line tools, git commit ad7679cfe57733940f8461097ee01bfd97997ce6
182
182
  email: gem-dev@curoverse.com
183
183
  executables:
184
184
  - arv