arvados-cli 0.1.20150612180532 → 0.1.20150622205518

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/crunch-job +106 -44
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e7b56a00e393a7f5d9fa259e633b661b2b720a8a
4
- data.tar.gz: b5ddabf741422dc9da49e4253e0d5e3d050d133d
3
+ metadata.gz: e321a7b7d04de4e2a7cd3f83e2ddf629b4c15d66
4
+ data.tar.gz: fda717493fb47d9b36ab5e37cc5c4873ccbba676
5
5
  SHA512:
6
- metadata.gz: 002519e2ea167ae32f7c5ab91c0bdc8bf17e5744c4131526d5ad9990c2749267afcbaff30a23dc0b0b74e9686fe78736054424613a9f464717cb86b40caf8f6b
7
- data.tar.gz: ceae829320fcef6bd92f1ea4a7a2d4945897c7284d19cd358ff583d512b086e20285a43406abb34b569a6b8e106eb5d7568acfe3b0abfecc0b83f3e3a6fb1c4e
6
+ metadata.gz: 713b7f792692442ae2e59935994bb187feafcea88bbb7c3fdc7b0f9005049208edbba9c260b68eae723734c43dab96f5414a2a4ec0c8e9873c385ebe9706b2d5
7
+ data.tar.gz: aec453b13239436976570f1fca4bcf935dc823889780ecac1cb749a4fb3409074dccfd5c814593586bfef7483e9f505e9fccc1e5ac3a95f0402da33b92830225
data/bin/crunch-job CHANGED
@@ -98,6 +98,7 @@ use File::Path qw( make_path remove_tree );
98
98
 
99
99
  use constant TASK_TEMPFAIL => 111;
100
100
  use constant EX_TEMPFAIL => 75;
101
+ use constant EX_RETRY_UNLOCKED => 93;
101
102
 
102
103
  $ENV{"TMPDIR"} ||= "/tmp";
103
104
  unless (defined $ENV{"CRUNCH_TMP"}) {
@@ -292,9 +293,16 @@ foreach (@sinfo)
292
293
  {
293
294
  Log (undef, "node $nodename - $ncpus slots");
294
295
  my $node = { name => $nodename,
295
- ncpus => $ncpus,
296
- losing_streak => 0,
297
- hold_until => 0 };
296
+ ncpus => $ncpus,
297
+ # The number of consecutive times a task has been dispatched
298
+ # to this node and failed.
299
+ losing_streak => 0,
300
+ # The number of consecutive times that SLURM has reported
301
+ # a node failure since the last successful task.
302
+ fail_count => 0,
303
+ # Don't dispatch work to this node until this time
304
+ # (in seconds since the epoch) has passed.
305
+ hold_until => 0 };
298
306
  foreach my $cpu (1..$ncpus)
299
307
  {
300
308
  push @slot, { node => $node,
@@ -721,6 +729,7 @@ ONELEVEL:
721
729
  my $thisround_succeeded = 0;
722
730
  my $thisround_failed = 0;
723
731
  my $thisround_failed_multiple = 0;
732
+ my $working_slot_count = scalar(@slot);
724
733
 
725
734
  @jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
726
735
  or $a <=> $b } @jobstep_todo;
@@ -950,6 +959,7 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
950
959
  $Jobstep->{slotindex} = $childslot;
951
960
  delete $Jobstep->{stderr};
952
961
  delete $Jobstep->{finishtime};
962
+ delete $Jobstep->{tempfail};
953
963
 
954
964
  $Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
955
965
  $Jobstep->{'arvados_task'}->save;
@@ -986,6 +996,8 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
986
996
  {
987
997
  update_progress_stats();
988
998
  }
999
+ $working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
1000
+ $_->{node}->{hold_count} < 4 } @slot);
989
1001
  if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
990
1002
  ($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
991
1003
  {
@@ -1009,10 +1021,8 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
1009
1021
  }
1010
1022
 
1011
1023
  # give up if no nodes are succeeding
1012
- if (!grep { $_->{node}->{losing_streak} == 0 &&
1013
- $_->{node}->{hold_count} < 4 } @slot) {
1014
- my $message = "Every node has failed -- giving up on this round";
1015
- Log (undef, $message);
1024
+ if ($working_slot_count < 1) {
1025
+ Log(undef, "Every node has failed -- giving up");
1016
1026
  last THISROUND;
1017
1027
  }
1018
1028
  }
@@ -1048,18 +1058,18 @@ freeze_if_want_freeze();
1048
1058
 
1049
1059
  if (!defined $main::success)
1050
1060
  {
1051
- if (@jobstep_todo &&
1052
- $thisround_succeeded == 0 &&
1053
- ($thisround_failed == 0 || $thisround_failed > 4))
1054
- {
1061
+ if (!@jobstep_todo) {
1062
+ $main::success = 1;
1063
+ } elsif ($working_slot_count < 1) {
1064
+ save_output_collection();
1065
+ save_meta();
1066
+ exit(EX_RETRY_UNLOCKED);
1067
+ } elsif ($thisround_succeeded == 0 &&
1068
+ ($thisround_failed == 0 || $thisround_failed > 4)) {
1055
1069
  my $message = "stop because $thisround_failed tasks failed and none succeeded";
1056
1070
  Log (undef, $message);
1057
1071
  $main::success = 0;
1058
1072
  }
1059
- if (!@jobstep_todo)
1060
- {
1061
- $main::success = 1;
1062
- }
1063
1073
  }
1064
1074
 
1065
1075
  goto ONELEVEL if !defined $main::success;
@@ -1067,16 +1077,7 @@ goto ONELEVEL if !defined $main::success;
1067
1077
 
1068
1078
  release_allocation();
1069
1079
  freeze();
1070
- my $collated_output = &create_output_collection();
1071
-
1072
- if (!$collated_output) {
1073
- Log (undef, "Failed to write output collection");
1074
- }
1075
- else {
1076
- Log(undef, "job output $collated_output");
1077
- $Job->update_attributes('output' => $collated_output);
1078
- }
1079
-
1080
+ my $collated_output = save_output_collection();
1080
1081
  Log (undef, "finish");
1081
1082
 
1082
1083
  save_meta();
@@ -1141,7 +1142,7 @@ sub reapchildren
1141
1142
  if (!$task_success)
1142
1143
  {
1143
1144
  my $temporary_fail;
1144
- $temporary_fail ||= $Jobstep->{node_fail};
1145
+ $temporary_fail ||= $Jobstep->{tempfail};
1145
1146
  $temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
1146
1147
 
1147
1148
  ++$thisround_failed;
@@ -1179,6 +1180,7 @@ sub reapchildren
1179
1180
  ++$thisround_succeeded;
1180
1181
  $slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
1181
1182
  $slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
1183
+ $slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
1182
1184
  push @jobstep_done, $jobstepid;
1183
1185
  Log ($jobstepid, "success in $elapsed seconds");
1184
1186
  }
@@ -1389,10 +1391,19 @@ sub preprocess_stderr
1389
1391
  # whoa.
1390
1392
  $main::please_freeze = 1;
1391
1393
  }
1392
- elsif ($line =~ /(srun: error: (Node failure on|Unable to create job step|.*: Communication connection failure))|arvados.errors.Keep/) {
1393
- $jobstep[$job]->{node_fail} = 1;
1394
+ elsif ($line =~ /srun: error: Node failure on/) {
1395
+ my $job_slot_index = $jobstep[$job]->{slotindex};
1396
+ $slot[$job_slot_index]->{node}->{fail_count}++;
1397
+ $jobstep[$job]->{tempfail} = 1;
1398
+ ban_node_by_slot($job_slot_index);
1399
+ }
1400
+ elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
1401
+ $jobstep[$job]->{tempfail} = 1;
1394
1402
  ban_node_by_slot($jobstep[$job]->{slotindex});
1395
1403
  }
1404
+ elsif ($line =~ /arvados\.errors\.Keep/) {
1405
+ $jobstep[$job]->{tempfail} = 1;
1406
+ }
1396
1407
  }
1397
1408
  }
1398
1409
 
@@ -1511,6 +1522,20 @@ print (arvados.api("v1").collections().
1511
1522
  return $joboutput;
1512
1523
  }
1513
1524
 
1525
+ # Calls create_output_collection, logs the result, and returns it.
1526
+ # If that was successful, save that as the output in the job record.
1527
+ sub save_output_collection {
1528
+ my $collated_output = create_output_collection();
1529
+
1530
+ if (!$collated_output) {
1531
+ Log(undef, "Failed to write output collection");
1532
+ }
1533
+ else {
1534
+ Log(undef, "job output $collated_output");
1535
+ $Job->update_attributes('output' => $collated_output);
1536
+ }
1537
+ return $collated_output;
1538
+ }
1514
1539
 
1515
1540
  sub killem
1516
1541
  {
@@ -1556,6 +1581,8 @@ sub fhbits
1556
1581
  # Send log output to Keep via arv-put.
1557
1582
  #
1558
1583
  # $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
1584
+ # $log_pipe_out_buf is a string containing all output read from arv-put so far.
1585
+ # $log_pipe_out_select is an IO::Select object around $log_pipe_out.
1559
1586
  # $log_pipe_pid is the pid of the arv-put subprocess.
1560
1587
  #
1561
1588
  # The only functions that should access these variables directly are:
@@ -1564,6 +1591,13 @@ sub fhbits
1564
1591
  # Starts an arv-put pipe, reading data on stdin and writing it to
1565
1592
  # a $logfilename file in an output collection.
1566
1593
  #
1594
+ # log_writer_read_output([$timeout])
1595
+ # Read output from $log_pipe_out and append it to $log_pipe_out_buf.
1596
+ # Passes $timeout to the select() call, with a default of 0.01.
1597
+ # Returns the result of the last read() call on $log_pipe_out, or
1598
+ # -1 if read() wasn't called because select() timed out.
1599
+ # Only other log_writer_* functions should need to call this.
1600
+ #
1567
1601
  # log_writer_send($txt)
1568
1602
  # Writes $txt to the output log collection.
1569
1603
  #
@@ -1574,25 +1608,40 @@ sub fhbits
1574
1608
  # Returns a true value if there is currently a live arv-put
1575
1609
  # process, false otherwise.
1576
1610
  #
1577
- my ($log_pipe_in, $log_pipe_out, $log_pipe_pid);
1611
+ my ($log_pipe_in, $log_pipe_out, $log_pipe_out_buf, $log_pipe_out_select,
1612
+ $log_pipe_pid);
1578
1613
 
1579
1614
  sub log_writer_start($)
1580
1615
  {
1581
1616
  my $logfilename = shift;
1582
1617
  $log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
1583
1618
  'arv-put',
1584
- '--portable-data-hash',
1585
- '--project-uuid', $Job->{owner_uuid},
1619
+ '--stream',
1586
1620
  '--retries', '3',
1587
- '--name', $logfilename,
1588
1621
  '--filename', $logfilename,
1589
1622
  '-');
1623
+ $log_pipe_out_buf = "";
1624
+ $log_pipe_out_select = IO::Select->new($log_pipe_out);
1625
+ }
1626
+
1627
+ sub log_writer_read_output {
1628
+ my $timeout = shift || 0.01;
1629
+ my $read = -1;
1630
+ while ($read && $log_pipe_out_select->can_read($timeout)) {
1631
+ $read = read($log_pipe_out, $log_pipe_out_buf, 65536,
1632
+ length($log_pipe_out_buf));
1633
+ }
1634
+ if (!defined($read)) {
1635
+ Log(undef, "error reading log manifest from arv-put: $!");
1636
+ }
1637
+ return $read;
1590
1638
  }
1591
1639
 
1592
1640
  sub log_writer_send($)
1593
1641
  {
1594
1642
  my $txt = shift;
1595
1643
  print $log_pipe_in $txt;
1644
+ log_writer_read_output();
1596
1645
  }
1597
1646
 
1598
1647
  sub log_writer_finish()
@@ -1600,22 +1649,24 @@ sub log_writer_finish()
1600
1649
  return unless $log_pipe_pid;
1601
1650
 
1602
1651
  close($log_pipe_in);
1603
- my $arv_put_output;
1604
1652
 
1605
- my $s = IO::Select->new($log_pipe_out);
1606
- if ($s->can_read(120)) {
1607
- sysread($log_pipe_out, $arv_put_output, 1024);
1608
- chomp($arv_put_output);
1609
- } else {
1653
+ my $read_result = log_writer_read_output(120);
1654
+ if ($read_result == -1) {
1610
1655
  Log (undef, "timed out reading from 'arv-put'");
1656
+ } elsif ($read_result != 0) {
1657
+ Log(undef, "failed to read arv-put log manifest to EOF");
1611
1658
  }
1612
1659
 
1613
1660
  waitpid($log_pipe_pid, 0);
1614
- $log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
1615
1661
  if ($?) {
1616
- Log("log_writer_finish: arv-put exited ".exit_status_s($?))
1662
+ Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?))
1617
1663
  }
1618
1664
 
1665
+ close($log_pipe_out);
1666
+ my $arv_put_output = $log_pipe_out_buf;
1667
+ $log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf =
1668
+ $log_pipe_out_select = undef;
1669
+
1619
1670
  return $arv_put_output;
1620
1671
  }
1621
1672
 
@@ -1679,10 +1730,21 @@ sub save_meta
1679
1730
  return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm
1680
1731
  return unless log_writer_is_active();
1681
1732
 
1682
- my $loglocator = log_writer_finish();
1683
- Log (undef, "log manifest is $loglocator");
1684
- $Job->{'log'} = $loglocator;
1685
- $Job->update_attributes('log', $loglocator);
1733
+ my $log_manifest = "";
1734
+ if ($Job->{log}) {
1735
+ my $prev_log_coll = api_call("collections/get", uuid => $Job->{log});
1736
+ $log_manifest .= $prev_log_coll->{manifest_text};
1737
+ }
1738
+ $log_manifest .= log_writer_finish();
1739
+
1740
+ my $log_coll = api_call(
1741
+ "collections/create", ensure_unique_name => 1, collection => {
1742
+ manifest_text => $log_manifest,
1743
+ owner_uuid => $Job->{owner_uuid},
1744
+ name => sprintf("Log from %s job %s", $Job->{script}, $Job->{uuid}),
1745
+ });
1746
+ Log(undef, "log collection is " . $log_coll->{portable_data_hash});
1747
+ $Job->update_attributes('log' => $log_coll->{portable_data_hash});
1686
1748
  }
1687
1749
 
1688
1750
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arvados-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20150612180532
4
+ version: 0.1.20150622205518
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arvados Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-12 00:00:00.000000000 Z
11
+ date: 2015-06-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: arvados
@@ -178,7 +178,7 @@ dependencies:
178
178
  - - "<"
179
179
  - !ruby/object:Gem::Version
180
180
  version: 1.0.0
181
- description: Arvados command line tools, git commit f0a92e384da79e0a17efb42de17031f45f006e44
181
+ description: Arvados command line tools, git commit ad7679cfe57733940f8461097ee01bfd97997ce6
182
182
  email: gem-dev@curoverse.com
183
183
  executables:
184
184
  - arv