arvados-cli 0.1.20150612180532 → 0.1.20150622205518
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/crunch-job +106 -44
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e321a7b7d04de4e2a7cd3f83e2ddf629b4c15d66
|
4
|
+
data.tar.gz: fda717493fb47d9b36ab5e37cc5c4873ccbba676
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 713b7f792692442ae2e59935994bb187feafcea88bbb7c3fdc7b0f9005049208edbba9c260b68eae723734c43dab96f5414a2a4ec0c8e9873c385ebe9706b2d5
|
7
|
+
data.tar.gz: aec453b13239436976570f1fca4bcf935dc823889780ecac1cb749a4fb3409074dccfd5c814593586bfef7483e9f505e9fccc1e5ac3a95f0402da33b92830225
|
data/bin/crunch-job
CHANGED
@@ -98,6 +98,7 @@ use File::Path qw( make_path remove_tree );
|
|
98
98
|
|
99
99
|
use constant TASK_TEMPFAIL => 111;
|
100
100
|
use constant EX_TEMPFAIL => 75;
|
101
|
+
use constant EX_RETRY_UNLOCKED => 93;
|
101
102
|
|
102
103
|
$ENV{"TMPDIR"} ||= "/tmp";
|
103
104
|
unless (defined $ENV{"CRUNCH_TMP"}) {
|
@@ -292,9 +293,16 @@ foreach (@sinfo)
|
|
292
293
|
{
|
293
294
|
Log (undef, "node $nodename - $ncpus slots");
|
294
295
|
my $node = { name => $nodename,
|
295
|
-
|
296
|
-
|
297
|
-
|
296
|
+
ncpus => $ncpus,
|
297
|
+
# The number of consecutive times a task has been dispatched
|
298
|
+
# to this node and failed.
|
299
|
+
losing_streak => 0,
|
300
|
+
# The number of consecutive times that SLURM has reported
|
301
|
+
# a node failure since the last successful task.
|
302
|
+
fail_count => 0,
|
303
|
+
# Don't dispatch work to this node until this time
|
304
|
+
# (in seconds since the epoch) has passed.
|
305
|
+
hold_until => 0 };
|
298
306
|
foreach my $cpu (1..$ncpus)
|
299
307
|
{
|
300
308
|
push @slot, { node => $node,
|
@@ -721,6 +729,7 @@ ONELEVEL:
|
|
721
729
|
my $thisround_succeeded = 0;
|
722
730
|
my $thisround_failed = 0;
|
723
731
|
my $thisround_failed_multiple = 0;
|
732
|
+
my $working_slot_count = scalar(@slot);
|
724
733
|
|
725
734
|
@jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
|
726
735
|
or $a <=> $b } @jobstep_todo;
|
@@ -950,6 +959,7 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
|
|
950
959
|
$Jobstep->{slotindex} = $childslot;
|
951
960
|
delete $Jobstep->{stderr};
|
952
961
|
delete $Jobstep->{finishtime};
|
962
|
+
delete $Jobstep->{tempfail};
|
953
963
|
|
954
964
|
$Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
|
955
965
|
$Jobstep->{'arvados_task'}->save;
|
@@ -986,6 +996,8 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
|
|
986
996
|
{
|
987
997
|
update_progress_stats();
|
988
998
|
}
|
999
|
+
$working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
|
1000
|
+
$_->{node}->{hold_count} < 4 } @slot);
|
989
1001
|
if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
|
990
1002
|
($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
|
991
1003
|
{
|
@@ -1009,10 +1021,8 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
|
|
1009
1021
|
}
|
1010
1022
|
|
1011
1023
|
# give up if no nodes are succeeding
|
1012
|
-
if (
|
1013
|
-
|
1014
|
-
my $message = "Every node has failed -- giving up on this round";
|
1015
|
-
Log (undef, $message);
|
1024
|
+
if ($working_slot_count < 1) {
|
1025
|
+
Log(undef, "Every node has failed -- giving up");
|
1016
1026
|
last THISROUND;
|
1017
1027
|
}
|
1018
1028
|
}
|
@@ -1048,18 +1058,18 @@ freeze_if_want_freeze();
|
|
1048
1058
|
|
1049
1059
|
if (!defined $main::success)
|
1050
1060
|
{
|
1051
|
-
if (
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1061
|
+
if (!@jobstep_todo) {
|
1062
|
+
$main::success = 1;
|
1063
|
+
} elsif ($working_slot_count < 1) {
|
1064
|
+
save_output_collection();
|
1065
|
+
save_meta();
|
1066
|
+
exit(EX_RETRY_UNLOCKED);
|
1067
|
+
} elsif ($thisround_succeeded == 0 &&
|
1068
|
+
($thisround_failed == 0 || $thisround_failed > 4)) {
|
1055
1069
|
my $message = "stop because $thisround_failed tasks failed and none succeeded";
|
1056
1070
|
Log (undef, $message);
|
1057
1071
|
$main::success = 0;
|
1058
1072
|
}
|
1059
|
-
if (!@jobstep_todo)
|
1060
|
-
{
|
1061
|
-
$main::success = 1;
|
1062
|
-
}
|
1063
1073
|
}
|
1064
1074
|
|
1065
1075
|
goto ONELEVEL if !defined $main::success;
|
@@ -1067,16 +1077,7 @@ goto ONELEVEL if !defined $main::success;
|
|
1067
1077
|
|
1068
1078
|
release_allocation();
|
1069
1079
|
freeze();
|
1070
|
-
my $collated_output =
|
1071
|
-
|
1072
|
-
if (!$collated_output) {
|
1073
|
-
Log (undef, "Failed to write output collection");
|
1074
|
-
}
|
1075
|
-
else {
|
1076
|
-
Log(undef, "job output $collated_output");
|
1077
|
-
$Job->update_attributes('output' => $collated_output);
|
1078
|
-
}
|
1079
|
-
|
1080
|
+
my $collated_output = save_output_collection();
|
1080
1081
|
Log (undef, "finish");
|
1081
1082
|
|
1082
1083
|
save_meta();
|
@@ -1141,7 +1142,7 @@ sub reapchildren
|
|
1141
1142
|
if (!$task_success)
|
1142
1143
|
{
|
1143
1144
|
my $temporary_fail;
|
1144
|
-
$temporary_fail ||= $Jobstep->{
|
1145
|
+
$temporary_fail ||= $Jobstep->{tempfail};
|
1145
1146
|
$temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
|
1146
1147
|
|
1147
1148
|
++$thisround_failed;
|
@@ -1179,6 +1180,7 @@ sub reapchildren
|
|
1179
1180
|
++$thisround_succeeded;
|
1180
1181
|
$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
|
1181
1182
|
$slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
|
1183
|
+
$slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
|
1182
1184
|
push @jobstep_done, $jobstepid;
|
1183
1185
|
Log ($jobstepid, "success in $elapsed seconds");
|
1184
1186
|
}
|
@@ -1389,10 +1391,19 @@ sub preprocess_stderr
|
|
1389
1391
|
# whoa.
|
1390
1392
|
$main::please_freeze = 1;
|
1391
1393
|
}
|
1392
|
-
elsif ($line =~ /
|
1393
|
-
$jobstep[$job]->{
|
1394
|
+
elsif ($line =~ /srun: error: Node failure on/) {
|
1395
|
+
my $job_slot_index = $jobstep[$job]->{slotindex};
|
1396
|
+
$slot[$job_slot_index]->{node}->{fail_count}++;
|
1397
|
+
$jobstep[$job]->{tempfail} = 1;
|
1398
|
+
ban_node_by_slot($job_slot_index);
|
1399
|
+
}
|
1400
|
+
elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
|
1401
|
+
$jobstep[$job]->{tempfail} = 1;
|
1394
1402
|
ban_node_by_slot($jobstep[$job]->{slotindex});
|
1395
1403
|
}
|
1404
|
+
elsif ($line =~ /arvados\.errors\.Keep/) {
|
1405
|
+
$jobstep[$job]->{tempfail} = 1;
|
1406
|
+
}
|
1396
1407
|
}
|
1397
1408
|
}
|
1398
1409
|
|
@@ -1511,6 +1522,20 @@ print (arvados.api("v1").collections().
|
|
1511
1522
|
return $joboutput;
|
1512
1523
|
}
|
1513
1524
|
|
1525
|
+
# Calls create_output_collection, logs the result, and returns it.
|
1526
|
+
# If that was successful, save that as the output in the job record.
|
1527
|
+
sub save_output_collection {
|
1528
|
+
my $collated_output = create_output_collection();
|
1529
|
+
|
1530
|
+
if (!$collated_output) {
|
1531
|
+
Log(undef, "Failed to write output collection");
|
1532
|
+
}
|
1533
|
+
else {
|
1534
|
+
Log(undef, "job output $collated_output");
|
1535
|
+
$Job->update_attributes('output' => $collated_output);
|
1536
|
+
}
|
1537
|
+
return $collated_output;
|
1538
|
+
}
|
1514
1539
|
|
1515
1540
|
sub killem
|
1516
1541
|
{
|
@@ -1556,6 +1581,8 @@ sub fhbits
|
|
1556
1581
|
# Send log output to Keep via arv-put.
|
1557
1582
|
#
|
1558
1583
|
# $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
|
1584
|
+
# $log_pipe_out_buf is a string containing all output read from arv-put so far.
|
1585
|
+
# $log_pipe_out_select is an IO::Select object around $log_pipe_out.
|
1559
1586
|
# $log_pipe_pid is the pid of the arv-put subprocess.
|
1560
1587
|
#
|
1561
1588
|
# The only functions that should access these variables directly are:
|
@@ -1564,6 +1591,13 @@ sub fhbits
|
|
1564
1591
|
# Starts an arv-put pipe, reading data on stdin and writing it to
|
1565
1592
|
# a $logfilename file in an output collection.
|
1566
1593
|
#
|
1594
|
+
# log_writer_read_output([$timeout])
|
1595
|
+
# Read output from $log_pipe_out and append it to $log_pipe_out_buf.
|
1596
|
+
# Passes $timeout to the select() call, with a default of 0.01.
|
1597
|
+
# Returns the result of the last read() call on $log_pipe_out, or
|
1598
|
+
# -1 if read() wasn't called because select() timed out.
|
1599
|
+
# Only other log_writer_* functions should need to call this.
|
1600
|
+
#
|
1567
1601
|
# log_writer_send($txt)
|
1568
1602
|
# Writes $txt to the output log collection.
|
1569
1603
|
#
|
@@ -1574,25 +1608,40 @@ sub fhbits
|
|
1574
1608
|
# Returns a true value if there is currently a live arv-put
|
1575
1609
|
# process, false otherwise.
|
1576
1610
|
#
|
1577
|
-
my ($log_pipe_in, $log_pipe_out, $
|
1611
|
+
my ($log_pipe_in, $log_pipe_out, $log_pipe_out_buf, $log_pipe_out_select,
|
1612
|
+
$log_pipe_pid);
|
1578
1613
|
|
1579
1614
|
sub log_writer_start($)
|
1580
1615
|
{
|
1581
1616
|
my $logfilename = shift;
|
1582
1617
|
$log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
|
1583
1618
|
'arv-put',
|
1584
|
-
'--
|
1585
|
-
'--project-uuid', $Job->{owner_uuid},
|
1619
|
+
'--stream',
|
1586
1620
|
'--retries', '3',
|
1587
|
-
'--name', $logfilename,
|
1588
1621
|
'--filename', $logfilename,
|
1589
1622
|
'-');
|
1623
|
+
$log_pipe_out_buf = "";
|
1624
|
+
$log_pipe_out_select = IO::Select->new($log_pipe_out);
|
1625
|
+
}
|
1626
|
+
|
1627
|
+
sub log_writer_read_output {
|
1628
|
+
my $timeout = shift || 0.01;
|
1629
|
+
my $read = -1;
|
1630
|
+
while ($read && $log_pipe_out_select->can_read($timeout)) {
|
1631
|
+
$read = read($log_pipe_out, $log_pipe_out_buf, 65536,
|
1632
|
+
length($log_pipe_out_buf));
|
1633
|
+
}
|
1634
|
+
if (!defined($read)) {
|
1635
|
+
Log(undef, "error reading log manifest from arv-put: $!");
|
1636
|
+
}
|
1637
|
+
return $read;
|
1590
1638
|
}
|
1591
1639
|
|
1592
1640
|
sub log_writer_send($)
|
1593
1641
|
{
|
1594
1642
|
my $txt = shift;
|
1595
1643
|
print $log_pipe_in $txt;
|
1644
|
+
log_writer_read_output();
|
1596
1645
|
}
|
1597
1646
|
|
1598
1647
|
sub log_writer_finish()
|
@@ -1600,22 +1649,24 @@ sub log_writer_finish()
|
|
1600
1649
|
return unless $log_pipe_pid;
|
1601
1650
|
|
1602
1651
|
close($log_pipe_in);
|
1603
|
-
my $arv_put_output;
|
1604
1652
|
|
1605
|
-
my $
|
1606
|
-
if ($
|
1607
|
-
sysread($log_pipe_out, $arv_put_output, 1024);
|
1608
|
-
chomp($arv_put_output);
|
1609
|
-
} else {
|
1653
|
+
my $read_result = log_writer_read_output(120);
|
1654
|
+
if ($read_result == -1) {
|
1610
1655
|
Log (undef, "timed out reading from 'arv-put'");
|
1656
|
+
} elsif ($read_result != 0) {
|
1657
|
+
Log(undef, "failed to read arv-put log manifest to EOF");
|
1611
1658
|
}
|
1612
1659
|
|
1613
1660
|
waitpid($log_pipe_pid, 0);
|
1614
|
-
$log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
|
1615
1661
|
if ($?) {
|
1616
|
-
Log("log_writer_finish: arv-put exited ".exit_status_s($?))
|
1662
|
+
Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?))
|
1617
1663
|
}
|
1618
1664
|
|
1665
|
+
close($log_pipe_out);
|
1666
|
+
my $arv_put_output = $log_pipe_out_buf;
|
1667
|
+
$log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf =
|
1668
|
+
$log_pipe_out_select = undef;
|
1669
|
+
|
1619
1670
|
return $arv_put_output;
|
1620
1671
|
}
|
1621
1672
|
|
@@ -1679,10 +1730,21 @@ sub save_meta
|
|
1679
1730
|
return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm
|
1680
1731
|
return unless log_writer_is_active();
|
1681
1732
|
|
1682
|
-
my $
|
1683
|
-
|
1684
|
-
|
1685
|
-
|
1733
|
+
my $log_manifest = "";
|
1734
|
+
if ($Job->{log}) {
|
1735
|
+
my $prev_log_coll = api_call("collections/get", uuid => $Job->{log});
|
1736
|
+
$log_manifest .= $prev_log_coll->{manifest_text};
|
1737
|
+
}
|
1738
|
+
$log_manifest .= log_writer_finish();
|
1739
|
+
|
1740
|
+
my $log_coll = api_call(
|
1741
|
+
"collections/create", ensure_unique_name => 1, collection => {
|
1742
|
+
manifest_text => $log_manifest,
|
1743
|
+
owner_uuid => $Job->{owner_uuid},
|
1744
|
+
name => sprintf("Log from %s job %s", $Job->{script}, $Job->{uuid}),
|
1745
|
+
});
|
1746
|
+
Log(undef, "log collection is " . $log_coll->{portable_data_hash});
|
1747
|
+
$Job->update_attributes('log' => $log_coll->{portable_data_hash});
|
1686
1748
|
}
|
1687
1749
|
|
1688
1750
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arvados-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.20150622205518
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arvados Authors
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: arvados
|
@@ -178,7 +178,7 @@ dependencies:
|
|
178
178
|
- - "<"
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: 1.0.0
|
181
|
-
description: Arvados command line tools, git commit
|
181
|
+
description: Arvados command line tools, git commit ad7679cfe57733940f8461097ee01bfd97997ce6
|
182
182
|
email: gem-dev@curoverse.com
|
183
183
|
executables:
|
184
184
|
- arv
|