arvados-cli 0.1.20150612180532 → 0.1.20150622205518
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/crunch-job +106 -44
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e321a7b7d04de4e2a7cd3f83e2ddf629b4c15d66
|
4
|
+
data.tar.gz: fda717493fb47d9b36ab5e37cc5c4873ccbba676
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 713b7f792692442ae2e59935994bb187feafcea88bbb7c3fdc7b0f9005049208edbba9c260b68eae723734c43dab96f5414a2a4ec0c8e9873c385ebe9706b2d5
|
7
|
+
data.tar.gz: aec453b13239436976570f1fca4bcf935dc823889780ecac1cb749a4fb3409074dccfd5c814593586bfef7483e9f505e9fccc1e5ac3a95f0402da33b92830225
|
data/bin/crunch-job
CHANGED
@@ -98,6 +98,7 @@ use File::Path qw( make_path remove_tree );
|
|
98
98
|
|
99
99
|
use constant TASK_TEMPFAIL => 111;
|
100
100
|
use constant EX_TEMPFAIL => 75;
|
101
|
+
use constant EX_RETRY_UNLOCKED => 93;
|
101
102
|
|
102
103
|
$ENV{"TMPDIR"} ||= "/tmp";
|
103
104
|
unless (defined $ENV{"CRUNCH_TMP"}) {
|
@@ -292,9 +293,16 @@ foreach (@sinfo)
|
|
292
293
|
{
|
293
294
|
Log (undef, "node $nodename - $ncpus slots");
|
294
295
|
my $node = { name => $nodename,
|
295
|
-
|
296
|
-
|
297
|
-
|
296
|
+
ncpus => $ncpus,
|
297
|
+
# The number of consecutive times a task has been dispatched
|
298
|
+
# to this node and failed.
|
299
|
+
losing_streak => 0,
|
300
|
+
# The number of consecutive times that SLURM has reported
|
301
|
+
# a node failure since the last successful task.
|
302
|
+
fail_count => 0,
|
303
|
+
# Don't dispatch work to this node until this time
|
304
|
+
# (in seconds since the epoch) has passed.
|
305
|
+
hold_until => 0 };
|
298
306
|
foreach my $cpu (1..$ncpus)
|
299
307
|
{
|
300
308
|
push @slot, { node => $node,
|
@@ -721,6 +729,7 @@ ONELEVEL:
|
|
721
729
|
my $thisround_succeeded = 0;
|
722
730
|
my $thisround_failed = 0;
|
723
731
|
my $thisround_failed_multiple = 0;
|
732
|
+
my $working_slot_count = scalar(@slot);
|
724
733
|
|
725
734
|
@jobstep_todo = sort { $jobstep[$a]->{level} <=> $jobstep[$b]->{level}
|
726
735
|
or $a <=> $b } @jobstep_todo;
|
@@ -950,6 +959,7 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
|
|
950
959
|
$Jobstep->{slotindex} = $childslot;
|
951
960
|
delete $Jobstep->{stderr};
|
952
961
|
delete $Jobstep->{finishtime};
|
962
|
+
delete $Jobstep->{tempfail};
|
953
963
|
|
954
964
|
$Jobstep->{'arvados_task'}->{started_at} = strftime "%Y-%m-%dT%H:%M:%SZ", gmtime($Jobstep->{starttime});
|
955
965
|
$Jobstep->{'arvados_task'}->save;
|
@@ -986,6 +996,8 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
|
|
986
996
|
{
|
987
997
|
update_progress_stats();
|
988
998
|
}
|
999
|
+
$working_slot_count = scalar(grep { $_->{node}->{fail_count} == 0 &&
|
1000
|
+
$_->{node}->{hold_count} < 4 } @slot);
|
989
1001
|
if (($thisround_failed_multiple >= 8 && $thisround_succeeded == 0) ||
|
990
1002
|
($thisround_failed_multiple >= 16 && $thisround_failed_multiple > $thisround_succeeded))
|
991
1003
|
{
|
@@ -1009,10 +1021,8 @@ for (my $todo_ptr = 0; $todo_ptr <= $#jobstep_todo; $todo_ptr ++)
|
|
1009
1021
|
}
|
1010
1022
|
|
1011
1023
|
# give up if no nodes are succeeding
|
1012
|
-
if (
|
1013
|
-
|
1014
|
-
my $message = "Every node has failed -- giving up on this round";
|
1015
|
-
Log (undef, $message);
|
1024
|
+
if ($working_slot_count < 1) {
|
1025
|
+
Log(undef, "Every node has failed -- giving up");
|
1016
1026
|
last THISROUND;
|
1017
1027
|
}
|
1018
1028
|
}
|
@@ -1048,18 +1058,18 @@ freeze_if_want_freeze();
|
|
1048
1058
|
|
1049
1059
|
if (!defined $main::success)
|
1050
1060
|
{
|
1051
|
-
if (
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1061
|
+
if (!@jobstep_todo) {
|
1062
|
+
$main::success = 1;
|
1063
|
+
} elsif ($working_slot_count < 1) {
|
1064
|
+
save_output_collection();
|
1065
|
+
save_meta();
|
1066
|
+
exit(EX_RETRY_UNLOCKED);
|
1067
|
+
} elsif ($thisround_succeeded == 0 &&
|
1068
|
+
($thisround_failed == 0 || $thisround_failed > 4)) {
|
1055
1069
|
my $message = "stop because $thisround_failed tasks failed and none succeeded";
|
1056
1070
|
Log (undef, $message);
|
1057
1071
|
$main::success = 0;
|
1058
1072
|
}
|
1059
|
-
if (!@jobstep_todo)
|
1060
|
-
{
|
1061
|
-
$main::success = 1;
|
1062
|
-
}
|
1063
1073
|
}
|
1064
1074
|
|
1065
1075
|
goto ONELEVEL if !defined $main::success;
|
@@ -1067,16 +1077,7 @@ goto ONELEVEL if !defined $main::success;
|
|
1067
1077
|
|
1068
1078
|
release_allocation();
|
1069
1079
|
freeze();
|
1070
|
-
my $collated_output =
|
1071
|
-
|
1072
|
-
if (!$collated_output) {
|
1073
|
-
Log (undef, "Failed to write output collection");
|
1074
|
-
}
|
1075
|
-
else {
|
1076
|
-
Log(undef, "job output $collated_output");
|
1077
|
-
$Job->update_attributes('output' => $collated_output);
|
1078
|
-
}
|
1079
|
-
|
1080
|
+
my $collated_output = save_output_collection();
|
1080
1081
|
Log (undef, "finish");
|
1081
1082
|
|
1082
1083
|
save_meta();
|
@@ -1141,7 +1142,7 @@ sub reapchildren
|
|
1141
1142
|
if (!$task_success)
|
1142
1143
|
{
|
1143
1144
|
my $temporary_fail;
|
1144
|
-
$temporary_fail ||= $Jobstep->{
|
1145
|
+
$temporary_fail ||= $Jobstep->{tempfail};
|
1145
1146
|
$temporary_fail ||= ($exitvalue == TASK_TEMPFAIL);
|
1146
1147
|
|
1147
1148
|
++$thisround_failed;
|
@@ -1179,6 +1180,7 @@ sub reapchildren
|
|
1179
1180
|
++$thisround_succeeded;
|
1180
1181
|
$slot[$proc{$pid}->{slot}]->{node}->{losing_streak} = 0;
|
1181
1182
|
$slot[$proc{$pid}->{slot}]->{node}->{hold_until} = 0;
|
1183
|
+
$slot[$proc{$pid}->{slot}]->{node}->{fail_count} = 0;
|
1182
1184
|
push @jobstep_done, $jobstepid;
|
1183
1185
|
Log ($jobstepid, "success in $elapsed seconds");
|
1184
1186
|
}
|
@@ -1389,10 +1391,19 @@ sub preprocess_stderr
|
|
1389
1391
|
# whoa.
|
1390
1392
|
$main::please_freeze = 1;
|
1391
1393
|
}
|
1392
|
-
elsif ($line =~ /
|
1393
|
-
$jobstep[$job]->{
|
1394
|
+
elsif ($line =~ /srun: error: Node failure on/) {
|
1395
|
+
my $job_slot_index = $jobstep[$job]->{slotindex};
|
1396
|
+
$slot[$job_slot_index]->{node}->{fail_count}++;
|
1397
|
+
$jobstep[$job]->{tempfail} = 1;
|
1398
|
+
ban_node_by_slot($job_slot_index);
|
1399
|
+
}
|
1400
|
+
elsif ($line =~ /srun: error: (Unable to create job step|.*: Communication connection failure)/) {
|
1401
|
+
$jobstep[$job]->{tempfail} = 1;
|
1394
1402
|
ban_node_by_slot($jobstep[$job]->{slotindex});
|
1395
1403
|
}
|
1404
|
+
elsif ($line =~ /arvados\.errors\.Keep/) {
|
1405
|
+
$jobstep[$job]->{tempfail} = 1;
|
1406
|
+
}
|
1396
1407
|
}
|
1397
1408
|
}
|
1398
1409
|
|
@@ -1511,6 +1522,20 @@ print (arvados.api("v1").collections().
|
|
1511
1522
|
return $joboutput;
|
1512
1523
|
}
|
1513
1524
|
|
1525
|
+
# Calls create_output_collection, logs the result, and returns it.
|
1526
|
+
# If that was successful, save that as the output in the job record.
|
1527
|
+
sub save_output_collection {
|
1528
|
+
my $collated_output = create_output_collection();
|
1529
|
+
|
1530
|
+
if (!$collated_output) {
|
1531
|
+
Log(undef, "Failed to write output collection");
|
1532
|
+
}
|
1533
|
+
else {
|
1534
|
+
Log(undef, "job output $collated_output");
|
1535
|
+
$Job->update_attributes('output' => $collated_output);
|
1536
|
+
}
|
1537
|
+
return $collated_output;
|
1538
|
+
}
|
1514
1539
|
|
1515
1540
|
sub killem
|
1516
1541
|
{
|
@@ -1556,6 +1581,8 @@ sub fhbits
|
|
1556
1581
|
# Send log output to Keep via arv-put.
|
1557
1582
|
#
|
1558
1583
|
# $log_pipe_in and $log_pipe_out are the input and output filehandles to the arv-put pipe.
|
1584
|
+
# $log_pipe_out_buf is a string containing all output read from arv-put so far.
|
1585
|
+
# $log_pipe_out_select is an IO::Select object around $log_pipe_out.
|
1559
1586
|
# $log_pipe_pid is the pid of the arv-put subprocess.
|
1560
1587
|
#
|
1561
1588
|
# The only functions that should access these variables directly are:
|
@@ -1564,6 +1591,13 @@ sub fhbits
|
|
1564
1591
|
# Starts an arv-put pipe, reading data on stdin and writing it to
|
1565
1592
|
# a $logfilename file in an output collection.
|
1566
1593
|
#
|
1594
|
+
# log_writer_read_output([$timeout])
|
1595
|
+
# Read output from $log_pipe_out and append it to $log_pipe_out_buf.
|
1596
|
+
# Passes $timeout to the select() call, with a default of 0.01.
|
1597
|
+
# Returns the result of the last read() call on $log_pipe_out, or
|
1598
|
+
# -1 if read() wasn't called because select() timed out.
|
1599
|
+
# Only other log_writer_* functions should need to call this.
|
1600
|
+
#
|
1567
1601
|
# log_writer_send($txt)
|
1568
1602
|
# Writes $txt to the output log collection.
|
1569
1603
|
#
|
@@ -1574,25 +1608,40 @@ sub fhbits
|
|
1574
1608
|
# Returns a true value if there is currently a live arv-put
|
1575
1609
|
# process, false otherwise.
|
1576
1610
|
#
|
1577
|
-
my ($log_pipe_in, $log_pipe_out, $
|
1611
|
+
my ($log_pipe_in, $log_pipe_out, $log_pipe_out_buf, $log_pipe_out_select,
|
1612
|
+
$log_pipe_pid);
|
1578
1613
|
|
1579
1614
|
sub log_writer_start($)
|
1580
1615
|
{
|
1581
1616
|
my $logfilename = shift;
|
1582
1617
|
$log_pipe_pid = open2($log_pipe_out, $log_pipe_in,
|
1583
1618
|
'arv-put',
|
1584
|
-
'--
|
1585
|
-
'--project-uuid', $Job->{owner_uuid},
|
1619
|
+
'--stream',
|
1586
1620
|
'--retries', '3',
|
1587
|
-
'--name', $logfilename,
|
1588
1621
|
'--filename', $logfilename,
|
1589
1622
|
'-');
|
1623
|
+
$log_pipe_out_buf = "";
|
1624
|
+
$log_pipe_out_select = IO::Select->new($log_pipe_out);
|
1625
|
+
}
|
1626
|
+
|
1627
|
+
sub log_writer_read_output {
|
1628
|
+
my $timeout = shift || 0.01;
|
1629
|
+
my $read = -1;
|
1630
|
+
while ($read && $log_pipe_out_select->can_read($timeout)) {
|
1631
|
+
$read = read($log_pipe_out, $log_pipe_out_buf, 65536,
|
1632
|
+
length($log_pipe_out_buf));
|
1633
|
+
}
|
1634
|
+
if (!defined($read)) {
|
1635
|
+
Log(undef, "error reading log manifest from arv-put: $!");
|
1636
|
+
}
|
1637
|
+
return $read;
|
1590
1638
|
}
|
1591
1639
|
|
1592
1640
|
sub log_writer_send($)
|
1593
1641
|
{
|
1594
1642
|
my $txt = shift;
|
1595
1643
|
print $log_pipe_in $txt;
|
1644
|
+
log_writer_read_output();
|
1596
1645
|
}
|
1597
1646
|
|
1598
1647
|
sub log_writer_finish()
|
@@ -1600,22 +1649,24 @@ sub log_writer_finish()
|
|
1600
1649
|
return unless $log_pipe_pid;
|
1601
1650
|
|
1602
1651
|
close($log_pipe_in);
|
1603
|
-
my $arv_put_output;
|
1604
1652
|
|
1605
|
-
my $
|
1606
|
-
if ($
|
1607
|
-
sysread($log_pipe_out, $arv_put_output, 1024);
|
1608
|
-
chomp($arv_put_output);
|
1609
|
-
} else {
|
1653
|
+
my $read_result = log_writer_read_output(120);
|
1654
|
+
if ($read_result == -1) {
|
1610
1655
|
Log (undef, "timed out reading from 'arv-put'");
|
1656
|
+
} elsif ($read_result != 0) {
|
1657
|
+
Log(undef, "failed to read arv-put log manifest to EOF");
|
1611
1658
|
}
|
1612
1659
|
|
1613
1660
|
waitpid($log_pipe_pid, 0);
|
1614
|
-
$log_pipe_pid = $log_pipe_in = $log_pipe_out = undef;
|
1615
1661
|
if ($?) {
|
1616
|
-
Log("log_writer_finish: arv-put exited ".exit_status_s($?))
|
1662
|
+
Log(undef, "log_writer_finish: arv-put exited " . exit_status_s($?))
|
1617
1663
|
}
|
1618
1664
|
|
1665
|
+
close($log_pipe_out);
|
1666
|
+
my $arv_put_output = $log_pipe_out_buf;
|
1667
|
+
$log_pipe_pid = $log_pipe_in = $log_pipe_out = $log_pipe_out_buf =
|
1668
|
+
$log_pipe_out_select = undef;
|
1669
|
+
|
1619
1670
|
return $arv_put_output;
|
1620
1671
|
}
|
1621
1672
|
|
@@ -1679,10 +1730,21 @@ sub save_meta
|
|
1679
1730
|
return if $justcheckpoint; # checkpointing is not relevant post-Warehouse.pm
|
1680
1731
|
return unless log_writer_is_active();
|
1681
1732
|
|
1682
|
-
my $
|
1683
|
-
|
1684
|
-
|
1685
|
-
|
1733
|
+
my $log_manifest = "";
|
1734
|
+
if ($Job->{log}) {
|
1735
|
+
my $prev_log_coll = api_call("collections/get", uuid => $Job->{log});
|
1736
|
+
$log_manifest .= $prev_log_coll->{manifest_text};
|
1737
|
+
}
|
1738
|
+
$log_manifest .= log_writer_finish();
|
1739
|
+
|
1740
|
+
my $log_coll = api_call(
|
1741
|
+
"collections/create", ensure_unique_name => 1, collection => {
|
1742
|
+
manifest_text => $log_manifest,
|
1743
|
+
owner_uuid => $Job->{owner_uuid},
|
1744
|
+
name => sprintf("Log from %s job %s", $Job->{script}, $Job->{uuid}),
|
1745
|
+
});
|
1746
|
+
Log(undef, "log collection is " . $log_coll->{portable_data_hash});
|
1747
|
+
$Job->update_attributes('log' => $log_coll->{portable_data_hash});
|
1686
1748
|
}
|
1687
1749
|
|
1688
1750
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arvados-cli
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.20150622205518
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Arvados Authors
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-06-
|
11
|
+
date: 2015-06-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: arvados
|
@@ -178,7 +178,7 @@ dependencies:
|
|
178
178
|
- - "<"
|
179
179
|
- !ruby/object:Gem::Version
|
180
180
|
version: 1.0.0
|
181
|
-
description: Arvados command line tools, git commit
|
181
|
+
description: Arvados command line tools, git commit ad7679cfe57733940f8461097ee01bfd97997ce6
|
182
182
|
email: gem-dev@curoverse.com
|
183
183
|
executables:
|
184
184
|
- arv
|