arvados-cli 0.1.20150526143156 → 0.1.20150527201024

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/crunch-job +76 -43
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: df49631b2b1c7ae33eb0d5d9e988e50a2fc0a311
4
- data.tar.gz: fe0a40bf178a62a8518fec9c8ca699188d2a8d32
3
+ metadata.gz: bafef169f074ca61d533298a5e462905d9764846
4
+ data.tar.gz: bd2e304a1cef255c8362650d82ed490d88d7a56d
5
5
  SHA512:
6
- metadata.gz: 59df31ffb182cdcc78b960b8ba439bed2f8b2d916d740ace0b92eb76a3de96169c193de30e3cedfb14e0df96cdda9a4eea32562735ff509f4e9a6b273028d1bb
7
- data.tar.gz: b2afa69c6273326c575cc6e3c4a8c166c544288cb4d90eb6709ae813b69c953e4fad3aa6444797cba3f7318e3c6304f63afa8b198c635199c94a8422b28d5d65
6
+ metadata.gz: fb00c43e5d7d57f3b17f82efaf494426766b12d5af483195ce89252ec11d402c9dcc5149c3d5dc340f393a125d37c53d93e6ef90d7a60808f2124aa44e7c6688
7
+ data.tar.gz: d7879e42888f1bb9a0b4305f4d1b1d2e24acb2b0bbf64fde6be45b33f560aee935c1ab936d62e21ddd330467b05cb234032214e47deee6dc76432b775ea3ad9a
data/bin/crunch-job CHANGED
@@ -118,6 +118,7 @@ $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
118
118
  $ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
119
119
  mkdir ($ENV{"JOB_WORK"});
120
120
 
121
+ my %proc;
121
122
  my $force_unlock;
122
123
  my $git_dir;
123
124
  my $jobspec;
@@ -589,56 +590,89 @@ if (!defined $git_archive) {
589
590
  }
590
591
  }
591
592
  else {
592
- Log(undef, "Run install script on all workers");
593
-
594
- my @srunargs = ("srun",
595
- "--nodelist=$nodelist",
596
- "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
597
- my @execargs = ("sh", "-c",
598
- "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
599
-
600
- $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
601
- my ($install_stderr_r, $install_stderr_w);
602
- pipe $install_stderr_r, $install_stderr_w or croak("pipe() failed: $!");
603
- set_nonblocking($install_stderr_r);
604
- my $installpid = fork();
605
- if ($installpid == 0)
606
- {
607
- close($install_stderr_r);
608
- fcntl($install_stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
609
- open(STDOUT, ">&", $install_stderr_w);
610
- open(STDERR, ">&", $install_stderr_w);
611
- srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
612
- exit (1);
613
- }
614
- close($install_stderr_w);
615
- my $stderr_buf = '';
616
- while ($installpid != waitpid(-1, WNOHANG)) {
617
- freeze_if_want_freeze ($installpid);
618
- # Wait up to 0.1 seconds for something to appear on stderr, then
619
- # do a non-blocking read.
620
- my $bits = fhbits($install_stderr_r);
621
- select ($bits, undef, $bits, 0.1);
622
- if (0 < sysread ($install_stderr_r, $stderr_buf, 8192, length($stderr_buf)))
593
+ my $install_exited;
594
+ my $install_script_tries_left = 3;
595
+ for (my $attempts = 0; $attempts < 3; $attempts++) {
596
+ Log(undef, "Run install script on all workers");
597
+
598
+ my @srunargs = ("srun",
599
+ "--nodelist=$nodelist",
600
+ "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
601
+ my @execargs = ("sh", "-c",
602
+ "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
603
+
604
+ $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
605
+ my ($install_stderr_r, $install_stderr_w);
606
+ pipe $install_stderr_r, $install_stderr_w or croak("pipe() failed: $!");
607
+ set_nonblocking($install_stderr_r);
608
+ my $installpid = fork();
609
+ if ($installpid == 0)
623
610
  {
624
- while ($stderr_buf =~ /^(.*?)\n/) {
625
- my $line = $1;
626
- substr $stderr_buf, 0, 1+length($line), "";
627
- Log(undef, "stderr $line");
611
+ close($install_stderr_r);
612
+ fcntl($install_stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
613
+ open(STDOUT, ">&", $install_stderr_w);
614
+ open(STDERR, ">&", $install_stderr_w);
615
+ srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
616
+ exit (1);
617
+ }
618
+ close($install_stderr_w);
619
+ # Tell freeze_if_want_freeze how to kill the child, otherwise the
620
+ # "waitpid(installpid)" loop won't get interrupted by a freeze:
621
+ $proc{$installpid} = {};
622
+ my $stderr_buf = '';
623
+ # Track whether anything appears on stderr other than slurm errors
624
+ # ("srun: ...") and the "starting: ..." message printed by the
625
+ # srun subroutine itself:
626
+ my $stderr_anything_from_script = 0;
627
+ my $match_our_own_errors = '^(srun: error: |starting: \[)';
628
+ while ($installpid != waitpid(-1, WNOHANG)) {
629
+ freeze_if_want_freeze ($installpid);
630
+ # Wait up to 0.1 seconds for something to appear on stderr, then
631
+ # do a non-blocking read.
632
+ my $bits = fhbits($install_stderr_r);
633
+ select ($bits, undef, $bits, 0.1);
634
+ if (0 < sysread ($install_stderr_r, $stderr_buf, 8192, length($stderr_buf)))
635
+ {
636
+ while ($stderr_buf =~ /^(.*?)\n/) {
637
+ my $line = $1;
638
+ substr $stderr_buf, 0, 1+length($line), "";
639
+ Log(undef, "stderr $line");
640
+ if ($line !~ /$match_our_own_errors/) {
641
+ $stderr_anything_from_script = 1;
642
+ }
643
+ }
628
644
  }
629
645
  }
630
- }
631
- my $install_exited = $?;
632
- close($install_stderr_r);
633
- if (length($stderr_buf) > 0) {
634
- Log(undef, "stderr $stderr_buf")
646
+ delete $proc{$installpid};
647
+ $install_exited = $?;
648
+ close($install_stderr_r);
649
+ if (length($stderr_buf) > 0) {
650
+ if ($stderr_buf !~ /$match_our_own_errors/) {
651
+ $stderr_anything_from_script = 1;
652
+ }
653
+ Log(undef, "stderr $stderr_buf")
654
+ }
655
+
656
+ Log (undef, "Install script exited ".exit_status_s($install_exited));
657
+ last if $install_exited == 0 || $main::please_freeze;
658
+ # If the install script fails but doesn't print an error message,
659
+ # the next thing anyone is likely to do is just run it again in
660
+ # case it was a transient problem like "slurm communication fails
661
+ # because the network isn't reliable enough". So we'll just do
662
+ # that ourselves (up to 3 attempts in total). OTOH, if there is an
663
+ # error message, the problem is more likely to have a real fix and
664
+ # we should fail the job so the fixing process can start, instead
665
+ # of doing 2 more attempts.
666
+ last if $stderr_anything_from_script;
635
667
  }
636
668
 
637
- Log (undef, "Install script exited ".exit_status_s($install_exited));
638
669
  foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
639
670
  unlink($tar_filename);
640
671
  }
641
- exit (1) if $install_exited != 0;
672
+
673
+ if ($install_exited != 0) {
674
+ croak("Giving up");
675
+ }
642
676
  }
643
677
 
644
678
  foreach (qw (script script_version script_parameters runtime_constraints))
@@ -704,7 +738,6 @@ for (my $ii = $#freeslot; $ii >= 0; $ii--) {
704
738
  }
705
739
 
706
740
  Log(undef, "start level $level with $round_num_freeslots slots");
707
- my %proc;
708
741
  my @holdslot;
709
742
  my %reader;
710
743
  my $progress_is_dirty = 1;
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arvados-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20150526143156
4
+ version: 0.1.20150527201024
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arvados Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-26 00:00:00.000000000 Z
11
+ date: 2015-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: arvados
@@ -178,7 +178,7 @@ dependencies:
178
178
  - - "<"
179
179
  - !ruby/object:Gem::Version
180
180
  version: 1.0.0
181
- description: Arvados command line tools, git commit 9b67268ff45d2a552d21fa39f5086180f537ab4a
181
+ description: Arvados command line tools, git commit 7a53d874994a5a9af273cee1329d9635b7e03edb
182
182
  email: gem-dev@curoverse.com
183
183
  executables:
184
184
  - arv