arvados-cli 0.1.20150526143156 → 0.1.20150527201024

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/bin/crunch-job +76 -43
  3. metadata +3 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: df49631b2b1c7ae33eb0d5d9e988e50a2fc0a311
4
- data.tar.gz: fe0a40bf178a62a8518fec9c8ca699188d2a8d32
3
+ metadata.gz: bafef169f074ca61d533298a5e462905d9764846
4
+ data.tar.gz: bd2e304a1cef255c8362650d82ed490d88d7a56d
5
5
  SHA512:
6
- metadata.gz: 59df31ffb182cdcc78b960b8ba439bed2f8b2d916d740ace0b92eb76a3de96169c193de30e3cedfb14e0df96cdda9a4eea32562735ff509f4e9a6b273028d1bb
7
- data.tar.gz: b2afa69c6273326c575cc6e3c4a8c166c544288cb4d90eb6709ae813b69c953e4fad3aa6444797cba3f7318e3c6304f63afa8b198c635199c94a8422b28d5d65
6
+ metadata.gz: fb00c43e5d7d57f3b17f82efaf494426766b12d5af483195ce89252ec11d402c9dcc5149c3d5dc340f393a125d37c53d93e6ef90d7a60808f2124aa44e7c6688
7
+ data.tar.gz: d7879e42888f1bb9a0b4305f4d1b1d2e24acb2b0bbf64fde6be45b33f560aee935c1ab936d62e21ddd330467b05cb234032214e47deee6dc76432b775ea3ad9a
data/bin/crunch-job CHANGED
@@ -118,6 +118,7 @@ $ENV{"CRUNCH_INSTALL"} = "$ENV{CRUNCH_TMP}/opt";
118
118
  $ENV{"CRUNCH_WORK"} = $ENV{"JOB_WORK"}; # deprecated
119
119
  mkdir ($ENV{"JOB_WORK"});
120
120
 
121
+ my %proc;
121
122
  my $force_unlock;
122
123
  my $git_dir;
123
124
  my $jobspec;
@@ -589,56 +590,89 @@ if (!defined $git_archive) {
589
590
  }
590
591
  }
591
592
  else {
592
- Log(undef, "Run install script on all workers");
593
-
594
- my @srunargs = ("srun",
595
- "--nodelist=$nodelist",
596
- "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
597
- my @execargs = ("sh", "-c",
598
- "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
599
-
600
- $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
601
- my ($install_stderr_r, $install_stderr_w);
602
- pipe $install_stderr_r, $install_stderr_w or croak("pipe() failed: $!");
603
- set_nonblocking($install_stderr_r);
604
- my $installpid = fork();
605
- if ($installpid == 0)
606
- {
607
- close($install_stderr_r);
608
- fcntl($install_stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
609
- open(STDOUT, ">&", $install_stderr_w);
610
- open(STDERR, ">&", $install_stderr_w);
611
- srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
612
- exit (1);
613
- }
614
- close($install_stderr_w);
615
- my $stderr_buf = '';
616
- while ($installpid != waitpid(-1, WNOHANG)) {
617
- freeze_if_want_freeze ($installpid);
618
- # Wait up to 0.1 seconds for something to appear on stderr, then
619
- # do a non-blocking read.
620
- my $bits = fhbits($install_stderr_r);
621
- select ($bits, undef, $bits, 0.1);
622
- if (0 < sysread ($install_stderr_r, $stderr_buf, 8192, length($stderr_buf)))
593
+ my $install_exited;
594
+ my $install_script_tries_left = 3;
595
+ for (my $attempts = 0; $attempts < 3; $attempts++) {
596
+ Log(undef, "Run install script on all workers");
597
+
598
+ my @srunargs = ("srun",
599
+ "--nodelist=$nodelist",
600
+ "-D", $ENV{'TMPDIR'}, "--job-name=$job_id");
601
+ my @execargs = ("sh", "-c",
602
+ "mkdir -p $ENV{CRUNCH_INSTALL} && cd $ENV{CRUNCH_TMP} && perl -");
603
+
604
+ $ENV{"CRUNCH_GIT_ARCHIVE_HASH"} = md5_hex($git_archive);
605
+ my ($install_stderr_r, $install_stderr_w);
606
+ pipe $install_stderr_r, $install_stderr_w or croak("pipe() failed: $!");
607
+ set_nonblocking($install_stderr_r);
608
+ my $installpid = fork();
609
+ if ($installpid == 0)
623
610
  {
624
- while ($stderr_buf =~ /^(.*?)\n/) {
625
- my $line = $1;
626
- substr $stderr_buf, 0, 1+length($line), "";
627
- Log(undef, "stderr $line");
611
+ close($install_stderr_r);
612
+ fcntl($install_stderr_w, F_SETFL, 0) or croak($!); # no close-on-exec
613
+ open(STDOUT, ">&", $install_stderr_w);
614
+ open(STDERR, ">&", $install_stderr_w);
615
+ srun (\@srunargs, \@execargs, {}, $build_script . $git_archive);
616
+ exit (1);
617
+ }
618
+ close($install_stderr_w);
619
+ # Tell freeze_if_want_freeze how to kill the child, otherwise the
620
+ # "waitpid(installpid)" loop won't get interrupted by a freeze:
621
+ $proc{$installpid} = {};
622
+ my $stderr_buf = '';
623
+ # Track whether anything appears on stderr other than slurm errors
624
+ # ("srun: ...") and the "starting: ..." message printed by the
625
+ # srun subroutine itself:
626
+ my $stderr_anything_from_script = 0;
627
+ my $match_our_own_errors = '^(srun: error: |starting: \[)';
628
+ while ($installpid != waitpid(-1, WNOHANG)) {
629
+ freeze_if_want_freeze ($installpid);
630
+ # Wait up to 0.1 seconds for something to appear on stderr, then
631
+ # do a non-blocking read.
632
+ my $bits = fhbits($install_stderr_r);
633
+ select ($bits, undef, $bits, 0.1);
634
+ if (0 < sysread ($install_stderr_r, $stderr_buf, 8192, length($stderr_buf)))
635
+ {
636
+ while ($stderr_buf =~ /^(.*?)\n/) {
637
+ my $line = $1;
638
+ substr $stderr_buf, 0, 1+length($line), "";
639
+ Log(undef, "stderr $line");
640
+ if ($line !~ /$match_our_own_errors/) {
641
+ $stderr_anything_from_script = 1;
642
+ }
643
+ }
628
644
  }
629
645
  }
630
- }
631
- my $install_exited = $?;
632
- close($install_stderr_r);
633
- if (length($stderr_buf) > 0) {
634
- Log(undef, "stderr $stderr_buf")
646
+ delete $proc{$installpid};
647
+ $install_exited = $?;
648
+ close($install_stderr_r);
649
+ if (length($stderr_buf) > 0) {
650
+ if ($stderr_buf !~ /$match_our_own_errors/) {
651
+ $stderr_anything_from_script = 1;
652
+ }
653
+ Log(undef, "stderr $stderr_buf")
654
+ }
655
+
656
+ Log (undef, "Install script exited ".exit_status_s($install_exited));
657
+ last if $install_exited == 0 || $main::please_freeze;
658
+ # If the install script fails but doesn't print an error message,
659
+ # the next thing anyone is likely to do is just run it again in
660
+ # case it was a transient problem like "slurm communication fails
661
+ # because the network isn't reliable enough". So we'll just do
662
+ # that ourselves (up to 3 attempts in total). OTOH, if there is an
663
+ # error message, the problem is more likely to have a real fix and
664
+ # we should fail the job so the fixing process can start, instead
665
+ # of doing 2 more attempts.
666
+ last if $stderr_anything_from_script;
635
667
  }
636
668
 
637
- Log (undef, "Install script exited ".exit_status_s($install_exited));
638
669
  foreach my $tar_filename (map { tar_filename_n($_); } (1..$git_tar_count)) {
639
670
  unlink($tar_filename);
640
671
  }
641
- exit (1) if $install_exited != 0;
672
+
673
+ if ($install_exited != 0) {
674
+ croak("Giving up");
675
+ }
642
676
  }
643
677
 
644
678
  foreach (qw (script script_version script_parameters runtime_constraints))
@@ -704,7 +738,6 @@ for (my $ii = $#freeslot; $ii >= 0; $ii--) {
704
738
  }
705
739
 
706
740
  Log(undef, "start level $level with $round_num_freeslots slots");
707
- my %proc;
708
741
  my @holdslot;
709
742
  my %reader;
710
743
  my $progress_is_dirty = 1;
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arvados-cli
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.20150526143156
4
+ version: 0.1.20150527201024
5
5
  platform: ruby
6
6
  authors:
7
7
  - Arvados Authors
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-05-26 00:00:00.000000000 Z
11
+ date: 2015-05-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: arvados
@@ -178,7 +178,7 @@ dependencies:
178
178
  - - "<"
179
179
  - !ruby/object:Gem::Version
180
180
  version: 1.0.0
181
- description: Arvados command line tools, git commit 9b67268ff45d2a552d21fa39f5086180f537ab4a
181
+ description: Arvados command line tools, git commit 7a53d874994a5a9af273cee1329d9635b7e03edb
182
182
  email: gem-dev@curoverse.com
183
183
  executables:
184
184
  - arv