cheripic 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ace583d5063ea92f69faa4430b71f0bb0f654528
4
- data.tar.gz: 00ae530b7c5c162aa0e699cae2c5dcaa9d159673
3
+ metadata.gz: 458f681424a73ea58acb8aefa73d68019ad0854d
4
+ data.tar.gz: 23547939b1fead465d06d2f6d8e45ce4172b1cb1
5
5
  SHA512:
6
- metadata.gz: c179df9e44bdff364c8c9c7dd2a779609b05ed3d2f0ef5385c5c5ebb4910a98c35a05d861851dea3f34b14ab8d188994e6b7f254dde2b9356e73dbff08386cf0
7
- data.tar.gz: f21ee021e4594bacaf319170746ad7655b0c579cb0c49495bf33dbef9dabd059f8d2e81edf44d64d24cea1f86a3e6315782d87fd8e475cf5698d35dce1bd3079
6
+ metadata.gz: 2e3af0df95197769c542b4aab76444a6b14842890b46a97d6be10101f267db5f5df7d1ed8d67083ac8890a866e1cab678a9b23c5dc03b1edb7b8fc2150b35097
7
+ data.tar.gz: 9aa159df9086102679bd6359d4a5bf94dfe72f52d9c11e66259ffd40754f767001bb2a67e996b958018a009db0a6aa558c7ebe4001c2f6bce9b8993bdfd66091
data/.gitignore CHANGED
@@ -8,4 +8,5 @@
8
8
  /spec/reports/
9
9
  /tmp/
10
10
  .idea
11
+ /packaging/
11
12
 
data/Gemfile CHANGED
@@ -1,5 +1,4 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
- gem 'bio-samtools', :git => 'git://github.com/helios/bioruby-samtools.git', :tag => 'v2.3.5'
4
3
  # Specify your gem's dependencies in cheripic.gemspec
5
4
  gemspec
data/bin/cheripic ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ require 'cheripic'
3
+
4
+ # rescue errors to get clean error messages through the logger
5
+ # backtrace can be accessed by setting --loglevel to 'debug' option
6
+ begin
7
+ submission = Cheripic::Cmd.new ARGV
8
+ submission.run
9
+ rescue Cheripic::CheripicError => e
10
+ logger.error e.message
11
+ logger.debug e.backtrace unless e.backtrace.nil?
12
+ exit 1
13
+ end
data/cheripic.gemspec CHANGED
@@ -22,12 +22,12 @@ Gem::Specification.new do |spec|
22
22
  spec.add_runtime_dependency 'yell', '~> 2.0', '>= 2.0.5'
23
23
  spec.add_runtime_dependency 'trollop', '~> 2.1', '>= 2.1.2'
24
24
  spec.add_runtime_dependency 'bio', '~> 1.5', '>= 1.5.0'
25
- # spec.add_dependency 'bio-samtools', '~> 2.3.3'
25
+ spec.add_dependency 'bio-samtools', '~> 2.4.0'
26
26
  spec.add_dependency 'bio-gngm', '~> 0.2.1'
27
27
  spec.add_runtime_dependency 'rinruby', '~> 2.0', '>= 2.0.3'
28
28
 
29
29
  spec.add_development_dependency 'activesupport', '~> 4.2.6'
30
- spec.add_development_dependency 'bundler', '~> 1.10'
30
+ spec.add_development_dependency 'bundler', '~> 1.7.6'
31
31
  spec.add_development_dependency 'rake', '~> 10.0'
32
32
  spec.add_development_dependency 'minitest'
33
33
  spec.add_development_dependency 'minitest-reporters', '>= 1.0.17'
data/lib/cheripic.rb CHANGED
@@ -1,11 +1,17 @@
1
1
 
2
- # set up a golbal logger object to access across module
2
+ # sets up a global logger object to access across module
3
3
  require 'yell'
4
+
5
+ # Computing Homozygosity Enriched Regions In genomes to Prioritize Identification of Candidate variants (CHERIPIC)
6
+ # Cheripic module provides tools and pipeline to extract potential candidate mutations
7
+ # in around the region of the genome hosting the causative mutation behind the phenotype of interest.
4
8
  module Cheripic
5
9
 
6
10
  # custom error handling
7
11
  class CheripicError < StandardError; end
12
+ # custom error handling for IO
8
13
  class CheripicIOError < CheripicError; end
14
+ # custom error handling for Arg
9
15
  class CheripicArgError < CheripicError; end
10
16
 
11
17
  # Define a logger and pass `Object` as name.
data/lib/cheripic/bfr.rb CHANGED
@@ -2,16 +2,26 @@
2
2
 
3
3
  module Cheripic
4
4
 
5
+ # Custom error handling for Bfr class
5
6
  class BfrError < CheripicError; end
6
7
 
8
+ # A class to calculate bulk frequency ratio (bfr) using one or two hashes of base fractions resulted from pileup
9
+ #
10
+ # @!attribute [rw] bfr_adj
11
+ # @return [Float] a float value to adjust the bfr calculation
7
12
  class Bfr
8
13
 
9
14
  attr_accessor :bfr_adj
10
15
 
11
- # get bulk frequency ratio (bfr) for marked hemi snps only
12
- # ignore positions with complex variants
16
+ # A method to get bulk frequency ratio (bfr) for selected hemi snps.
17
+ # This is done by selecting which hash (mutant or background) to use for bfr calculation
18
+ # either calculates fraction or bfr
19
+ # and ignores positions with complex variants.
20
+ # @param mut_hash [Hash] a hash of base fractions from pileup of mutant bulk
21
+ # @param bg_hash [Hash] a hash of base fractions from pileup of background bulk
22
+ # @return [Float] a ratio calculated
13
23
  def self.get_bfr(mut_hash, bg_hash='')
14
- @bfr_adj = Options.params.bfr_adjust
24
+ @bfr_adj = Options.bfr_adjust
15
25
  if bg_hash != ''
16
26
  # checking if only two vars in base hash and that includes ref
17
27
  # checking if only one var in hemi snp
@@ -37,9 +47,12 @@ module Cheripic
37
47
  bfr
38
48
  end
39
49
 
40
- # calculate bfr using both mutant and background bulk information
50
+ # A method to calculate bfr using a base fraction hash with hemi-snp
51
+ # @param two_key_hash [Hash] a hash of base fractions from pileup with 2 keys (a ref and variant base)
52
+ # @param other_hash [Hash] a hash of base fractions from pileup
53
+ # @return [Float] a ratio calculated
41
54
  def self.calculate_bfr(two_key_hash, other_hash)
42
- # fix :ref value if absent due to below noise depth
55
+ # if :ref is absent such as below noise depth, then set to zero
43
56
  unless two_key_hash.key?(:ref)
44
57
  two_key_hash[:ref] = 0
45
58
  end
@@ -63,6 +76,9 @@ module Cheripic
63
76
  bfr
64
77
  end
65
78
 
79
+ # A method to calculate ratio using a base fraction hash
80
+ # @param hash [Hash] a hash of base fractions from pileup with 2 or 1 keys
81
+ # @return [Array<Float><String>] an array of ratio calculated and base character
66
82
  def self.calc_fraction(hash)
67
83
  unless hash.key?(:ref)
68
84
  hash[:ref] = 0
data/lib/cheripic/cmd.rb CHANGED
@@ -2,6 +2,10 @@
2
2
 
3
3
  module Cheripic
4
4
 
5
+ # A command line option and processing object to handle input options
6
+ #
7
+ # @!attribute [rw] options
8
+ # @return [Hash] a hash of trollop option names as keys and user or default setting as values
5
9
  class Cmd
6
10
 
7
11
  require 'trollop'
@@ -10,11 +14,16 @@ module Cheripic
10
14
 
11
15
  attr_accessor :options
12
16
 
17
+ # creates a Cmd object using input string entry
18
+ # @param args [String]
13
19
  def initialize(args)
14
20
  @options = parse_arguments(args)
15
21
  check_arguments
16
22
  end
17
23
 
24
+ # method to check input command string and run appropriate
25
+ # method of the object (help or examples or parsing arguments)
26
+ # @param args [String]
18
27
  def parse_arguments(args)
19
28
  Trollop::with_standard_exception_handling argument_parser do
20
29
  if args.empty? || args.include?('-h') || args.include?('--help')
@@ -26,6 +35,8 @@ module Cheripic
26
35
  end
27
36
  end
28
37
 
38
+ # trollop argument_parser for input args string and
39
+ # @return [Hash] a hash of trollop option names as keys and user or default setting as values
29
40
  def argument_parser
30
41
  cmds = self
31
42
  Trollop::Parser.new do
@@ -106,40 +117,44 @@ module Cheripic
106
117
  end
107
118
  end
108
119
 
120
+ # help message to display from command line
109
121
  def help_message
110
- <<-EOS
122
+ msg = <<-EOS
111
123
 
112
- Cheripic v#{Cheripic::VERSION.dup}
113
- Authors: Shyam Rallapalli and Dan MacLean
124
+ Cheripic v#{Cheripic::VERSION.dup}
125
+ Authors: Shyam Rallapalli and Dan MacLean
114
126
 
115
- Description: Candidate mutation and closely linked marker selection for non reference genomes
116
- Uses bulk segregant data from non-reference sequence genomes
127
+ Description: Candidate mutation and closely linked marker selection for non reference genomes
128
+ Uses bulk segregant data from non-reference sequence genomes
117
129
 
118
- Inputs:
119
- 1. Needs a reference fasta file of asssembly use for variant analysis
120
- 2. Pileup files for mutant (phenotype of interest) bulks and background (wildtype phenotype) bulks
121
- 3. If polyploid species, include of pileup from one or both parents
130
+ Inputs:
131
+ 1. Needs a reference fasta file of asssembly use for variant analysis
132
+ 2. Pileup files for mutant (phenotype of interest) bulks and background (wildtype phenotype) bulks
133
+ 3. If polyploid species, include of pileup from one or both parents
122
134
 
123
- USAGE:
124
- cheripic <options>
135
+ USAGE:
136
+ cheripic <options>
125
137
 
126
- OPTIONS:
138
+ OPTIONS:
127
139
 
128
140
  EOS
141
+ msg.split("\n").map{ |line| line.lstrip }.join("\n")
129
142
  end
130
143
 
144
+ # examples to display from command line
131
145
  def print_examples
132
146
  msg = <<-EOS
133
147
 
134
- Cheripic v#{Cheripic::VERSION.dup}
148
+ Cheripic v#{Cheripic::VERSION.dup}
135
149
 
136
- EXAMPLE COMMANDS:
150
+ EXAMPLE COMMANDS:
137
151
 
138
152
  EOS
139
153
  puts msg.split("\n").map{ |line| line.lstrip }.join("\n")
140
154
  exit(0)
141
155
  end
142
156
 
157
+ # calls other methods to check if command line inputs are valid
143
158
  def check_arguments
144
159
  check_output_dir
145
160
  check_log_level
@@ -153,6 +168,7 @@ OPTIONS:
153
168
  # end
154
169
  # end
155
170
 
171
+ # checks if input files are valid
156
172
  def check_input_files
157
173
  if @options[:polyploidy]
158
174
  inputfiles = %i{assembly mut_bulk bg_bulk mut_parent bg_parent}
@@ -173,6 +189,7 @@ OPTIONS:
173
189
  end
174
190
  end
175
191
 
192
+ # checks if output directory already exists
176
193
  def check_output_dir
177
194
  if Dir.exist?(@options[:output])
178
195
  raise CheripicArgError.new "#{@options[:output]} directory exists" +
@@ -180,6 +197,7 @@ OPTIONS:
180
197
  end
181
198
  end
182
199
 
200
+ # checks and sets logger level
183
201
  def check_log_level
184
202
  unless %w(error info warn debug).include?(@options[:loglevel])
185
203
  raise CheripicArgError.new "Loglevel #{@options[:loglevel]} is not valid. " +
@@ -188,6 +206,10 @@ OPTIONS:
188
206
  logger.level = Yell::Level.new @options[:loglevel].to_sym
189
207
  end
190
208
 
209
+ # Initializes an Implementer object using input options
210
+ # and calls run method of the Implementer to start the pipeline
211
+ # A hash of trollop option names as keys and user or default
212
+ # setting as values is passed to Implementer object
191
213
  def run
192
214
  @options[:output] = File.expand_path @options[:output]
193
215
  analysis = Implementer.new(@options)
@@ -4,16 +4,29 @@ require 'forwardable'
4
4
 
5
5
  module Cheripic
6
6
 
7
+ # Custom error handling for Contig class
7
8
  class ContigError < CheripicError; end
8
9
 
10
+ # A contig object from assembly that stores positions of
11
+ # homozygous, heterozygous and hemi-variants
12
+ #
13
+ # @!attribute [rw] hm_pos
14
+ # @return [Hash] a hash of homozygous variant positions as keys and allele frequency as values
15
+ # @!attribute [rw] ht_pos
16
+ # @return [Hash] a hash of heterozygous variant positions as keys and allele frequency as values
17
+ # @!attribute [rw] hemi_pos
18
+ # @return [Hash] a hash of hemi-variant positions as keys and allele frequency as values
19
+ # @!attribute [r] id
20
+ # @return [String] id of the contig in assembly taken from fasta file
21
+ # @!attribute [r] length
22
+ # @return [Integer] length of contig in bases
9
23
  class Contig
10
24
 
11
- include Enumerable
12
- extend Forwardable
13
- # delegate [:size, :length] => :@contig
14
- # def_delegator :@contig, :entry_id, :id
15
- attr_accessor :hm_pos, :ht_pos, :hemi_pos, :id, :length
25
+ attr_accessor :hm_pos, :ht_pos, :hemi_pos
26
+ attr_reader :id, :length
16
27
 
28
+ # creates a Contig object using fasta entry
29
+ # @param fasta [Bio::FastaFormat] an individual fasta entry from input assembly file
17
30
  def initialize (fasta)
18
31
  @id = fasta.entry_id
19
32
  @length = fasta.length
@@ -22,16 +35,23 @@ module Cheripic
22
35
  @hemi_pos = {}
23
36
  end
24
37
 
38
+ # Number of homozygous variants identified in the contig
39
+ # @return [Integer]
25
40
  def hm_num
26
41
  self.hm_pos.length
27
42
  end
28
43
 
44
+ # Number of heterozygous variants identified in the contig
45
+ # @return [Integer]
29
46
  def ht_num
30
47
  self.ht_pos.length
31
48
  end
32
49
 
50
+ # Homozygosity enrichment score calculated using
51
+ # hm_num and ht_num of the contig object
52
+ # @return [Float]
33
53
  def hme_score
34
- hmes_adjust = Options.params.hmes_adjust
54
+ hmes_adjust = Options.hmes_adjust
35
55
  if self.hm_num == 0 and self.ht_num == 0
36
56
  0.0
37
57
  else
@@ -39,10 +59,15 @@ module Cheripic
39
59
  end
40
60
  end
41
61
 
62
+ # Number of hemi-variants identified in the contig
63
+ # @return [Integer]
42
64
  def hemi_num
43
65
  self.hemi_pos.length
44
66
  end
45
67
 
68
+ # Mean of bulk frequency ratios (bfr) calculated using
69
+ # bfr values all hemi_pos of the contig
70
+ # @return [Float]
46
71
  def bfr_score
47
72
  if self.hemi_pos.values.empty?
48
73
  0.0
@@ -51,7 +76,9 @@ module Cheripic
51
76
  end
52
77
  end
53
78
 
54
- # geometric mean of an array of numbers
79
+ # Calculates mean of an array of numbers
80
+ # @param array [Array] an array of bfr values from hemi_snp
81
+ # @return [Float] mean value as float
55
82
  def geom_mean(array)
56
83
  return array[0].to_f if array.length == 1
57
84
  array.reduce(:+) / array.size.to_f
@@ -4,8 +4,25 @@ require 'forwardable'
4
4
 
5
5
  module Cheripic
6
6
 
7
+ # Custom error handling for ContigPileup class
7
8
  class ContigPileupsError < CheripicError; end
8
9
 
10
+ # A ContigPileup object for each contig from assembly that stores
11
+ # pileup file information and variants are selected from analysis of pileup files
12
+ # selected variants from pileup files is stored as hashes
13
+ #
14
+ # @!attribute [rw] id
15
+ # @return [String] id of the contig in assembly taken from fasta file
16
+ # @!attribute [rw] mut_bulk
17
+ # @return [Hash] a hash of variant positions from mut_bulk as keys and pileup info as values
18
+ # @!attribute [rw] bg_bulk
19
+ # @return [Hash] a hash of variant positions from bg_bulk as keys and pileup info as values
20
+ # @!attribute [rw] mut_parent
21
+ # @return [Hash] a hash of variant positions from mut_parent as keys and pileup info as values
22
+ # @!attribute [rw] bg_parent
23
+ # @return [Hash] a hash of variant positions from bg_parent as keys and pileup info as values
24
+ # @!attribute [rw] parent_hemi
25
+ # @return [Hash] a hash of hemi-variant positions as keys and bfr calculated from parent bulks as values
9
26
  class ContigPileups
10
27
 
11
28
  include Enumerable
@@ -17,6 +34,8 @@ module Cheripic
17
34
  attr_accessor :id, :parent_hemi
18
35
  attr_accessor :mut_bulk, :bg_bulk, :mut_parent, :bg_parent
19
36
 
37
+ # creates a ContigPileup object using fasta entry id
38
+ # @param fasta [String] a contig id from fasta entry
20
39
  def initialize (fasta)
21
40
  @id = fasta
22
41
  @mut_bulk = {}
@@ -26,12 +45,15 @@ module Cheripic
26
45
  @parent_hemi = {}
27
46
  end
28
47
 
48
+ # bulk pileups are compared and variant positions are selected
49
+ # @return [Array<Hash>] variant positions are stored in hashes
50
+ # for homozygous, heterozygous and hemi-variant positions
29
51
  def bulks_compared
30
52
  @hm_pos = {}
31
53
  @ht_pos = {}
32
54
  @hemi_pos = {}
33
55
  @mut_bulk.each_key do | pos |
34
- if Options.params.polyploidy and @parent_hemi.key?(pos)
56
+ if Options.polyploidy and @parent_hemi.key?(pos)
35
57
  bg_bases = ''
36
58
  if @bg_bulk.key?(pos)
37
59
  bg_bases = @bg_bulk[pos].var_base_frac
@@ -46,9 +68,11 @@ module Cheripic
46
68
  [@hm_pos, @ht_pos, @hemi_pos]
47
69
  end
48
70
 
49
- # we are only dealing with single element hashes
50
- # so discard hashes with more than one element and empty hashes
51
- # empty hash results from position below selected coverage or bases freq below noise
71
+ # mut_bulk and bg_bulk pileups are compared at selected position of the contig.
72
+ # Empty hash results from position below selected coverage
73
+ # or bases freq below noise and such positions are deleted.
74
+ # @param pos [Integer] position in the contig
75
+ # stores variant type, position and allele fraction to either @hm_pos or @ht_pos hashes
52
76
  def compare_pileup(pos)
53
77
  base_hash = @mut_bulk[pos].var_base_frac
54
78
  base_hash.delete(:ref)
@@ -56,22 +80,43 @@ module Cheripic
56
80
  # we could ignore complex loci or
57
81
  # take the variant type based on predominant base
58
82
  if base_hash.length > 1
59
- mut_type, ratio = var_mode(base_hash.values.max)
83
+ fraction = base_hash.values.max
84
+ mut_type = var_mode(fraction)
60
85
  else
61
- base = base_hash.keys[0]
62
- mut_type, ratio = var_mode(base_hash[base])
86
+ fraction = base_hash[base_hash.keys[0]]
87
+ mut_type = var_mode(fraction)
63
88
  end
64
89
  if @bg_bulk.key?(pos)
65
90
  bg_type = bg_bulk_var(pos)
66
91
  mut_type = compare_var_type(mut_type, bg_type)
67
92
  end
68
93
  unless mut_type == nil
69
- categorise_pos(mut_type, pos, ratio)
94
+ categorise_pos(mut_type, pos, fraction)
70
95
  end
71
96
  end
72
97
 
73
- # if both bulks have homozygous var at this position
74
- # then ignore the position
98
+ # Categorizes variant zygosity based on the allele fraction provided.
99
+ # Uses lower and upper limit set for heterozygosity in the options.
100
+ # @note consider increasing the range of heterozygosity limits for RNA-seq data
101
+ # @param fraction [Float] allele fraction
102
+ # @return [Symbol] of either :het or :hom to represent heterozygous or homozygous respectively
103
+ def var_mode(fraction)
104
+ ht_low = Options.htlow
105
+ ht_high = Options.hthigh
106
+ mode = ''
107
+ if fraction.between?(ht_low, ht_high)
108
+ mode = :het
109
+ elsif fraction > ht_high
110
+ mode = :hom
111
+ end
112
+ mode
113
+ end
114
+
115
+ # Simple comparison of variant type of mut and bg bulks at a position
116
+ # If both bulks have homozygous variant at selected position then it is ignored
117
+ # @param muttype [Symbol] values are either :hom or :het
118
+ # @param bgtype [Symbol] values are either :hom or :het
119
+ # @return [Symbol] variant mode of the mut bulk (:hom or :het) at the position or nil
75
120
  def compare_var_type(muttype, bgtype)
76
121
  if muttype == :hom and bgtype == :hom
77
122
  nil
@@ -80,17 +125,26 @@ module Cheripic
80
125
  end
81
126
  end
82
127
 
128
+ # Method to extract var_mode from pileup information at a position in contig
129
+ #
130
+ # @param pos [Integer] position in the contig
131
+ # @return [Symbol] variant mode of the background bulk (:hom or :het) at the position
83
132
  def bg_bulk_var(pos)
84
133
  bg_base_hash = @bg_bulk[pos].var_base_frac
85
134
  if bg_base_hash.length > 1
86
135
  # taking only var mode
87
- var_mode(bg_base_hash.values.max)[0]
136
+ var_mode(bg_base_hash.values.max)
88
137
  else
89
138
  # taking only var mode
90
- var_mode(bg_base_hash[0])[0]
139
+ var_mode(bg_base_hash[0])
91
140
  end
92
141
  end
93
142
 
143
+ # method stores pos as key and allele fraction as value
144
+ # to @hm_pos or @ht_pos hash based on variant type
145
+ # @param var_type [Symbol] values are either :hom or :het
146
+ # @param pos [Integer] position in the contig
147
+ # @param ratio [Float] allele fraction
94
148
  def categorise_pos(var_type, pos, ratio)
95
149
  if var_type == :hom
96
150
  @hm_pos[pos] = ratio
@@ -99,20 +153,10 @@ module Cheripic
99
153
  end
100
154
  end
101
155
 
102
- # calculate var zygosity for non-polyploid variants
103
- # increased range is used for heterozygosity for RNA-seq data
104
- def var_mode(ratio)
105
- ht_low = Options.params.htlow
106
- ht_high = Options.params.hthigh
107
- mode = ''
108
- if ratio.between?(ht_low, ht_high)
109
- mode = :het
110
- elsif ratio > ht_high
111
- mode = :hom
112
- end
113
- [mode, ratio]
114
- end
115
-
156
+ # Compares parental pileups for the contig and identify position
157
+ # that indicate variants from homelogues called hemi-snps
158
+ # and calculates bulk frequency ratio (bfr)
159
+ # @return [Hash] parent_hemi hash with position as key and bfr as value
116
160
  def hemisnps_in_parent
117
161
  # mark all the hemi snp based on both parents
118
162
  self.mut_parent.each_key do |pos|