cheripic 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ace583d5063ea92f69faa4430b71f0bb0f654528
4
- data.tar.gz: 00ae530b7c5c162aa0e699cae2c5dcaa9d159673
3
+ metadata.gz: 458f681424a73ea58acb8aefa73d68019ad0854d
4
+ data.tar.gz: 23547939b1fead465d06d2f6d8e45ce4172b1cb1
5
5
  SHA512:
6
- metadata.gz: c179df9e44bdff364c8c9c7dd2a779609b05ed3d2f0ef5385c5c5ebb4910a98c35a05d861851dea3f34b14ab8d188994e6b7f254dde2b9356e73dbff08386cf0
7
- data.tar.gz: f21ee021e4594bacaf319170746ad7655b0c579cb0c49495bf33dbef9dabd059f8d2e81edf44d64d24cea1f86a3e6315782d87fd8e475cf5698d35dce1bd3079
6
+ metadata.gz: 2e3af0df95197769c542b4aab76444a6b14842890b46a97d6be10101f267db5f5df7d1ed8d67083ac8890a866e1cab678a9b23c5dc03b1edb7b8fc2150b35097
7
+ data.tar.gz: 9aa159df9086102679bd6359d4a5bf94dfe72f52d9c11e66259ffd40754f767001bb2a67e996b958018a009db0a6aa558c7ebe4001c2f6bce9b8993bdfd66091
data/.gitignore CHANGED
@@ -8,4 +8,5 @@
8
8
  /spec/reports/
9
9
  /tmp/
10
10
  .idea
11
+ /packaging/
11
12
 
data/Gemfile CHANGED
@@ -1,5 +1,4 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
- gem 'bio-samtools', :git => 'git://github.com/helios/bioruby-samtools.git', :tag => 'v2.3.5'
4
3
  # Specify your gem's dependencies in cheripic.gemspec
5
4
  gemspec
data/bin/cheripic ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ require 'cheripic'
3
+
4
+ # rescue errors to get clean error messages through the logger
5
+ # backtrace can be accessed by setting --loglevel to 'debug' option
6
+ begin
7
+ submission = Cheripic::Cmd.new ARGV
8
+ submission.run
9
+ rescue Cheripic::CheripicError => e
10
+ logger.error e.message
11
+ logger.debug e.backtrace unless e.backtrace.nil?
12
+ exit 1
13
+ end
data/cheripic.gemspec CHANGED
@@ -22,12 +22,12 @@ Gem::Specification.new do |spec|
22
22
  spec.add_runtime_dependency 'yell', '~> 2.0', '>= 2.0.5'
23
23
  spec.add_runtime_dependency 'trollop', '~> 2.1', '>= 2.1.2'
24
24
  spec.add_runtime_dependency 'bio', '~> 1.5', '>= 1.5.0'
25
- # spec.add_dependency 'bio-samtools', '~> 2.3.3'
25
+ spec.add_dependency 'bio-samtools', '~> 2.4.0'
26
26
  spec.add_dependency 'bio-gngm', '~> 0.2.1'
27
27
  spec.add_runtime_dependency 'rinruby', '~> 2.0', '>= 2.0.3'
28
28
 
29
29
  spec.add_development_dependency 'activesupport', '~> 4.2.6'
30
- spec.add_development_dependency 'bundler', '~> 1.10'
30
+ spec.add_development_dependency 'bundler', '~> 1.7.6'
31
31
  spec.add_development_dependency 'rake', '~> 10.0'
32
32
  spec.add_development_dependency 'minitest'
33
33
  spec.add_development_dependency 'minitest-reporters', '>= 1.0.17'
data/lib/cheripic.rb CHANGED
@@ -1,11 +1,17 @@
1
1
 
2
- # set up a golbal logger object to access across module
2
+ # sets up a global logger object to access across module
3
3
  require 'yell'
4
+
5
+ # Computing Homozygosity Enriched Regions In genomes to Prioritize Identification of Candidate variants (CHERIPIC)
6
+ # Cheripic module provides tools and pipeline to extract potential candidate mutations
7
+ # in around the region of the genome hosting the causative mutation behind the phenotype of interest.
4
8
  module Cheripic
5
9
 
6
10
  # custom error handling
7
11
  class CheripicError < StandardError; end
12
+ # custom error handling for IO
8
13
  class CheripicIOError < CheripicError; end
14
+ # custom error handling for Arg
9
15
  class CheripicArgError < CheripicError; end
10
16
 
11
17
  # Define a logger and pass `Object` as name.
data/lib/cheripic/bfr.rb CHANGED
@@ -2,16 +2,26 @@
2
2
 
3
3
  module Cheripic
4
4
 
5
+ # Custom error handling for Bfr class
5
6
  class BfrError < CheripicError; end
6
7
 
8
+ # A class to calculate bulk frequency ratio (bfr) using one or two hashes of base fractions resulted from pileup
9
+ #
10
+ # @!attribute [rw] bfr_adj
11
+ # @return [Float] a float value to adjust the bfr calculation
7
12
  class Bfr
8
13
 
9
14
  attr_accessor :bfr_adj
10
15
 
11
- # get bulk frequency ratio (bfr) for marked hemi snps only
12
- # ignore positions with complex variants
16
+ # A method to get bulk frequency ratio (bfr) for selected hemi snps.
17
+ # This is done by selecting which hash (mutant or background) to use for bfr calculation
18
+ # either calculates fraction or bfr
19
+ # and ignores positions with complex variants.
20
+ # @param mut_hash [Hash] a hash of base fractions from pileup of mutant bulk
21
+ # @param bg_hash [Hash] a hash of base fractions from pileup of background bulk
22
+ # @return [Float] a ratio calculated
13
23
  def self.get_bfr(mut_hash, bg_hash='')
14
- @bfr_adj = Options.params.bfr_adjust
24
+ @bfr_adj = Options.bfr_adjust
15
25
  if bg_hash != ''
16
26
  # checking if only two vars in base hash and that includes ref
17
27
  # checking if only one var in hemi snp
@@ -37,9 +47,12 @@ module Cheripic
37
47
  bfr
38
48
  end
39
49
 
40
- # calculate bfr using both mutant and background bulk information
50
+ # A method to calculate bfr using a base fraction hash with hemi-snp
51
+ # @param two_key_hash [Hash] a hash of base fractions from pileup with 2 keys (a ref and variant base)
52
+ # @param other_hash [Hash] a hash of base fractions from pileup
53
+ # @return [Float] a ratio calculated
41
54
  def self.calculate_bfr(two_key_hash, other_hash)
42
- # fix :ref value if absent due to below noise depth
55
+ # if :ref is absent such as below noise depth, then set to zero
43
56
  unless two_key_hash.key?(:ref)
44
57
  two_key_hash[:ref] = 0
45
58
  end
@@ -63,6 +76,9 @@ module Cheripic
63
76
  bfr
64
77
  end
65
78
 
79
+ # A method to calculate ratio using a base fraction hash
80
+ # @param hash [Hash] a hash of base fractions from pileup with 2 or 1 keys
81
+ # @return [Array<Float><String>] an array of ratio calculated and base character
66
82
  def self.calc_fraction(hash)
67
83
  unless hash.key?(:ref)
68
84
  hash[:ref] = 0
data/lib/cheripic/cmd.rb CHANGED
@@ -2,6 +2,10 @@
2
2
 
3
3
  module Cheripic
4
4
 
5
+ # A command line option and processing object to handle input options
6
+ #
7
+ # @!attribute [rw] options
8
+ # @return [Hash] a hash of trollop option names as keys and user or default setting as values
5
9
  class Cmd
6
10
 
7
11
  require 'trollop'
@@ -10,11 +14,16 @@ module Cheripic
10
14
 
11
15
  attr_accessor :options
12
16
 
17
+ # creates a Cmd object using input string entry
18
+ # @param args [String]
13
19
  def initialize(args)
14
20
  @options = parse_arguments(args)
15
21
  check_arguments
16
22
  end
17
23
 
24
+ # method to check input command string and run appropriate
25
+ # method of the object (help or examples or parsing arguments)
26
+ # @param args [String]
18
27
  def parse_arguments(args)
19
28
  Trollop::with_standard_exception_handling argument_parser do
20
29
  if args.empty? || args.include?('-h') || args.include?('--help')
@@ -26,6 +35,8 @@ module Cheripic
26
35
  end
27
36
  end
28
37
 
38
+ # trollop argument_parser for input args string and
39
+ # @return [Hash] a hash of trollop option names as keys and user or default setting as values
29
40
  def argument_parser
30
41
  cmds = self
31
42
  Trollop::Parser.new do
@@ -106,40 +117,44 @@ module Cheripic
106
117
  end
107
118
  end
108
119
 
120
+ # help message to display from command line
109
121
  def help_message
110
- <<-EOS
122
+ msg = <<-EOS
111
123
 
112
- Cheripic v#{Cheripic::VERSION.dup}
113
- Authors: Shyam Rallapalli and Dan MacLean
124
+ Cheripic v#{Cheripic::VERSION.dup}
125
+ Authors: Shyam Rallapalli and Dan MacLean
114
126
 
115
- Description: Candidate mutation and closely linked marker selection for non reference genomes
116
- Uses bulk segregant data from non-reference sequence genomes
127
+ Description: Candidate mutation and closely linked marker selection for non reference genomes
128
+ Uses bulk segregant data from non-reference sequence genomes
117
129
 
118
- Inputs:
119
- 1. Needs a reference fasta file of asssembly use for variant analysis
120
- 2. Pileup files for mutant (phenotype of interest) bulks and background (wildtype phenotype) bulks
121
- 3. If polyploid species, include of pileup from one or both parents
130
+ Inputs:
131
+ 1. Needs a reference fasta file of asssembly use for variant analysis
132
+ 2. Pileup files for mutant (phenotype of interest) bulks and background (wildtype phenotype) bulks
133
+ 3. If polyploid species, include of pileup from one or both parents
122
134
 
123
- USAGE:
124
- cheripic <options>
135
+ USAGE:
136
+ cheripic <options>
125
137
 
126
- OPTIONS:
138
+ OPTIONS:
127
139
 
128
140
  EOS
141
+ msg.split("\n").map{ |line| line.lstrip }.join("\n")
129
142
  end
130
143
 
144
+ # examples to display from command line
131
145
  def print_examples
132
146
  msg = <<-EOS
133
147
 
134
- Cheripic v#{Cheripic::VERSION.dup}
148
+ Cheripic v#{Cheripic::VERSION.dup}
135
149
 
136
- EXAMPLE COMMANDS:
150
+ EXAMPLE COMMANDS:
137
151
 
138
152
  EOS
139
153
  puts msg.split("\n").map{ |line| line.lstrip }.join("\n")
140
154
  exit(0)
141
155
  end
142
156
 
157
+ # calls other methods to check if command line inputs are valid
143
158
  def check_arguments
144
159
  check_output_dir
145
160
  check_log_level
@@ -153,6 +168,7 @@ OPTIONS:
153
168
  # end
154
169
  # end
155
170
 
171
+ # checks if input files are valid
156
172
  def check_input_files
157
173
  if @options[:polyploidy]
158
174
  inputfiles = %i{assembly mut_bulk bg_bulk mut_parent bg_parent}
@@ -173,6 +189,7 @@ OPTIONS:
173
189
  end
174
190
  end
175
191
 
192
+ # checks if output directory already exists
176
193
  def check_output_dir
177
194
  if Dir.exist?(@options[:output])
178
195
  raise CheripicArgError.new "#{@options[:output]} directory exists" +
@@ -180,6 +197,7 @@ OPTIONS:
180
197
  end
181
198
  end
182
199
 
200
+ # checks and sets logger level
183
201
  def check_log_level
184
202
  unless %w(error info warn debug).include?(@options[:loglevel])
185
203
  raise CheripicArgError.new "Loglevel #{@options[:loglevel]} is not valid. " +
@@ -188,6 +206,10 @@ OPTIONS:
188
206
  logger.level = Yell::Level.new @options[:loglevel].to_sym
189
207
  end
190
208
 
209
+ # Initializes an Implementer object using input options
210
+ # and calls run method of the Implementer to start the pipeline
211
+ # A hash of trollop option names as keys and user or default
212
+ # setting as values is passed to Implementer object
191
213
  def run
192
214
  @options[:output] = File.expand_path @options[:output]
193
215
  analysis = Implementer.new(@options)
@@ -4,16 +4,29 @@ require 'forwardable'
4
4
 
5
5
  module Cheripic
6
6
 
7
+ # Custom error handling for Contig class
7
8
  class ContigError < CheripicError; end
8
9
 
10
+ # A contig object from assembly that stores positions of
11
+ # homozygous, heterozygous and hemi-variants
12
+ #
13
+ # @!attribute [rw] hm_pos
14
+ # @return [Hash] a hash of homozygous variant positions as keys and allele frequency as values
15
+ # @!attribute [rw] ht_pos
16
+ # @return [Hash] a hash of heterozygous variant positions as keys and allele frequency as values
17
+ # @!attribute [rw] hemi_pos
18
+ # @return [Hash] a hash of hemi-variant positions as keys and allele frequency as values
19
+ # @!attribute [r] id
20
+ # @return [String] id of the contig in assembly taken from fasta file
21
+ # @!attribute [r] length
22
+ # @return [Integer] length of contig in bases
9
23
  class Contig
10
24
 
11
- include Enumerable
12
- extend Forwardable
13
- # delegate [:size, :length] => :@contig
14
- # def_delegator :@contig, :entry_id, :id
15
- attr_accessor :hm_pos, :ht_pos, :hemi_pos, :id, :length
25
+ attr_accessor :hm_pos, :ht_pos, :hemi_pos
26
+ attr_reader :id, :length
16
27
 
28
+ # creates a Contig object using fasta entry
29
+ # @param fasta [Bio::FastaFormat] an individual fasta entry from input assembly file
17
30
  def initialize (fasta)
18
31
  @id = fasta.entry_id
19
32
  @length = fasta.length
@@ -22,16 +35,23 @@ module Cheripic
22
35
  @hemi_pos = {}
23
36
  end
24
37
 
38
+ # Number of homozygous variants identified in the contig
39
+ # @return [Integer]
25
40
  def hm_num
26
41
  self.hm_pos.length
27
42
  end
28
43
 
44
+ # Number of heterozygous variants identified in the contig
45
+ # @return [Integer]
29
46
  def ht_num
30
47
  self.ht_pos.length
31
48
  end
32
49
 
50
+ # Homozygosity enrichment score calculated using
51
+ # hm_num and ht_num of the contig object
52
+ # @return [Float]
33
53
  def hme_score
34
- hmes_adjust = Options.params.hmes_adjust
54
+ hmes_adjust = Options.hmes_adjust
35
55
  if self.hm_num == 0 and self.ht_num == 0
36
56
  0.0
37
57
  else
@@ -39,10 +59,15 @@ module Cheripic
39
59
  end
40
60
  end
41
61
 
62
+ # Number of hemi-variants identified in the contig
63
+ # @return [Integer]
42
64
  def hemi_num
43
65
  self.hemi_pos.length
44
66
  end
45
67
 
68
+ # Mean of bulk frequency ratios (bfr) calculated using
69
+ # bfr values all hemi_pos of the contig
70
+ # @return [Float]
46
71
  def bfr_score
47
72
  if self.hemi_pos.values.empty?
48
73
  0.0
@@ -51,7 +76,9 @@ module Cheripic
51
76
  end
52
77
  end
53
78
 
54
- # geometric mean of an array of numbers
79
+ # Calculates mean of an array of numbers
80
+ # @param array [Array] an array of bfr values from hemi_snp
81
+ # @return [Float] mean value as float
55
82
  def geom_mean(array)
56
83
  return array[0].to_f if array.length == 1
57
84
  array.reduce(:+) / array.size.to_f
@@ -4,8 +4,25 @@ require 'forwardable'
4
4
 
5
5
  module Cheripic
6
6
 
7
+ # Custom error handling for ContigPileup class
7
8
  class ContigPileupsError < CheripicError; end
8
9
 
10
+ # A ContigPileup object for each contig from assembly that stores
11
+ # pileup file information and variants are selected from analysis of pileup files
12
+ # selected variants from pileup files is stored as hashes
13
+ #
14
+ # @!attribute [rw] id
15
+ # @return [String] id of the contig in assembly taken from fasta file
16
+ # @!attribute [rw] mut_bulk
17
+ # @return [Hash] a hash of variant positions from mut_bulk as keys and pileup info as values
18
+ # @!attribute [rw] bg_bulk
19
+ # @return [Hash] a hash of variant positions from bg_bulk as keys and pileup info as values
20
+ # @!attribute [rw] mut_parent
21
+ # @return [Hash] a hash of variant positions from mut_parent as keys and pileup info as values
22
+ # @!attribute [rw] bg_parent
23
+ # @return [Hash] a hash of variant positions from bg_parent as keys and pileup info as values
24
+ # @!attribute [rw] parent_hemi
25
+ # @return [Hash] a hash of hemi-variant positions as keys and bfr calculated from parent bulks as values
9
26
  class ContigPileups
10
27
 
11
28
  include Enumerable
@@ -17,6 +34,8 @@ module Cheripic
17
34
  attr_accessor :id, :parent_hemi
18
35
  attr_accessor :mut_bulk, :bg_bulk, :mut_parent, :bg_parent
19
36
 
37
+ # creates a ContigPileup object using fasta entry id
38
+ # @param fasta [String] a contig id from fasta entry
20
39
  def initialize (fasta)
21
40
  @id = fasta
22
41
  @mut_bulk = {}
@@ -26,12 +45,15 @@ module Cheripic
26
45
  @parent_hemi = {}
27
46
  end
28
47
 
48
+ # bulk pileups are compared and variant positions are selected
49
+ # @return [Array<Hash>] variant positions are stored in hashes
50
+ # for homozygous, heterozygous and hemi-variant positions
29
51
  def bulks_compared
30
52
  @hm_pos = {}
31
53
  @ht_pos = {}
32
54
  @hemi_pos = {}
33
55
  @mut_bulk.each_key do | pos |
34
- if Options.params.polyploidy and @parent_hemi.key?(pos)
56
+ if Options.polyploidy and @parent_hemi.key?(pos)
35
57
  bg_bases = ''
36
58
  if @bg_bulk.key?(pos)
37
59
  bg_bases = @bg_bulk[pos].var_base_frac
@@ -46,9 +68,11 @@ module Cheripic
46
68
  [@hm_pos, @ht_pos, @hemi_pos]
47
69
  end
48
70
 
49
- # we are only dealing with single element hashes
50
- # so discard hashes with more than one element and empty hashes
51
- # empty hash results from position below selected coverage or bases freq below noise
71
+ # mut_bulk and bg_bulk pileups are compared at selected position of the contig.
72
+ # Empty hash results from position below selected coverage
73
+ # or bases freq below noise and such positions are deleted.
74
+ # @param pos [Integer] position in the contig
75
+ # stores variant type, position and allele fraction to either @hm_pos or @ht_pos hashes
52
76
  def compare_pileup(pos)
53
77
  base_hash = @mut_bulk[pos].var_base_frac
54
78
  base_hash.delete(:ref)
@@ -56,22 +80,43 @@ module Cheripic
56
80
  # we could ignore complex loci or
57
81
  # take the variant type based on predominant base
58
82
  if base_hash.length > 1
59
- mut_type, ratio = var_mode(base_hash.values.max)
83
+ fraction = base_hash.values.max
84
+ mut_type = var_mode(fraction)
60
85
  else
61
- base = base_hash.keys[0]
62
- mut_type, ratio = var_mode(base_hash[base])
86
+ fraction = base_hash[base_hash.keys[0]]
87
+ mut_type = var_mode(fraction)
63
88
  end
64
89
  if @bg_bulk.key?(pos)
65
90
  bg_type = bg_bulk_var(pos)
66
91
  mut_type = compare_var_type(mut_type, bg_type)
67
92
  end
68
93
  unless mut_type == nil
69
- categorise_pos(mut_type, pos, ratio)
94
+ categorise_pos(mut_type, pos, fraction)
70
95
  end
71
96
  end
72
97
 
73
- # if both bulks have homozygous var at this position
74
- # then ignore the position
98
+ # Categorizes variant zygosity based on the allele fraction provided.
99
+ # Uses lower and upper limit set for heterozygosity in the options.
100
+ # @note consider increasing the range of heterozygosity limits for RNA-seq data
101
+ # @param fraction [Float] allele fraction
102
+ # @return [Symbol] of either :het or :hom to represent heterozygous or homozygous respectively
103
+ def var_mode(fraction)
104
+ ht_low = Options.htlow
105
+ ht_high = Options.hthigh
106
+ mode = ''
107
+ if fraction.between?(ht_low, ht_high)
108
+ mode = :het
109
+ elsif fraction > ht_high
110
+ mode = :hom
111
+ end
112
+ mode
113
+ end
114
+
115
+ # Simple comparison of variant type of mut and bg bulks at a position
116
+ # If both bulks have homozygous variant at selected position then it is ignored
117
+ # @param muttype [Symbol] values are either :hom or :het
118
+ # @param bgtype [Symbol] values are either :hom or :het
119
+ # @return [Symbol] variant mode of the mut bulk (:hom or :het) at the position or nil
75
120
  def compare_var_type(muttype, bgtype)
76
121
  if muttype == :hom and bgtype == :hom
77
122
  nil
@@ -80,17 +125,26 @@ module Cheripic
80
125
  end
81
126
  end
82
127
 
128
+ # Method to extract var_mode from pileup information at a position in contig
129
+ #
130
+ # @param pos [Integer] position in the contig
131
+ # @return [Symbol] variant mode of the background bulk (:hom or :het) at the position
83
132
  def bg_bulk_var(pos)
84
133
  bg_base_hash = @bg_bulk[pos].var_base_frac
85
134
  if bg_base_hash.length > 1
86
135
  # taking only var mode
87
- var_mode(bg_base_hash.values.max)[0]
136
+ var_mode(bg_base_hash.values.max)
88
137
  else
89
138
  # taking only var mode
90
- var_mode(bg_base_hash[0])[0]
139
+ var_mode(bg_base_hash[0])
91
140
  end
92
141
  end
93
142
 
143
+ # method stores pos as key and allele fraction as value
144
+ # to @hm_pos or @ht_pos hash based on variant type
145
+ # @param var_type [Symbol] values are either :hom or :het
146
+ # @param pos [Integer] position in the contig
147
+ # @param ratio [Float] allele fraction
94
148
  def categorise_pos(var_type, pos, ratio)
95
149
  if var_type == :hom
96
150
  @hm_pos[pos] = ratio
@@ -99,20 +153,10 @@ module Cheripic
99
153
  end
100
154
  end
101
155
 
102
- # calculate var zygosity for non-polyploid variants
103
- # increased range is used for heterozygosity for RNA-seq data
104
- def var_mode(ratio)
105
- ht_low = Options.params.htlow
106
- ht_high = Options.params.hthigh
107
- mode = ''
108
- if ratio.between?(ht_low, ht_high)
109
- mode = :het
110
- elsif ratio > ht_high
111
- mode = :hom
112
- end
113
- [mode, ratio]
114
- end
115
-
156
+ # Compares parental pileups for the contig and identify position
157
+ # that indicate variants from homelogues called hemi-snps
158
+ # and calculates bulk frequency ratio (bfr)
159
+ # @return [Hash] parent_hemi hash with position as key and bfr as value
116
160
  def hemisnps_in_parent
117
161
  # mark all the hemi snp based on both parents
118
162
  self.mut_parent.each_key do |pos|