cheripic 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,14 +2,24 @@
2
2
 
3
3
  module Cheripic
4
4
 
5
+ # Custom error handling for Implementer class
5
6
  class ImplementerError < CheripicError; end
6
7
 
8
+ # An Implementer object for running pipeline from Cmd object options
9
+ #
10
+ # @!attribute [r] options
11
+ # @return [Hash] a hash of required input files names as keys and
12
+ # user provided file paths as values taken from Cmd object
13
+ # @!attribute [r] variants
14
+ # @return [<Cheripic::Variants>] a Variants object initialized using options from Cmd object
7
15
  class Implementer
8
16
 
9
17
  require 'ostruct'
10
18
  require 'fileutils'
11
- attr_accessor :options, :variants
19
+ attr_reader :options, :variants, :has_run
12
20
 
21
+ # Initializes an Implementer object using inputs from cmd object
22
+ # @param inputs [Hash] a hash of trollop option names as keys and user or default setting as values from Cmd object
13
23
  def initialize(inputs)
14
24
  set1 = %i{assembly
15
25
  input_format
@@ -38,13 +48,20 @@ module Cheripic
38
48
  settings = inputs.select { |k| set2.include?(k) }
39
49
  Options.update(settings)
40
50
  FileUtils.mkdir_p @options.output
51
+ @vars_extracted = false
52
+ @has_run = false
41
53
  end
42
54
 
55
+ # Initializes a Variants object using using input options (files).
56
+ # Each pileup file is processed and bulks are compared
43
57
  def extract_vars
44
58
  @variants = Variants.new(@options)
45
59
  @variants.compare_pileups
60
+ @vars_extracted = true
46
61
  end
47
62
 
63
+ # Extracted variants from bulk comparison are re-analysed
64
+ # and selected variants are written to a file
48
65
  def process_variants
49
66
  @variants.verify_bg_bulk_pileup
50
67
  # print selected variants that could be potential markers or mutation
@@ -63,16 +80,20 @@ module Cheripic
63
80
  out_file.close
64
81
  end
65
82
 
83
+ # Wrapper to extract and isolate selected variants
84
+ # implements extract_vars and process_variants and
85
+ # if data is from polyploids extracts contigs with high bfr
66
86
  def run
67
- unless defined?(@variants.has_run)
87
+ unless @vars_extracted
68
88
  self.extract_vars
69
89
  end
70
- if Options.params.polyploidy
90
+ if Options.polyploidy
71
91
  self.process_variants
72
92
  @variants.bfr_frags
73
93
  else
74
94
  self.process_variants
75
95
  end
96
+ @has_run = true
76
97
  end
77
98
 
78
99
  end
@@ -2,12 +2,12 @@
2
2
 
3
3
  module Cheripic
4
4
 
5
+ # A class to get default settings and update user settings for parameters
6
+ # and facilitate retrieval of settings any where in the module
5
7
  class Options
6
8
 
7
- require 'ostruct'
8
- # class << self; attr_accessor :params end
9
-
10
- @defaults = {
9
+ # Default parameter settings
10
+ @def_settings = {
11
11
  :hmes_adjust => 0.5,
12
12
  :htlow => 0.2,
13
13
  :hthigh => 0.9,
@@ -25,16 +25,116 @@ module Cheripic
25
25
  :bfr_adjust => 0.05,
26
26
  :sel_seq_len => 50
27
27
  }
28
- # @params = OpenStruct.new(@defaults)
29
28
 
29
+ # set defaults as user settings
30
+ @user_settings = @def_settings
31
+
32
+ # A value to adjust calculation of Homozygosity Enrichment Score (HMES)
33
+ # @return [Float]
34
+ def self.hmes_adjust
35
+ @user_settings[:hmes_adjust]
36
+ end
37
+
38
+ # Lower cut off of Allele fraction for categorization of an variant to heterozygous
39
+ # @return [Float]
40
+ def self.htlow
41
+ @user_settings[:htlow]
42
+ end
43
+
44
+ # Higher cut off of Allele fraction for categorization of an variant to heterozygous
45
+ # @return [Float]
46
+ def self.hthigh
47
+ @user_settings[:hthigh]
48
+ end
49
+
50
+ # Minimum read coverage at the variant position to be considered for analysis
51
+ # @return [Integer]
52
+ def self.mindepth
53
+ @user_settings[:mindepth]
54
+ end
55
+
56
+ # Minimum non reference count at the variant position to be considered for analysis
57
+ # @return [Integer]
58
+ def self.min_non_ref_count
59
+ @user_settings[:min_non_ref_count]
60
+ end
61
+
62
+ # Minimum reads supporting an indel at the variant position to be considered for analysis as indel
63
+ # @return [Integer]
64
+ def self.min_indel_count_support
65
+ @user_settings[:min_indel_count_support]
66
+ end
67
+
68
+ # Option to whether to ignore or consider the reference positions which are ambiguous
69
+ # @return [Boolean]
70
+ def self.ignore_reference_n
71
+ @user_settings[:ignore_reference_n]
72
+ end
73
+
74
+ # Minimum alignment mapping quality of the read to be used for bam files
75
+ # @return [Integer]
76
+ def self.mapping_quality
77
+ @user_settings[:mapping_quality]
78
+ end
79
+
80
+ # Minimum aligned base quality at the variant position to be considered for analysis
81
+ # @return [Integer]
82
+ def self.base_quality
83
+ @user_settings[:base_quality]
84
+ end
85
+
86
+ # Threshold for fraction of read bases at variant position below which are ignored as noise
87
+ # @return [Float]
88
+ def self.noise
89
+ @user_settings[:noise]
90
+ end
91
+
92
+ # Option for cross type used for generating bulk population
93
+ # @note options are either 'back' or 'out'
94
+ # @return [String]
95
+ def self.cross_type
96
+ @user_settings[:cross_type]
97
+ end
98
+
99
+ # Option to whether to ignore or consider the contigs with out any variants
100
+ # @return [Boolean]
101
+ def self.only_frag_with_vars
102
+ @user_settings[:only_frag_with_vars]
103
+ end
104
+
105
+ # Option to whether to ignore or consider the contigs with low HME score
106
+ # @return [Boolean]
107
+ def self.filter_out_low_hmes
108
+ @user_settings[:filter_out_low_hmes]
109
+ end
110
+
111
+ # Option to whether to set the input data is from polyploid or not
112
+ # @return [Boolean]
113
+ def self.polyploidy
114
+ @user_settings[:polyploidy]
115
+ end
116
+
117
+ # A value to adjust calculation of bulk frequency ratio (bfr)
118
+ # @return [Float]
119
+ def self.bfr_adjust
120
+ @user_settings[:bfr_adjust]
121
+ end
122
+
123
+ # Number of nucleotides of sequence to select from each side of the selected variant
124
+ # @return [Integer]
125
+ def self.sel_seq_len
126
+ @user_settings[:sel_seq_len]
127
+ end
128
+
129
+ # Updates the values of options using a hash generated from user inputs
130
+ # @param newset [Hash] a hash of option names as keys user settings as values
30
131
  def self.update(newset)
31
- @defaults.merge!(newset)
32
- self.params
33
- # @params = OpenStruct.new(@defaults)
132
+ @user_settings = @def_settings.merge(newset)
34
133
  end
35
134
 
36
- def self.params
37
- OpenStruct.new(@defaults)
135
+ # Resets the values of options to defaults
136
+ def self.defaults
137
+ @user_settings = @def_settings
38
138
  end
39
139
 
40
140
  end
@@ -1,186 +1,177 @@
1
1
  # encoding: utf-8
2
- require 'bio'
3
- require 'bio-samtools'
4
- require 'bio/db/pileup'
2
+ module Cheripic
5
3
 
6
- class Pileup < Bio::DB::Pileup
4
+ # Custom error handling for Pileup class
5
+ class PileupError < CheripicError; end
7
6
 
8
- attr_accessor :defaults
7
+ require 'bio-samtools'
8
+ require 'bio/db/pileup'
9
9
 
10
- def initialize(string, opts={})
11
- super(string)
12
- set_defaults(opts)
13
- adj_read_bases
14
- @indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN'
15
- end
16
-
17
- def set_defaults(opts)
18
- @defaults = {
19
- noise: 0.1, # noise level for read depth
20
- ht_low: 0.2, # min allele freq for heterozygosity
21
- ht_high: 0.9, # max allele freq for heterozygosity
22
- min_depth: 6, # minimum coverage for variant
23
- min_non_ref_count: 3,
24
- ignore_reference_n: true,
25
- min_indel_count_support: 3,
26
- }
27
- @defaults.merge(opts)
28
- end
29
-
30
- # removes mapping quality information
31
- def adj_read_bases
32
- # mapping quality after '^' symbol is substituted
33
- # to avoid splitting at non indel + or - characters
34
- # read ends marking by '$' symbol is substituted
35
- # insertion and deletion marking by '*' symbol is substituted
36
- self.read_bases.gsub!(/\^./, '')
37
- self.read_bases.delete! '$'
38
- self.read_bases.delete! '*'
39
- # warn about reads with ambiguous codes
40
- # if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/)
41
- # warn "Ambiguous nucleotide\t#{self.read_bases}"
42
- # end
43
- end
10
+ # An extension of Bio::DB::Pileup object to process pileup information at a given position
11
+ class Pileup < Bio::DB::Pileup
44
12
 
45
- # count bases matching reference and non-reference
46
- # from snp variant and make a hash of bases with counts
47
- # for indels return the read bases information instead
48
- def bases_hash
49
- if self.read_bases =~ /\+/
50
- bases_hash = indels_to_hash('+')
51
- elsif self.read_bases =~ /\-/
52
- bases_hash = indels_to_hash('-')
53
- else
54
- bases_hash = snp_base_hash(self.read_bases)
13
+ # creates a Pileup object using a pileup information as string
14
+ # @param string [String] pileup information line for a given position
15
+ def initialize(string)
16
+ super(string)
17
+ adj_read_bases
18
+ @indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN'
55
19
  end
56
- # some indels will have ref base in the read and using
57
- # sum of hash values is going to give wrong additional coverage
58
- # from indels so including actual coverage from pileup
59
- # bases_hash keys are :A, :C, :G, :T, :N, :ref, :indel and :cov
60
- bases_hash[:cov] = self.coverage
61
- bases_hash
62
- end
63
20
 
64
- # count bases from indels
65
- # array of pileup bases is split at + / -
66
- # and number after each + / - is counted
67
- def count_indel_bases(delimiter)
68
- array = self.read_bases.split(delimiter)
69
- number = 0
70
- array.shift
71
- array.each do |element|
72
- # deletions in reference could contain ambiguous codes,
73
- number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
21
+ # removes mapping quality information
22
+ def adj_read_bases
23
+ # mapping quality after '^' symbol is substituted
24
+ # to avoid splitting at non indel + or - characters
25
+ # read ends marking by '$' symbol is substituted
26
+ # insertion and deletion marking by '*' symbol is substituted
27
+ self.read_bases.gsub!(/\^./, '')
28
+ self.read_bases.delete! '$'
29
+ self.read_bases.delete! '*'
30
+ # warn about reads with ambiguous codes
31
+ # if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/)
32
+ # warn "Ambiguous nucleotide\t#{self.read_bases}"
33
+ # end
74
34
  end
75
- number
76
- end
77
35
 
78
- # count bases matching reference and non-reference
79
- # and calculate ratio of non_ref allele to total bases
80
- def non_ref_count
81
- read_bases = self.read_bases
82
- if read_bases =~ /\+/
83
- non_ref_count = indel_non_ref_count('+')
84
- elsif read_bases =~ /\-/
85
- non_ref_count = indel_non_ref_count('-')
86
- else
87
- non_ref_count = read_bases.count('atgcATGC')
36
+ # count bases matching reference and non-reference
37
+ # from snp variant and make a hash of bases with counts
38
+ # for indels return the read bases information instead
39
+ def bases_hash
40
+ if self.read_bases =~ /\+/
41
+ bases_hash = indels_to_hash('+')
42
+ elsif self.read_bases =~ /-/
43
+ bases_hash = indels_to_hash('-')
44
+ else
45
+ bases_hash = snp_base_hash(self.read_bases)
46
+ end
47
+ # some indels will have ref base in the read and using
48
+ # sum of hash values is going to give wrong additional coverage
49
+ # from indels so including actual coverage from pileup
50
+ # bases_hash keys are :A, :C, :G, :T, :N, :ref and :indel
51
+ bases_hash
88
52
  end
89
- non_ref_count
90
- end
91
53
 
92
- # check if the pileup has the parameters we are looking for
93
- def is_var
94
- ignore_reference_n = @defaults[:ignore_reference_n]
95
- min_depth = @defaults[:min_depth]
96
- min_non_ref_count = @defaults[:min_non_ref_count]
54
+ # count bases from indels
55
+ # array of pileup bases is split at + / -
56
+ # and number after each + / - is counted
57
+ def count_indel_bases(delimiter)
58
+ array = self.read_bases.split(delimiter)
59
+ number = 0
60
+ array.shift
61
+ array.each do |element|
62
+ # deletions in reference could contain ambiguous codes,
63
+ number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
64
+ end
65
+ number
66
+ end
97
67
 
98
- return false if self.ref_base == '*'
99
- return false if ignore_reference_n and self.ref_base =~ /^[nN]$/
100
- return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count
101
- false
102
- end
68
+ # count bases matching reference and non-reference
69
+ # and calculate ratio of non_ref allele to total bases
70
+ def non_ref_count
71
+ read_bases = self.read_bases
72
+ if read_bases =~ /\+/
73
+ non_ref_count = indel_non_ref_count('+')
74
+ elsif read_bases =~ /-/
75
+ non_ref_count = indel_non_ref_count('-')
76
+ else
77
+ non_ref_count = read_bases.count('atgcATGC')
78
+ end
79
+ non_ref_count
80
+ end
103
81
 
104
- # count bases matching reference and non-reference
105
- # and calculate ratio of non_ref allele to total bases
106
- def non_ref_ratio
107
- self.non_ref_count.to_f / self.coverage.to_f
108
- end
82
+ # check if the pileup has the parameters we are looking for
83
+ def is_var
84
+ ignore_reference_n = Options.ignore_reference_n
85
+ min_depth = Options.mindepth
86
+ min_non_ref_count = Options.min_non_ref_count
109
87
 
110
- # calculate var zygosity for non-polyploid variants
111
- # increased range is used for heterozygosity for RNA-seq data
112
- def var_mode
113
- ht_low = @defaults[:ht_low]
114
- ht_high = @defaults[:ht_high]
115
- mode = ''
116
- if self.non_ref_ratio.between?(ht_low, ht_high)
117
- mode = :het
118
- elsif self.non_ref_ratio > ht_high
119
- mode = :hom
88
+ return false if self.ref_base == '*'
89
+ return false if ignore_reference_n and self.ref_base =~ /^[nN]$/
90
+ return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count
91
+ false
120
92
  end
121
- mode
122
- end
123
93
 
124
- # form hash of base information, [ATGC] counts for snp
125
- # a hash of base proportion is calculated
126
- # base proportion hash below a selected depth is empty
127
- # base proportion below or equal to a noise factor are discarded
128
- def var_base_frac
129
- hash = self.bases_hash
130
- snp_hash = {}
131
- coverage = hash[:cov]
132
- return snp_hash if coverage < @defaults[:min_depth]
133
- # calculate proportion of each base in coverage
134
- hash.each_key do | base |
135
- next if base == :cov
136
- freq = hash[base].to_f/coverage.to_f
137
- next if freq <= @defaults[:noise]
138
- snp_hash[base] = freq
94
+ # count bases matching reference and non-reference
95
+ # and calculate ratio of non_ref allele to total bases
96
+ def non_ref_ratio
97
+ self.non_ref_count.to_f / self.coverage.to_f
139
98
  end
140
- snp_hash
141
- end
142
99
 
100
+ # calculate var zygosity for non-polyploid variants
101
+ # increased range is used for heterozygosity for RNA-seq data
102
+ # def var_mode
103
+ # ht_low = @defaults[:ht_low]
104
+ # ht_high = @defaults[:ht_high]
105
+ # mode = ''
106
+ # if self.non_ref_ratio.between?(ht_low, ht_high)
107
+ # mode = :het
108
+ # elsif self.non_ref_ratio > ht_high
109
+ # mode = :hom
110
+ # end
111
+ # mode
112
+ # end
143
113
 
144
- private
114
+ # form hash of base information, [ATGC] counts for snp
115
+ # a hash of base proportion is calculated
116
+ # base proportion hash below a selected depth is empty
117
+ # base proportion below or equal to a noise factor are discarded
118
+ def var_base_frac
119
+ hash = self.bases_hash
120
+ snp_hash = {}
121
+ coverage = self.coverage
122
+ return snp_hash if coverage < Options.mindepth
123
+ # calculate proportion of each base in coverage
124
+ hash.each_key do | base |
125
+ freq = hash[base].to_f/coverage.to_f
126
+ next if freq <= Options.noise
127
+ snp_hash[base] = freq
128
+ end
129
+ snp_hash
130
+ end
145
131
 
146
- # count number of indels and number non-indel base
147
- # and return a hash with bases and indel counts
148
- def indels_to_hash(delimiter)
149
- non_indel_bases = String.new
150
- array = self.read_bases.split(delimiter)
151
- non_indel_bases << array.shift
152
- array.each do |element|
153
- # get number of nucleotides inserted or deleted
154
- number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
155
- # capture remaining nucleotides
156
- non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '')
132
+
133
+ private
134
+
135
+ # count number of indels and number non-indel base
136
+ # and return a hash with bases and indel counts
137
+ def indels_to_hash(delimiter)
138
+ non_indel_bases = String.new
139
+ array = self.read_bases.split(delimiter)
140
+ non_indel_bases << array.shift
141
+ array.each do |element|
142
+ # get number of nucleotides inserted or deleted
143
+ number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
144
+ # capture remaining nucleotides
145
+ non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '')
146
+ end
147
+ bases_hash = snp_base_hash(non_indel_bases)
148
+ # check at least three reads are supporting indel
149
+ indel_count = self.read_bases.count(delimiter)
150
+ if indel_count >= Options.min_indel_count_support
151
+ bases_hash[:indel] = indel_count
152
+ end
153
+ bases_hash
157
154
  end
158
- bases_hash = snp_base_hash(non_indel_bases)
159
- # check at least three reads are supporting indel
160
- indel_count = self.read_bases.count(delimiter)
161
- if indel_count >= @defaults[:min_indel_count_support]
162
- bases_hash[:indel] = indel_count
155
+
156
+ def snp_base_hash(readbases)
157
+ non_indel_base_hash = {}
158
+ non_indel_base_hash[:ref] = readbases.count('.,')
159
+ non_indel_base_hash[:A] = readbases.count('aA')
160
+ non_indel_base_hash[:C] = readbases.count('cC')
161
+ non_indel_base_hash[:G] = readbases.count('gG')
162
+ non_indel_base_hash[:T] = readbases.count('tT')
163
+ # non_indel_base_hash[:N] = read_bases.count('nN')
164
+ non_indel_base_hash
163
165
  end
164
- bases_hash
165
- end
166
166
 
167
- def snp_base_hash(readbases)
168
- non_indel_base_hash = {}
169
- non_indel_base_hash[:ref] = readbases.count('.,')
170
- non_indel_base_hash[:A] = readbases.count('aA')
171
- non_indel_base_hash[:C] = readbases.count('cC')
172
- non_indel_base_hash[:G] = readbases.count('gG')
173
- non_indel_base_hash[:T] = readbases.count('tT')
174
- # non_indel_base_hash[:N] = read_bases.count('nN')
175
- non_indel_base_hash
176
- end
167
+ def indel_non_ref_count(delimitter)
168
+ read_bases = self.read_bases
169
+ non_ref_count = read_bases.count(@indelbases)
170
+ indelcounts = read_bases.count(delimitter)
171
+ indel_bases = count_indel_bases(delimitter)
172
+ non_ref_count + indelcounts - indel_bases
173
+ end
177
174
 
178
- def indel_non_ref_count(delimitter)
179
- read_bases = self.read_bases
180
- non_ref_count = read_bases.count(@indelbases)
181
- indelcounts = read_bases.count(delimitter)
182
- indel_bases = count_indel_bases(delimitter)
183
- non_ref_count + indelcounts - indel_bases
184
175
  end
185
176
 
186
177
  end