cheripic 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile +0 -1
- data/bin/cheripic +13 -0
- data/cheripic.gemspec +2 -2
- data/lib/cheripic.rb +7 -1
- data/lib/cheripic/bfr.rb +21 -5
- data/lib/cheripic/cmd.rb +36 -14
- data/lib/cheripic/contig.rb +34 -7
- data/lib/cheripic/contig_pileups.rb +70 -26
- data/lib/cheripic/implementer.rb +24 -3
- data/lib/cheripic/options.rb +110 -10
- data/lib/cheripic/pileup.rb +150 -159
- data/lib/cheripic/regions.rb +20 -4
- data/lib/cheripic/variants.rb +59 -12
- data/lib/cheripic/version.rb +5 -1
- metadata +20 -5
data/lib/cheripic/implementer.rb
CHANGED
@@ -2,14 +2,24 @@
|
|
2
2
|
|
3
3
|
module Cheripic
|
4
4
|
|
5
|
+
# Custom error handling for Implementer class
|
5
6
|
class ImplementerError < CheripicError; end
|
6
7
|
|
8
|
+
# An Implementer object for running pipeline from Cmd object options
|
9
|
+
#
|
10
|
+
# @!attribute [r] options
|
11
|
+
# @return [Hash] a hash of required input files names as keys and
|
12
|
+
# user provided file paths as values taken from Cmd object
|
13
|
+
# @!attribute [r] variants
|
14
|
+
# @return [<Cheripic::Variants>] a Variants object initialized using options from Cmd object
|
7
15
|
class Implementer
|
8
16
|
|
9
17
|
require 'ostruct'
|
10
18
|
require 'fileutils'
|
11
|
-
|
19
|
+
attr_reader :options, :variants, :has_run
|
12
20
|
|
21
|
+
# Initializes an Implementer object using inputs from cmd object
|
22
|
+
# @param inputs [Hash] a hash of trollop option names as keys and user or default setting as values from Cmd object
|
13
23
|
def initialize(inputs)
|
14
24
|
set1 = %i{assembly
|
15
25
|
input_format
|
@@ -38,13 +48,20 @@ module Cheripic
|
|
38
48
|
settings = inputs.select { |k| set2.include?(k) }
|
39
49
|
Options.update(settings)
|
40
50
|
FileUtils.mkdir_p @options.output
|
51
|
+
@vars_extracted = false
|
52
|
+
@has_run = false
|
41
53
|
end
|
42
54
|
|
55
|
+
# Initializes a Variants object using using input options (files).
|
56
|
+
# Each pileup file is processed and bulks are compared
|
43
57
|
def extract_vars
|
44
58
|
@variants = Variants.new(@options)
|
45
59
|
@variants.compare_pileups
|
60
|
+
@vars_extracted = true
|
46
61
|
end
|
47
62
|
|
63
|
+
# Extracted variants from bulk comparison are re-analysed
|
64
|
+
# and selected variants are written to a file
|
48
65
|
def process_variants
|
49
66
|
@variants.verify_bg_bulk_pileup
|
50
67
|
# print selected variants that could be potential markers or mutation
|
@@ -63,16 +80,20 @@ module Cheripic
|
|
63
80
|
out_file.close
|
64
81
|
end
|
65
82
|
|
83
|
+
# Wrapper to extract and isolate selected variants
|
84
|
+
# implements extract_vars and process_variants and
|
85
|
+
# if data is from polyploids extracts contigs with high bfr
|
66
86
|
def run
|
67
|
-
unless
|
87
|
+
unless @vars_extracted
|
68
88
|
self.extract_vars
|
69
89
|
end
|
70
|
-
if Options.
|
90
|
+
if Options.polyploidy
|
71
91
|
self.process_variants
|
72
92
|
@variants.bfr_frags
|
73
93
|
else
|
74
94
|
self.process_variants
|
75
95
|
end
|
96
|
+
@has_run = true
|
76
97
|
end
|
77
98
|
|
78
99
|
end
|
data/lib/cheripic/options.rb
CHANGED
@@ -2,12 +2,12 @@
|
|
2
2
|
|
3
3
|
module Cheripic
|
4
4
|
|
5
|
+
# A class to get default settings and update user settings for parameters
|
6
|
+
# and facilitate retrieval of settings any where in the module
|
5
7
|
class Options
|
6
8
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
@defaults = {
|
9
|
+
# Default parameter settings
|
10
|
+
@def_settings = {
|
11
11
|
:hmes_adjust => 0.5,
|
12
12
|
:htlow => 0.2,
|
13
13
|
:hthigh => 0.9,
|
@@ -25,16 +25,116 @@ module Cheripic
|
|
25
25
|
:bfr_adjust => 0.05,
|
26
26
|
:sel_seq_len => 50
|
27
27
|
}
|
28
|
-
# @params = OpenStruct.new(@defaults)
|
29
28
|
|
29
|
+
# set defaults as user settings
|
30
|
+
@user_settings = @def_settings
|
31
|
+
|
32
|
+
# A value to adjust calculation of Homozygosity Enrichment Score (HMES)
|
33
|
+
# @return [Float]
|
34
|
+
def self.hmes_adjust
|
35
|
+
@user_settings[:hmes_adjust]
|
36
|
+
end
|
37
|
+
|
38
|
+
# Lower cut off of Allele fraction for categorization of an variant to heterozygous
|
39
|
+
# @return [Float]
|
40
|
+
def self.htlow
|
41
|
+
@user_settings[:htlow]
|
42
|
+
end
|
43
|
+
|
44
|
+
# Higher cut off of Allele fraction for categorization of an variant to heterozygous
|
45
|
+
# @return [Float]
|
46
|
+
def self.hthigh
|
47
|
+
@user_settings[:hthigh]
|
48
|
+
end
|
49
|
+
|
50
|
+
# Minimum read coverage at the variant position to be considered for analysis
|
51
|
+
# @return [Integer]
|
52
|
+
def self.mindepth
|
53
|
+
@user_settings[:mindepth]
|
54
|
+
end
|
55
|
+
|
56
|
+
# Minimum non reference count at the variant position to be considered for analysis
|
57
|
+
# @return [Integer]
|
58
|
+
def self.min_non_ref_count
|
59
|
+
@user_settings[:min_non_ref_count]
|
60
|
+
end
|
61
|
+
|
62
|
+
# Minimum reads supporting an indel at the variant position to be considered for analysis as indel
|
63
|
+
# @return [Integer]
|
64
|
+
def self.min_indel_count_support
|
65
|
+
@user_settings[:min_indel_count_support]
|
66
|
+
end
|
67
|
+
|
68
|
+
# Option to whether to ignore or consider the reference positions which are ambiguous
|
69
|
+
# @return [Boolean]
|
70
|
+
def self.ignore_reference_n
|
71
|
+
@user_settings[:ignore_reference_n]
|
72
|
+
end
|
73
|
+
|
74
|
+
# Minimum alignment mapping quality of the read to be used for bam files
|
75
|
+
# @return [Integer]
|
76
|
+
def self.mapping_quality
|
77
|
+
@user_settings[:mapping_quality]
|
78
|
+
end
|
79
|
+
|
80
|
+
# Minimum aligned base quality at the variant position to be considered for analysis
|
81
|
+
# @return [Integer]
|
82
|
+
def self.base_quality
|
83
|
+
@user_settings[:base_quality]
|
84
|
+
end
|
85
|
+
|
86
|
+
# Threshold for fraction of read bases at variant position below which are ignored as noise
|
87
|
+
# @return [Float]
|
88
|
+
def self.noise
|
89
|
+
@user_settings[:noise]
|
90
|
+
end
|
91
|
+
|
92
|
+
# Option for cross type used for generating bulk population
|
93
|
+
# @note options are either 'back' or 'out'
|
94
|
+
# @return [String]
|
95
|
+
def self.cross_type
|
96
|
+
@user_settings[:cross_type]
|
97
|
+
end
|
98
|
+
|
99
|
+
# Option to whether to ignore or consider the contigs with out any variants
|
100
|
+
# @return [Boolean]
|
101
|
+
def self.only_frag_with_vars
|
102
|
+
@user_settings[:only_frag_with_vars]
|
103
|
+
end
|
104
|
+
|
105
|
+
# Option to whether to ignore or consider the contigs with low HME score
|
106
|
+
# @return [Boolean]
|
107
|
+
def self.filter_out_low_hmes
|
108
|
+
@user_settings[:filter_out_low_hmes]
|
109
|
+
end
|
110
|
+
|
111
|
+
# Option to whether to set the input data is from polyploid or not
|
112
|
+
# @return [Boolean]
|
113
|
+
def self.polyploidy
|
114
|
+
@user_settings[:polyploidy]
|
115
|
+
end
|
116
|
+
|
117
|
+
# A value to adjust calculation of bulk frequency ratio (bfr)
|
118
|
+
# @return [Float]
|
119
|
+
def self.bfr_adjust
|
120
|
+
@user_settings[:bfr_adjust]
|
121
|
+
end
|
122
|
+
|
123
|
+
# Number of nucleotides of sequence to select from each side of the selected variant
|
124
|
+
# @return [Integer]
|
125
|
+
def self.sel_seq_len
|
126
|
+
@user_settings[:sel_seq_len]
|
127
|
+
end
|
128
|
+
|
129
|
+
# Updates the values of options using a hash generated from user inputs
|
130
|
+
# @param newset [Hash] a hash of option names as keys user settings as values
|
30
131
|
def self.update(newset)
|
31
|
-
@
|
32
|
-
self.params
|
33
|
-
# @params = OpenStruct.new(@defaults)
|
132
|
+
@user_settings = @def_settings.merge(newset)
|
34
133
|
end
|
35
134
|
|
36
|
-
|
37
|
-
|
135
|
+
# Resets the values of options to defaults
|
136
|
+
def self.defaults
|
137
|
+
@user_settings = @def_settings
|
38
138
|
end
|
39
139
|
|
40
140
|
end
|
data/lib/cheripic/pileup.rb
CHANGED
@@ -1,186 +1,177 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'bio-samtools'
|
4
|
-
require 'bio/db/pileup'
|
2
|
+
module Cheripic
|
5
3
|
|
6
|
-
|
4
|
+
# Custom error handling for Pileup class
|
5
|
+
class PileupError < CheripicError; end
|
7
6
|
|
8
|
-
|
7
|
+
require 'bio-samtools'
|
8
|
+
require 'bio/db/pileup'
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
set_defaults(opts)
|
13
|
-
adj_read_bases
|
14
|
-
@indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN'
|
15
|
-
end
|
16
|
-
|
17
|
-
def set_defaults(opts)
|
18
|
-
@defaults = {
|
19
|
-
noise: 0.1, # noise level for read depth
|
20
|
-
ht_low: 0.2, # min allele freq for heterozygosity
|
21
|
-
ht_high: 0.9, # max allele freq for heterozygosity
|
22
|
-
min_depth: 6, # minimum coverage for variant
|
23
|
-
min_non_ref_count: 3,
|
24
|
-
ignore_reference_n: true,
|
25
|
-
min_indel_count_support: 3,
|
26
|
-
}
|
27
|
-
@defaults.merge(opts)
|
28
|
-
end
|
29
|
-
|
30
|
-
# removes mapping quality information
|
31
|
-
def adj_read_bases
|
32
|
-
# mapping quality after '^' symbol is substituted
|
33
|
-
# to avoid splitting at non indel + or - characters
|
34
|
-
# read ends marking by '$' symbol is substituted
|
35
|
-
# insertion and deletion marking by '*' symbol is substituted
|
36
|
-
self.read_bases.gsub!(/\^./, '')
|
37
|
-
self.read_bases.delete! '$'
|
38
|
-
self.read_bases.delete! '*'
|
39
|
-
# warn about reads with ambiguous codes
|
40
|
-
# if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/)
|
41
|
-
# warn "Ambiguous nucleotide\t#{self.read_bases}"
|
42
|
-
# end
|
43
|
-
end
|
10
|
+
# An extension of Bio::DB::Pileup object to process pileup information at a given position
|
11
|
+
class Pileup < Bio::DB::Pileup
|
44
12
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
elsif self.read_bases =~ /\-/
|
52
|
-
bases_hash = indels_to_hash('-')
|
53
|
-
else
|
54
|
-
bases_hash = snp_base_hash(self.read_bases)
|
13
|
+
# creates a Pileup object using a pileup information as string
|
14
|
+
# @param string [String] pileup information line for a given position
|
15
|
+
def initialize(string)
|
16
|
+
super(string)
|
17
|
+
adj_read_bases
|
18
|
+
@indelbases = 'acgtryswkmbdhvnACGTRYSWKMBDHVN'
|
55
19
|
end
|
56
|
-
# some indels will have ref base in the read and using
|
57
|
-
# sum of hash values is going to give wrong additional coverage
|
58
|
-
# from indels so including actual coverage from pileup
|
59
|
-
# bases_hash keys are :A, :C, :G, :T, :N, :ref, :indel and :cov
|
60
|
-
bases_hash[:cov] = self.coverage
|
61
|
-
bases_hash
|
62
|
-
end
|
63
20
|
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
21
|
+
# removes mapping quality information
|
22
|
+
def adj_read_bases
|
23
|
+
# mapping quality after '^' symbol is substituted
|
24
|
+
# to avoid splitting at non indel + or - characters
|
25
|
+
# read ends marking by '$' symbol is substituted
|
26
|
+
# insertion and deletion marking by '*' symbol is substituted
|
27
|
+
self.read_bases.gsub!(/\^./, '')
|
28
|
+
self.read_bases.delete! '$'
|
29
|
+
self.read_bases.delete! '*'
|
30
|
+
# warn about reads with ambiguous codes
|
31
|
+
# if self.read_bases.match(/[^atgcATGC,\.\+\-0-9]/)
|
32
|
+
# warn "Ambiguous nucleotide\t#{self.read_bases}"
|
33
|
+
# end
|
74
34
|
end
|
75
|
-
number
|
76
|
-
end
|
77
35
|
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
36
|
+
# count bases matching reference and non-reference
|
37
|
+
# from snp variant and make a hash of bases with counts
|
38
|
+
# for indels return the read bases information instead
|
39
|
+
def bases_hash
|
40
|
+
if self.read_bases =~ /\+/
|
41
|
+
bases_hash = indels_to_hash('+')
|
42
|
+
elsif self.read_bases =~ /-/
|
43
|
+
bases_hash = indels_to_hash('-')
|
44
|
+
else
|
45
|
+
bases_hash = snp_base_hash(self.read_bases)
|
46
|
+
end
|
47
|
+
# some indels will have ref base in the read and using
|
48
|
+
# sum of hash values is going to give wrong additional coverage
|
49
|
+
# from indels so including actual coverage from pileup
|
50
|
+
# bases_hash keys are :A, :C, :G, :T, :N, :ref and :indel
|
51
|
+
bases_hash
|
88
52
|
end
|
89
|
-
non_ref_count
|
90
|
-
end
|
91
53
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
54
|
+
# count bases from indels
|
55
|
+
# array of pileup bases is split at + / -
|
56
|
+
# and number after each + / - is counted
|
57
|
+
def count_indel_bases(delimiter)
|
58
|
+
array = self.read_bases.split(delimiter)
|
59
|
+
number = 0
|
60
|
+
array.shift
|
61
|
+
array.each do |element|
|
62
|
+
# deletions in reference could contain ambiguous codes,
|
63
|
+
number += /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
|
64
|
+
end
|
65
|
+
number
|
66
|
+
end
|
97
67
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
68
|
+
# count bases matching reference and non-reference
|
69
|
+
# and calculate ratio of non_ref allele to total bases
|
70
|
+
def non_ref_count
|
71
|
+
read_bases = self.read_bases
|
72
|
+
if read_bases =~ /\+/
|
73
|
+
non_ref_count = indel_non_ref_count('+')
|
74
|
+
elsif read_bases =~ /-/
|
75
|
+
non_ref_count = indel_non_ref_count('-')
|
76
|
+
else
|
77
|
+
non_ref_count = read_bases.count('atgcATGC')
|
78
|
+
end
|
79
|
+
non_ref_count
|
80
|
+
end
|
103
81
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
82
|
+
# check if the pileup has the parameters we are looking for
|
83
|
+
def is_var
|
84
|
+
ignore_reference_n = Options.ignore_reference_n
|
85
|
+
min_depth = Options.mindepth
|
86
|
+
min_non_ref_count = Options.min_non_ref_count
|
109
87
|
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
ht_high = @defaults[:ht_high]
|
115
|
-
mode = ''
|
116
|
-
if self.non_ref_ratio.between?(ht_low, ht_high)
|
117
|
-
mode = :het
|
118
|
-
elsif self.non_ref_ratio > ht_high
|
119
|
-
mode = :hom
|
88
|
+
return false if self.ref_base == '*'
|
89
|
+
return false if ignore_reference_n and self.ref_base =~ /^[nN]$/
|
90
|
+
return true if self.coverage >= min_depth and self.non_ref_count >= min_non_ref_count
|
91
|
+
false
|
120
92
|
end
|
121
|
-
mode
|
122
|
-
end
|
123
93
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
def var_base_frac
|
129
|
-
hash = self.bases_hash
|
130
|
-
snp_hash = {}
|
131
|
-
coverage = hash[:cov]
|
132
|
-
return snp_hash if coverage < @defaults[:min_depth]
|
133
|
-
# calculate proportion of each base in coverage
|
134
|
-
hash.each_key do | base |
|
135
|
-
next if base == :cov
|
136
|
-
freq = hash[base].to_f/coverage.to_f
|
137
|
-
next if freq <= @defaults[:noise]
|
138
|
-
snp_hash[base] = freq
|
94
|
+
# count bases matching reference and non-reference
|
95
|
+
# and calculate ratio of non_ref allele to total bases
|
96
|
+
def non_ref_ratio
|
97
|
+
self.non_ref_count.to_f / self.coverage.to_f
|
139
98
|
end
|
140
|
-
snp_hash
|
141
|
-
end
|
142
99
|
|
100
|
+
# calculate var zygosity for non-polyploid variants
|
101
|
+
# increased range is used for heterozygosity for RNA-seq data
|
102
|
+
# def var_mode
|
103
|
+
# ht_low = @defaults[:ht_low]
|
104
|
+
# ht_high = @defaults[:ht_high]
|
105
|
+
# mode = ''
|
106
|
+
# if self.non_ref_ratio.between?(ht_low, ht_high)
|
107
|
+
# mode = :het
|
108
|
+
# elsif self.non_ref_ratio > ht_high
|
109
|
+
# mode = :hom
|
110
|
+
# end
|
111
|
+
# mode
|
112
|
+
# end
|
143
113
|
|
144
|
-
|
114
|
+
# form hash of base information, [ATGC] counts for snp
|
115
|
+
# a hash of base proportion is calculated
|
116
|
+
# base proportion hash below a selected depth is empty
|
117
|
+
# base proportion below or equal to a noise factor are discarded
|
118
|
+
def var_base_frac
|
119
|
+
hash = self.bases_hash
|
120
|
+
snp_hash = {}
|
121
|
+
coverage = self.coverage
|
122
|
+
return snp_hash if coverage < Options.mindepth
|
123
|
+
# calculate proportion of each base in coverage
|
124
|
+
hash.each_key do | base |
|
125
|
+
freq = hash[base].to_f/coverage.to_f
|
126
|
+
next if freq <= Options.noise
|
127
|
+
snp_hash[base] = freq
|
128
|
+
end
|
129
|
+
snp_hash
|
130
|
+
end
|
145
131
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
# count number of indels and number non-indel base
|
136
|
+
# and return a hash with bases and indel counts
|
137
|
+
def indels_to_hash(delimiter)
|
138
|
+
non_indel_bases = String.new
|
139
|
+
array = self.read_bases.split(delimiter)
|
140
|
+
non_indel_bases << array.shift
|
141
|
+
array.each do |element|
|
142
|
+
# get number of nucleotides inserted or deleted
|
143
|
+
number = /^(\d+)[#{@indelbases}]/.match(element)[1].to_i
|
144
|
+
# capture remaining nucleotides
|
145
|
+
non_indel_bases << element.gsub(/^#{number}\w{#{number}}/, '')
|
146
|
+
end
|
147
|
+
bases_hash = snp_base_hash(non_indel_bases)
|
148
|
+
# check at least three reads are supporting indel
|
149
|
+
indel_count = self.read_bases.count(delimiter)
|
150
|
+
if indel_count >= Options.min_indel_count_support
|
151
|
+
bases_hash[:indel] = indel_count
|
152
|
+
end
|
153
|
+
bases_hash
|
157
154
|
end
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
155
|
+
|
156
|
+
def snp_base_hash(readbases)
|
157
|
+
non_indel_base_hash = {}
|
158
|
+
non_indel_base_hash[:ref] = readbases.count('.,')
|
159
|
+
non_indel_base_hash[:A] = readbases.count('aA')
|
160
|
+
non_indel_base_hash[:C] = readbases.count('cC')
|
161
|
+
non_indel_base_hash[:G] = readbases.count('gG')
|
162
|
+
non_indel_base_hash[:T] = readbases.count('tT')
|
163
|
+
# non_indel_base_hash[:N] = read_bases.count('nN')
|
164
|
+
non_indel_base_hash
|
163
165
|
end
|
164
|
-
bases_hash
|
165
|
-
end
|
166
166
|
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
# non_indel_base_hash[:N] = read_bases.count('nN')
|
175
|
-
non_indel_base_hash
|
176
|
-
end
|
167
|
+
def indel_non_ref_count(delimitter)
|
168
|
+
read_bases = self.read_bases
|
169
|
+
non_ref_count = read_bases.count(@indelbases)
|
170
|
+
indelcounts = read_bases.count(delimitter)
|
171
|
+
indel_bases = count_indel_bases(delimitter)
|
172
|
+
non_ref_count + indelcounts - indel_bases
|
173
|
+
end
|
177
174
|
|
178
|
-
def indel_non_ref_count(delimitter)
|
179
|
-
read_bases = self.read_bases
|
180
|
-
non_ref_count = read_bases.count(@indelbases)
|
181
|
-
indelcounts = read_bases.count(delimitter)
|
182
|
-
indel_bases = count_indel_bases(delimitter)
|
183
|
-
non_ref_count + indelcounts - indel_bases
|
184
175
|
end
|
185
176
|
|
186
177
|
end
|