cheripic 1.2.5 → 1.2.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 824a8c68d3707ad02cf0d3b7d567191244d1a5a6
4
- data.tar.gz: 6d2b3c7bef04aba06b5206968d1a0d69996a25b0
3
+ metadata.gz: 4b848b1b52f01215fabf404f7387e6f621a2285b
4
+ data.tar.gz: e434cbad53ac982974c30cf2a940668862a6d2bd
5
5
  SHA512:
6
- metadata.gz: 6ae5a85c30a0b1ea19f118409ddec95a6c7c3e11f00663e9769a5642770e90cb2ab5b0200f9d9eaa4ed8c6873492ac7f5f3acd568dc0cc14ddf8ccaac5012435
7
- data.tar.gz: efe77b2ccafd0ad7ed4eeb47b497207cacf3dbaee058779b46af7a5ca34597991a3949dbeb0f3807bba69ea7dca0dd7e24fb58c040bf0065fef2ca1e4e3424fc
6
+ metadata.gz: ef7eb0c0008b8d8ed45d6e1deeeb4c59ef187a8af3a92fbc91dba33d91af03b475c337c2e69dcdcb8c34c3cfeea3f9f5d54e42fadba8d0270e0a776c96609c79
7
+ data.tar.gz: beb3fccf28f54a30fc355a93261043d27dfd4d2ebb34043245404608d1608d2185aae57b8263192ca5973caf1a99c9b9b1614187e5d253656efc9ec9fab9aa50
@@ -6,8 +6,15 @@ All significant changes to this project at each release are documented in this f
6
6
  #### Future changes to include
7
7
 
8
8
  1. option to take multiple background pileup files
9
- 2. replace output directory with output file name tag, since we only write to one file
10
- 3. option to take bam file or pileup file as inputs of bulks
9
+
10
+ #### [1.2.6] - 2016-10-26
11
+
12
+ 1. option to run only using with vcf files as bulk inputs to increase speed of analysis for larger genomes
13
+
14
+ #### [1.2.5] - 2016-10-17
15
+
16
+ 1. Updated methods take bam file (along with a vcf file) or pileup file as inputs of bulks
17
+ 2. Replaced output directory with output file name tag, since we only write to one file
11
18
 
12
19
  #### [1.2.0] - 2016-08-11
13
20
 
@@ -18,4 +25,4 @@ All significant changes to this project at each release are documented in this f
18
25
 
19
26
  #### [1.1.0] - 2016-07-26
20
27
 
21
- first release of the binaries for Linux 64 bit and OSX 64bit
28
+ first release of the binaries for Linux 64 bit and OSX 64bit
data/README.md CHANGED
@@ -45,7 +45,7 @@ Running `cheripic` without any input at command line interface shows following h
45
45
 
46
46
  ```
47
47
 
48
- Cheripic v1.2.0
48
+ Cheripic v1.2.6
49
49
  Authors: Shyam Rallapalli and Dan MacLean
50
50
 
51
51
  Description: Candidate mutation and closely linked marker selection for non reference genomes
@@ -53,35 +53,45 @@ Uses bulk segregant data from non-reference sequence genomes
53
53
 
54
54
  Inputs:
55
55
  1. Needs a reference fasta file of asssembly use for variant analysis
56
- 2. Pileup files for mutant (phenotype of interest) bulks and background (wildtype phenotype) bulks
57
- 3. If polyploid species, include of pileup from one or both parents
56
+ 2. Pileup/Bam files for mutant (phenotype of interest) bulks and background (wildtype phenotype) bulks
57
+ 3. If providing bam files, you have to include vcf files for the respective bulks
58
+ 4. If polyploid species, include pileup/bam files from one or both parents
58
59
 
59
60
  USAGE:
60
61
  cheripic <options>
61
62
 
62
63
  OPTIONS:
63
64
  -f, --assembly=<s> Assembly file in FASTA format
64
- -F, --input-format=<s> bulk and parent alignment file format types - set either pileup or bam (default: pileup)
65
+ -F, --input-format=<s> bulk and parent alignment file format types - set either pileup or bam or vcf (default: pileup)
65
66
  -a, --mut-bulk=<s> Pileup or sorted BAM file alignments from mutant/trait of interest bulk 1
67
+ --mut-bulk-vcf=<s> vcf file for variants from mutant/trait of interest bulk 1 (default: )
66
68
  -b, --bg-bulk=<s> Pileup or sorted BAM file alignments from background/wildtype bulk 2
67
- --output=<s> Directory to store results, will be created if not existing (default: cheripic_results)
69
+ --bg-bulk-vcf=<s> vcf file for variants from background/wildtype bulk 2 (default: )
70
+ --output=<s> custom name tag to include in the output file name (default: cheripic_results)
68
71
  --loglevel=<s> Choose any one of "info / warn / debug" level for logs generated (default: debug)
69
72
  --hmes-adjust=<f> factor added to snp count of each contig to adjust for hme score calculations (default: 0.5)
70
73
  --htlow=<f> lower level for categorizing heterozygosity (default: 0.2)
71
74
  --hthigh=<f> high level for categorizing heterozygosity (default: 0.9)
72
- --mindepth=<i> minimum read depth to conisder a position for variant calls (default: 6)
75
+ --mindepth=<i> minimum read depth at a position to consider for variant calls (default: 6)
76
+ --max-d-multiple=<i> multiplication factor for average coverage to calculate maximum read coverage
77
+ if set zero no calculation will be made from bam file.
78
+ setting this value will override user set max depth (Default: 5)
79
+ --maxdepth=<i> maximum read depth at a position to consider for variant calls
80
+ if set to zero no user max depth will be used (default: 0)
73
81
  --min-non-ref-count=<i> minimum read depth supporting non reference base at each position (default: 3)
74
82
  --min-indel-count-support=<i> minimum read depth supporting an indel at each position (default: 3)
75
- --ambiguous-ref-bases including variant at completely ambiguous bases in the reference
83
+ --ambiguous-ref-bases=<s> including variant at completely ambiguous bases in the reference (default: false)
76
84
  -q, --mapping-quality=<i> minimum mapping quality of read covering the position (default: 20)
77
85
  -Q, --base-quality=<i> minimum base quality of bases covering the position (default: 15)
78
86
  --noise=<f> praportion of reads for a variant to conisder as noise (default: 0.1)
79
87
  --cross-type=<s> type of cross used to generated mapping population - back or out (default: back)
80
- --use-all-contigs option to select all contigs or only contigs containing variants for analysis
81
- --include-low-hmes option to include or discard variants from contigs with low hme-score or bfr score to list in the final output
82
- --polyploidy Set if the data input is from polyploids
88
+ --use-all-contigs=<s> option to select all contigs or only contigs containing variants for analysis (default: false)
89
+ --include-low-hmes=<s> option to include or discard variants from contigs with
90
+ low hme-score or bfr score to list in the final output (default: false)
91
+ --polyploidy=<s> Set if the data input is from polyploids (default: false)
83
92
  -p, --mut-parent=<s> Pileup or sorted BAM file alignments from mutant/trait of interest parent (default: )
84
93
  -r, --bg-parent=<s> Pileup or sorted BAM file alignments from background/wildtype parent (default: )
94
+ -R, --repeats-file=<s> repeat masker output file for the assembly (default: )
85
95
  --bfr-adjust=<f> factor added to hemi snp frequency of each parent to adjust for bfr calculations (default: 0.05)
86
96
  --sel-seq-len=<i> sequence length to print from either side of selected variants (default: 50)
87
97
  --examples shows some example commands with explanation
data/Rakefile CHANGED
@@ -61,8 +61,8 @@ def create_package(target)
61
61
  package_dir = "packaging/#{package_dest}"
62
62
  sh "rm -rf #{package_dir}"
63
63
  sh "mkdir #{package_dir}"
64
- sh "mkdir -p #{package_dir}/lib/app"
65
- sh "cp -R bin #{package_dir}/lib/app/"
64
+ sh "mkdir -p #{package_dir}/lib/app/bin"
65
+ sh "cp bin/cheripic #{package_dir}/lib/app/bin/"
66
66
  sh "cp -R lib #{package_dir}/lib/app/"
67
67
  sh "mkdir #{package_dir}/lib/app/ruby"
68
68
  sh "tar -xzf packaging/traveling-ruby-#{TRAVELING_RUBY_VERSION}-#{target}.tar.gz -C #{package_dir}/lib/app/ruby"
@@ -1,21 +1,25 @@
1
- <tool id="cheripic" name="CHERIPIC" version="1.2.0">
1
+ <tool id="cheripic" name="CHERIPIC" version="1.2.6">
2
2
 
3
3
  <description>CHERIPIC</description>
4
4
 
5
- <version_command>cheripic -v</version_command>
5
+ <version_command>/full_path_to/cheripic -v</version_command>
6
6
 
7
7
  <command>
8
8
  <![CDATA[
9
- cheripic
9
+ /full_path_to/cheripic
10
10
  --assembly $assembly
11
+ --input-format $input_format
11
12
  --mut-bulk $mut_bulk
12
13
  --bg-bulk $bg_bulk
13
- --output $output
14
+ --mut-bulk-vcf $mut_bulk_vcf
15
+ --bg-bulk-vcf $bg_bulk_vcf
14
16
  --loglevel $loglevel
15
17
  --hmes-adjust $hmes_adjust
16
18
  --htlow $ht_low
17
19
  --hthigh $ht_high
18
20
  --mindepth $min_depth
21
+ --max-d-multiple $max_d_multiple
22
+ --maxdepth $max_depth
19
23
  --min-non-ref-count $min_non_ref_count
20
24
  --min-indel-count-support $min_indel_count_support
21
25
  --ambiguous-ref-bases $ambiguous_ref_bases
@@ -28,17 +32,26 @@
28
32
  --polyploidy $polyploidy
29
33
  --mut-parent $mut_parent
30
34
  --bg-parent $bg_parent
35
+ --repeats-file $repeats_file
31
36
  --bfr-adjust $bfr_adjust
32
37
  --sel-seq-len $sel_seq_len
38
+ &> output_log.txt
33
39
  ]]>
34
40
  </command>
35
41
 
36
42
  <inputs>
37
43
  <param name="assembly" type="data" format="fasta" label="Input Assembly file" help="Select Assembly fasta file" />
38
- <param name="mut_bulk" type="data" format="pileup" label="mutant bulk pileup file" help="Select mutant bulk pileup file" />
39
- <param name="bg_bulk" type="data" format="pileup" label="background bulk pileup file" min="1" multiple="true" help="Select background bulk pileup file" />
40
- <param name="loglevel" type="select" optional="true" label="analysis log level" help="choose between info, warn and debug levels">
41
- <option value="info" selected="true">info </option>
44
+ <param name="input_format" type="select" optional="true" label="input file format" help="choose between vcf, bam and pileup format" >
45
+ <option value="vcf" selected="true">vcf</option>
46
+ <option value="bam">bam</option>
47
+ <option value="pileup">pileup</option>
48
+ </param>
49
+ <param name="mut_bulk" type="data" label="mutant bulk input file" help="Select mutant bulk input file" />
50
+ <param name="bg_bulk" type="data" label="background bulk input file" help="Select background bulk input file" />
51
+ <param name="mut_bulk_vcf" type="data" optional="true" label="mutant bulk input vcf file" help="Select mutant bulk input vcf file" />
52
+ <param name="bg_bulk_vcf" type="data" optional="true" label="background bulk input vcf file" help="Select background bulk input vcf file" />
53
+ <param name="loglevel" type="select" optional="true" label="analysis log level" help="choose between info, warn and debug levels" >
54
+ <option value="info" selected="true">info</option>
42
55
  <option value="warn">warnings</option>
43
56
  <option value="debug">debug</option>
44
57
  </param>
@@ -50,6 +63,10 @@
50
63
  label="heterozygosity high limit" help="upper limit to heterozygosity allele fraction" />
51
64
  <param name="min_depth" size="4" type="integer" optional="true" value="6" min="1" max="8000"
52
65
  label="minimum read coverage" help="minimum read depth to conisder a position for variant calls" />
66
+ <param name="max_d_multiple" size="4" type="integer" optional="true" value="5" min="0" max="100"
67
+ label="multiplication factor avg read coverage" help="multiplication factor for average coverage to calculate maximum read coverage" />
68
+ <param name="max_depth" size="4" type="integer" optional="true" value="0" min="0" max="8000"
69
+ label="maximum read coverage" help="maximum read depth to conisder a position for variant calls" />
53
70
  <param name="min_non_ref_count" size="4" type="integer" optional="true" value="3" min="1" max="8000"
54
71
  label="minimum alternate read coverage" help="minimum read depth supporting non reference base at each position" />
55
72
  <param name="min_indel_count_support" size="4" type="integer" optional="true" value="3" min="1" max="8000"
@@ -73,20 +90,22 @@
73
90
  help="option to include or discard variants from contigs with low hme-score or bfr score to list in the final output" truevalue="true" falsevalue="false" />
74
91
  <param name="polyploidy" type="boolean" optional="true" checked="false" label="polyploid data"
75
92
  help="Set if the input data is from polyploids" truevalue="true" falsevalue="false" />
76
- <param name="mut-parent" type="data" optional="true" format="pileup" label="mutant parent pileup file" help="Select mutant parent pileup file" />
77
- <param name="bg-parent" type="data" optional="true" format="pileup" label="background parent pileup file" help="Select background parent pileup file" />
93
+ <param name="mut_parent" type="data" optional="true" format="pileup" label="mutant parent pileup file" help="Select mutant parent pileup file" />
94
+ <param name="bg_parent" type="data" optional="true" format="pileup" label="background parent pileup file" help="Select background parent pileup file" />
95
+ <param name="repeats_file" type="data" optional="true" format="txt" label="Repeat masker output file" help="Repeat masker output file of repeat positions" />
78
96
 
79
97
  <param name="bfr_adjust" size="4" type="float" optional="true" value="0.05" min="0.01" max="1.0"
80
98
  label="bfr score adjuster" help="factor added to hemi snp frequency of each parent to adjust for bfr calculations (default: 0.05)" />
81
99
  <param name="sel_seq_len" size="4" type="integer" optional="true" value="50" min="10" max="250"
82
100
  label="selected variant seq length out" help="sequence length to print from either side of selected variants (default: 50)" />
83
-
84
- <param name="output" type="text" size="30" value="cheripic_results" label="tag for output filename" help="write a tag to include with output filename" />
85
101
  </inputs>
86
102
 
87
103
  <outputs>
88
- <data name="output_1" format="txt" file="${output}_selected_hme_variants.txt" />
89
- <data name="output_2" format="txt" file="${output}_selected_bfr_variants.txt" />
104
+ <data name="output1" format="txt" from_work_dir="output_log.txt" label="cheripic log file" />
105
+ <data name="output2" format="txt" from_work_dir="cheripic_results_selected_hme_variants.txt" label="selected hmes variants" />
106
+ <data name="output3" format="txt" from_work_dir="cheripic_results_selected_bfr_variants.txt" label="selected bfr variants" >
107
+ <filter>polyploidy == "true"</filter>
108
+ </data>
90
109
  </outputs>
91
110
 
92
111
  <tests>
@@ -154,15 +173,22 @@ All of the options have a default value. You can change any of them. All of the
154
173
 
155
174
  OPTIONS:
156
175
  -f, --assembly Assembly file in FASTA format
157
- -F, --input-format bulk and parent alignment file format types - set either pileup or bam (default: pileup)
176
+ -F, --input-format bulk and parent alignment file format types - set either pileup or bam or vcf (default: pileup)
158
177
  -a, --mut-bulk Pileup or sorted BAM file alignments from mutant/trait of interest bulk 1
178
+ --mut-bulk-vcf vcf file for variants from mutant/trait of interest bulk 1
159
179
  -b, --bg-bulk Pileup or sorted BAM file alignments from background/wildtype bulk 2
160
- --output Directory to store results, will be created if not existing (default: cheripic_results)
180
+ --bg-bulk-vcf vcf file for variants from background/wildtype bulk 2
181
+ --output custom name tag to include in the output file name (default: cheripic_results)
161
182
  --loglevel Choose any one of "info / warn / debug" level for logs generated (default: debug)
162
183
  --hmes-adjust factor added to snp count of each contig to adjust for hme score calculations (default: 0.5)
163
184
  --htlow lower level for categorizing heterozygosity (default: 0.2)
164
185
  --hthigh high level for categorizing heterozygosity (default: 0.9)
165
- --mindepth minimum read depth to conisder a position for variant calls (default: 6)
186
+ --mindepth minimum read depth at a position to consider for variant calls (default: 6)
187
+ --max-d-multiple multiplication factor for average coverage to calculate maximum read coverage
188
+ if set zero no calculation will be made from bam file.
189
+ setting this value will override user set max depth (Default: 5)
190
+ --maxdepth maximum read depth at a position to consider for variant calls
191
+ if set to zero no user max depth will be used (default: 0)
166
192
  --min-non-ref-count minimum read depth supporting non reference base at each position (default: 3)
167
193
  --min-indel-count-support minimum read depth supporting an indel at each position (default: 3)
168
194
  --ambiguous-ref-bases including variant at completely ambiguous bases in the reference
@@ -171,10 +197,12 @@ OPTIONS:
171
197
  --noise praportion of reads for a variant to conisder as noise (default: 0.1)
172
198
  --cross-type type of cross used to generated mapping population - back or out (default: back)
173
199
  --use-all-contigs option to select all contigs or only contigs containing variants for analysis
174
- --include-low-hmes option to include or discard variants from contigs with low hme-score or bfr score to list in the final output
200
+ --include-low-hmes option to include or discard variants from contigs with
201
+ low hme-score or bfr score to list in the final output
175
202
  --polyploidy Set if the data input is from polyploids
176
203
  -p, --mut-parent Pileup or sorted BAM file alignments from mutant/trait of interest parent (default: )
177
204
  -r, --bg-parent Pileup or sorted BAM file alignments from background/wildtype parent (default: )
205
+ -R, --repeats-file repeat masker output file for the assembly (default: )
178
206
  --bfr-adjust factor added to hemi snp frequency of each parent to adjust for bfr calculations (default: 0.05)
179
207
  --sel-seq-len sequence length to print from either side of selected variants (default: 50)
180
208
 
@@ -190,7 +218,7 @@ Shyam Rallapalli
190
218
  </help>
191
219
 
192
220
  <citations>
193
- <citation type="doi">10.1093/bioinformatics/btg1080</citation>
221
+ <citation type="doi">spaceholder</citation>
194
222
  </citations>
195
223
 
196
224
  </tool>
@@ -45,7 +45,7 @@ module Cheripic
45
45
  opt :assembly, 'Assembly file in FASTA format',
46
46
  :short => '-f',
47
47
  :type => String
48
- opt :input_format, 'bulk and parent alignment file format types - set either pileup or bam',
48
+ opt :input_format, 'bulk and parent alignment file format types - set either pileup or bam or vcf',
49
49
  :short => '-F',
50
50
  :type => String,
51
51
  :default => 'pileup'
@@ -53,16 +53,18 @@ module Cheripic
53
53
  :short => '-a',
54
54
  :type => String
55
55
  opt :mut_bulk_vcf, 'vcf file for variants from mutant/trait of interest bulk 1',
56
- :type => String
56
+ :type => String,
57
+ :default => ''
57
58
  opt :bg_bulk, 'Pileup or sorted BAM file alignments from background/wildtype bulk 2',
58
59
  :short => '-b',
59
60
  :type => String
60
61
  opt :bg_bulk_vcf, 'vcf file for variants from background/wildtype bulk 2',
61
- :type => String
62
+ :type => String,
63
+ :default => ''
62
64
  opt :output, 'custom name tag to include in the output file name',
63
65
  :default => 'cheripic_results'
64
66
  opt :loglevel, 'Choose any one of "info / warn / debug" level for logs generated',
65
- :default => 'debug'
67
+ :default => 'info'
66
68
  opt :hmes_adjust, 'factor added to snp count of each contig to adjust for hme score calculations',
67
69
  :type => Float,
68
70
  :default => 0.5
@@ -79,8 +81,8 @@ module Cheripic
79
81
  if set zero no calculation will be made from bam file.\nsetting this value will override user set max depth",
80
82
  :type => Integer,
81
83
  :default => 5
82
- opt :maxdepth, "maximum read depth at a position to consider for variant calls
83
- if set to zero no user max depth will be used",
84
+ opt :maxdepth, 'maximum read depth at a position to consider for variant calls
85
+ if set to zero no user max depth will be used',
84
86
  :type => Integer,
85
87
  :default => 0
86
88
  opt :min_non_ref_count, 'minimum read depth supporting non reference base at each position',
@@ -90,8 +92,8 @@ if set to zero no user max depth will be used",
90
92
  :type => Integer,
91
93
  :default => 3
92
94
  opt :ambiguous_ref_bases, 'including variant at completely ambiguous bases in the reference',
93
- :type => FalseClass,
94
- :default => false
95
+ :type => String,
96
+ :default => 'false'
95
97
  opt :mapping_quality, 'minimum mapping quality of read covering the position',
96
98
  :short => '-q',
97
99
  :type => Integer,
@@ -107,15 +109,15 @@ if set to zero no user max depth will be used",
107
109
  :type => String,
108
110
  :default => 'back'
109
111
  opt :use_all_contigs, 'option to select all contigs or only contigs containing variants for analysis',
110
- :type => FalseClass,
111
- :default => false
112
+ :type => String,
113
+ :default => 'false'
112
114
  opt :include_low_hmes, 'option to include or discard variants from contigs with
113
115
  low hme-score or bfr score to list in the final output',
114
- :type => FalseClass,
115
- :default => false
116
+ :type => String,
117
+ :default => 'false'
116
118
  opt :polyploidy, 'Set if the data input is from polyploids',
117
- :type => FalseClass,
118
- :default => false
119
+ :type => String,
120
+ :default => 'false'
119
121
  opt :mut_parent, 'Pileup or sorted BAM file alignments from mutant/trait of interest parent',
120
122
  :short => '-p',
121
123
  :type => String,
@@ -187,11 +189,35 @@ low hme-score or bfr score to list in the final output',
187
189
 
188
190
  # calls other methods to check if command line inputs are valid
189
191
  def check_arguments
192
+ convert_boolean_strings
190
193
  check_output
191
194
  check_log_level
195
+ check_input_entry
192
196
  check_input_types
193
197
  end
194
198
 
199
+ # convert true or false options to boolean
200
+ def convert_boolean_strings
201
+ %i{ambiguous_ref_bases use_all_contigs include_low_hmes polyploidy}.each do | symbol |
202
+ if @options.key?(symbol)
203
+ @options[symbol] = @options[symbol] == 'false' ? false : true
204
+ end
205
+ end
206
+ end
207
+
208
+ # set file given option to false if input is nil or None or ''
209
+ def check_input_entry
210
+ %i{assembly mut_bulk bg_bulk mut_bulk_vcf bg_bulk_vcf mut_parent bg_parent repeats_file}.each do | symbol |
211
+ if @options.key?(symbol)
212
+ if @options[symbol] == 'None'
213
+ param = (symbol.to_s + '_given').to_sym
214
+ @options[symbol] = ''
215
+ @options.delete(param)
216
+ end
217
+ end
218
+ end
219
+ end
220
+
195
221
  # checks input files based on bulk file type
196
222
  def check_input_types
197
223
  inputfiles = {}
@@ -54,6 +54,7 @@ module Cheripic
54
54
  sel_seq_len}
55
55
  settings = inputs.select { |k| set2.include?(k) }
56
56
  Options.update(settings)
57
+ logger.debug "parameter values set\n#{Options.current_values.to_yaml}"
57
58
  @vars_extracted = false
58
59
  @has_run = false
59
60
  end
@@ -74,19 +75,21 @@ module Cheripic
74
75
  end
75
76
  # print selected variants that could be potential markers or mutation
76
77
  out_file = File.open(@options[pos_type], 'w')
77
- out_file.puts "Score\tAlleleFreq\tseq_id\tposition\tref_base\tcoverage\tbases\tbase_quals\tsequence_left\tAlt_seq\tsequence_right"
78
+ out_file.puts "Score\tAlleleFreq\tlength\tseq_id\tposition\tref_base\tcoverage\tbases\tbase_quals\tsequence_left\tAlt_seq\tsequence_right"
78
79
  regions = Regions.new(@options.assembly)
79
80
  @variants.send(pos_type).each_key do | frag |
80
81
  contig_obj = @variants.assembly[frag]
81
82
  if pos_type == :hmes_frags
82
83
  positions = contig_obj.hm_pos.keys
84
+ score = contig_obj.hme_score
83
85
  else
84
86
  positions = contig_obj.hemi_pos.keys
87
+ score = contig_obj.bfr_score
85
88
  end
86
89
  positions.each do | pos |
87
90
  pileup = @variants.pileups[frag].mut_bulk[pos]
88
91
  seqs = regions.fetch_seq(frag,pos)
89
- out_file.puts "#{contig_obj.hme_score}\t#{contig_obj.hm_pos[pos]}\t#{pileup.to_s.chomp}\t#{seqs[0]}\t#{pileup.consensus}\t#{seqs[1]}"
92
+ out_file.puts "#{score}\t#{contig_obj.hm_pos[pos]}\t#{contig_obj.length}\t#{pileup.to_s.chomp}\t#{seqs[0]}\t#{pileup.consensus}\t#{seqs[1]}"
90
93
  end
91
94
  end
92
95
  out_file.close
@@ -160,6 +160,11 @@ module Cheripic
160
160
  @user_settings = @def_settings
161
161
  end
162
162
 
163
+ # Resets the values of options to defaults
164
+ def self.current_values
165
+ @user_settings
166
+ end
167
+
163
168
  end
164
169
 
165
170
  end
@@ -4,35 +4,6 @@ require 'forwardable'
4
4
 
5
5
  module Cheripic
6
6
 
7
- require 'bio-samtools'
8
- require 'bio/db/sam'
9
- require 'open3'
10
-
11
- # An extension of Bio::DB::Sam object to modify depth method
12
- class Bio::DB::Sam
13
-
14
- # A method to retrieve depth information from bam object
15
- # @param opts [Hash] a hash of following input options
16
- # b [File] list of positions or regions in BED format
17
- # l [INT] minQLen
18
- # q [INT] base quality threshold
19
- # Q [INT] mapping quality threshold
20
- # r [chr:from-to] region
21
- # @returns a block with each line reporting sequence_name, position and depth
22
- def depth(opts={})
23
- command = form_opt_string(self.samtools, 'depth', opts)
24
- # capture returns string output, so careful not to give whole genome or big contigs for depth analysis
25
- stdout, stderr, status = Open3.capture3(command)
26
- unless status.success?
27
- logger.error "resulted in exit code #{status.exitstatus} using #{command}"
28
- logger.error "stderr output is: #{stderr}"
29
- raise CheripicError
30
- end
31
- # return stdout
32
- stdout
33
- end
34
-
35
- end
36
7
 
37
8
  # Custom error handling for Variants class
38
9
  class VariantsError < CheripicError; end
@@ -110,6 +81,8 @@ module Cheripic
110
81
  logger.info "processing #{input} file"
111
82
  if @params.input_format == 'pileup'
112
83
  extract_pileup(infile, input)
84
+ elsif @params.input_format == 'vcf'
85
+ extract_vcfs(infile, input)
113
86
  else
114
87
  extract_bam_pileup(infile, input)
115
88
  end
@@ -119,35 +92,21 @@ module Cheripic
119
92
  @pileups_analyzed = true
120
93
  end
121
94
 
122
- # Bam object is read and each contig mean and std deviation of depth calculated
123
- # @param bamobject [Bio::DB::Sam]
124
- # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
125
- def set_max_depth(bamobject, bamfile)
126
- logger.info "processing #{bamfile} file for depth"
127
- all_depths = []
128
- bq = Options.base_quality
129
- mq = Options.mapping_quality
130
- @assembly.each_key do | id |
131
- contig_obj = @assembly[id]
132
- len = contig_obj.length
133
- data = bamobject.depth(:r => "#{id}", :Q => bq, :q => mq)
134
- depths = []
135
- data.split("\n").each do |line|
136
- info = line.split("\t")
137
- depths << info[2].to_i
138
- end
139
- variance = 0
140
- mean_depth = depths.reduce(0, :+) / len.to_f
141
- depths.each do |value|
142
- variance += (value.to_f - mean_depth)**2
95
+ # Input vcf file is read and positions are selected that pass the thresholds
96
+ # @param vcffile [String] path to the pileup file to read
97
+ # @param sym [Symbol] Symbol of the pileup file used to write selected variants
98
+ # pileup information to respective ContigPileups object
99
+ def extract_vcfs(vcffile, sym)
100
+ # read vcf file and process each variant
101
+ File.foreach(vcffile) do |line|
102
+ next if line =~ /^#/
103
+ v = Bio::DB::Vcf.new(line)
104
+ unless v.alt == '.'
105
+ pileup_string = Vcf.to_pileup(v)
106
+ pileup = Pileup.new(pileup_string)
107
+ store_pileup_info(pileup, sym)
143
108
  end
144
- all_depths << mean_depth
145
- contig_obj.sd_depth = Math.sqrt(variance)
146
- contig_obj.mean_depth = mean_depth
147
109
  end
148
- # setting max depth as 3 times the average depth
149
- mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
150
- Options.maxdepth = Options.max_d_multiple * mean_coverage
151
110
  end
152
111
 
153
112
  # Input pileup file is read and positions are selected that pass the thresholds
@@ -159,8 +118,7 @@ module Cheripic
159
118
  File.foreach(pileupfile) do |line|
160
119
  pileup = Pileup.new(line)
161
120
  if pileup.is_var
162
- contig_obj = @pileups[pileup.ref_name]
163
- contig_obj.send(sym).store(pileup.pos, pileup)
121
+ store_pileup_info(pileup, sym)
164
122
  end
165
123
  end
166
124
  end
@@ -175,44 +133,93 @@ module Cheripic
175
133
  bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
176
134
  bamobject.index unless bamobject.indexed?
177
135
 
136
+ # or calculate from bamfile
137
+ set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
178
138
  # check if user has set max depth or set to zero to ignore
179
139
  max_d = Options.maxdepth
180
- # or calculate from bamfile
181
- if Options.max_d_multiple > 0
182
- set_max_depth(bamobject, bamfile)
183
- max_d = Options.maxdepth
184
- logger.info "max depth used for #{sym} file\t#{max_d}"
185
- end
140
+ logger.info "max depth used for #{sym} file\t#{max_d}"
186
141
 
187
142
  @vcf_hash.each_key do | id |
188
143
  positions = @vcf_hash[id][:het].keys
189
144
  positions << @vcf_hash[id][:hom].keys
190
145
  positions.flatten!
191
146
  next if positions.empty?
192
- contig_obj = @pileups[id]
193
147
  positions.each do | pos |
194
148
  command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
195
- stdout, stderr, status = Open3.capture3(command)
196
- unless status.success?
197
- logger.error "resulted in exit code #{status.exitstatus} using #{command}"
198
- logger.error "stderr output is: #{stderr}"
199
- raise CheripicError
200
- end
201
- stdout.chomp!
149
+ stdout = capture_command(command)
202
150
  if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
203
151
  logger.info "pileup data empty for\t#{id}\t#{pos}"
204
152
  else
205
153
  pileup = Pileup.new(stdout)
206
- unless max_d == 0 or pileup.coverage <= max_d
207
- logger.info "pileup coverage is higher than max\t#{pileup.to_s}"
208
- next
209
- end
210
- contig_obj.send(sym).store(pos, pileup)
154
+ store_pileup_info(pileup, sym)
211
155
  end
212
156
  end
213
157
  end
214
158
  end
215
159
 
160
+ # Bam object is read and each contig mean and std deviation of depth calculated
161
+ # @param bamobject [Bio::DB::Sam]
162
+ # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
163
+ def set_max_depth(bamobject, bamfile)
164
+ logger.info "processing #{bamfile} file for depth"
165
+ all_depths = []
166
+ bq = Options.base_quality
167
+ mq = Options.mapping_quality
168
+ @assembly.each_key do | id |
169
+ contig_obj = @assembly[id]
170
+ len = contig_obj.length
171
+ command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
172
+ data = capture_command(command)
173
+ if data == ''
174
+ logger.info "depth data empty for\t#{id}"
175
+ next
176
+ end
177
+ depths = []
178
+ data.split("\n").each do |line|
179
+ info = line.split("\t")
180
+ depths << info[2].to_i
181
+ end
182
+ variance = 0
183
+ mean_depth = depths.reduce(0, :+) / len.to_f
184
+ depths.each do |value|
185
+ variance += (value.to_f - mean_depth)**2
186
+ end
187
+ all_depths << mean_depth
188
+ contig_obj.sd_depth = Math.sqrt(variance)
189
+ contig_obj.mean_depth = mean_depth
190
+ end
191
+ # setting max depth as 3 times the average depth
192
+ mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
193
+ Options.maxdepth = Options.max_d_multiple * mean_coverage
194
+ end
195
+
196
+ def capture_command(command)
197
+ stdout, stderr, status = Open3.capture3(command)
198
+ unless status.success?
199
+ logger.error "resulted in exit code #{status.exitstatus} using #{command}"
200
+ logger.error "stderr output is: #{stderr}"
201
+ raise CheripicError
202
+ end
203
+ stdout.chomp
204
+ end
205
+
206
+ # stores pileup information provided to respective contig_pileup object using sym input
207
+ # @param pileup [Pileup] Pileup objects
208
+ # @param sym [Symbol] Symbol of the input file used to write selected variants
209
+ # pileup information stored to respective ContigPileups object
210
+ def store_pileup_info(pileup, sym)
211
+ # discarding variants with higher than max depth only for mut_bulk
212
+ if sym == :mut_bulk
213
+ unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
214
+ logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
215
+ return nil
216
+ end
217
+ end
218
+ contig_obj = @pileups[pileup.ref_name]
219
+ contig_obj.send(sym).store(pileup.pos, pileup)
220
+ nil
221
+ end
222
+
216
223
  # Once pileup files are analysed and variants are extracted from each bulk;
217
224
  # bulks are compared to identify and isolate variants for downstream analysis.
218
225
  # If polyploidy set to trye and mut_parent and bg_parent bulks are provided
@@ -9,33 +9,38 @@ module Cheripic
9
9
 
10
10
  class Vcf
11
11
 
12
- def self.get_allele_freq(vcf_obj)
12
+ def self.get_allele_depth(vcf_obj)
13
13
  # check if the vcf is from samtools (has DP4 and AF1 fields in INFO)
14
14
  if vcf_obj.info.key?('DP4')
15
15
  freq = vcf_obj.info['DP4'].split(',')
16
- depth = freq.inject { | sum, n | sum.to_f + n.to_f }
16
+ ref = freq[0].to_f + freq[1].to_f
17
17
  alt = freq[2].to_f + freq[3].to_f
18
- allele_freq = alt / depth
19
- # allele_freq = vcf_obj.non_ref_allele_freq
20
18
  # check if the vcf is from VarScan (has RD, AD and FREQ fields in FORMAT)
21
19
  elsif vcf_obj.samples['1'].key?('RD')
20
+ ref = vcf_obj.samples['1']['RD'].to_f
22
21
  alt = vcf_obj.samples['1']['AD'].to_f
23
- depth = vcf_obj.samples['1']['RD'].to_f + alt
24
- allele_freq = alt / depth
25
22
  # check if the vcf is from GATK (has AD and GT fields in FORMAT)
26
23
  elsif vcf_obj.samples['1'].key?('AD') and vcf_obj.samples['1']['AD'].include?(',')
27
24
  freq = vcf_obj.samples['1']['AD'].split(',')
28
- allele_freq = freq[1].to_f / ( freq[0].to_f + freq[1].to_f )
25
+ ref = freq[0].to_i
26
+ alt = freq[1].to_i
29
27
  # check if the vcf has has AF fields in INFO
30
28
  elsif vcf_obj.info.key?('AF')
31
29
  allele_freq = vcf_obj.info['AF'].to_f
30
+ depth = vcf_obj.info['DP'].to_i
31
+ alt = (depth * allele_freq).round
32
+ ref = depth - alt
32
33
  else
33
34
  raise VcfError.new 'not a supported vcf format (VarScan, GATK, Bcftools(Samtools), Vcf 4.0, 4.1 and 4.2)' +
34
35
  " and check that it is one sample vcf\n"
35
36
  end
36
- allele_freq
37
+ [ref, alt]
37
38
  end
38
39
 
40
+ def self.get_allele_freq(vcf_obj)
41
+ ref, alt = get_allele_depth(vcf_obj)
42
+ alt.to_f/(ref + alt)
43
+ end
39
44
 
40
45
  ##Input: vcf file
41
46
  ##Ouput: lists of hm and ht SNPS and hash of all fragments with variants
@@ -78,6 +83,25 @@ module Cheripic
78
83
  var_pos_mut
79
84
  end
80
85
 
86
+ def self.to_pileup(v)
87
+ ref, alt = Vcf.get_allele_depth(v)
88
+ depth = ref + alt
89
+ alt_bases = '' + v.alt
90
+ ref_len = v.ref.length
91
+ alt_len = v.alt.length
92
+ if ref_len > alt_len
93
+ seq = v.ref[alt_len..-1]
94
+ alt_bases = '-' + seq.length.to_s + seq
95
+ v.ref = v.ref[0]
96
+ elsif ref_len < alt_len
97
+ seq = v.alt[ref_len..-1]
98
+ alt_bases = '+' + seq.length.to_s + seq
99
+ end
100
+ bases = ('.' * ref) + ( alt_bases * alt)
101
+ quality = 'D' * depth
102
+ [v.chrom, v.pos, v.ref, depth, bases, quality].join("\t")
103
+ end
104
+
81
105
  end
82
106
 
83
107
  end
@@ -2,6 +2,6 @@ module Cheripic
2
2
 
3
3
  # Sets the semantic version number for this module.
4
4
  # Version number will be used in help messages and for generating gem.
5
- VERSION = '1.2.5'
5
+ VERSION = '1.2.6'
6
6
 
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cheripic
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.5
4
+ version: 1.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shyam Rallapalli
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-10-17 00:00:00.000000000 Z
11
+ date: 2016-10-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: yell