bio-sam-mutation 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 89bbe12a6c812fdc4726e74a758d3243575875ea
4
+ data.tar.gz: 651ffa24264f666bf86740d7f7043845e3eae5a7
5
+ SHA512:
6
+ metadata.gz: 3f27e36ba12e338179e310964e5867c3fdaedb8f799e50b49d06f65fe545d79223dc7f37ef6b88770d61d18b31b79c4edd4db58b5d77919f28c6587787db1a7b
7
+ data.tar.gz: 0904d6a55dd48f26ba20cbc9e11c65d00d1720e37b61cf9d15fe2f0b81afdf1c38b796ec265fc690f02b77abe6a39a6148da131dac9a42ed87e29424938875b1
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.0
4
+ - 2.1.0
5
+
6
+ # - rbx-19mode
7
+ # - 1.8.7
8
+ # - jruby-18mode # JRuby in 1.8 mode
9
+ # - rbx-18mode
10
+
11
+ # uncomment this line if your project needs to run something other than `rake`:
12
+ # script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,21 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "bio", ">= 1.4.2"
4
+ # Using edge version due to a problem with ruby >2.1 in biogems version at time of writing
5
+ gem "bio-samtools", "~>2.3.4", git: "https://github.com/helios/bioruby-samtools.git", ref: "2e77274"
6
+ # JSON serialisation:
7
+ gem "oj", "~>2.14"
8
+ # At the time of writing, the released version 0.2.0 does not include the variation#vep_hgvs method
9
+ # so use this specific commit:
10
+ gem "bio-ensembl-rest", "0.2.0", git: "https://github.com/ALTree/bio-ensembl-rest.git", ref: "c934fa0"
11
+ gem "trollop"
12
+ gem "rake", "~>0.9"
13
+
14
+ group :development do
15
+ gem "shoulda", ">= 0"
16
+ gem "rdoc", "~> 3.12"
17
+ gem "simplecov", ">= 0"
18
+ gem "jeweler", "~> 2.0"
19
+ gem "bundler"
20
+ gem "test-unit", "~> 3.0"
21
+ end
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2015 stveep
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,88 @@
1
+ # bio-sam-mutation
2
+
3
+ [![Build Status](https://secure.travis-ci.org/stveep/bioruby-sam-mutation.png)](http://travis-ci.org/stveep/bioruby-sam-mutation)
4
+
5
+ * Methods for calling mutations from SAM alignments, including CIGAR and MD tag parsers.
6
+
7
+ * Annotates mutations in HGVS format: http://www.hgvs.org/mutnomen/recs.html.
8
+
9
+ * Incorporates Ensembl VEP lookup.
10
+
11
+ ## Installation
12
+
13
+ ```sh
14
+ gem install bio-sam-mutation
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ ```ruby
20
+ require 'bio-sam-mutation'
21
+
22
+ # NB must be tab-delimited
23
+ insertion_and_deletion = Bio::DB::Alignment.new("I2M5K:00253:00406\t0\t5\t112839854\t70\t63M2I138M1D27M7S\t*\t0\t0\tCAGTGATCTTCCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACACCTCCACCATACCTCCTCAAACAGCTCAAACCAAGCGAGAAGTACCTAAAAATAAAGCACCTACTGCTGAAAAGAGAGAGAGTGGACCTAAGCAAGCTGCAGTAAATGCTGCAGTTCAGAGGGTCCAGGTTCTTCCAGATGCTGATACTTATTACATTTTGCCACGGAAAGTACTGCTGAGG\t@CDDDCCCCACACCCCCCCC?CCACCCC>A6;;;;7;;6;6;BC;;6;;;;;.;;>ADDA??;;;;;?CCACCCD>C??@CCCC>C@C;>?CCCC@C=::@:::::+:::/:CCC?>>>>CCCCDDD9CCCC@AB????=AB>??;?BB>@@@AA???CC<@@?????BB>??;;;B<BC;??8;6:A=@=@BBB;;;?<77//*08*088888*8=9=?B7;;4;??????????<\tPG:Z:novoalign\tAS:i:183\tUQ:i:183\tNM:i:3\tMD:Z:201^T27")
24
+
25
+ insertion_and_deletion.mutations
26
+ #=> [#<Bio::Mutation:0x007fa20b5b4fc8 @position=112839916, @type=:insertion, @reference=nil, @mutant="AT", @seqname="5">, #<Bio::Mutation:0x007fa20b5b4960 @position=112840055, @type=:deletion, @reference="T", @mutant=nil, @seqname="5">]
27
+
28
+ insertion_and_deletion.mutations.first.to_hgvs("g")
29
+ #=> "5:g.112839916_112839917insAT"
30
+
31
+ puts YAML.dump(insertion_and_deletion.mutations.first.vep("human","g").first["transcript_consequences"].keep_if{|c| c["transcript_id"] == "ENST00000257430"})
32
+ #---
33
+ # - variant_allele: AT
34
+ # cdna_end: 4379
35
+ # codons: cca/ccATa
36
+ # protein_end: 1441
37
+ # strand: 1
38
+ # hgnc_id: HGNC:583
39
+ # amino_acids: P/PX
40
+ # gene_symbol: APC
41
+ # cdna_start: 4378
42
+ # transcript_id: ENST00000257430
43
+ # cds_start: 4322
44
+ # gene_id: ENSG00000134982
45
+ # protein_start: 1441
46
+ # biotype: protein_coding
47
+ # gene_symbol_source: HGNC
48
+ # cds_end: 4323
49
+ # consequence_terms:
50
+ # - frameshift_variant
51
+ # impact: HIGH
52
+ # => nil
53
+
54
+ # E.g. of full request return
55
+ # http://rest.ensembl.org/documentation/info/vep_hgvs_get
56
+ insertion_and_deletion.mutations(112839854).first.vep("human","g")
57
+ # => [{"assembly_name"=>"GRCh38", "end"=>112839917, "seq_region_name"=>"5", "transcript_consequences"=>[{"gene_id"=>"ENSG00000134982", "distance"=>46, "variant_allele"=>"AT", "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000502371", "impact"=>"MODIFIER"}, {"variant_allele"=>"AT", "cdna_end"=>4380, "codons"=>"-/AT", "protein_end"=>1442, "strand"=>1, "hgnc_id"=>"HGNC:583", "amino_acids"=>"-/X", "gene_symbol"=>"APC", "cdna_start"=>4379, "transcript_id"=>"ENST00000257430", "cds_start"=>4323, "gene_id"=>"ENSG00000134982", "protein_start"=>1441, "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "cds_end"=>4324, "consequence_terms"=>["frameshift_variant"], "impact"=>"HIGH"}, {"gene_id"=>"ENSG00000134982", "distance"=>863, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000507379", "impact"=>"MODIFIER"}, {"variant_allele"=>"AT", "cdna_end"=>4481, "codons"=>"-/AT", "protein_end"=>1442, "strand"=>1, "hgnc_id"=>"HGNC:583", "amino_acids"=>"-/X", "gene_symbol"=>"APC", "cdna_start"=>4480, "transcript_id"=>"ENST00000508376", "cds_start"=>4323, "gene_id"=>"ENSG00000134982", "protein_start"=>1441, "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "cds_end"=>4324, "consequence_terms"=>["frameshift_variant"], "impact"=>"HIGH"}, {"gene_id"=>"ENSG00000134982", "distance"=>409, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000512211", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000134982", "variant_allele"=>"AT", "cdna_end"=>4569, "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["3_prime_UTR_variant", "NMD_transcript_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "cdna_start"=>4568, "transcript_id"=>"ENST00000508624", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000258864", "variant_allele"=>"AT", "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"Clone_based_vega_gene", "consequence_terms"=>["intron_variant", "NMD_transcript_variant"], "strand"=>1, "gene_symbol"=>"CTC-554D6.1", "transcript_id"=>"ENST00000520401", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000134982", "distance"=>2195, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000504915", "impact"=>"MODIFIER"}], "strand"=>1, "id"=>"5:g.112839917_112839918insAT", "allele_string"=>"-/AT", "most_severe_consequence"=>"frameshift_variant", "start"=>112839918}]
58
+
59
+
60
+
61
+ ```
62
+
63
+ The API doc is online. For more code examples see the test files in
64
+ the source tree.
65
+
66
+ ## Project home page
67
+
68
+ Information on the source tree, documentation, examples, issues and
69
+ how to contribute, see
70
+
71
+ http://github.com/stveep/bioruby-sam-mutation
72
+
73
+ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
74
+
75
+ ## Cite
76
+
77
+ If you use this software, please cite one of
78
+
79
+ * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
80
+ * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
81
+
82
+ ## Biogems.info
83
+
84
+ This Biogem is published at (http://biogems.info/index.html#bio-sam)
85
+
86
+ ## Copyright
87
+
88
+ Copyright (c) 2015 stveep. See LICENSE.txt for further details.
@@ -0,0 +1,48 @@
1
+ = bio-sam
2
+
3
+ {<img
4
+ src="https://secure.travis-ci.org/stveep/bioruby-sam.png"
5
+ />}[http://travis-ci.org/#!/stveep/bioruby-sam]
6
+
7
+ Full description goes here
8
+
9
+ Note: this software is under active development!
10
+
11
+ == Installation
12
+
13
+ gem install bio-sam
14
+
15
+ == Usage
16
+
17
+ == Developers
18
+
19
+ To use the library
20
+
21
+ require 'bio-sam'
22
+
23
+ The API doc is online. For more code examples see also the test files in
24
+ the source tree.
25
+
26
+ == Project home page
27
+
28
+ Information on the source tree, documentation, issues and how to contribute, see
29
+
30
+ http://github.com/stveep/bioruby-sam
31
+
32
+ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
33
+
34
+ == Cite
35
+
36
+ If you use this software, please cite one of
37
+
38
+ * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
39
+ * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
40
+
41
+ == Biogems.info
42
+
43
+ This Biogem is published at http://biogems.info/index.html#bio-sam
44
+
45
+ == Copyright
46
+
47
+ Copyright (c) 2015 stveep. See LICENSE.txt for further details.
48
+
@@ -0,0 +1,54 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
+ gem.name = "bio-sam-mutation"
18
+ gem.version = '0.4.1'
19
+ gem.homepage = "http://github.com/stveep/bioruby-sam-mutation"
20
+ gem.license = "MIT"
21
+ gem.summary = %Q{Parsing and mutation calling from SAM, CIGAR and MD:Z.}
22
+ gem.description = %Q{Simple classes for parsing SAM, CIGAR and MD:Z strings, including slices. Methods for calling mutations in HGVS format and looking up consequences using Ensembl VEP REST API. Developed for calling mutations at an expected position in an alignment - e.g. Amplicon sequencing of CRISPR-induced mutations.}
23
+ gem.email = "spettitt@gmail.com"
24
+ gem.authors = ["Stephen Pettitt"]
25
+ # dependencies defined in Gemfile
26
+ # gem.required_ruby_version = '>= 1.9.3'
27
+ gem.executables << "mutations"
28
+ end
29
+ Jeweler::RubygemsDotOrgTasks.new
30
+
31
+ require 'rake/testtask'
32
+ Rake::TestTask.new(:test) do |test|
33
+ test.libs << 'lib' << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+
38
+ desc "Code coverage detail"
39
+ task :simplecov do
40
+ ENV['COVERAGE'] = "true"
41
+ Rake::Task['test'].execute
42
+ end
43
+
44
+ task :default => :test
45
+
46
+ require 'rdoc/task'
47
+ Rake::RDocTask.new do |rdoc|
48
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
49
+
50
+ rdoc.rdoc_dir = 'rdoc'
51
+ rdoc.title = "bio-sam-mutation #{version}"
52
+ rdoc.rdoc_files.include('README*')
53
+ rdoc.rdoc_files.include('lib/**/*.rb')
54
+ end
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env ruby
2
+ $LOAD_PATH.unshift File.join(__dir__, '..', 'lib')
3
+ require 'trollop'
4
+ require 'bio-sam-mutation'
5
+ require 'pry'
6
+
7
+ opts = Trollop::options do
8
+ opt :config, "Configuration file in YAML format. Defaults to ./config.yml. Run with --example-config for example.", type: :string, default: "config.yml"
9
+ opt :example_config, "Show example configuration file."
10
+ opt :output, "Output file, works with single product only. Better to define in config file.", type: :string, default: "output.sam"
11
+ opt :tag, "Tag an input SAM file (or piped stream of SAM data) with mutation calls"
12
+ opt :report, "Produce a report by amplicon for each file given, including read counts and annotation for each allele. (does not work on a stream)"
13
+ opt :mincov, "Minimum fraction of reads to report a mutant allele.", default: 0.01
14
+ opt :flag, "Minimum reads to flag a potential mutant (= no wild type).", default: 20
15
+ end
16
+
17
+ raise "Cannot use report and tag simultaneously." if opts[:report] && opts[:tag]
18
+
19
+ # Trollop removes and parses the options, leaving either input files or the incoming stream:
20
+ config = opts[:config]? YAML.load_file(opts[:config]) : {}
21
+ if opts[:tag]
22
+ if config.keys.include? :products
23
+ config[:products].each do |product_name, config_hash|
24
+ config[product_name] = MutationsCLI.set_defaults(config_hash)
25
+ end
26
+ config = MutationsCLI.construct_products(config)
27
+ else
28
+ config[:single_product] = true
29
+ config[:output] ||= opt[:output]
30
+ config[:outfile] = File.open(config[:output],'w')
31
+ end
32
+
33
+ ARGF.each do |input|
34
+ MutationsCLI.tag input, config
35
+ end
36
+ end
37
+
38
+ if opts[:report]
39
+ config[:products].each do |product_name, config_hash|
40
+ config_hash = MutationsCLI.set_defaults(config_hash)
41
+ lookups = {}
42
+ out = File.open(product_name+"-report.txt",'w')
43
+ mutants = []
44
+ ARGV.each do |file|
45
+ calls = Hash.new{|h,k| h[k] = MutantAllele.new}
46
+ File.open(file).readlines.each do |input|
47
+ next if input.match /^@/ # skip sam headers
48
+ sam = Bio::DB::Alignment.new(input)
49
+ # Check correct start in case of file with mixed amplicons
50
+ if config_hash[:start]
51
+ next unless sam.seq.match Regexp.new("^"+config_hash[:start])
52
+ end
53
+ # Must be have sufficient mapped length to call mutations in the given interval:
54
+ next if sam.query_unmapped
55
+ next if config_hash[:length] > Bio::Alignment::CIGAR.new(sam.cigar).reference_length - config_hash[:offset]
56
+ key = "w.t."
57
+ muts = MutationsCLI.call_mutations_given_product sam, config_hash
58
+ if muts
59
+ key = muts.to_hgvs
60
+ calls[key].mutations ||= muts
61
+ end
62
+ calls[key].seq ||= sam.query(config_hash[:offset], config_hash[:length])
63
+ calls[key].example ||= sam
64
+ calls[key].count += 1
65
+ end
66
+ total_reads = calls.map{|k,v| v.count}.reduce(:+)
67
+ out.puts file
68
+ out.puts "Total reads: #{total_reads}"
69
+ threshold = total_reads ? opts[:mincov] * total_reads : 0
70
+ calls.keep_if{|k, v| v.count > threshold}
71
+ mutants << file unless calls.keys.include? "w.t." || total_reads < opts[:flag]
72
+ calls = calls.sort_by{|k,v| v.count}.reverse.to_h
73
+ calls.each do |key, allele|
74
+ hgvs, vep = ""
75
+ formatted = nil
76
+ if allele.mutations
77
+ hgvs = allele.mutations.to_hgvs
78
+ if allele.mutations.size == 1 # VEP lookup doesn't work for compound mutations
79
+ begin
80
+ vep = allele.lookup
81
+ rescue RuntimeError
82
+ result = nil
83
+ end
84
+ # TODO: sort out the cacheing - does it actually work?
85
+ if vep
86
+ result = lookups[hgvs] ? lookups[hgvs] : VepHgvs.consequences_for_transcript(vep,config_hash[:transcript]).first
87
+ if result
88
+ result.each
89
+ formatted = [[result["CDS position"].to_s,result["Allele"]].join(" "),[result["Protein start"].to_s,result["Mutation"]].join(" "),result["Consequence"]].join("\t")
90
+ end
91
+ else
92
+ formatted = "No VEP result"
93
+ end
94
+ lookups[hgvs] ||= result
95
+ else
96
+ formatted = "Compound mutant"
97
+ end
98
+ end
99
+ formatted ||= "No mutation"
100
+ out.puts ([allele.seq, allele.count, key, formatted]).join("\t")
101
+ end
102
+ out.puts "\n===================================="
103
+ end
104
+ out.puts "Files with wild type below threshold:"
105
+ out.puts mutants.join("\n")
106
+ end
107
+
108
+ end
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio-sam-mutation'
3
+ require 'thor'
4
+
5
+ class SamMutationCLI < Thor
6
+ desc "tag [--config=config.yml]", "Tag a SAM file with HGVS annotations."
7
+ option :config
8
+ def tag (file_name)
9
+ File.open(file_name).each do |line|
10
+ next if line.match(/^@/)
11
+ sam = Bio::DB::Alignment.new(line)
12
+ if sam.mutations
13
+ new_tag = Bio::DB::Tag.new("YH:m:"+sam.mutations.to_hgvs)
14
+ puts sam.add_tag(new_tag)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
20
+ SamMutationCLI.start ARGV
@@ -0,0 +1,26 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio'
12
+ require 'bio-ensembl-rest'
13
+ require 'bio-samtools'
14
+ require 'oj'
15
+ require 'yaml'
16
+ require 'bio-sam-mutation/bio/db/alignment'
17
+ require 'bio-sam-mutation/bio/alignment/iterate_pairs'
18
+ require 'bio-sam-mutation/bio/alignment/cigar'
19
+ require 'bio-sam-mutation/bio/db/tag'
20
+ require 'bio-sam-mutation/bio/db/tag/md'
21
+ require 'bio-sam-mutation/bio/vephgvs'
22
+ require 'bio-sam-mutation/bio/mutation'
23
+ require 'bio-sam-mutation/bio/mutation_array'
24
+ require 'bio-sam-mutation/bio/mutantallele'
25
+
26
+ require 'bio-sam-mutation/mutationscli'
@@ -0,0 +1,239 @@
1
+ # Parse a CIGAR string
2
+ # An example from Exonerate output. Ideally will also allow SAM file input to be used.
3
+ # 1 : CGGCTATGGGGTCGTGGGTCCCGCGTTG-CTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 63
4
+ # | | || | ||||||||||||||||||| |||||||||||||||||||||||||||||||||||
5
+ # 1 : CAG-TA-GTGGTCGTGGGTCCCGCGTTGTCTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 62
6
+ #
7
+ # ref: CAGTAGTGGTCGTGGGTCCCGCGTTGTCTCTGG...
8
+ # cigar: SP-A12_D02_2015-01-16.seq 0 611 + SP-A3_ref 0 621 + 2514 M 3 I 1 M 2 I 1 M 21 D 1 M 306 D 9 M 89 I 1 M 126 D 1 M 24 D 1 M 8 I 1 M 6 D 1 M 5 D 1 M 17
9
+ # I are not counted in reference
10
+ # Regexp (from SAM specification) - but in exonerate the number comes first: ([0-9]+[MIDNSHP])+|\*
11
+
12
+ class Bio::Alignment::CIGAR
13
+ include Bio::Alignment::IteratePairs
14
+
15
+ class << self
16
+ attr_accessor :reference_operations, :query_operations, :subexp, :regexps
17
+ end
18
+
19
+ self.regexps = {"exonerate" => /([MIDNSHP]{1})(\d+)/, "sam" => /(\d+)([MIDNSHP]{1})/}
20
+ # Type of elements that count towards the reference length:
21
+ # TODO: add full support for other elements S, H etc.
22
+ self.reference_operations = /[MD]/
23
+ self.query_operations = /[MIS]/
24
+ self.subexp = /([atgcAGCT]+)>([atgcAGTC]+)/
25
+ attr_accessor :pairs, :reference
26
+
27
+ def initialize(string,ref=nil,source="")
28
+ # strip out whitespace
29
+ string.gsub!(/\s+/,"")
30
+
31
+ # Auto-detect source if not supplied
32
+ if !(Bio::Alignment::CIGAR.regexps.keys.include? source)
33
+ Bio::Alignment::CIGAR.regexps.each do |k,v|
34
+ # Look for match at start of string
35
+ if m = string.match(v)
36
+ source = k if m.offset(0)[0] == 0
37
+ end
38
+ end
39
+ if source == ""
40
+ raise "Source (e.g. 'exonerate', 'sam') not given and failed to auto-detect."
41
+ end
42
+ end
43
+ # Make an array of pairs of of cigar elements:
44
+ @pairs = string.scan(Bio::Alignment::CIGAR.regexps[source])
45
+ if source == "exonerate"
46
+ @pairs.map!{|pair| [pair[0].to_s, pair[1].to_i]}
47
+ else
48
+ # Provision to have number and identifier the other way round
49
+ @pairs.map!{|pair| [pair[1].to_s, pair[0].to_i]}
50
+ end
51
+
52
+ # Include reference sequence if provided
53
+ @reference = ref
54
+ # Check length of reference = sum(M+D)?
55
+ #warn "Reference length is not equal to that implied by CIGAR string: #{@reference.length}, #{self.reference_length}." unless @reference.length == self.reference_length
56
+
57
+ end
58
+
59
+ # Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
60
+ def subalignment(offset,length,regexp=Bio::Alignment::CIGAR.reference_operations)
61
+ new_array = iterate_pairs(@pairs,offset,length,regexp)
62
+ # Return a CIGAR instance with just the new alignment
63
+ new_string = new_array.join(" ")
64
+ # -1 from offset as ruby string starts at zero
65
+ new_cigar = Bio::Alignment::CIGAR.new(new_string,@reference[offset-1,length])
66
+ new_cigar.remove_empty!
67
+ end
68
+ alias_method :slice, :subalignment
69
+
70
+ # Given a CIGAR-based [not reference - use subalignment] offset and length, return a subregion
71
+ def subcigar(offset,length)
72
+ # No regexp - includes everything
73
+ self.subalignment(offset,length,//)
74
+ end
75
+
76
+ def unmasked(offset,length)
77
+ self.subalignment(offset,length,/[MDI]/)
78
+ end
79
+
80
+ def remove_small!(threshold=1)
81
+ # Deletions convert to matches, insertions just remove
82
+ deletions_to_matches(threshold)
83
+ remove_small_nonmatches(threshold)
84
+ self
85
+ end
86
+
87
+ def remove_empty!
88
+ self.pairs.keep_if{|pair| pair[1] != 0 }
89
+ self
90
+ end
91
+
92
+ def matched_length
93
+ count_type("M")
94
+ end
95
+
96
+ def deleted_length
97
+ count_type("D")
98
+ end
99
+
100
+ def inserted_length
101
+ count_type("I")
102
+ end
103
+
104
+ def masked_length
105
+ count_type(/[SH]/)
106
+ end
107
+
108
+ def reference_length
109
+ count_type(Bio::Alignment::CIGAR.reference_operations)
110
+ end
111
+
112
+ def query_length
113
+ count_type(Bio::Alignment::CIGAR.query_operations)
114
+ end
115
+
116
+ # Output a representation of the query: replace deleted portions with "-", flag insertions with "*" or sim. Optionally provide the sequence (or symbols to use) of insertions, in order of appearence.
117
+ # Should be able to accept an array
118
+ # TODO: Add support for substitution highlighting (e.g lowercasing)
119
+ def query(insertions=nil)
120
+ if (insertions && (insertions.is_a? String))
121
+ insertions = [insertions]
122
+ end
123
+ sequence = []
124
+ total = 0
125
+ @pairs.each do |pair|
126
+ if pair[0].match("M")
127
+ sequence << @reference[total..total+pair[1]-1].upcase
128
+ total += pair[1]
129
+ end
130
+ if pair[0].match("I")
131
+ if (insertions)
132
+ insertion = insertions.shift.to_s
133
+ else
134
+ insertion = '['+pair[1].to_s+']'
135
+ end
136
+ sequence << insertion
137
+ end
138
+ if pair[0].match("D")
139
+ pair[1].times{ sequence << "-" }
140
+ total += pair[1]
141
+ end
142
+ end
143
+ sequence.join("")
144
+ end
145
+
146
+ # Output hgnc variant format given reference position. Only deletions can be accurately annotated from the cigar string; insertions or wild type seqeunces return nil
147
+ # NB mutation calling and annotation now implemented as extension to Bio::DB::Alignment (SAM)
148
+ def hgnc(reference_pos=0,insertions=[],type="g",*subs)
149
+ if insertions
150
+ if insertions.is_a? String
151
+ insertions = [insertions]
152
+ end
153
+ end
154
+ first_match = true
155
+ total = 0
156
+ hgnc_format = []
157
+ @pairs.each do |pair|
158
+ case pair[0]
159
+ when "M"
160
+ #break if first_match == false
161
+ reference_pos += pair[1]
162
+ total += pair[1]
163
+ first_match = false
164
+ when "D"
165
+ deleted_bases = @reference[total,pair[1]].upcase
166
+ if (pair[1] == 1)
167
+ string = (reference_pos + 1).to_s
168
+ else
169
+ string = (reference_pos + 1).to_s + "_" + (reference_pos + pair[1]).to_s
170
+ end
171
+ string = string + "del" + deleted_bases
172
+ hgnc_format << string
173
+ total += pair[1]
174
+ when "I"
175
+ inserted_bases = (insertions.length == 0) ? "N" : insertions.shift
176
+
177
+ hgnc_format << (reference_pos).to_s + "_" + (reference_pos + 1).to_s + "ins" + inserted_bases.upcase
178
+ end
179
+ end
180
+ # Use for substitutions, but could also pass any other annotation to include in here, as an array of strings
181
+ subs = subs.first # >1 arguments discarded
182
+ if subs
183
+ if (subs.length > 0 && (subs.is_a? Array))
184
+ hgnc_format = hgnc_format + subs
185
+ end
186
+ end
187
+ if hgnc_format.length == 0
188
+ nil
189
+ elsif hgnc_format.length == 1
190
+ type.to_s + "." + hgnc_format[0]
191
+ else
192
+ type.to_s + "." + "[" + hgnc_format.join(";") + "]"
193
+ end
194
+
195
+ end
196
+
197
+ # TODO combine adjacent operations of the same type into a single pair
198
+ def combine_adjacent
199
+
200
+ end
201
+
202
+ # Returns a hash (keyed by operation type) of three element arrays: the start positions on the reference of operations of the given type(s) and the length of the operation,
203
+ # followed by query position (for e.g. retrieving inserted bases from SAM). A regexp can be used to specify multiple types e.g. /[ID]/.
204
+ def positions(type)
205
+ total = 0
206
+ qtotal = 0
207
+ hash = Hash.new{|h,k| h[k] = []}
208
+ @pairs.each do |pair|
209
+ if pair[0].match(type)
210
+ hash[$&] << [total, pair[1], qtotal]
211
+ end
212
+ total += pair[1] if pair[0].match Bio::Alignment::CIGAR.reference_operations
213
+ qtotal += pair[1] if pair[0].match Bio::Alignment::CIGAR.query_operations
214
+ end
215
+ hash
216
+ end
217
+
218
+
219
+ private
220
+
221
+ def deletions_to_matches(threshold)
222
+ self.pairs = @pairs.each{|pair| pair[0].sub!("D","M") if pair[1] <= threshold}
223
+ end
224
+
225
+ def remove_small_nonmatches(threshold)
226
+ self.pairs = @pairs.keep_if{|pair| pair[0] == "M" || pair[1] > threshold}
227
+ end
228
+
229
+ def count_type(type)
230
+ sum = 0
231
+ @pairs.each do |pair|
232
+ if pair[0].match(type)
233
+ sum += pair[1]
234
+ end
235
+ end
236
+ sum
237
+ end
238
+
239
+ end #class