bio-sam-mutation 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 89bbe12a6c812fdc4726e74a758d3243575875ea
4
+ data.tar.gz: 651ffa24264f666bf86740d7f7043845e3eae5a7
5
+ SHA512:
6
+ metadata.gz: 3f27e36ba12e338179e310964e5867c3fdaedb8f799e50b49d06f65fe545d79223dc7f37ef6b88770d61d18b31b79c4edd4db58b5d77919f28c6587787db1a7b
7
+ data.tar.gz: 0904d6a55dd48f26ba20cbc9e11c65d00d1720e37b61cf9d15fe2f0b81afdf1c38b796ec265fc690f02b77abe6a39a6148da131dac9a42ed87e29424938875b1
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.0
4
+ - 2.1.0
5
+
6
+ # - rbx-19mode
7
+ # - 1.8.7
8
+ # - jruby-18mode # JRuby in 1.8 mode
9
+ # - rbx-18mode
10
+
11
+ # uncomment this line if your project needs to run something other than `rake`:
12
+ # script: bundle exec rspec spec
data/Gemfile ADDED
@@ -0,0 +1,21 @@
1
+ source "http://rubygems.org"
2
+
3
+ gem "bio", ">= 1.4.2"
4
+ # Using edge version due to a problem with ruby >2.1 in biogems version at time of writing
5
+ gem "bio-samtools", "~>2.3.4", git: "https://github.com/helios/bioruby-samtools.git", ref: "2e77274"
6
+ # JSON serialisation:
7
+ gem "oj", "~>2.14"
8
+ # At the time of writing, the released version 0.2.0 does not include the variation#vep_hgvs method
9
+ # so use this specific commit:
10
+ gem "bio-ensembl-rest", "0.2.0", git: "https://github.com/ALTree/bio-ensembl-rest.git", ref: "c934fa0"
11
+ gem "trollop"
12
+ gem "rake", "~>0.9"
13
+
14
+ group :development do
15
+ gem "shoulda", ">= 0"
16
+ gem "rdoc", "~> 3.12"
17
+ gem "simplecov", ">= 0"
18
+ gem "jeweler", "~> 2.0"
19
+ gem "bundler"
20
+ gem "test-unit", "~> 3.0"
21
+ end
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2015 stveep
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,88 @@
1
+ # bio-sam-mutation
2
+
3
+ [![Build Status](https://secure.travis-ci.org/stveep/bioruby-sam-mutation.png)](http://travis-ci.org/stveep/bioruby-sam-mutation)
4
+
5
+ * Methods for calling mutations from SAM alignments, including CIGAR and MD tag parsers.
6
+
7
+ * Annotates mutations in HGVS format: http://www.hgvs.org/mutnomen/recs.html.
8
+
9
+ * Incorporates Ensembl VEP lookup.
10
+
11
+ ## Installation
12
+
13
+ ```sh
14
+ gem install bio-sam-mutation
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ ```ruby
20
+ require 'bio-sam-mutation'
21
+
22
+ # NB must be tab-delimited
23
+ insertion_and_deletion = Bio::DB::Alignment.new("I2M5K:00253:00406\t0\t5\t112839854\t70\t63M2I138M1D27M7S\t*\t0\t0\tCAGTGATCTTCCAGATAGCCCTGGACAAACCATGCCACCAAGCAGAAGTAAAACACCTCCACCATACCTCCTCAAACAGCTCAAACCAAGCGAGAAGTACCTAAAAATAAAGCACCTACTGCTGAAAAGAGAGAGAGTGGACCTAAGCAAGCTGCAGTAAATGCTGCAGTTCAGAGGGTCCAGGTTCTTCCAGATGCTGATACTTATTACATTTTGCCACGGAAAGTACTGCTGAGG\t@CDDDCCCCACACCCCCCCC?CCACCCC>A6;;;;7;;6;6;BC;;6;;;;;.;;>ADDA??;;;;;?CCACCCD>C??@CCCC>C@C;>?CCCC@C=::@:::::+:::/:CCC?>>>>CCCCDDD9CCCC@AB????=AB>??;?BB>@@@AA???CC<@@?????BB>??;;;B<BC;??8;6:A=@=@BBB;;;?<77//*08*088888*8=9=?B7;;4;??????????<\tPG:Z:novoalign\tAS:i:183\tUQ:i:183\tNM:i:3\tMD:Z:201^T27")
24
+
25
+ insertion_and_deletion.mutations
26
+ #=> [#<Bio::Mutation:0x007fa20b5b4fc8 @position=112839916, @type=:insertion, @reference=nil, @mutant="AT", @seqname="5">, #<Bio::Mutation:0x007fa20b5b4960 @position=112840055, @type=:deletion, @reference="T", @mutant=nil, @seqname="5">]
27
+
28
+ insertion_and_deletion.mutations.first.to_hgvs("g")
29
+ #=> "5:g.112839916_112839917insAT"
30
+
31
+ puts YAML.dump(insertion_and_deletion.mutations.first.vep("human","g").first["transcript_consequences"].keep_if{|c| c["transcript_id"] == "ENST00000257430"})
32
+ #---
33
+ # - variant_allele: AT
34
+ # cdna_end: 4379
35
+ # codons: cca/ccATa
36
+ # protein_end: 1441
37
+ # strand: 1
38
+ # hgnc_id: HGNC:583
39
+ # amino_acids: P/PX
40
+ # gene_symbol: APC
41
+ # cdna_start: 4378
42
+ # transcript_id: ENST00000257430
43
+ # cds_start: 4322
44
+ # gene_id: ENSG00000134982
45
+ # protein_start: 1441
46
+ # biotype: protein_coding
47
+ # gene_symbol_source: HGNC
48
+ # cds_end: 4323
49
+ # consequence_terms:
50
+ # - frameshift_variant
51
+ # impact: HIGH
52
+ # => nil
53
+
54
+ # E.g. of full request return
55
+ # http://rest.ensembl.org/documentation/info/vep_hgvs_get
56
+ insertion_and_deletion.mutations(112839854).first.vep("human","g")
57
+ # => [{"assembly_name"=>"GRCh38", "end"=>112839917, "seq_region_name"=>"5", "transcript_consequences"=>[{"gene_id"=>"ENSG00000134982", "distance"=>46, "variant_allele"=>"AT", "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000502371", "impact"=>"MODIFIER"}, {"variant_allele"=>"AT", "cdna_end"=>4380, "codons"=>"-/AT", "protein_end"=>1442, "strand"=>1, "hgnc_id"=>"HGNC:583", "amino_acids"=>"-/X", "gene_symbol"=>"APC", "cdna_start"=>4379, "transcript_id"=>"ENST00000257430", "cds_start"=>4323, "gene_id"=>"ENSG00000134982", "protein_start"=>1441, "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "cds_end"=>4324, "consequence_terms"=>["frameshift_variant"], "impact"=>"HIGH"}, {"gene_id"=>"ENSG00000134982", "distance"=>863, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000507379", "impact"=>"MODIFIER"}, {"variant_allele"=>"AT", "cdna_end"=>4481, "codons"=>"-/AT", "protein_end"=>1442, "strand"=>1, "hgnc_id"=>"HGNC:583", "amino_acids"=>"-/X", "gene_symbol"=>"APC", "cdna_start"=>4480, "transcript_id"=>"ENST00000508376", "cds_start"=>4323, "gene_id"=>"ENSG00000134982", "protein_start"=>1441, "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "cds_end"=>4324, "consequence_terms"=>["frameshift_variant"], "impact"=>"HIGH"}, {"gene_id"=>"ENSG00000134982", "distance"=>409, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000512211", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000134982", "variant_allele"=>"AT", "cdna_end"=>4569, "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["3_prime_UTR_variant", "NMD_transcript_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "cdna_start"=>4568, "transcript_id"=>"ENST00000508624", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000258864", "variant_allele"=>"AT", "biotype"=>"nonsense_mediated_decay", "gene_symbol_source"=>"Clone_based_vega_gene", "consequence_terms"=>["intron_variant", "NMD_transcript_variant"], "strand"=>1, "gene_symbol"=>"CTC-554D6.1", "transcript_id"=>"ENST00000520401", "impact"=>"MODIFIER"}, {"gene_id"=>"ENSG00000134982", "distance"=>2195, "variant_allele"=>"AT", "biotype"=>"protein_coding", "gene_symbol_source"=>"HGNC", "consequence_terms"=>["downstream_gene_variant"], "strand"=>1, "hgnc_id"=>"HGNC:583", "gene_symbol"=>"APC", "transcript_id"=>"ENST00000504915", "impact"=>"MODIFIER"}], "strand"=>1, "id"=>"5:g.112839917_112839918insAT", "allele_string"=>"-/AT", "most_severe_consequence"=>"frameshift_variant", "start"=>112839918}]
58
+
59
+
60
+
61
+ ```
62
+
63
+ The API doc is online. For more code examples see the test files in
64
+ the source tree.
65
+
66
+ ## Project home page
67
+
68
+ Information on the source tree, documentation, examples, issues and
69
+ how to contribute, see
70
+
71
+ http://github.com/stveep/bioruby-sam-mutation
72
+
73
+ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
74
+
75
+ ## Cite
76
+
77
+ If you use this software, please cite one of
78
+
79
+ * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
80
+ * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
81
+
82
+ ## Biogems.info
83
+
84
+ This Biogem is published at (http://biogems.info/index.html#bio-sam)
85
+
86
+ ## Copyright
87
+
88
+ Copyright (c) 2015 stveep. See LICENSE.txt for further details.
@@ -0,0 +1,48 @@
1
+ = bio-sam
2
+
3
+ {<img
4
+ src="https://secure.travis-ci.org/stveep/bioruby-sam.png"
5
+ />}[http://travis-ci.org/#!/stveep/bioruby-sam]
6
+
7
+ Full description goes here
8
+
9
+ Note: this software is under active development!
10
+
11
+ == Installation
12
+
13
+ gem install bio-sam
14
+
15
+ == Usage
16
+
17
+ == Developers
18
+
19
+ To use the library
20
+
21
+ require 'bio-sam'
22
+
23
+ The API doc is online. For more code examples see also the test files in
24
+ the source tree.
25
+
26
+ == Project home page
27
+
28
+ Information on the source tree, documentation, issues and how to contribute, see
29
+
30
+ http://github.com/stveep/bioruby-sam
31
+
32
+ The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
33
+
34
+ == Cite
35
+
36
+ If you use this software, please cite one of
37
+
38
+ * [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
39
+ * [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
40
+
41
+ == Biogems.info
42
+
43
+ This Biogem is published at http://biogems.info/index.html#bio-sam
44
+
45
+ == Copyright
46
+
47
+ Copyright (c) 2015 stveep. See LICENSE.txt for further details.
48
+
@@ -0,0 +1,54 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
17
+ gem.name = "bio-sam-mutation"
18
+ gem.version = '0.4.1'
19
+ gem.homepage = "http://github.com/stveep/bioruby-sam-mutation"
20
+ gem.license = "MIT"
21
+ gem.summary = %Q{Parsing and mutation calling from SAM, CIGAR and MD:Z.}
22
+ gem.description = %Q{Simple classes for parsing SAM, CIGAR and MD:Z strings, including slices. Methods for calling mutations in HGVS format and looking up consequences using Ensembl VEP REST API. Developed for calling mutations at an expected position in an alignment - e.g. Amplicon sequencing of CRISPR-induced mutations.}
23
+ gem.email = "spettitt@gmail.com"
24
+ gem.authors = ["Stephen Pettitt"]
25
+ # dependencies defined in Gemfile
26
+ # gem.required_ruby_version = '>= 1.9.3'
27
+ gem.executables << "mutations"
28
+ end
29
+ Jeweler::RubygemsDotOrgTasks.new
30
+
31
+ require 'rake/testtask'
32
+ Rake::TestTask.new(:test) do |test|
33
+ test.libs << 'lib' << 'test'
34
+ test.pattern = 'test/**/test_*.rb'
35
+ test.verbose = true
36
+ end
37
+
38
+ desc "Code coverage detail"
39
+ task :simplecov do
40
+ ENV['COVERAGE'] = "true"
41
+ Rake::Task['test'].execute
42
+ end
43
+
44
+ task :default => :test
45
+
46
+ require 'rdoc/task'
47
+ Rake::RDocTask.new do |rdoc|
48
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
49
+
50
+ rdoc.rdoc_dir = 'rdoc'
51
+ rdoc.title = "bio-sam-mutation #{version}"
52
+ rdoc.rdoc_files.include('README*')
53
+ rdoc.rdoc_files.include('lib/**/*.rb')
54
+ end
@@ -0,0 +1,108 @@
1
+ #!/usr/bin/env ruby
2
+ $LOAD_PATH.unshift File.join(__dir__, '..', 'lib')
3
+ require 'trollop'
4
+ require 'bio-sam-mutation'
5
+ require 'pry'
6
+
7
+ opts = Trollop::options do
8
+ opt :config, "Configuration file in YAML format. Defaults to ./config.yml. Run with --example-config for example.", type: :string, default: "config.yml"
9
+ opt :example_config, "Show example configuration file."
10
+ opt :output, "Output file, works with single product only. Better to define in config file.", type: :string, default: "output.sam"
11
+ opt :tag, "Tag an input SAM file (or piped stream of SAM data) with mutation calls"
12
+ opt :report, "Produce a report by amplicon for each file given, including read counts and annotation for each allele. (does not work on a stream)"
13
+ opt :mincov, "Minimum fraction of reads to report a mutant allele.", default: 0.01
14
+ opt :flag, "Minimum reads to flag a potential mutant (= no wild type).", default: 20
15
+ end
16
+
17
+ raise "Cannot use report and tag simultaneously." if opts[:report] && opts[:tag]
18
+
19
+ # Trollop removes and parses the options, leaving either input files or the incoming stream:
20
+ config = opts[:config]? YAML.load_file(opts[:config]) : {}
21
+ if opts[:tag]
22
+ if config.keys.include? :products
23
+ config[:products].each do |product_name, config_hash|
24
+ config[product_name] = MutationsCLI.set_defaults(config_hash)
25
+ end
26
+ config = MutationsCLI.construct_products(config)
27
+ else
28
+ config[:single_product] = true
29
+ config[:output] ||= opt[:output]
30
+ config[:outfile] = File.open(config[:output],'w')
31
+ end
32
+
33
+ ARGF.each do |input|
34
+ MutationsCLI.tag input, config
35
+ end
36
+ end
37
+
38
+ if opts[:report]
39
+ config[:products].each do |product_name, config_hash|
40
+ config_hash = MutationsCLI.set_defaults(config_hash)
41
+ lookups = {}
42
+ out = File.open(product_name+"-report.txt",'w')
43
+ mutants = []
44
+ ARGV.each do |file|
45
+ calls = Hash.new{|h,k| h[k] = MutantAllele.new}
46
+ File.open(file).readlines.each do |input|
47
+ next if input.match /^@/ # skip sam headers
48
+ sam = Bio::DB::Alignment.new(input)
49
+ # Check correct start in case of file with mixed amplicons
50
+ if config_hash[:start]
51
+ next unless sam.seq.match Regexp.new("^"+config_hash[:start])
52
+ end
53
+ # Must be have sufficient mapped length to call mutations in the given interval:
54
+ next if sam.query_unmapped
55
+ next if config_hash[:length] > Bio::Alignment::CIGAR.new(sam.cigar).reference_length - config_hash[:offset]
56
+ key = "w.t."
57
+ muts = MutationsCLI.call_mutations_given_product sam, config_hash
58
+ if muts
59
+ key = muts.to_hgvs
60
+ calls[key].mutations ||= muts
61
+ end
62
+ calls[key].seq ||= sam.query(config_hash[:offset], config_hash[:length])
63
+ calls[key].example ||= sam
64
+ calls[key].count += 1
65
+ end
66
+ total_reads = calls.map{|k,v| v.count}.reduce(:+)
67
+ out.puts file
68
+ out.puts "Total reads: #{total_reads}"
69
+ threshold = total_reads ? opts[:mincov] * total_reads : 0
70
+ calls.keep_if{|k, v| v.count > threshold}
71
+ mutants << file unless calls.keys.include? "w.t." || total_reads < opts[:flag]
72
+ calls = calls.sort_by{|k,v| v.count}.reverse.to_h
73
+ calls.each do |key, allele|
74
+ hgvs, vep = ""
75
+ formatted = nil
76
+ if allele.mutations
77
+ hgvs = allele.mutations.to_hgvs
78
+ if allele.mutations.size == 1 # VEP lookup doesn't work for compound mutations
79
+ begin
80
+ vep = allele.lookup
81
+ rescue RuntimeError
82
+ result = nil
83
+ end
84
+ # TODO: sort out the cacheing - does it actually work?
85
+ if vep
86
+ result = lookups[hgvs] ? lookups[hgvs] : VepHgvs.consequences_for_transcript(vep,config_hash[:transcript]).first
87
+ if result
88
+ result.each
89
+ formatted = [[result["CDS position"].to_s,result["Allele"]].join(" "),[result["Protein start"].to_s,result["Mutation"]].join(" "),result["Consequence"]].join("\t")
90
+ end
91
+ else
92
+ formatted = "No VEP result"
93
+ end
94
+ lookups[hgvs] ||= result
95
+ else
96
+ formatted = "Compound mutant"
97
+ end
98
+ end
99
+ formatted ||= "No mutation"
100
+ out.puts ([allele.seq, allele.count, key, formatted]).join("\t")
101
+ end
102
+ out.puts "\n===================================="
103
+ end
104
+ out.puts "Files with wild type below threshold:"
105
+ out.puts mutants.join("\n")
106
+ end
107
+
108
+ end
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+ require 'bio-sam-mutation'
3
+ require 'thor'
4
+
5
+ class SamMutationCLI < Thor
6
+ desc "tag [--config=config.yml]", "Tag a SAM file with HGVS annotations."
7
+ option :config
8
+ def tag (file_name)
9
+ File.open(file_name).each do |line|
10
+ next if line.match(/^@/)
11
+ sam = Bio::DB::Alignment.new(line)
12
+ if sam.mutations
13
+ new_tag = Bio::DB::Tag.new("YH:m:"+sam.mutations.to_hgvs)
14
+ puts sam.add_tag(new_tag)
15
+ end
16
+ end
17
+ end
18
+ end
19
+
20
+ SamMutationCLI.start ARGV
@@ -0,0 +1,26 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio'
12
+ require 'bio-ensembl-rest'
13
+ require 'bio-samtools'
14
+ require 'oj'
15
+ require 'yaml'
16
+ require 'bio-sam-mutation/bio/db/alignment'
17
+ require 'bio-sam-mutation/bio/alignment/iterate_pairs'
18
+ require 'bio-sam-mutation/bio/alignment/cigar'
19
+ require 'bio-sam-mutation/bio/db/tag'
20
+ require 'bio-sam-mutation/bio/db/tag/md'
21
+ require 'bio-sam-mutation/bio/vephgvs'
22
+ require 'bio-sam-mutation/bio/mutation'
23
+ require 'bio-sam-mutation/bio/mutation_array'
24
+ require 'bio-sam-mutation/bio/mutantallele'
25
+
26
+ require 'bio-sam-mutation/mutationscli'
@@ -0,0 +1,239 @@
1
+ # Parse a CIGAR string
2
+ # An example from Exonerate output. Ideally will also allow SAM file input to be used.
3
+ # 1 : CGGCTATGGGGTCGTGGGTCCCGCGTTG-CTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 63
4
+ # | | || | ||||||||||||||||||| |||||||||||||||||||||||||||||||||||
5
+ # 1 : CAG-TA-GTGGTCGTGGGTCCCGCGTTGTCTCTGGGGCTCGGCACCCTGGGGCGGCACGGCCGT : 62
6
+ #
7
+ # ref: CAGTAGTGGTCGTGGGTCCCGCGTTGTCTCTGG...
8
+ # cigar: SP-A12_D02_2015-01-16.seq 0 611 + SP-A3_ref 0 621 + 2514 M 3 I 1 M 2 I 1 M 21 D 1 M 306 D 9 M 89 I 1 M 126 D 1 M 24 D 1 M 8 I 1 M 6 D 1 M 5 D 1 M 17
9
+ # I are not counted in reference
10
+ # Regexp (from SAM specification) - but in exonerate the number comes first: ([0-9]+[MIDNSHP])+|\*
11
+
12
+ class Bio::Alignment::CIGAR
13
+ include Bio::Alignment::IteratePairs
14
+
15
+ class << self
16
+ attr_accessor :reference_operations, :query_operations, :subexp, :regexps
17
+ end
18
+
19
+ self.regexps = {"exonerate" => /([MIDNSHP]{1})(\d+)/, "sam" => /(\d+)([MIDNSHP]{1})/}
20
+ # Type of elements that count towards the reference length:
21
+ # TODO: add full support for other elements S, H etc.
22
+ self.reference_operations = /[MD]/
23
+ self.query_operations = /[MIS]/
24
+ self.subexp = /([atgcAGCT]+)>([atgcAGTC]+)/
25
+ attr_accessor :pairs, :reference
26
+
27
+ def initialize(string,ref=nil,source="")
28
+ # strip out whitespace
29
+ string.gsub!(/\s+/,"")
30
+
31
+ # Auto-detect source if not supplied
32
+ if !(Bio::Alignment::CIGAR.regexps.keys.include? source)
33
+ Bio::Alignment::CIGAR.regexps.each do |k,v|
34
+ # Look for match at start of string
35
+ if m = string.match(v)
36
+ source = k if m.offset(0)[0] == 0
37
+ end
38
+ end
39
+ if source == ""
40
+ raise "Source (e.g. 'exonerate', 'sam') not given and failed to auto-detect."
41
+ end
42
+ end
43
+ # Make an array of pairs of of cigar elements:
44
+ @pairs = string.scan(Bio::Alignment::CIGAR.regexps[source])
45
+ if source == "exonerate"
46
+ @pairs.map!{|pair| [pair[0].to_s, pair[1].to_i]}
47
+ else
48
+ # Provision to have number and identifier the other way round
49
+ @pairs.map!{|pair| [pair[1].to_s, pair[0].to_i]}
50
+ end
51
+
52
+ # Include reference sequence if provided
53
+ @reference = ref
54
+ # Check length of reference = sum(M+D)?
55
+ #warn "Reference length is not equal to that implied by CIGAR string: #{@reference.length}, #{self.reference_length}." unless @reference.length == self.reference_length
56
+
57
+ end
58
+
59
+ # Given an offset in reference sequence and length, return an object corresponding to that subregion of the alignment
60
+ def subalignment(offset,length,regexp=Bio::Alignment::CIGAR.reference_operations)
61
+ new_array = iterate_pairs(@pairs,offset,length,regexp)
62
+ # Return a CIGAR instance with just the new alignment
63
+ new_string = new_array.join(" ")
64
+ # -1 from offset as ruby string starts at zero
65
+ new_cigar = Bio::Alignment::CIGAR.new(new_string,@reference[offset-1,length])
66
+ new_cigar.remove_empty!
67
+ end
68
+ alias_method :slice, :subalignment
69
+
70
+ # Given a CIGAR-based [not reference - use subalignment] offset and length, return a subregion
71
+ def subcigar(offset,length)
72
+ # No regexp - includes everything
73
+ self.subalignment(offset,length,//)
74
+ end
75
+
76
+ def unmasked(offset,length)
77
+ self.subalignment(offset,length,/[MDI]/)
78
+ end
79
+
80
+ def remove_small!(threshold=1)
81
+ # Deletions convert to matches, insertions just remove
82
+ deletions_to_matches(threshold)
83
+ remove_small_nonmatches(threshold)
84
+ self
85
+ end
86
+
87
+ def remove_empty!
88
+ self.pairs.keep_if{|pair| pair[1] != 0 }
89
+ self
90
+ end
91
+
92
+ def matched_length
93
+ count_type("M")
94
+ end
95
+
96
+ def deleted_length
97
+ count_type("D")
98
+ end
99
+
100
+ def inserted_length
101
+ count_type("I")
102
+ end
103
+
104
+ def masked_length
105
+ count_type(/[SH]/)
106
+ end
107
+
108
+ def reference_length
109
+ count_type(Bio::Alignment::CIGAR.reference_operations)
110
+ end
111
+
112
+ def query_length
113
+ count_type(Bio::Alignment::CIGAR.query_operations)
114
+ end
115
+
116
+ # Output a representation of the query: replace deleted portions with "-", flag insertions with "*" or sim. Optionally provide the sequence (or symbols to use) of insertions, in order of appearence.
117
+ # Should be able to accept an array
118
+ # TODO: Add support for substitution highlighting (e.g lowercasing)
119
+ def query(insertions=nil)
120
+ if (insertions && (insertions.is_a? String))
121
+ insertions = [insertions]
122
+ end
123
+ sequence = []
124
+ total = 0
125
+ @pairs.each do |pair|
126
+ if pair[0].match("M")
127
+ sequence << @reference[total..total+pair[1]-1].upcase
128
+ total += pair[1]
129
+ end
130
+ if pair[0].match("I")
131
+ if (insertions)
132
+ insertion = insertions.shift.to_s
133
+ else
134
+ insertion = '['+pair[1].to_s+']'
135
+ end
136
+ sequence << insertion
137
+ end
138
+ if pair[0].match("D")
139
+ pair[1].times{ sequence << "-" }
140
+ total += pair[1]
141
+ end
142
+ end
143
+ sequence.join("")
144
+ end
145
+
146
+ # Output hgnc variant format given reference position. Only deletions can be accurately annotated from the cigar string; insertions or wild type seqeunces return nil
147
+ # NB mutation calling and annotation now implemented as extension to Bio::DB::Alignment (SAM)
148
+ def hgnc(reference_pos=0,insertions=[],type="g",*subs)
149
+ if insertions
150
+ if insertions.is_a? String
151
+ insertions = [insertions]
152
+ end
153
+ end
154
+ first_match = true
155
+ total = 0
156
+ hgnc_format = []
157
+ @pairs.each do |pair|
158
+ case pair[0]
159
+ when "M"
160
+ #break if first_match == false
161
+ reference_pos += pair[1]
162
+ total += pair[1]
163
+ first_match = false
164
+ when "D"
165
+ deleted_bases = @reference[total,pair[1]].upcase
166
+ if (pair[1] == 1)
167
+ string = (reference_pos + 1).to_s
168
+ else
169
+ string = (reference_pos + 1).to_s + "_" + (reference_pos + pair[1]).to_s
170
+ end
171
+ string = string + "del" + deleted_bases
172
+ hgnc_format << string
173
+ total += pair[1]
174
+ when "I"
175
+ inserted_bases = (insertions.length == 0) ? "N" : insertions.shift
176
+
177
+ hgnc_format << (reference_pos).to_s + "_" + (reference_pos + 1).to_s + "ins" + inserted_bases.upcase
178
+ end
179
+ end
180
+ # Use for substitutions, but could also pass any other annotation to include in here, as an array of strings
181
+ subs = subs.first # >1 arguments discarded
182
+ if subs
183
+ if (subs.length > 0 && (subs.is_a? Array))
184
+ hgnc_format = hgnc_format + subs
185
+ end
186
+ end
187
+ if hgnc_format.length == 0
188
+ nil
189
+ elsif hgnc_format.length == 1
190
+ type.to_s + "." + hgnc_format[0]
191
+ else
192
+ type.to_s + "." + "[" + hgnc_format.join(";") + "]"
193
+ end
194
+
195
+ end
196
+
197
+ # TODO combine adjacent operations of the same type into a single pair
198
+ def combine_adjacent
199
+
200
+ end
201
+
202
+ # Returns a hash (keyed by operation type) of three element arrays: the start positions on the reference of operations of the given type(s) and the length of the operation,
203
+ # followed by query position (for e.g. retrieving inserted bases from SAM). A regexp can be used to specify multiple types e.g. /[ID]/.
204
+ def positions(type)
205
+ total = 0
206
+ qtotal = 0
207
+ hash = Hash.new{|h,k| h[k] = []}
208
+ @pairs.each do |pair|
209
+ if pair[0].match(type)
210
+ hash[$&] << [total, pair[1], qtotal]
211
+ end
212
+ total += pair[1] if pair[0].match Bio::Alignment::CIGAR.reference_operations
213
+ qtotal += pair[1] if pair[0].match Bio::Alignment::CIGAR.query_operations
214
+ end
215
+ hash
216
+ end
217
+
218
+
219
+ private
220
+
221
+ def deletions_to_matches(threshold)
222
+ self.pairs = @pairs.each{|pair| pair[0].sub!("D","M") if pair[1] <= threshold}
223
+ end
224
+
225
+ def remove_small_nonmatches(threshold)
226
+ self.pairs = @pairs.keep_if{|pair| pair[0] == "M" || pair[1] > threshold}
227
+ end
228
+
229
+ def count_type(type)
230
+ sum = 0
231
+ @pairs.each do |pair|
232
+ if pair[0].match(type)
233
+ sum += pair[1]
234
+ end
235
+ end
236
+ sum
237
+ end
238
+
239
+ end #class