bio-polyploid-tools 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +16 -0
  3. data/Gemfile.lock +67 -0
  4. data/README +21 -0
  5. data/Rakefile +61 -0
  6. data/VERSION +1 -0
  7. data/bin/bfr.rb +133 -0
  8. data/bin/count_variations.rb +36 -0
  9. data/bin/filter_blat_by_target_coverage.rb +15 -0
  10. data/bin/find_best_blat_hit.rb +32 -0
  11. data/bin/hexaploid_primers.rb +168 -0
  12. data/bin/homokaryot_primers.rb +155 -0
  13. data/bin/map_markers_to_contigs.rb +66 -0
  14. data/bin/markers_in_region.rb +42 -0
  15. data/bin/polymarker.rb +219 -0
  16. data/bin/snps_between_bams.rb +106 -0
  17. data/bio-polyploid-tools.gemspec +139 -0
  18. data/conf/defaults.rb +1 -0
  19. data/conf/primer3_config/dangle.dh +128 -0
  20. data/conf/primer3_config/dangle.ds +128 -0
  21. data/conf/primer3_config/interpretations/dangle_i.dh +131 -0
  22. data/conf/primer3_config/interpretations/dangle_i.ds +131 -0
  23. data/conf/primer3_config/interpretations/loops_i.dh +34 -0
  24. data/conf/primer3_config/interpretations/loops_i.ds +31 -0
  25. data/conf/primer3_config/interpretations/stack_i.dh +257 -0
  26. data/conf/primer3_config/interpretations/stack_i.ds +256 -0
  27. data/conf/primer3_config/interpretations/stackmm_i_mm.dh +257 -0
  28. data/conf/primer3_config/interpretations/stackmm_i_mm.ds +256 -0
  29. data/conf/primer3_config/interpretations/tetraloop_i.dh +79 -0
  30. data/conf/primer3_config/interpretations/tetraloop_i.ds +81 -0
  31. data/conf/primer3_config/interpretations/triloop_i.dh +21 -0
  32. data/conf/primer3_config/interpretations/triloop_i.ds +18 -0
  33. data/conf/primer3_config/interpretations/tstack2_i.dh +256 -0
  34. data/conf/primer3_config/interpretations/tstack2_i.ds +256 -0
  35. data/conf/primer3_config/interpretations/tstack_i.dh +256 -0
  36. data/conf/primer3_config/interpretations/tstack_i.ds +256 -0
  37. data/conf/primer3_config/interpretations/tstack_tm_inf_i.dh +256 -0
  38. data/conf/primer3_config/interpretations/tstack_tm_inf_i.ds +256 -0
  39. data/conf/primer3_config/loops.dh +30 -0
  40. data/conf/primer3_config/loops.ds +30 -0
  41. data/conf/primer3_config/stack.dh +256 -0
  42. data/conf/primer3_config/stack.ds +256 -0
  43. data/conf/primer3_config/stackmm.dh +256 -0
  44. data/conf/primer3_config/stackmm.ds +256 -0
  45. data/conf/primer3_config/tetraloop.dh +77 -0
  46. data/conf/primer3_config/tetraloop.ds +77 -0
  47. data/conf/primer3_config/triloop.dh +16 -0
  48. data/conf/primer3_config/triloop.ds +16 -0
  49. data/conf/primer3_config/tstack.dh +256 -0
  50. data/conf/primer3_config/tstack2.dh +256 -0
  51. data/conf/primer3_config/tstack2.ds +256 -0
  52. data/conf/primer3_config/tstack_tm_inf.ds +256 -0
  53. data/lib/bio/BFRTools.rb +698 -0
  54. data/lib/bio/BIOExtensions.rb +186 -0
  55. data/lib/bio/PolyploidTools/ChromosomeArm.rb +52 -0
  56. data/lib/bio/PolyploidTools/ExonContainer.rb +194 -0
  57. data/lib/bio/PolyploidTools/Marker.rb +175 -0
  58. data/lib/bio/PolyploidTools/PrimerRegion.rb +22 -0
  59. data/lib/bio/PolyploidTools/SNP.rb +681 -0
  60. data/lib/bio/PolyploidTools/SNPSequence.rb +56 -0
  61. data/lib/bio/SAMToolsExtensions.rb +284 -0
  62. data/lib/bio/db/exonerate.rb +272 -0
  63. data/lib/bio/db/fastadb.rb +164 -0
  64. data/lib/bio/db/primer3.rb +673 -0
  65. data/lib/bioruby-polyploid-tools.rb +25 -0
  66. data/test/data/BS00068396_51.fa +2 -0
  67. data/test/data/BS00068396_51_contigs.aln +1412 -0
  68. data/test/data/BS00068396_51_contigs.dnd +7 -0
  69. data/test/data/BS00068396_51_contigs.fa +8 -0
  70. data/test/data/BS00068396_51_exonerate.tab +6 -0
  71. data/test/data/BS00068396_51_genes.txt +14 -0
  72. data/test/data/LIB1716.bam +0 -0
  73. data/test/data/LIB1716.bam.bai +0 -0
  74. data/test/data/LIB1719.bam +0 -0
  75. data/test/data/LIB1719.bam.bai +0 -0
  76. data/test/data/LIB1721.bam +0 -0
  77. data/test/data/LIB1721.bam.bai +0 -0
  78. data/test/data/LIB1722.bam +0 -0
  79. data/test/data/LIB1722.bam.bai +0 -0
  80. data/test/data/S22380157.fa +16 -0
  81. data/test/data/S22380157.fa.fai +1 -0
  82. data/test/data/Test3Aspecific.csv +1 -0
  83. data/test/data/Test3Aspecific_contigs.fa +6 -0
  84. data/test/data/patological_cases5D.csv +1 -0
  85. data/test/data/short_primer_design_test.csv +10 -0
  86. data/test/data/test_primer3_error.csv +4 -0
  87. data/test/data/test_primer3_error_contigs.fa +10 -0
  88. data/test/test_bfr.rb +51 -0
  89. data/test/test_exon_container.rb +17 -0
  90. data/test/test_exonearate.rb +53 -0
  91. data/test/test_snp_parsing.rb +40 -0
  92. metadata +201 -0
@@ -0,0 +1,186 @@
1
+
2
+ module Bio::NucleicAcid::Data
3
+ IUPAC_CODES = {
4
+
5
+ 'y' => 'ct',
6
+ 'r' => 'ag',
7
+ 'w' => 'at',
8
+ 's' => 'cg',
9
+ 'k' => 'gt',
10
+ 'm' => 'ac',
11
+
12
+ 'b' => 'cgt',
13
+ 'd' => 'agt',
14
+ 'h' => 'act',
15
+ 'v' => 'acg',
16
+
17
+ 'n' => 'acgt',
18
+
19
+ 'a' => 'a',
20
+ 't' => 't',
21
+ 'g' => 'g',
22
+ 'c' => 'c',
23
+ 'u' => 'u',
24
+
25
+ 'ct' => 'y',
26
+ 'ag' => 'r',
27
+ 'at' => 'w',
28
+ 'cg' => 's',
29
+ 'gt' => 'k',
30
+ 'ac' => 'm',
31
+
32
+ 'cgt' => 'b',
33
+ 'agt' => 'd',
34
+ 'act' => 'h',
35
+ 'acg' => 'v',
36
+
37
+ 'acgt' => 'n'
38
+ }
39
+
40
+
41
+ end
42
+
43
+ class Bio::NucleicAcid
44
+
45
+ IUPAC_CODES = {
46
+
47
+ 'y' => 'ct',
48
+ 'r' => 'ag',
49
+ 'w' => 'at',
50
+ 's' => 'cg',
51
+ 'k' => 'gt',
52
+ 'm' => 'ac',
53
+
54
+ 'b' => 'cgt',
55
+ 'd' => 'agt',
56
+ 'h' => 'act',
57
+ 'v' => 'acg',
58
+
59
+ 'n' => 'acgt',
60
+
61
+ 'a' => 'a',
62
+ 't' => 't',
63
+ 'g' => 'g',
64
+ 'c' => 'c',
65
+ 'u' => 'u',
66
+
67
+ 'ct' => 'y',
68
+ 'ag' => 'r',
69
+ 'at' => 'w',
70
+ 'cg' => 's',
71
+ 'gt' => 'k',
72
+ 'ac' => 'm',
73
+
74
+ 'cgt' => 'b',
75
+ 'agt' => 'd',
76
+ 'act' => 'h',
77
+ 'acg' => 'v',
78
+
79
+ 'acgt' => 'n'
80
+ }
81
+
82
+ def self.is_unambiguous(base)
83
+ "acgtACGT".match(base)
84
+ end
85
+
86
+ def self.to_IUAPC(bases)
87
+ base = IUPAC_CODES[bases.to_s.downcase.chars.sort.uniq.join]
88
+ if base == nil
89
+ p "Invalid base! #{base}"
90
+ base = 'n' #This is a patch... as one of the scripts failed here.
91
+ end
92
+ base.upcase
93
+ end
94
+
95
+ def self.is_valid(code, base)
96
+ IUPAC_CODES[code.downcase].chars.include? base.downcase
97
+ end
98
+
99
+ end
100
+
101
+ class Bio::Sequence
102
+ def self.snps_between(seq1, seq2)
103
+ snps=0
104
+ for i in (0..seq1.size-1)
105
+ snps += 1 if seq1[i] != seq2[i]
106
+ end
107
+ snps
108
+ end
109
+ end
110
+
111
+ class String
112
+ def count_ambiguities
113
+ snps=0
114
+
115
+ for i in (0..self.size-1)
116
+
117
+ snps += 1 if !Bio::NucleicAcid.is_unambiguous(self[i])
118
+ end
119
+ snps
120
+ end
121
+
122
+ def upper_case_count
123
+ match(/[^A-Z]*/).to_s.size
124
+ end
125
+ end
126
+
127
+ class Bio::Blat
128
+ def self.align(database , query , output)
129
+ cmdline = "blat #{database} #{query} #{output}"
130
+ puts $stderr.puts cmdline
131
+ status, stdout, stderr = systemu cmdline
132
+ if status.exitstatus == 0
133
+ alns = Array.new unless block_given?
134
+ blat_aln = Bio::Blat::Report.new(Bio::FlatFile.open(output).to_io)
135
+ #p blat_aln
136
+ blat_aln.each_hit() do |hit|
137
+ if block_given?
138
+ yield hit
139
+ else
140
+ alns << hit
141
+ end
142
+ end
143
+ return alns unless block_given?
144
+ else
145
+ raise Exception.new(), "Error running exonerate. Command line was '#{cmdline}'\nBlat STDERR was:\n#{stderr}"
146
+ end
147
+ end
148
+ end
149
+
150
+ class Bio::Blat::Report::Hit
151
+
152
+ #Function to parse stuff like: IWGSC_CSS_1AL_scaff_110
153
+ def wheat_chr_arm
154
+ @wheat_chr_arm if @wheat_chr_arm
155
+ @wheat_chr_arm = target_id.split('_')[2]
156
+ end
157
+
158
+ def wheat_chr
159
+ wheat_chr_arm[0,2]
160
+ end
161
+
162
+ def wheat_chr_group
163
+ raise Exception.new(), "No wheat group for #{target_id} #{self.inspect}" unless wheat_chr
164
+ wheat_chr_arm[0]
165
+ end
166
+
167
+ def wheat_genome
168
+ wheat_chr_arm[1]
169
+ end
170
+
171
+ def wheat_arm
172
+ wheat_chr_arm[2]
173
+ end
174
+
175
+ def percentage_covered
176
+ ( match + mismatch ) * 100.0 / query_len.to_f
177
+ end
178
+
179
+ end
180
+
181
+
182
+ class Hash
183
+ def join(keyvaldelim=$,, entrydelim=$,)
184
+ map {|e| e.join(keyvaldelim) }.join(entrydelim)
185
+ end
186
+ end
@@ -0,0 +1,52 @@
1
+ module Bio::PolyploidTools
2
+
3
+ class ChromosomeArm
4
+ attr_accessor :name
5
+ attr_reader :genes
6
+ attr_reader :loaded_entries
7
+ attr_reader :fasta_db
8
+
9
+ def initialize(name, path_to_fasta)
10
+ @name = name
11
+ @fasta_db = Bio::DB::Fasta::FastaFile.new(path_to_fasta)
12
+ #$stderr.puts "Loading entries for #{name}"
13
+
14
+ @genes = Hash.new
15
+ end
16
+
17
+ def fetch_contig(contig_id)
18
+
19
+ @fasta_db.load_fai_entries unless @loaded_entries
20
+ @loaded_entries = true
21
+ entry = fasta_db.index.region_for_entry(contig_id)
22
+ # puts entry
23
+ @fasta_db.fetch_sequence(entry.get_full_region)
24
+ end
25
+ #Loads all the chromosome arms in a folder.
26
+ #The current version requires that all the references end with .fa, and start with XXX_*.fa
27
+ #Where XXX is the chromosome name
28
+ def self.load_from_folder(path_to_contigs)
29
+ chromosomeArms = Hash.new
30
+
31
+ Dir.foreach(path_to_contigs) do |filename |
32
+
33
+ if File.fnmatch("*.fa", filename)
34
+
35
+ parsed = /^(?<arm>\d\w+)/.match(filename)
36
+
37
+ target="#{path_to_contigs}/#{filename}"
38
+
39
+
40
+
41
+ # fasta_file = Bio::DB::Fasta::FastaFile.new(target)
42
+ #fasta_file.load_fai_entries
43
+ arm = ChromosomeArm.new(parsed[:arm], target)
44
+ chromosomeArms[arm.name] = arm
45
+ end
46
+ end
47
+ return chromosomeArms
48
+ end
49
+
50
+ end
51
+
52
+ end
@@ -0,0 +1,194 @@
1
+ #puts "Loading ExonCointainer..."
2
+ module Bio::PolyploidTools
3
+ class ExonContainer
4
+ attr_reader :parental_1_sam, :parental_2_sam
5
+ attr_reader :parental_1_name, :parental_2_name, :gene_models_db
6
+ attr_reader :chromosomes, :snp_map
7
+ attr_reader :parents
8
+ attr_accessor :flanking_size
9
+
10
+ BASES = [:A, :C, :G, :T]
11
+ #Sets the reference file for the gene models
12
+
13
+ def initialize
14
+ @parents=Hash.new
15
+ @snp_map = Hash.new
16
+ @snp_contigs
17
+ end
18
+
19
+ def gene_models(path)
20
+ @gene_models_db = Bio::DB::Fasta::FastaFile.new(path)
21
+ @gene_models_path = path
22
+ end
23
+
24
+ #Retunrs the sequence for a region in the gene models (exon)
25
+ def gene_model_sequence(region)
26
+ seq=@gene_models_db.fetch_sequence(region)
27
+
28
+
29
+ end
30
+
31
+ #Sets the reference file for the gene models
32
+ def chromosomes(path)
33
+ @chromosomes_db = Bio::DB::Fasta::FastaFile.new(path)
34
+ @chromosomes_path = path
35
+ end
36
+
37
+ #Retunrs the sequence for a region in the gene models (exon)
38
+ def chromosome_sequence(region)
39
+ left_pad = 0
40
+ #TODO: Padd if it goes to the right
41
+ if(region.start < 0)
42
+ left_pad = region.start * -1
43
+ left_pad += 1
44
+ region.start = 0
45
+ end
46
+ str = "-" * left_pad << @chromosomes_db.fetch_sequence(region)
47
+ #str << "n" * (region.size - str.size + 1) if region.size > str.size
48
+ str
49
+ end
50
+
51
+
52
+ def add_chromosome_arm(opts)
53
+ @chromosomes = Hash.new unless @chromosomes
54
+ name = opts[:name]
55
+ path = opts[:reference_path]
56
+ path = opts[:alig_path]
57
+ chromosomes[name] = Bio::DB::Fasta::FastaFile.new(path)
58
+ end
59
+
60
+ def add_snp(snp)
61
+ @snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
62
+ @snp_map[snp.gene] << snp
63
+ end
64
+
65
+ def add_snp_file(filename, chromosome, snp_in, original_name)
66
+
67
+ File.open(filename) do | f |
68
+ f.each_line do | line |
69
+ snp = SNP.parse(line)
70
+ snp.flanking_size = flanking_size
71
+ if snp.position > 0
72
+ snp.container = self
73
+ snp.chromosome = chromosome
74
+ snp.snp_in = snp_in
75
+ snp.original_name = original_name
76
+ @snp_map[snp.gene] = Array.new unless @snp_map[snp.gene]
77
+ @snp_map[snp.gene] << snp
78
+ end
79
+
80
+ end
81
+ end
82
+ end
83
+
84
+ def primer_3_input_for_snp(snp)
85
+ gene_region = snp.covered_region
86
+ local_pos_in_gene = snp.local_position
87
+ puts ""
88
+ end
89
+
90
+ def fasta_string_for_snp(snp)
91
+ gene_region = snp.covered_region
92
+ local_pos_in_gene = snp.local_position
93
+ ret_str = ""
94
+ @parents.each do |name, bam|
95
+ ret_str << ">#{gene_region.id}_SNP-#{snp.position}_#{name} Overlapping_exons:#{gene_region.to_s} localSNPpo:#{local_pos_in_gene+1}\n"
96
+ to_print = bam.consensus_with_ambiguities({:region=>gene_region}).to_s
97
+ to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
98
+ ret_str << to_print << "\n"
99
+ end
100
+
101
+ snp.exon_list.each do | chromosome, exon |
102
+ target_region = exon.target_region
103
+ exon_start_offset = exon.query_region.start - gene_region.start
104
+ chr_local_pos=local_pos_in_gene + target_region.start + 1
105
+ ret_str << ">#{chromosome}_SNP-#{chr_local_pos} #{exon.to_s} #{target_region.orientation}\n"
106
+ to_print = "-" * exon_start_offset
107
+ chr_seq = chromosome_sequence(exon.target_region).to_s
108
+ l_pos = exon_start_offset + local_pos_in_gene
109
+ to_print << chr_seq
110
+ to_print[local_pos_in_gene] = to_print[local_pos_in_gene].upcase
111
+ ret_str << to_print
112
+ end
113
+ puts ret_str
114
+ ret_str
115
+ end
116
+
117
+ def print_fasta_snp_exones (file)
118
+ @missing_exons = Set.new unless @missing_exons
119
+ @snp_map.each do | gene, snp_array|
120
+ snp_array.each do |snp|
121
+ #file.puts snp.primer_fasta_string
122
+ begin
123
+ file.puts snp.aligned_sequences_fasta
124
+ rescue Exception=>e
125
+ @missing_exons << snp.to_s
126
+ # $stderr.puts e.to_s
127
+ end
128
+ end
129
+ end
130
+ end
131
+
132
+ def print_primer_3_exons (file, target_chromosome , parental )
133
+ @snp_map.each do | gene, snp_array|
134
+ snp_array.each do |snp|
135
+ begin
136
+ string = snp.primer_3_string( snp.chromosome, parental )
137
+ file.puts string if string.size > 0
138
+ rescue Exception=>e
139
+ @missing_exons << snp.to_s
140
+ #$stderr.puts e.to_s
141
+ end
142
+ end
143
+ end
144
+ end
145
+
146
+ def add_alignments(opts=Hash.new)
147
+ opts = { :min_identity=>90 }.merge!(opts)
148
+ exonerate_filename = opts[:exonerate_file]
149
+ arm_selection = opts[:arm_selection]
150
+
151
+ unless arm_selection
152
+ arm_selection = lambda do | contig_name |
153
+ ret = contig_name[0,3]
154
+ return ret
155
+ end
156
+ end
157
+
158
+
159
+ File.open(exonerate_filename) do |f|
160
+ f.each_line do | line |
161
+ record = Bio::DB::Exonerate::Alignment.parse_custom(line)
162
+ if record and record.identity >= opts[:min_identity]
163
+ snp_array = @snp_map[record.query_id]
164
+ if snp_array != nil
165
+ snp_array.each do |snp|
166
+ if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
167
+ begin
168
+ exon = record.exon_on_gene_position(snp.position)
169
+ snp.add_exon(exon, arm_selection.call(record.target_id))
170
+ rescue Bio::DB::Exonerate::ExonerateException
171
+ $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
172
+ end
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
179
+ end
180
+
181
+ def add_parental(opts=Hash.new)
182
+ # opts = { :name=>opts[:path]}.merge!(opts)
183
+ sam = nil
184
+ name = opts[:name] ? opts[:name] : "Unknown"
185
+ if opts[:path]
186
+ path = opts[:path]
187
+ name = opts[:name] ? opts[:name] : path.basename(".bam")
188
+ sam = Bio::DB::Sam.new({:fasta=>@gene_models_path, :bam=>opts[:path]})
189
+ end
190
+ @parents[name] = sam
191
+ end
192
+ end
193
+
194
+ end
@@ -0,0 +1,175 @@
1
+ module Bio::PolyploidTools
2
+ class Marker
3
+ include Comparable
4
+ #include Virgola
5
+ attr_reader :template_sequence, :original, :snp
6
+ attr_accessor :best_hit
7
+ attr_accessor :index_90k
8
+ attr_accessor :snp_id
9
+ attr_accessor :snp_name
10
+ attr_accessor :chr
11
+ attr_accessor :coordinates_chr
12
+ attr_accessor :map_order
13
+ attr_accessor :chr_arm
14
+ attr_accessor :distance_cm
15
+ attr_accessor :sequence
16
+ attr_writer :contig
17
+
18
+
19
+
20
+ #after_map :parse_sequence_snp
21
+
22
+ def to_fasta
23
+ ">#{self.snp_name}\n#{self.template_sequence}"
24
+ end
25
+
26
+ def contig
27
+ @contig = best_hit.target_id.chomp if best_hit
28
+ @contig
29
+ end
30
+
31
+ def to_csv
32
+ "#{index_90k},#{snp_id},#{snp_name},#{chr},#{coordinates_chr},#{map_order},#{chr_arm},#{distance_cm},#{sequence},#{contig}"
33
+ end
34
+
35
+ def <=>(anOter)
36
+ return 0 if anOter.snp_name == @snp_name
37
+ return @chr_arm <=> anOter.chr_arm if anOter.chr_arm != @chr_arm
38
+ return @snp_name <=> anOter.snp_name if anOter.coordinates_chr == @coordinates_chr
39
+ return @coordinates_chr <=> anOter.coordinates_chr
40
+ end
41
+
42
+ def initialize(line)
43
+ line.chomp!
44
+ @template_sequence = nil
45
+ #INDEX_90K,SNP_ID,SNP_NAME,CHR,COORDINATES_CHR,MAP_ORDER,CHR_ARM,DISTANCE_CM,SEQUENCE
46
+ @index_90k, @snp_id, @snp_name, @chr, @coordinates_chr, @map_order, @chr_arm, @distance_cm, @sequence, @contig = line.split(',')
47
+ parse_sequence_snp
48
+ end
49
+
50
+ def self.parse(filename)
51
+ f = File.open(filename, "r").read
52
+ f.each_line do |line|
53
+ m = Marker.new(line)
54
+ yield m if m.template_sequence
55
+
56
+ end
57
+ end
58
+
59
+ protected
60
+ def parse_sequence_snp
61
+ pos = 0
62
+ @chr.upcase!
63
+ match_data = /(?<pre>\w*)\[(?<org>[ACGT])\/(?<snp>[ACGT])\](?<pos>\w*)/.match(sequence)
64
+ if match_data
65
+ @position = Regexp.last_match(:pre).size + 1
66
+ @original = Regexp.last_match(:org)
67
+ @snp = Regexp.last_match(:snp)
68
+ amb_base = Bio::NucleicAcid.to_IUAPC("#{@original}#{@snp}")
69
+ @template_sequence = "#{Regexp.last_match(:pre)}#{amb_base}#{Regexp.last_match(:pos)}"
70
+ return @template_sequence
71
+ end
72
+ return nil
73
+ end
74
+ end
75
+
76
+
77
+ #The map hast to come sorted.
78
+ class ArmMap
79
+ attr_reader :markers , :global_reference, :reference
80
+ attr_accessor :chromosome
81
+ def initialize
82
+ @markers = Hash.new
83
+ end
84
+
85
+ def align_markers(output)
86
+ Bio::Blat.align(@reference.fasta_path, @fasta_markers, output) do |hit|
87
+ marker = markers[hit.query_id]
88
+ best = marker.best_hit
89
+ unless marker.best_hit
90
+ markers[hit.query_id].best_hit = hit
91
+ else
92
+ marker.best_hit = hit if hit.score > marker.best_hit.score
93
+ end
94
+ end
95
+ end
96
+
97
+ def print_fasta_contigs_for_markers(contigs_file)
98
+
99
+ contigs = Set.new
100
+ markers.each do |k, marker|
101
+
102
+ if marker.best_hit
103
+ contigs << marker.best_hit.target_id
104
+ end
105
+ end
106
+
107
+ fasta=File.open(contigs_file, "w")
108
+ contigs.each do |contig_id|
109
+ reg = @reference.index.region_for_entry(contig_id)
110
+ fasta.puts ">#{contig_id}\n#{@reference.fetch_sequence(reg.get_full_region)}"
111
+ end
112
+ fasta.close
113
+ end
114
+
115
+
116
+
117
+
118
+ def print_fasta_markers(filename)
119
+ @fasta_markers = filename
120
+ fasta=File.open(filename, "w")
121
+
122
+ markers.each do |k, marker|
123
+ fasta.puts marker.to_fasta
124
+ end
125
+ fasta.close
126
+ end
127
+
128
+ def global_reference(reference)
129
+ @global_reference = Bio::DB::Fasta::FastaFile.new(reference)
130
+ @global_reference.load_fai_entries
131
+ end
132
+
133
+ def reference(reference)
134
+ @reference = Bio::DB::Fasta::FastaFile.new(reference)
135
+ @reference.load_fai_entries
136
+ end
137
+
138
+ def print_fasta_contigs_from_reference(filename)
139
+ if File.exist?(filename)
140
+ reference(filename)
141
+ return
142
+ end
143
+
144
+ #puts "loaded"
145
+
146
+ fasta=File.open(filename, "w")
147
+
148
+ Bio::FlatFile.auto( @global_reference.fasta_path) do |ff|
149
+ ff.each do |f|
150
+ chr_reg = arm_selection_embl(f.entry_id)
151
+ if chr_reg == chromosome
152
+ fasta.puts f.entry
153
+ end
154
+ end
155
+ end
156
+ fasta.close
157
+ reference(filename)
158
+ end
159
+
160
+
161
+ def print_map_with_contigs(filename)
162
+ file = File.open(filename, "w")
163
+ markers.values.sort { |x,y| x.map_order <=> y.map_order }.each do | marker |
164
+ file.puts marker.to_csv
165
+ end
166
+ file.close
167
+ end
168
+
169
+ protected
170
+ def arm_selection_embl(contig_name)
171
+ ret = contig_name.split('_')[2][0,2]
172
+ return ret
173
+ end
174
+ end
175
+ end