bio-polyploid-tools 0.7.3 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (66) hide show
  1. checksums.yaml +5 -5
  2. data/.travis.yml +17 -0
  3. data/Gemfile +10 -7
  4. data/README.md +44 -0
  5. data/Rakefile +14 -14
  6. data/VERSION +1 -1
  7. data/bin/bfr.rb +2 -2
  8. data/bin/blast_triads.rb +166 -0
  9. data/bin/blast_triads_promoters.rb +192 -0
  10. data/bin/find_homoeologue_variations.rb +385 -0
  11. data/bin/get_longest_hsp_blastx_triads.rb +66 -0
  12. data/bin/hexaploid_primers.rb +2 -2
  13. data/bin/homokaryot_primers.rb +2 -2
  14. data/bin/mafft_triads.rb +120 -0
  15. data/bin/mafft_triads_promoters.rb +403 -0
  16. data/bin/polymarker.rb +73 -17
  17. data/bin/polymarker_capillary.rb +416 -0
  18. data/bin/snp_position_to_polymarker.rb +5 -3
  19. data/bin/snps_between_bams.rb +0 -29
  20. data/bin/vcfLineToTable.rb +56 -0
  21. data/bio-polyploid-tools.gemspec +74 -32
  22. data/lib/bio/BFRTools.rb +1 -0
  23. data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
  24. data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
  25. data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
  26. data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
  27. data/lib/bio/PolyploidTools/SNP.rb +58 -18
  28. data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
  29. data/lib/bio/db/blast.rb +112 -0
  30. data/lib/bio/db/exonerate.rb +4 -5
  31. data/lib/bio/db/primer3.rb +83 -14
  32. data/test/data/BS00068396_51_blast.tab +4 -0
  33. data/test/data/BS00068396_51_contigs.nhr +0 -0
  34. data/test/data/BS00068396_51_contigs.nin +0 -0
  35. data/test/data/BS00068396_51_contigs.nsq +0 -0
  36. data/test/data/BS00068396_51_for_polymarker.fa +1 -0
  37. data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
  38. data/test/data/S22380157.vcf +67 -0
  39. data/test/data/S58861868/LIB1716.bam +0 -0
  40. data/test/data/S58861868/LIB1716.sam +651 -0
  41. data/test/data/S58861868/LIB1719.bam +0 -0
  42. data/test/data/S58861868/LIB1719.sam +805 -0
  43. data/test/data/S58861868/LIB1721.bam +0 -0
  44. data/test/data/S58861868/LIB1721.sam +1790 -0
  45. data/test/data/S58861868/LIB1722.bam +0 -0
  46. data/test/data/S58861868/LIB1722.sam +1271 -0
  47. data/test/data/S58861868/S58861868.fa +16 -0
  48. data/test/data/S58861868/S58861868.fa.fai +1 -0
  49. data/test/data/S58861868/S58861868.vcf +76 -0
  50. data/test/data/S58861868/header.txt +9 -0
  51. data/test/data/S58861868/merged.bam +0 -0
  52. data/test/data/S58861868/merged_reheader.bam +0 -0
  53. data/test/data/S58861868/merged_reheader.bam.bai +0 -0
  54. data/test/data/bfr_out_test.csv +5 -5
  55. data/test/data/headerMergeed.txt +9 -0
  56. data/test/data/headerS2238015 +1 -0
  57. data/test/data/mergedLibs.bam +0 -0
  58. data/test/data/mergedLibsReheader.bam +0 -0
  59. data/test/data/mergedLibsSorted.bam +0 -0
  60. data/test/data/mergedLibsSorted.bam.bai +0 -0
  61. data/test/test_bfr.rb +26 -34
  62. data/test/test_blast.rb +47 -0
  63. data/test/test_exonearate.rb +4 -9
  64. data/test/test_snp_parsing.rb +42 -22
  65. metadata +81 -20
  66. data/Gemfile.lock +0 -67
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: adcaebc142757300631df98a5f672cae5dc76cb7
4
- data.tar.gz: 05e3acefabb42d5ea3c84236f4b79ebf338c6306
2
+ SHA256:
3
+ metadata.gz: '08be9c740b45561cf8de023e6ca63bb6be4ae63e6f89bd1eb4b149da9cf47334'
4
+ data.tar.gz: 94aa0d62f15ad380a35fe2c4bbcd870f2cb984f04c76aa825084b9ab97431d8b
5
5
  SHA512:
6
- metadata.gz: c1fa0e9e177bad1633fc5c4d3fd3f3fe2c2b4fc9915dfeeea943d7559d8850061f9437f703a7fa462163eb7f52431b747216d540ed73e64559ca20cfc7fe471b
7
- data.tar.gz: 4db9d4d8b404378b39af64978d82671d94b5babc42ea4542c9f76b880793c23a5283945dffc4d76d2ba68761c3d30e1b434f867513e8a71c3f60f61b3885cf58
6
+ metadata.gz: 6f15740cb929555b6627eac53dc12b28d75c10709e271a23aef06935c11fb83bf99479afe68d8db5e5bac8d9ecc06c62ac8f17fc4e3066e8ae6de1094b3fb042
7
+ data.tar.gz: 7a8cee46ca1ecf4a6ed71b497005f32f851067667c59e36a6b91bea3e8153c9beee4a765866f0849ae0fe83378cc241372fde6368f6fddc11e426a0a12415c36
@@ -0,0 +1,17 @@
1
+ language: ruby
2
+ sudo: false
3
+ addons:
4
+ apt:
5
+ packages:
6
+ - zlib1g-dev
7
+ - libncurses5-dev
8
+ - libtinfo-dev
9
+ - exonerate
10
+ rvm:
11
+ - 2.1.10
12
+ - 2.2.5
13
+ - 2.3.5
14
+ - 2.4.2
15
+
16
+ before_install:
17
+ - export RUBYOPT="-W1"
data/Gemfile CHANGED
@@ -3,15 +3,18 @@ source "http://rubygems.org"
3
3
  # Example:
4
4
  # gem "activesupport", ">= 2.3.5"
5
5
 
6
- gem "bio", ">= 1.4.3"
7
- gem "bio-samtools", ">= 2.0.4"
8
- gem "rake"
9
- gem "jeweler"
6
+ gem "bio", ">= 1.5.1"
7
+ gem "bio-samtools", ">= 2.6.2"
8
+ #gem "rake"
10
9
 
11
10
  gem "systemu", ">=2.5.2"
12
11
 
13
12
  group :development do
14
- # gem "shoulda", ">= 0"
15
- # gem "shoulda-context"
16
- # gem "shoulda-matchers"
13
+ gem "shoulda", ">= 2.10"
14
+ gem 'test-unit'
15
+ if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
16
+ gem "jeweler", "= 2.0.1"
17
+ else
18
+ gem "juwelier"
19
+ end
17
20
  end
data/README.md CHANGED
@@ -52,6 +52,43 @@ Usage: polymarker.rb [options]
52
52
  -P, --primers_to_order If present, saves a file named primers_to_order which contains the KASP tails
53
53
  ```
54
54
 
55
+ ## Input formats
56
+
57
+ The following formats are used to define the marker sequences:
58
+
59
+ ### Marker list
60
+
61
+ If the option ```--marker_list FILE``` is used, the SNP and the flanking sequence is included in the file. The format contains 3 columns (the order is important):
62
+
63
+ * **snp_name** The ID of the marker. Must be unique.
64
+ * **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
65
+ * **sequence** The sequence flanking the SNP with the SNP highligted on square brackets (```[]```) and the two alleles separated by a forward slash (```/```).
66
+
67
+ #### Example:
68
+
69
+ ```
70
+ BS00068396_51,2A,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
71
+ ```
72
+
73
+ ### SNP list
74
+
75
+ If the flanking sequence is unknow, but the position on a reference is available, the option ```--snp_list``` can be used and the FASTA file with the reference sequence must be provided with the option ```--reference```. This is to allow the use of a different assembly or set of contigs used for the discovery of the SNPs that are different to the reference given in the option ```--contigs```. The format contains the following positional columns:
76
+
77
+ * **scaffold** The sacffold where the SNP is.
78
+ * **reference allele** The base in the reference (may or may not be the same as in the reference file.
79
+ * **position** Position of the SNP. The first base in the scaffold is base 1.
80
+ * **alternative allele** The base in the alternative allele.
81
+ * **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
82
+
83
+ ####Example
84
+
85
+ ```
86
+ IWGSC_CSS_1AL_scaff_110,C,519,A,2A
87
+ ```
88
+
89
+ This file format can be used with ```snp_positions_to_polymarker.rb``` to produce the input for the option```--marker_list```.
90
+
91
+
55
92
  ###Custom reference sequences.
56
93
  By default, the contigs and pseudomolecules from [ensembl](ftp://ftp.ensemblgenomes.org/pub/release-25/plants/fasta/triticum_aestivum/dna/Triticum_aestivum.IWGSC2.25.dna.genome.fa.gz
57
94
  ) are used. However, it is possible to use a custom reference. To define the chromosome where each contig belongs the argument ```arm_selection``` is used. The defailt uses ids like: ```IWGSC_CSS_1AL_scaff_110```, where the third field, separated by underscores is used. A simple way to add costum references is to rename the fasta file to follow that convention. Another way is to use the option ```--arm_selection arm_selection_first_two```, where only the first two characters in each contig is used as identifier, useful when pseudomolecules are named after the chromosomes (ie: ">1A" in the fasta file).
@@ -71,6 +108,13 @@ end
71
108
 
72
109
  The function should return a 2 character string, when the first is the chromosome number and the second the chromosome group. The symbol in the hash is the name to be used in the argument ```--arm_selection```. If you want your parser to be added to the distribution, feel free to fork and make a pull request.
73
110
 
111
+ ##Using blast
112
+
113
+ To use blast instead of exonerate, use the following command:
114
+
115
+ ```
116
+ ./bin/polymarker.rb --contigs test/data/BS00068396_51_contigs.fa --marker_list test/data/BS00068396_51_for_polymarker.fa --aligner blast -a arm_selection_first_two
117
+ ```
74
118
 
75
119
 
76
120
  ##Release Notes
data/Rakefile CHANGED
@@ -12,16 +12,25 @@ begin
12
12
  end
13
13
  require 'rake'
14
14
 
15
- require 'jeweler'
16
15
 
17
- Jeweler::Tasks.new do |gem|
16
+ if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
17
+ require 'jeweler'
18
+ @taskClass = Jeweler
19
+ else
20
+ require 'juwelier'
21
+ @taskClass = Juwelier
22
+ end
23
+
24
+
25
+
26
+ @taskClass::Tasks.new do |gem|
18
27
  # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
19
28
  gem.name = "bio-polyploid-tools"
20
29
  gem.homepage = "http://github.com/tgac/bioruby-polyploid-tools"
21
30
  gem.license = "MIT"
22
31
  gem.summary = %Q{Tool to work with polyploids, NGS and molecular biology}
23
- gem.description = %Q{Repository of tools developed in TGAC and Crop Genetics in JIC to work with polyploid wheat}
24
- gem.email = "ricardo.ramirez-gonzalez@tgac.ac.uk"
32
+ gem.description = %Q{Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat}
33
+ gem.email = "ricardo.ramirez-gonzalez@jic.ac.uk"
25
34
  gem.authors = ["Ricardo H. Ramirez-Gonzalez"]
26
35
  # Include your dependencies below. Runtime dependencies are required when using your gem,
27
36
  # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
@@ -29,7 +38,7 @@ Jeweler::Tasks.new do |gem|
29
38
  # gem.add_development_dependency 'rspec', '> 1.2.3'
30
39
  # gem.extensions = "ext/mkrf_conf.rb"
31
40
  end
32
- Jeweler::RubygemsDotOrgTasks.new
41
+ @taskClass::RubygemsDotOrgTasks.new
33
42
 
34
43
  require 'rake/testtask'
35
44
  Rake::TestTask.new(:test) do |test|
@@ -50,12 +59,3 @@ end
50
59
 
51
60
  task :default => :test
52
61
 
53
- #require 'rdoc/task'
54
- ##RDoc::Task.new do |rdoc|
55
- # version = File.exist?('VERSION') ? File.read('VERSION') : ""
56
-
57
- # rdoc.rdoc_dir = 'rdoc'
58
- # rdoc.title = "bio-samtools #{version}"
59
- # rdoc.rdoc_files.include('README*')
60
- # rdoc.rdoc_files.include('lib/**/*.rb')
61
- #end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.7.3
1
+ 0.8.0
data/bin/bfr.rb CHANGED
@@ -50,11 +50,11 @@ OptionParser.new do |opts|
50
50
  options[:bulk_2] = o
51
51
  end
52
52
 
53
- opts.on("-m", "--chunk_size FILE", "Sorted BAM file with the alginments from bulk1 2 (corresponding to the phenotype of parental 2)") do |o|
53
+ opts.on("-m", "--chunk_size FILE", "Number of chunks to divde the SNP calling. Useful to run in a cluster.") do |o|
54
54
  options[:chunk_size] = o.to_i
55
55
  end
56
56
 
57
- opts.on("-n", "--chunk FILE", "Sorted BAM file with the alginments from bulk1 2 (corresponding to the phenotype of parental 2)") do |o|
57
+ opts.on("-n", "--chunk FILE", "Chunk number. Must be less than chunk_size. ") do |o|
58
58
  options[:chunk] = o.to_i
59
59
  end
60
60
 
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+
18
+ OptionParser.new do |opts|
19
+
20
+ opts.banner = "Usage: filter_blat.rb [options]"
21
+
22
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
23
+ options[:identity] = o.to_f
24
+ end
25
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
26
+ options[:min_bases] = o.to_i
27
+ end
28
+
29
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
30
+ options[:triads] = o
31
+ end
32
+
33
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
34
+ options[:fasta] = o
35
+ end
36
+
37
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
38
+ options[:split_token] = o
39
+ end
40
+
41
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
42
+ options[:program] = o
43
+ end
44
+
45
+ opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
46
+ options[:random_sample] = o.to_i
47
+ end
48
+
49
+
50
+ end.parse!
51
+
52
+
53
+ def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
54
+ cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
55
+ #puts cmd
56
+ executed = system cmd
57
+ result = []
58
+ blast_version = nil
59
+ n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
60
+ longest = nil
61
+ max_length = 0
62
+ max_pident = 0.0
63
+ max_similarity = 0.0
64
+ n.each do | iter |
65
+ iter.each do | hit |
66
+ align_len = 0
67
+ identity = 0.0
68
+ positives = 0.0
69
+ hit.each do | hsp |
70
+ align_len += hsp.align_len
71
+ identity += hsp.identity
72
+ positives += hsp.positive if program == "blastp"
73
+ end
74
+ if align_len > max_length
75
+ max_length = align_len
76
+ max_pident = 100 * identity / align_len
77
+ max_similarity = 100 * positives / align_len
78
+ end
79
+ end
80
+ end
81
+ [max_length, max_pident, max_similarity]
82
+ end
83
+
84
+ valid_pairs_A_B = Hash.new
85
+ valid_pairs_A_D = Hash.new
86
+ valid_pairs_B_D = Hash.new
87
+
88
+ split_token = options[:split_token]
89
+
90
+ sequences = Hash.new
91
+ sequence_count=0
92
+ Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
93
+ fasta_file.each do |entry|
94
+ gene_name = entry.entry_id.split(split_token)[0]
95
+ sequences[gene_name] = entry unless sequences[gene_name]
96
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
97
+ sequence_count += 1
98
+ end
99
+ end
100
+
101
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
102
+ #FileUtils.mkdir_p(options[:tmp_folder])
103
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
104
+
105
+ a_tmp = options[:tmp_folder] + "/A.fa"
106
+ b_tmp = options[:tmp_folder] + "/B.fa"
107
+ d_tmp = options[:tmp_folder] + "/D.fa"
108
+ out_tmp = options[:tmp_folder] + "/out.blast"
109
+
110
+
111
+ puts [
112
+ "group_id" , "query" , "subject" ,
113
+ "chr_query", "chr_subject", "aln_type",
114
+ "length" , "pident" , "psimilarity" ].join("\t")
115
+
116
+ count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
117
+
118
+ probability = options[:random_sample] / count_lines.to_f
119
+ probability = 1 if options[:random_sample] == 0
120
+ prng = Random.new
121
+ #puts probability
122
+
123
+ CSV.foreach(options[:triads], headers:true ) do |row|
124
+ a = row['A']
125
+ b = row['B']
126
+ d = row['D']
127
+ triad = row['group_id']
128
+
129
+ save = probability > prng.rand && probability < 1
130
+ run = probability == 1 || save
131
+ next unless run
132
+
133
+ seq_a = sequences[a]
134
+ seq_b = sequences[b]
135
+ seq_d = sequences[d]
136
+ File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
137
+ File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
138
+ File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
139
+ save_folder = "random_sample/#{triad}"
140
+
141
+ if save
142
+ FileUtils.mkdir_p save_folder
143
+ FileUtils.cp(a_tmp, save_folder) if seq_a
144
+ FileUtils.cp(b_tmp, save_folder) if seq_b
145
+ FileUtils.cp(d_tmp, save_folder) if seq_d
146
+ end
147
+
148
+ if seq_a and seq_b
149
+ to_print = [triad, a, b , "A","B","A->B"]
150
+ to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
151
+ FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") if save
152
+ puts to_print.join("\t")
153
+ end
154
+ if seq_a and seq_d
155
+ to_print = [triad, a, b , "A","D","A->D"]
156
+ to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
157
+ puts to_print.join("\t")
158
+ FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") if save
159
+ end
160
+ if seq_b and seq_d
161
+ to_print = [triad, a, b , "B","D","B->D"]
162
+ to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
163
+ FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") if save
164
+ puts to_print.join("\t")
165
+ end
166
+ end
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+ require 'bio'
4
+ require 'csv'
5
+ require 'bio-blastxmlparser'
6
+ require 'fileutils'
7
+ require 'tmpdir'
8
+
9
+
10
+ options = {}
11
+ options[:identity] = 50
12
+ options[:min_bases] = 200
13
+ options[:split_token] = "-"
14
+ options[:tmp_folder] = Dir.mktmpdir
15
+ options[:program] = "blastn"
16
+ options[:random_sample] = 0
17
+ options[:cut_promoter_length] = 0
18
+ options[:reverse] = true
19
+
20
+ OptionParser.new do |opts|
21
+
22
+ opts.banner = "Usage: filter_blat.rb [options]"
23
+
24
+ opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
25
+ options[:identity] = o.to_f
26
+ end
27
+ opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
28
+ options[:min_bases] = o.to_i
29
+ end
30
+
31
+ opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
32
+ options[:triads] = o
33
+ end
34
+
35
+ opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
36
+ options[:fasta] = o
37
+ end
38
+
39
+ opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
40
+ options[:split_token] = o
41
+ end
42
+
43
+ opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
44
+ options[:program] = o
45
+ end
46
+
47
+ opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
48
+ options[:random_sample] = o.to_i
49
+ end
50
+
51
+ opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
52
+ options[:cut_promoter_length] = o.to_i
53
+ end
54
+
55
+ opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
56
+ if o == 'T'
57
+ options[:reverse] = true
58
+ elsif o == 'F'
59
+ options[:reverse] = false
60
+ else
61
+ $stderr.puts "Invalid option for reverse (should be T or F)"
62
+ exit -1
63
+ end
64
+ end
65
+ end.parse!
66
+
67
+
68
+ def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
69
+ cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
70
+ #puts cmd
71
+ executed = system cmd
72
+ result = []
73
+ blast_version = nil
74
+ n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
75
+ longest = nil
76
+ max_length = 0
77
+ max_pident = 0.0
78
+ n.each do | iter |
79
+ iter.each do | hit |
80
+ hit.each do | hsp |
81
+ if hsp.align_len > max_length
82
+ max_length = hsp.align_len
83
+ max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
84
+ end
85
+ end
86
+ end
87
+ end
88
+ [max_length, max_pident]
89
+ end
90
+
91
+ valid_pairs_A_B = Hash.new
92
+ valid_pairs_A_D = Hash.new
93
+ valid_pairs_B_D = Hash.new
94
+
95
+ split_token = options[:split_token]
96
+
97
+ sequences = Hash.new
98
+ sequence_count=0
99
+ Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
100
+ fasta_file.each do |entry|
101
+ gene_name = entry.entry_id.split(split_token)[0]
102
+ seq = entry.naseq
103
+ seq.reverse_complement! if options[:reverse]
104
+ seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
105
+ entry.data = seq
106
+ sequences[gene_name] = entry unless sequences[gene_name]
107
+ sequences[gene_name] = entry if entry.length > sequences[gene_name].length
108
+ sequence_count += 1
109
+ end
110
+ end
111
+
112
+ $stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
113
+ #FileUtils.mkdir_p(options[:tmp_folder])
114
+ $stderr.puts "TMP dir: #{options[:tmp_folder]}"
115
+
116
+ a_tmp = options[:tmp_folder] + "/A.fa"
117
+ b_tmp = options[:tmp_folder] + "/B.fa"
118
+ d_tmp = options[:tmp_folder] + "/D.fa"
119
+ out_tmp = options[:tmp_folder] + "/out.blast"
120
+
121
+
122
+ puts [
123
+ "group_id" , "query" , "subject" ,
124
+ "chr_query", "chr_subject", "aln_type",
125
+ "length" , "pident" , "Ns_query", "Ns_subject", "Ns_total" ].join("\t")
126
+
127
+ count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
128
+
129
+ probability = options[:random_sample] / count_lines.to_f
130
+ probability = 1 if options[:random_sample] == 0
131
+ prng = Random.new
132
+ #puts probability
133
+ prom_len = options[:cut_promoter_length]
134
+ CSV.foreach(options[:triads], headers:true ) do |row|
135
+ a = row['A']
136
+ b = row['B']
137
+ d = row['D']
138
+ triad = row['group_id'].to_i
139
+ triad_folder = triad/100
140
+
141
+ save = probability > prng.rand && probability < 1
142
+ run = probability == 1 || save
143
+ next unless run
144
+
145
+ seq_a = sequences[a]
146
+ seq_b = sequences[b]
147
+ seq_d = sequences[d]
148
+ File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
149
+ File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
150
+ File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
151
+
152
+ ns_a = seq_a.seq.count('Nn') if seq_a
153
+ ns_b = seq_b.seq.count('Nn') if seq_b
154
+ ns_d = seq_d.seq.count('Nn') if seq_d
155
+
156
+ save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
157
+
158
+ #if save
159
+ FileUtils.mkdir_p save_folder
160
+ FileUtils.cp(a_tmp, save_folder) if seq_a
161
+ FileUtils.cp(b_tmp, save_folder) if seq_b
162
+ FileUtils.cp(d_tmp, save_folder) if seq_d
163
+ #end
164
+
165
+ if seq_a and seq_b
166
+ to_print = [triad, a, b , "A","B","A->B"]
167
+ to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
168
+ to_print << ns_a
169
+ to_print << ns_b
170
+ to_print << ns_a + ns_b
171
+ FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
172
+ puts to_print.join("\t")
173
+ end
174
+ if seq_a and seq_d
175
+ to_print = [triad, a, b , "A","D","A->D"]
176
+ to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
177
+ to_print << ns_a
178
+ to_print << ns_d
179
+ to_print << ns_a + ns_d
180
+ FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
181
+ puts to_print.join("\t")
182
+ end
183
+ if seq_b and seq_d
184
+ to_print = [triad, a, b , "B","D","B->D"]
185
+ to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
186
+ to_print << ns_b
187
+ to_print << ns_d
188
+ to_print << ns_b + ns_d
189
+ FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
190
+ puts to_print.join("\t")
191
+ end
192
+ end