bio-polyploid-tools 0.7.3 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +17 -0
- data/Gemfile +10 -7
- data/README.md +44 -0
- data/Rakefile +14 -14
- data/VERSION +1 -1
- data/bin/bfr.rb +2 -2
- data/bin/blast_triads.rb +166 -0
- data/bin/blast_triads_promoters.rb +192 -0
- data/bin/find_homoeologue_variations.rb +385 -0
- data/bin/get_longest_hsp_blastx_triads.rb +66 -0
- data/bin/hexaploid_primers.rb +2 -2
- data/bin/homokaryot_primers.rb +2 -2
- data/bin/mafft_triads.rb +120 -0
- data/bin/mafft_triads_promoters.rb +403 -0
- data/bin/polymarker.rb +73 -17
- data/bin/polymarker_capillary.rb +416 -0
- data/bin/snp_position_to_polymarker.rb +5 -3
- data/bin/snps_between_bams.rb +0 -29
- data/bin/vcfLineToTable.rb +56 -0
- data/bio-polyploid-tools.gemspec +74 -32
- data/lib/bio/BFRTools.rb +1 -0
- data/lib/bio/PolyploidTools/ChromosomeArm.rb +2 -6
- data/lib/bio/PolyploidTools/ExonContainer.rb +31 -8
- data/lib/bio/PolyploidTools/NoSNPSequence.rb +286 -0
- data/lib/bio/PolyploidTools/PrimerRegion.rb +9 -1
- data/lib/bio/PolyploidTools/SNP.rb +58 -18
- data/lib/bio/PolyploidTools/SNPMutant.rb +5 -3
- data/lib/bio/db/blast.rb +112 -0
- data/lib/bio/db/exonerate.rb +4 -5
- data/lib/bio/db/primer3.rb +83 -14
- data/test/data/BS00068396_51_blast.tab +4 -0
- data/test/data/BS00068396_51_contigs.nhr +0 -0
- data/test/data/BS00068396_51_contigs.nin +0 -0
- data/test/data/BS00068396_51_contigs.nsq +0 -0
- data/test/data/BS00068396_51_for_polymarker.fa +1 -0
- data/test/data/IWGSC_CSS_1AL_scaff_1455974_aln_contigs.fa.fai +11 -0
- data/test/data/S22380157.vcf +67 -0
- data/test/data/S58861868/LIB1716.bam +0 -0
- data/test/data/S58861868/LIB1716.sam +651 -0
- data/test/data/S58861868/LIB1719.bam +0 -0
- data/test/data/S58861868/LIB1719.sam +805 -0
- data/test/data/S58861868/LIB1721.bam +0 -0
- data/test/data/S58861868/LIB1721.sam +1790 -0
- data/test/data/S58861868/LIB1722.bam +0 -0
- data/test/data/S58861868/LIB1722.sam +1271 -0
- data/test/data/S58861868/S58861868.fa +16 -0
- data/test/data/S58861868/S58861868.fa.fai +1 -0
- data/test/data/S58861868/S58861868.vcf +76 -0
- data/test/data/S58861868/header.txt +9 -0
- data/test/data/S58861868/merged.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam +0 -0
- data/test/data/S58861868/merged_reheader.bam.bai +0 -0
- data/test/data/bfr_out_test.csv +5 -5
- data/test/data/headerMergeed.txt +9 -0
- data/test/data/headerS2238015 +1 -0
- data/test/data/mergedLibs.bam +0 -0
- data/test/data/mergedLibsReheader.bam +0 -0
- data/test/data/mergedLibsSorted.bam +0 -0
- data/test/data/mergedLibsSorted.bam.bai +0 -0
- data/test/test_bfr.rb +26 -34
- data/test/test_blast.rb +47 -0
- data/test/test_exonearate.rb +4 -9
- data/test/test_snp_parsing.rb +42 -22
- metadata +81 -20
- data/Gemfile.lock +0 -67
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: '08be9c740b45561cf8de023e6ca63bb6be4ae63e6f89bd1eb4b149da9cf47334'
|
4
|
+
data.tar.gz: 94aa0d62f15ad380a35fe2c4bbcd870f2cb984f04c76aa825084b9ab97431d8b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f15740cb929555b6627eac53dc12b28d75c10709e271a23aef06935c11fb83bf99479afe68d8db5e5bac8d9ecc06c62ac8f17fc4e3066e8ae6de1094b3fb042
|
7
|
+
data.tar.gz: 7a8cee46ca1ecf4a6ed71b497005f32f851067667c59e36a6b91bea3e8153c9beee4a765866f0849ae0fe83378cc241372fde6368f6fddc11e426a0a12415c36
|
data/.travis.yml
ADDED
data/Gemfile
CHANGED
@@ -3,15 +3,18 @@ source "http://rubygems.org"
|
|
3
3
|
# Example:
|
4
4
|
# gem "activesupport", ">= 2.3.5"
|
5
5
|
|
6
|
-
gem "bio", ">= 1.
|
7
|
-
gem "bio-samtools", ">= 2.
|
8
|
-
gem "rake"
|
9
|
-
gem "jeweler"
|
6
|
+
gem "bio", ">= 1.5.1"
|
7
|
+
gem "bio-samtools", ">= 2.6.2"
|
8
|
+
#gem "rake"
|
10
9
|
|
11
10
|
gem "systemu", ">=2.5.2"
|
12
11
|
|
13
12
|
group :development do
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
gem "shoulda", ">= 2.10"
|
14
|
+
gem 'test-unit'
|
15
|
+
if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
|
16
|
+
gem "jeweler", "= 2.0.1"
|
17
|
+
else
|
18
|
+
gem "juwelier"
|
19
|
+
end
|
17
20
|
end
|
data/README.md
CHANGED
@@ -52,6 +52,43 @@ Usage: polymarker.rb [options]
|
|
52
52
|
-P, --primers_to_order If present, saves a file named primers_to_order which contains the KASP tails
|
53
53
|
```
|
54
54
|
|
55
|
+
## Input formats
|
56
|
+
|
57
|
+
The following formats are used to define the marker sequences:
|
58
|
+
|
59
|
+
### Marker list
|
60
|
+
|
61
|
+
If the option ```--marker_list FILE``` is used, the SNP and the flanking sequence is included in the file. The format contains 3 columns (the order is important):
|
62
|
+
|
63
|
+
* **snp_name** The ID of the marker. Must be unique.
|
64
|
+
* **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
|
65
|
+
* **sequence** The sequence flanking the SNP with the SNP highligted on square brackets (```[]```) and the two alleles separated by a forward slash (```/```).
|
66
|
+
|
67
|
+
#### Example:
|
68
|
+
|
69
|
+
```
|
70
|
+
BS00068396_51,2A,CGAAGCGATCCTACTACATTGCGTTCCTTTCCCACTCCCAGGTCCCCCTA[T/C]ATGCAGGATCTTGATTAGTCGTGTGAACAACTGAAATTTGAGCGCCACAA
|
71
|
+
```
|
72
|
+
|
73
|
+
### SNP list
|
74
|
+
|
75
|
+
If the flanking sequence is unknow, but the position on a reference is available, the option ```--snp_list``` can be used and the FASTA file with the reference sequence must be provided with the option ```--reference```. This is to allow the use of a different assembly or set of contigs used for the discovery of the SNPs that are different to the reference given in the option ```--contigs```. The format contains the following positional columns:
|
76
|
+
|
77
|
+
* **scaffold** The sacffold where the SNP is.
|
78
|
+
* **reference allele** The base in the reference (may or may not be the same as in the reference file.
|
79
|
+
* **position** Position of the SNP. The first base in the scaffold is base 1.
|
80
|
+
* **alternative allele** The base in the alternative allele.
|
81
|
+
* **target chromosome** for the specific primers. Must be in line with the chromosome selection critieria.
|
82
|
+
|
83
|
+
####Example
|
84
|
+
|
85
|
+
```
|
86
|
+
IWGSC_CSS_1AL_scaff_110,C,519,A,2A
|
87
|
+
```
|
88
|
+
|
89
|
+
This file format can be used with ```snp_positions_to_polymarker.rb``` to produce the input for the option```--marker_list```.
|
90
|
+
|
91
|
+
|
55
92
|
###Custom reference sequences.
|
56
93
|
By default, the contigs and pseudomolecules from [ensembl](ftp://ftp.ensemblgenomes.org/pub/release-25/plants/fasta/triticum_aestivum/dna/Triticum_aestivum.IWGSC2.25.dna.genome.fa.gz
|
57
94
|
) are used. However, it is possible to use a custom reference. To define the chromosome where each contig belongs the argument ```arm_selection``` is used. The defailt uses ids like: ```IWGSC_CSS_1AL_scaff_110```, where the third field, separated by underscores is used. A simple way to add costum references is to rename the fasta file to follow that convention. Another way is to use the option ```--arm_selection arm_selection_first_two```, where only the first two characters in each contig is used as identifier, useful when pseudomolecules are named after the chromosomes (ie: ">1A" in the fasta file).
|
@@ -71,6 +108,13 @@ end
|
|
71
108
|
|
72
109
|
The function should return a 2 character string, when the first is the chromosome number and the second the chromosome group. The symbol in the hash is the name to be used in the argument ```--arm_selection```. If you want your parser to be added to the distribution, feel free to fork and make a pull request.
|
73
110
|
|
111
|
+
##Using blast
|
112
|
+
|
113
|
+
To use blast instead of exonerate, use the following command:
|
114
|
+
|
115
|
+
```
|
116
|
+
./bin/polymarker.rb --contigs test/data/BS00068396_51_contigs.fa --marker_list test/data/BS00068396_51_for_polymarker.fa --aligner blast -a arm_selection_first_two
|
117
|
+
```
|
74
118
|
|
75
119
|
|
76
120
|
##Release Notes
|
data/Rakefile
CHANGED
@@ -12,16 +12,25 @@ begin
|
|
12
12
|
end
|
13
13
|
require 'rake'
|
14
14
|
|
15
|
-
require 'jeweler'
|
16
15
|
|
17
|
-
|
16
|
+
if RUBY_VERSION.start_with?("2.1") or RUBY_VERSION.start_with?("2.2") or RUBY_VERSION.start_with?("2.0")
|
17
|
+
require 'jeweler'
|
18
|
+
@taskClass = Jeweler
|
19
|
+
else
|
20
|
+
require 'juwelier'
|
21
|
+
@taskClass = Juwelier
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
|
26
|
+
@taskClass::Tasks.new do |gem|
|
18
27
|
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
19
28
|
gem.name = "bio-polyploid-tools"
|
20
29
|
gem.homepage = "http://github.com/tgac/bioruby-polyploid-tools"
|
21
30
|
gem.license = "MIT"
|
22
31
|
gem.summary = %Q{Tool to work with polyploids, NGS and molecular biology}
|
23
|
-
gem.description = %Q{Repository of tools developed
|
24
|
-
gem.email = "ricardo.ramirez-gonzalez@
|
32
|
+
gem.description = %Q{Repository of tools developed at Crop Genetics in JIC to work with polyploid wheat}
|
33
|
+
gem.email = "ricardo.ramirez-gonzalez@jic.ac.uk"
|
25
34
|
gem.authors = ["Ricardo H. Ramirez-Gonzalez"]
|
26
35
|
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
27
36
|
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
@@ -29,7 +38,7 @@ Jeweler::Tasks.new do |gem|
|
|
29
38
|
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
30
39
|
# gem.extensions = "ext/mkrf_conf.rb"
|
31
40
|
end
|
32
|
-
|
41
|
+
@taskClass::RubygemsDotOrgTasks.new
|
33
42
|
|
34
43
|
require 'rake/testtask'
|
35
44
|
Rake::TestTask.new(:test) do |test|
|
@@ -50,12 +59,3 @@ end
|
|
50
59
|
|
51
60
|
task :default => :test
|
52
61
|
|
53
|
-
#require 'rdoc/task'
|
54
|
-
##RDoc::Task.new do |rdoc|
|
55
|
-
# version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
56
|
-
|
57
|
-
# rdoc.rdoc_dir = 'rdoc'
|
58
|
-
# rdoc.title = "bio-samtools #{version}"
|
59
|
-
# rdoc.rdoc_files.include('README*')
|
60
|
-
# rdoc.rdoc_files.include('lib/**/*.rb')
|
61
|
-
#end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.8.0
|
data/bin/bfr.rb
CHANGED
@@ -50,11 +50,11 @@ OptionParser.new do |opts|
|
|
50
50
|
options[:bulk_2] = o
|
51
51
|
end
|
52
52
|
|
53
|
-
opts.on("-m", "--chunk_size FILE", "
|
53
|
+
opts.on("-m", "--chunk_size FILE", "Number of chunks to divde the SNP calling. Useful to run in a cluster.") do |o|
|
54
54
|
options[:chunk_size] = o.to_i
|
55
55
|
end
|
56
56
|
|
57
|
-
opts.on("-n", "--chunk FILE", "
|
57
|
+
opts.on("-n", "--chunk FILE", "Chunk number. Must be less than chunk_size. ") do |o|
|
58
58
|
options[:chunk] = o.to_i
|
59
59
|
end
|
60
60
|
|
data/bin/blast_triads.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
|
18
|
+
OptionParser.new do |opts|
|
19
|
+
|
20
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
21
|
+
|
22
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
23
|
+
options[:identity] = o.to_f
|
24
|
+
end
|
25
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
26
|
+
options[:min_bases] = o.to_i
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
30
|
+
options[:triads] = o
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
34
|
+
options[:fasta] = o
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
38
|
+
options[:split_token] = o
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
42
|
+
options[:program] = o
|
43
|
+
end
|
44
|
+
|
45
|
+
opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
|
46
|
+
options[:random_sample] = o.to_i
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end.parse!
|
51
|
+
|
52
|
+
|
53
|
+
def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
|
54
|
+
cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
|
55
|
+
#puts cmd
|
56
|
+
executed = system cmd
|
57
|
+
result = []
|
58
|
+
blast_version = nil
|
59
|
+
n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
|
60
|
+
longest = nil
|
61
|
+
max_length = 0
|
62
|
+
max_pident = 0.0
|
63
|
+
max_similarity = 0.0
|
64
|
+
n.each do | iter |
|
65
|
+
iter.each do | hit |
|
66
|
+
align_len = 0
|
67
|
+
identity = 0.0
|
68
|
+
positives = 0.0
|
69
|
+
hit.each do | hsp |
|
70
|
+
align_len += hsp.align_len
|
71
|
+
identity += hsp.identity
|
72
|
+
positives += hsp.positive if program == "blastp"
|
73
|
+
end
|
74
|
+
if align_len > max_length
|
75
|
+
max_length = align_len
|
76
|
+
max_pident = 100 * identity / align_len
|
77
|
+
max_similarity = 100 * positives / align_len
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
[max_length, max_pident, max_similarity]
|
82
|
+
end
|
83
|
+
|
84
|
+
valid_pairs_A_B = Hash.new
|
85
|
+
valid_pairs_A_D = Hash.new
|
86
|
+
valid_pairs_B_D = Hash.new
|
87
|
+
|
88
|
+
split_token = options[:split_token]
|
89
|
+
|
90
|
+
sequences = Hash.new
|
91
|
+
sequence_count=0
|
92
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
|
93
|
+
fasta_file.each do |entry|
|
94
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
95
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
96
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
97
|
+
sequence_count += 1
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
102
|
+
#FileUtils.mkdir_p(options[:tmp_folder])
|
103
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
104
|
+
|
105
|
+
a_tmp = options[:tmp_folder] + "/A.fa"
|
106
|
+
b_tmp = options[:tmp_folder] + "/B.fa"
|
107
|
+
d_tmp = options[:tmp_folder] + "/D.fa"
|
108
|
+
out_tmp = options[:tmp_folder] + "/out.blast"
|
109
|
+
|
110
|
+
|
111
|
+
puts [
|
112
|
+
"group_id" , "query" , "subject" ,
|
113
|
+
"chr_query", "chr_subject", "aln_type",
|
114
|
+
"length" , "pident" , "psimilarity" ].join("\t")
|
115
|
+
|
116
|
+
count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
|
117
|
+
|
118
|
+
probability = options[:random_sample] / count_lines.to_f
|
119
|
+
probability = 1 if options[:random_sample] == 0
|
120
|
+
prng = Random.new
|
121
|
+
#puts probability
|
122
|
+
|
123
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
124
|
+
a = row['A']
|
125
|
+
b = row['B']
|
126
|
+
d = row['D']
|
127
|
+
triad = row['group_id']
|
128
|
+
|
129
|
+
save = probability > prng.rand && probability < 1
|
130
|
+
run = probability == 1 || save
|
131
|
+
next unless run
|
132
|
+
|
133
|
+
seq_a = sequences[a]
|
134
|
+
seq_b = sequences[b]
|
135
|
+
seq_d = sequences[d]
|
136
|
+
File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
|
137
|
+
File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
|
138
|
+
File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
|
139
|
+
save_folder = "random_sample/#{triad}"
|
140
|
+
|
141
|
+
if save
|
142
|
+
FileUtils.mkdir_p save_folder
|
143
|
+
FileUtils.cp(a_tmp, save_folder) if seq_a
|
144
|
+
FileUtils.cp(b_tmp, save_folder) if seq_b
|
145
|
+
FileUtils.cp(d_tmp, save_folder) if seq_d
|
146
|
+
end
|
147
|
+
|
148
|
+
if seq_a and seq_b
|
149
|
+
to_print = [triad, a, b , "A","B","A->B"]
|
150
|
+
to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
|
151
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") if save
|
152
|
+
puts to_print.join("\t")
|
153
|
+
end
|
154
|
+
if seq_a and seq_d
|
155
|
+
to_print = [triad, a, b , "A","D","A->D"]
|
156
|
+
to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
|
157
|
+
puts to_print.join("\t")
|
158
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") if save
|
159
|
+
end
|
160
|
+
if seq_b and seq_d
|
161
|
+
to_print = [triad, a, b , "B","D","B->D"]
|
162
|
+
to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
|
163
|
+
FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") if save
|
164
|
+
puts to_print.join("\t")
|
165
|
+
end
|
166
|
+
end
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'optparse'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'bio-blastxmlparser'
|
6
|
+
require 'fileutils'
|
7
|
+
require 'tmpdir'
|
8
|
+
|
9
|
+
|
10
|
+
options = {}
|
11
|
+
options[:identity] = 50
|
12
|
+
options[:min_bases] = 200
|
13
|
+
options[:split_token] = "-"
|
14
|
+
options[:tmp_folder] = Dir.mktmpdir
|
15
|
+
options[:program] = "blastn"
|
16
|
+
options[:random_sample] = 0
|
17
|
+
options[:cut_promoter_length] = 0
|
18
|
+
options[:reverse] = true
|
19
|
+
|
20
|
+
OptionParser.new do |opts|
|
21
|
+
|
22
|
+
opts.banner = "Usage: filter_blat.rb [options]"
|
23
|
+
|
24
|
+
opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
|
25
|
+
options[:identity] = o.to_f
|
26
|
+
end
|
27
|
+
opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
|
28
|
+
options[:min_bases] = o.to_i
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
|
32
|
+
options[:triads] = o
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
|
36
|
+
options[:fasta] = o
|
37
|
+
end
|
38
|
+
|
39
|
+
opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
|
40
|
+
options[:split_token] = o
|
41
|
+
end
|
42
|
+
|
43
|
+
opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
|
44
|
+
options[:program] = o
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
|
48
|
+
options[:random_sample] = o.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
|
52
|
+
options[:cut_promoter_length] = o.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
|
56
|
+
if o == 'T'
|
57
|
+
options[:reverse] = true
|
58
|
+
elsif o == 'F'
|
59
|
+
options[:reverse] = false
|
60
|
+
else
|
61
|
+
$stderr.puts "Invalid option for reverse (should be T or F)"
|
62
|
+
exit -1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end.parse!
|
66
|
+
|
67
|
+
|
68
|
+
def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
|
69
|
+
cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
|
70
|
+
#puts cmd
|
71
|
+
executed = system cmd
|
72
|
+
result = []
|
73
|
+
blast_version = nil
|
74
|
+
n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
|
75
|
+
longest = nil
|
76
|
+
max_length = 0
|
77
|
+
max_pident = 0.0
|
78
|
+
n.each do | iter |
|
79
|
+
iter.each do | hit |
|
80
|
+
hit.each do | hsp |
|
81
|
+
if hsp.align_len > max_length
|
82
|
+
max_length = hsp.align_len
|
83
|
+
max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
[max_length, max_pident]
|
89
|
+
end
|
90
|
+
|
91
|
+
valid_pairs_A_B = Hash.new
|
92
|
+
valid_pairs_A_D = Hash.new
|
93
|
+
valid_pairs_B_D = Hash.new
|
94
|
+
|
95
|
+
split_token = options[:split_token]
|
96
|
+
|
97
|
+
sequences = Hash.new
|
98
|
+
sequence_count=0
|
99
|
+
Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
|
100
|
+
fasta_file.each do |entry|
|
101
|
+
gene_name = entry.entry_id.split(split_token)[0]
|
102
|
+
seq = entry.naseq
|
103
|
+
seq.reverse_complement! if options[:reverse]
|
104
|
+
seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
|
105
|
+
entry.data = seq
|
106
|
+
sequences[gene_name] = entry unless sequences[gene_name]
|
107
|
+
sequences[gene_name] = entry if entry.length > sequences[gene_name].length
|
108
|
+
sequence_count += 1
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
|
113
|
+
#FileUtils.mkdir_p(options[:tmp_folder])
|
114
|
+
$stderr.puts "TMP dir: #{options[:tmp_folder]}"
|
115
|
+
|
116
|
+
a_tmp = options[:tmp_folder] + "/A.fa"
|
117
|
+
b_tmp = options[:tmp_folder] + "/B.fa"
|
118
|
+
d_tmp = options[:tmp_folder] + "/D.fa"
|
119
|
+
out_tmp = options[:tmp_folder] + "/out.blast"
|
120
|
+
|
121
|
+
|
122
|
+
puts [
|
123
|
+
"group_id" , "query" , "subject" ,
|
124
|
+
"chr_query", "chr_subject", "aln_type",
|
125
|
+
"length" , "pident" , "Ns_query", "Ns_subject", "Ns_total" ].join("\t")
|
126
|
+
|
127
|
+
count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}
|
128
|
+
|
129
|
+
probability = options[:random_sample] / count_lines.to_f
|
130
|
+
probability = 1 if options[:random_sample] == 0
|
131
|
+
prng = Random.new
|
132
|
+
#puts probability
|
133
|
+
prom_len = options[:cut_promoter_length]
|
134
|
+
CSV.foreach(options[:triads], headers:true ) do |row|
|
135
|
+
a = row['A']
|
136
|
+
b = row['B']
|
137
|
+
d = row['D']
|
138
|
+
triad = row['group_id'].to_i
|
139
|
+
triad_folder = triad/100
|
140
|
+
|
141
|
+
save = probability > prng.rand && probability < 1
|
142
|
+
run = probability == 1 || save
|
143
|
+
next unless run
|
144
|
+
|
145
|
+
seq_a = sequences[a]
|
146
|
+
seq_b = sequences[b]
|
147
|
+
seq_d = sequences[d]
|
148
|
+
File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
|
149
|
+
File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
|
150
|
+
File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d
|
151
|
+
|
152
|
+
ns_a = seq_a.seq.count('Nn') if seq_a
|
153
|
+
ns_b = seq_b.seq.count('Nn') if seq_b
|
154
|
+
ns_d = seq_d.seq.count('Nn') if seq_d
|
155
|
+
|
156
|
+
save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
|
157
|
+
|
158
|
+
#if save
|
159
|
+
FileUtils.mkdir_p save_folder
|
160
|
+
FileUtils.cp(a_tmp, save_folder) if seq_a
|
161
|
+
FileUtils.cp(b_tmp, save_folder) if seq_b
|
162
|
+
FileUtils.cp(d_tmp, save_folder) if seq_d
|
163
|
+
#end
|
164
|
+
|
165
|
+
if seq_a and seq_b
|
166
|
+
to_print = [triad, a, b , "A","B","A->B"]
|
167
|
+
to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
|
168
|
+
to_print << ns_a
|
169
|
+
to_print << ns_b
|
170
|
+
to_print << ns_a + ns_b
|
171
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
|
172
|
+
puts to_print.join("\t")
|
173
|
+
end
|
174
|
+
if seq_a and seq_d
|
175
|
+
to_print = [triad, a, b , "A","D","A->D"]
|
176
|
+
to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program])
|
177
|
+
to_print << ns_a
|
178
|
+
to_print << ns_d
|
179
|
+
to_print << ns_a + ns_d
|
180
|
+
FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
|
181
|
+
puts to_print.join("\t")
|
182
|
+
end
|
183
|
+
if seq_b and seq_d
|
184
|
+
to_print = [triad, a, b , "B","D","B->D"]
|
185
|
+
to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
|
186
|
+
to_print << ns_b
|
187
|
+
to_print << ns_d
|
188
|
+
to_print << ns_b + ns_d
|
189
|
+
FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
|
190
|
+
puts to_print.join("\t")
|
191
|
+
end
|
192
|
+
end
|