full_lengther_next 0.0.8 → 0.5.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
data/bin/full_lengther_next
CHANGED
@@ -2,11 +2,15 @@
|
|
2
2
|
|
3
3
|
# 12-2-2011 Noe Fernandez Pozo.
|
4
4
|
# Full-LengtherNEXT predicts if your sequences are complete, showing you the nucleotide sequences and the translated protein
|
5
|
+
ROOT_PATH=File.dirname(__FILE__)
|
6
|
+
$: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
|
5
7
|
|
6
|
-
#------------------------------------------------------------------ parameters entry
|
7
8
|
require 'optparse'
|
8
9
|
require 'socket'
|
9
10
|
|
11
|
+
###############################################################################################
|
12
|
+
# PARSE OPTIONS
|
13
|
+
###############################################################################################
|
10
14
|
options = {}
|
11
15
|
|
12
16
|
if !File.exists?('logs')
|
@@ -14,87 +18,147 @@ if !File.exists?('logs')
|
|
14
18
|
end
|
15
19
|
|
16
20
|
optparse = OptionParser.new do |opts|
|
17
|
-
|
21
|
+
|
22
|
+
options[:acess_db] = 'stnp'
|
23
|
+
opts.on( '-a', '--acess_db STRING', 'Select that databases is going to be used. s for Swissprot, t for trEMBL and n for ncrna, p for use Transdecoder and c for use testcode algothrim. By default is set to stnp' ) do |acess_db|
|
24
|
+
options[:acess_db] = acess_db
|
25
|
+
end
|
26
|
+
|
27
|
+
options[:blast] = ''
|
28
|
+
opts.on( '-b', '--blast STRING', 'Aditional options to blast execution' ) do |blast|
|
29
|
+
options[:blast] = blast
|
30
|
+
end
|
31
|
+
|
32
|
+
options[:chunk_size] = 200
|
33
|
+
opts.on( '-c', '--chunk_size SIZE', "Number of sequences processed in each block when parallelization is used. Default=200" ) do |s|
|
34
|
+
options[:chunk_size] = s.to_i
|
35
|
+
end
|
36
|
+
|
37
|
+
options[:est_db] = nil
|
38
|
+
opts.on( '-d', '--est_db FILE', "EST database for representative transcriptome\n" ) do |est_db|
|
39
|
+
options[:est_db] = est_db
|
40
|
+
if !File.exists?(options[:est_db])
|
41
|
+
puts "No valid path to EST database"
|
42
|
+
Process.exit(-1)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
options[:exonerate] = TRUE
|
47
|
+
opts.on( '-e', '--exonerate', 'Disables exonerate analysis' ) do |exonerate|
|
48
|
+
options[:exonerate] = FALSE
|
49
|
+
end
|
50
|
+
|
18
51
|
options[:fasta] = nil
|
19
52
|
opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
|
20
53
|
options[:fasta] = file
|
21
54
|
end
|
22
|
-
|
55
|
+
|
23
56
|
options[:tax_group] = nil
|
24
|
-
opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n\t\
|
57
|
+
opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n#{"\t"*5}fungi\n#{"\t"*5}human\n#{"\t"*5}invertebrates\n#{"\t"*5}mammals\n#{"\t"*5}plants\n#{"\t"*5}rodents\n#{"\t"*5}vertebrates\n\n" ) do |tax_name|
|
25
58
|
options[:tax_group] = tax_name
|
26
59
|
end
|
27
|
-
|
28
|
-
options[:
|
29
|
-
opts.on( '-
|
30
|
-
options[:
|
60
|
+
|
61
|
+
options[:ident] = 45.00
|
62
|
+
opts.on( '-i', '--identity_percent IDENTITY', 'identity percent threshold to consider as reliable the sequence similarity. Default=45.00' ) do |ident|
|
63
|
+
options[:ident] = ident.to_f
|
31
64
|
end
|
32
|
-
|
33
|
-
# options[:verbose] = nil
|
34
|
-
# opts.on( '-v', '--verbose_mode', "verbose mode\n\n" ) do |verbose|
|
35
|
-
# options[:verbose] = verbose
|
36
|
-
# end
|
37
65
|
|
38
|
-
options[:
|
39
|
-
opts.on( '-
|
40
|
-
options[:
|
66
|
+
options[:high_clustering] = TRUE
|
67
|
+
opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default true' ) do
|
68
|
+
options[:high_clustering] = FALSE
|
41
69
|
end
|
42
70
|
|
43
|
-
options[:
|
44
|
-
opts.on( '-
|
45
|
-
options[:
|
71
|
+
options[:subject_coverage] = 0.25
|
72
|
+
opts.on( '-j', '--subject_coverage_percent FLOAT', "Subject coverage percentage threshold" ) do |j|
|
73
|
+
options[:subject_coverage] = j.to_f/100
|
74
|
+
end
|
75
|
+
|
76
|
+
options[:min_nucleotides] = 100
|
77
|
+
opts.on( '-n', '--min_nucleotides minLONG', "min nucleotides to consider a part of chimera like putative unigene. Default=100\n\n" ) do |min_nucleotides|
|
78
|
+
options[:min_nucleotides] = min_nucleotides.to_i
|
46
79
|
end
|
47
80
|
|
48
81
|
options[:distance] = 15
|
49
82
|
opts.on( '-m', '--max_distance maxDIST', "maximal distance between query and subject gene boundaries to be qualified as putative, the less distance the more strict. Default=15\n\n" ) do |distance|
|
50
83
|
options[:distance] = distance.to_i
|
51
84
|
end
|
52
|
-
|
53
|
-
options[:
|
54
|
-
opts.on( '-
|
85
|
+
|
86
|
+
options[:port] = 0 #50000
|
87
|
+
opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
|
88
|
+
options[:port] = port.to_i
|
89
|
+
end
|
90
|
+
|
91
|
+
options[:chimera] = 'rc'
|
92
|
+
opts.on( '-q', '--chimera_detection STRING', "d for deactivate chimera detection mode, s for search chimeras only, r for revise it and c for cut it. Default = rcs \n\n" ) do |chimera|
|
93
|
+
chimera.downcase!
|
55
94
|
options[:chimera] = chimera
|
56
95
|
end
|
57
96
|
|
97
|
+
options[:reptrans] = nil
|
98
|
+
opts.on( '-r', '--representative_transcriptome', "Generates a fasta file with the minime transcriptome\n" ) do |reptrans|
|
99
|
+
options[:reptrans] = reptrans
|
100
|
+
end
|
101
|
+
|
102
|
+
options[:server_ip] = '0.0.0.0'
|
103
|
+
opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
|
104
|
+
|
105
|
+
# get list of available ips
|
106
|
+
ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
|
107
|
+
ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
|
108
|
+
|
109
|
+
if !ip
|
110
|
+
ip='0.0.0.0'
|
111
|
+
# $LOG.info("No available ip matching #{server_ip}")
|
112
|
+
end
|
113
|
+
# $ .info("Using ip #{ip}")
|
114
|
+
options[:server_ip] = ip
|
115
|
+
end
|
116
|
+
|
117
|
+
options[:ident_thresold] = 55.0
|
118
|
+
opts.on( '-t', '--identity_thresold FLOAT', "For chimeras only. Min identity to consider that two proteins are the same. Default=55.0\n\n" ) do |ident_thresold|
|
119
|
+
options[:ident_thresold] = ident_thresold.to_i
|
120
|
+
end
|
121
|
+
|
122
|
+
options[:user_db] = nil
|
123
|
+
opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
|
124
|
+
options[:user_db] = db
|
125
|
+
if !File.exists?(File.expand_path(options[:user_db])+'.psq')
|
126
|
+
puts "user database: #{options[:user_db]} was not found"
|
127
|
+
exit
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
options[:verbose] = 0
|
132
|
+
opts.on( '-v', '--verbose INTEGER', 'Show extra info' ) do |verbose|
|
133
|
+
options[:verbose] = verbose.to_i
|
134
|
+
end
|
135
|
+
|
58
136
|
options[:workers] = 2
|
59
137
|
opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
138
|
+
|
139
|
+
if File.exists?(workers)
|
140
|
+
# use workers file
|
141
|
+
options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
|
142
|
+
options[:workers].shift
|
143
|
+
elsif (workers.to_i > 0)
|
144
|
+
options[:workers] = workers.to_i
|
145
|
+
else
|
146
|
+
options[:workers] = 2
|
147
|
+
end
|
148
|
+
|
68
149
|
end
|
69
|
-
end
|
70
150
|
|
71
|
-
options[:
|
72
|
-
opts.on( '-
|
73
|
-
options[:
|
151
|
+
options[:training_ident] = 45.00
|
152
|
+
opts.on( '-x', '--training_identity_percent IDENTITY', 'identity percent threshold to use a complete sure sequence for Transdecoder training. Default=45.00' ) do |training_ident|
|
153
|
+
options[:training_ident] = ident.to_f
|
74
154
|
end
|
75
|
-
|
76
|
-
options[:server_ip] = '0.0.0.0'
|
77
|
-
opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
|
78
155
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
if !ip
|
85
|
-
ip='0.0.0.0'
|
86
|
-
# $LOG.info("No available ip matching #{server_ip}")
|
87
|
-
end
|
88
|
-
# $ .info("Using ip #{ip}")
|
89
|
-
options[:server_ip] = ip
|
90
|
-
end
|
91
|
-
|
92
|
-
options[:port] = 0 #50000
|
93
|
-
opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
|
94
|
-
options[:port] = port.to_i
|
95
|
-
end
|
96
|
-
|
156
|
+
options[:hdd] = FALSE
|
157
|
+
opts.on( '-z', '--hdd', 'Write/use blast report on HDD' ) do |hdd|
|
158
|
+
options[:hdd] = TRUE
|
159
|
+
end
|
97
160
|
|
161
|
+
|
98
162
|
# Set a banner, displayed at the top of the help screen.
|
99
163
|
opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
|
100
164
|
|
@@ -108,35 +172,25 @@ end
|
|
108
172
|
|
109
173
|
# parse options and remove from ARGV
|
110
174
|
optparse.parse!
|
111
|
-
# @verbose = options[:verbose]
|
112
|
-
|
113
|
-
# if (!@verbose.nil?)
|
114
|
-
# puts "You have chosen the verbose mode:\n\nInput File:\t#{options[:fasta]}\nTaxon Group:\t#{options[:tax_group]}\nOwn Database:\t#{options[:user_db]}\nCPU Number:\t#{options[:workers]}"
|
115
|
-
# end
|
116
175
|
|
117
|
-
#----------------------------------------------------------------------- testing errors in parameters entry
|
118
176
|
if (options[:fasta].nil?) || (options[:tax_group].nil?)
|
119
|
-
|
120
|
-
|
121
|
-
|
177
|
+
puts "incorrect number of arguments, you need a fasta file and a taxonomical group:\n\n\t"
|
178
|
+
puts optparse.help
|
179
|
+
exit
|
122
180
|
end
|
123
|
-
#----------------------------------------------------------------------- loading classes and gems
|
124
|
-
ROOT_PATH=File.dirname(__FILE__)
|
125
|
-
|
126
|
-
# $: << File.expand_path(File.join(ROOT_PATH, "classes"))
|
127
|
-
|
128
|
-
# load gem path, only to test locally
|
129
|
-
# $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
|
130
|
-
|
131
|
-
require 'full_lengther_next'
|
132
|
-
|
133
181
|
|
182
|
+
###################################################################################################
|
183
|
+
# PREPARE ENVIROMENT
|
184
|
+
###################################################################################################
|
134
185
|
if ENV['FULL_LENGTHER_NEXT_INIT'] && File.exists?(ENV['FULL_LENGTHER_NEXT_INIT'])
|
135
186
|
FULL_LENGTHER_NEXT_INIT=File.expand_path(ENV['FULL_LENGTHER_NEXT_INIT'])
|
136
187
|
else
|
137
188
|
FULL_LENGTHER_NEXT_INIT=File.join(ROOT_PATH,'init_env')
|
138
189
|
end
|
139
190
|
|
191
|
+
if !File.exists?('temp')
|
192
|
+
Dir.mkdir('temp')
|
193
|
+
end
|
140
194
|
|
141
195
|
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
142
196
|
formatted_db_path = ENV['BLASTDB']
|
@@ -147,50 +201,53 @@ end
|
|
147
201
|
ENV['BLASTDB']=formatted_db_path
|
148
202
|
puts "Using databases at: #{ENV['BLASTDB']}"
|
149
203
|
|
150
|
-
ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','
|
151
|
-
if !File.exists?(ncrna_path)
|
152
|
-
|
204
|
+
ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','ncrna.nhr')
|
205
|
+
if !File.exists?(ncrna_path) && options[:acess_db].include?('c')
|
206
|
+
puts "DB File #{ncrna_path} doesn't exists"
|
153
207
|
puts optparse.help
|
154
208
|
exit
|
155
209
|
end
|
156
210
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
211
|
+
if options[:acess_db].include?('s') || options[:acess_db].include?('t')
|
212
|
+
sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}.psq")
|
213
|
+
if !File.exists?(sp_path)
|
214
|
+
puts "DB File #{sp_path} doesn't exists, or"
|
215
|
+
puts "incorrect taxon group name: #{options[:tax_group]} choose:"
|
216
|
+
puts optparse.help
|
217
|
+
exit
|
218
|
+
end
|
163
219
|
end
|
164
220
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
require 'fl_string_utils'
|
169
|
-
require "une_los_hit"
|
170
|
-
require "lcs" # like the class simliar of seqtrim, return the longest common sequence
|
171
|
-
require "test_code"
|
172
|
-
|
173
|
-
########################################################## MAIN #################################################################
|
221
|
+
##################################################################################################
|
222
|
+
# MAIN
|
223
|
+
###################################################################################################
|
174
224
|
|
175
|
-
require '
|
225
|
+
require 'scbi_mapreduce'
|
226
|
+
require 'my_worker_manager_fln' #First server
|
227
|
+
require 'reptrans'
|
176
228
|
|
177
229
|
$LOG = Logger.new(STDOUT)
|
178
230
|
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
|
179
231
|
|
180
|
-
|
181
|
-
|
182
|
-
custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
|
232
|
+
main_path = File.dirname(ROOT_PATH)
|
233
|
+
custom_worker_file = File.join(main_path, 'lib','full_lengther_next','classes','my_worker.rb')
|
183
234
|
|
184
|
-
|
235
|
+
$LOG.info 'Starting server'
|
185
236
|
# initialize work manager (open files, etc)
|
186
|
-
|
237
|
+
MyWorkerManagerFln.init_work_manager(options)
|
187
238
|
|
188
239
|
# Create server
|
189
|
-
server = ScbiMapreduce::Manager.new(options[:server_ip],options[:port], options[:workers],
|
190
|
-
server.chunk_size=options[:chunk_size]
|
240
|
+
server = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerFln, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
|
241
|
+
server.chunk_size = options[:chunk_size]
|
242
|
+
|
191
243
|
# launch server
|
192
244
|
server.start_server
|
193
|
-
|
194
|
-
|
245
|
+
$LOG.info 'Closing server'
|
246
|
+
|
247
|
+
if !options[:reptrans].nil?
|
248
|
+
seqs_annotation_prot, seqs_some_coding ,seqs_unknown= MyWorkerManagerFln.get_annotations()
|
249
|
+
reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
|
250
|
+
end
|
251
|
+
puts "\nGracias por utilizar Full-LengtherNEXT"
|
252
|
+
|
195
253
|
|
196
|
-
puts "\nGracias por utilizar Full-LengtherNEXT"
|
@@ -0,0 +1,236 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scbi_fasta'
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
##########################################################################################
|
7
|
+
## FUNCTIONS
|
8
|
+
##########################################################################################
|
9
|
+
def load_fasta(fasta)
|
10
|
+
seqs = []
|
11
|
+
fasta = FastaQualFile.new(fasta)
|
12
|
+
fasta.each do |name, seq|
|
13
|
+
seqs << [name, seq]
|
14
|
+
end
|
15
|
+
fasta.close
|
16
|
+
return seqs
|
17
|
+
end
|
18
|
+
|
19
|
+
def copy_seqs(seqs)
|
20
|
+
all = []
|
21
|
+
seqs.each do |seq|
|
22
|
+
all << [seq.first.dup, seq.last.dup]
|
23
|
+
end
|
24
|
+
return all
|
25
|
+
end
|
26
|
+
|
27
|
+
def output_files(options)
|
28
|
+
file = File.basename(options[:file])
|
29
|
+
output_files = {}
|
30
|
+
output_files[:insertions] = File.open(file+'_insertions', 'w') if options[:indel]
|
31
|
+
output_files[:delections] = File.open(file+'_deletions', 'w') if options[:indel]
|
32
|
+
output_files[:mix] = File.open(file+'_mix', 'w') if options[:indel]
|
33
|
+
output_files[:cut_100_pb] = File.open(file+'_trimmed', 'w') if options[:trim]
|
34
|
+
output_files[:paired] = File.open(file+'_paired', 'w') if options[:pair]
|
35
|
+
output_files[:fullChim] = File.open(file+'_fullChim', 'w') if options[:chim]
|
36
|
+
output_files[:fusionChim] = File.open(file+'_fusionChim', 'w') if options[:chim]
|
37
|
+
output_files[:fusionChimTruncate] = File.open(file+'_truncateChim', 'w') if options[:chim]
|
38
|
+
return output_files
|
39
|
+
end
|
40
|
+
|
41
|
+
def random_nt
|
42
|
+
nts =['a','c','g','t']
|
43
|
+
return nts[rand(4).truncate]
|
44
|
+
end
|
45
|
+
|
46
|
+
def insertions(seq, position)
|
47
|
+
first_cut = (seq.length/3).truncate
|
48
|
+
second_cut = first_cut *2
|
49
|
+
case position%3
|
50
|
+
when 0
|
51
|
+
seq = seq.insert(first_cut, random_nt)
|
52
|
+
name = '_I__'
|
53
|
+
when 1
|
54
|
+
seq = seq.insert(second_cut, random_nt)
|
55
|
+
name = '__I_'
|
56
|
+
when 2
|
57
|
+
seq = seq.insert(first_cut, random_nt)
|
58
|
+
seq = seq.insert(second_cut, random_nt)
|
59
|
+
name = '_I_I_'
|
60
|
+
end
|
61
|
+
return seq, name
|
62
|
+
end
|
63
|
+
|
64
|
+
def delections(seq, position)
|
65
|
+
first_cut = (seq.length/3).truncate
|
66
|
+
second_cut = first_cut *2
|
67
|
+
case position%3
|
68
|
+
when 0
|
69
|
+
seq.slice!(first_cut)
|
70
|
+
name='_D__'
|
71
|
+
when 1
|
72
|
+
seq.slice!(second_cut)
|
73
|
+
name='__D_'
|
74
|
+
when 2
|
75
|
+
seq.slice!(first_cut)
|
76
|
+
seq.slice!(second_cut)
|
77
|
+
name='_D_D_'
|
78
|
+
end
|
79
|
+
return seq, name
|
80
|
+
end
|
81
|
+
|
82
|
+
def mix(seq, position)
|
83
|
+
first_cut = (seq.length/3).truncate
|
84
|
+
second_cut = first_cut *2
|
85
|
+
case position%2
|
86
|
+
when 0
|
87
|
+
seq = seq.insert(first_cut, random_nt)
|
88
|
+
seq.slice!(second_cut)
|
89
|
+
name='_I_D_'
|
90
|
+
when 1
|
91
|
+
seq.slice!(first_cut)
|
92
|
+
seq = seq.insert(second_cut, random_nt)
|
93
|
+
name='_D_I_'
|
94
|
+
end
|
95
|
+
return seq, name
|
96
|
+
end
|
97
|
+
|
98
|
+
def load_utrs(utr_file)
|
99
|
+
utrs = {}
|
100
|
+
File.open(utr_file).each do |line|
|
101
|
+
line.chomp!
|
102
|
+
fields = line.split("\t")
|
103
|
+
seq_name = fields.shift
|
104
|
+
utrs[seq_name] = fields.map{|coord| coord.to_i}
|
105
|
+
end
|
106
|
+
return utrs
|
107
|
+
end
|
108
|
+
##########################################################################################
|
109
|
+
## OPTIONS
|
110
|
+
##########################################################################################
|
111
|
+
options = {}
|
112
|
+
optparse = OptionParser.new do |opts|
|
113
|
+
options[:file]='samples'
|
114
|
+
opts.on( '-f', '--file FILE', 'FASTA file') do |file|
|
115
|
+
options[:file]=file
|
116
|
+
end
|
117
|
+
|
118
|
+
options[:duplicate]= 1
|
119
|
+
opts.on( '-d', '--duplicate INTEGER', 'Duplicate sequences to dataset') do |duplicate|
|
120
|
+
options[:duplicate] = duplicate.to_i
|
121
|
+
end
|
122
|
+
|
123
|
+
options[:split]= FALSE
|
124
|
+
opts.on( '-s', '--split', 'Split sequences in each case') do
|
125
|
+
options[:duplicate] = 3
|
126
|
+
end
|
127
|
+
|
128
|
+
options[:chim]= TRUE
|
129
|
+
opts.on( '-c', '--chim', 'Make sequence set of chimeras') do
|
130
|
+
options[:chim] = FALSE
|
131
|
+
end
|
132
|
+
|
133
|
+
options[:indel]= TRUE
|
134
|
+
opts.on( '-i', '--indel', 'Make sequence set of indels') do
|
135
|
+
options[:indel] = FALSE
|
136
|
+
end
|
137
|
+
|
138
|
+
options[:pair]= TRUE
|
139
|
+
opts.on( '-p', '--pair', 'Make sequence set of paired') do
|
140
|
+
options[:pair] = FALSE
|
141
|
+
end
|
142
|
+
|
143
|
+
options[:trim]= TRUE
|
144
|
+
opts.on( '-t', '--trim', 'Make sequence set of trimmed') do
|
145
|
+
options[:trim] = FALSE
|
146
|
+
end
|
147
|
+
|
148
|
+
# Set a banner, displayed at the top of the help screen.
|
149
|
+
opts.banner = "Usage: #{File.basename($0)} -f FILE \n\n"
|
150
|
+
|
151
|
+
# This displays the help screen
|
152
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
153
|
+
puts opts
|
154
|
+
exit
|
155
|
+
end
|
156
|
+
end # End opts
|
157
|
+
|
158
|
+
# parse options and remove from ARGV
|
159
|
+
optparse.parse!
|
160
|
+
|
161
|
+
##########################################################################################
|
162
|
+
## MAIN
|
163
|
+
##########################################################################################
|
164
|
+
if !File.exists?(options[:file])
|
165
|
+
puts 'File not exists'
|
166
|
+
Process.exit
|
167
|
+
end
|
168
|
+
seqs = load_fasta(options[:file])
|
169
|
+
output_files = output_files(options)
|
170
|
+
if options[:trim] || options[:chim]
|
171
|
+
file_ext = File.extname(options[:file])
|
172
|
+
utr_file = options[:file].gsub(file_ext,'')+'.utr'
|
173
|
+
utrs = {}
|
174
|
+
utrs = load_utrs(utr_file) if File.exists?(utr_file)
|
175
|
+
end
|
176
|
+
|
177
|
+
index = 0
|
178
|
+
seqs.each do |name, seq|
|
179
|
+
if index % 2 == 0 && !seqs[index+1].nil? && options[:chim]
|
180
|
+
second_seq = seqs[index+1].first
|
181
|
+
second_seq_fasta = seq+seqs[index+1].last
|
182
|
+
output_files[:fullChim].puts ">#{name+'_'+second_seq}\n#{seq+second_seq_fasta}"
|
183
|
+
if !utrs.empty?
|
184
|
+
utr_coord = utrs[name]
|
185
|
+
utr_coord_second = utrs[second_seq]
|
186
|
+
chim5 = seq[0..utr_coord.last]
|
187
|
+
chim3 = second_seq_fasta[utr_coord_second.first..second_seq_fasta.length-1]
|
188
|
+
output_files[:fusionChim].puts ">#{name+'_'+second_seq}\n#{chim5+chim3}" if !chim5.nil? && !chim3.nil?
|
189
|
+
chim5_trunc = chim5[0..chim5.length-100]
|
190
|
+
chim3_trunc = chim3[100..chim3.length]
|
191
|
+
output_files[:fusionChimTruncate].puts ">#{name+'_'+second_seq}\n#{chim5_trunc+chim3_trunc}" if !chim5_trunc.nil? && !chim3_trunc.nil?
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
if options[:trim]
|
196
|
+
if utrs.empty?
|
197
|
+
output_files[:cut_100_pb].puts ">#{name}\n#{seq[99..seq.length-101]}"
|
198
|
+
else
|
199
|
+
utr_coord = utrs[name]
|
200
|
+
trim_seq = seq[utr_coord.first+100..utr_coord.last-100]
|
201
|
+
output_files[:cut_100_pb].puts ">#{name}\n#{trim_seq}" if !trim_seq.nil? && !trim_seq.empty?
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
if options[:pair]
|
206
|
+
n_number = rand(5..50)
|
207
|
+
position = seq.length/2 - n_number/2
|
208
|
+
output_files[:paired].puts ">#{name}\n#{seq[0..position] + 'N'*n_number + seq[position+1..seq.length-1]}"
|
209
|
+
end
|
210
|
+
index += 1
|
211
|
+
end
|
212
|
+
|
213
|
+
if options[:indel]
|
214
|
+
all_seqs = []
|
215
|
+
options[:duplicate].times do
|
216
|
+
all_seqs.concat(copy_seqs(seqs))
|
217
|
+
end
|
218
|
+
|
219
|
+
length = all_seqs.length
|
220
|
+
all_seqs.each_with_index do |s, i|
|
221
|
+
case i
|
222
|
+
when 0..length/3-1
|
223
|
+
seq, type = insertions(s.last, i)
|
224
|
+
file = :insertions
|
225
|
+
when length/3..2*length/3-1
|
226
|
+
seq, type = delections(s.last, i)
|
227
|
+
file =:delections
|
228
|
+
else
|
229
|
+
seq, type = mix(s.last, i)
|
230
|
+
file = :mix
|
231
|
+
end
|
232
|
+
output_files[file].puts ">#{s.first}#{type}\n#{seq}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
output_files.values.map{|file| file.close}
|