full_lengther_next 0.0.8 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/History.txt +2 -2
- data/Manifest.txt +33 -18
- data/Rakefile +4 -2
- data/bin/download_fln_dbs.rb +310 -158
- data/bin/full_lengther_next +160 -103
- data/bin/make_test_dataset.rb +236 -0
- data/bin/make_user_db.rb +101 -117
- data/bin/plot_fln.rb +270 -0
- data/bin/plot_taxonomy.rb +70 -0
- data/lib/expresscanvas.zip +0 -0
- data/lib/full_lengther_next.rb +3 -3
- data/lib/full_lengther_next/classes/artifacts.rb +66 -0
- data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
- data/lib/full_lengther_next/classes/cdhit.rb +154 -0
- data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
- data/lib/full_lengther_next/classes/common_functions.rb +105 -63
- data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
- data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
- data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
- data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
- data/lib/full_lengther_next/classes/handle_db.rb +30 -0
- data/lib/full_lengther_next/classes/my_worker.rb +308 -138
- data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
- data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
- data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
- data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
- data/lib/full_lengther_next/classes/reptrans.rb +210 -0
- data/lib/full_lengther_next/classes/sequence.rb +439 -80
- data/lib/full_lengther_next/classes/test_code.rb +15 -16
- data/lib/full_lengther_next/classes/types.rb +12 -0
- data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
- data/lib/full_lengther_next/classes/warnings.rb +40 -0
- metadata +207 -93
- data/lib/full_lengther_next/classes/lcs.rb +0 -33
- data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
data/bin/full_lengther_next
CHANGED
@@ -2,11 +2,15 @@
|
|
2
2
|
|
3
3
|
# 12-2-2011 Noe Fernandez Pozo.
|
4
4
|
# Full-LengtherNEXT predicts if your sequences are complete, showing you the nucleotide sequences and the translated protein
|
5
|
+
ROOT_PATH=File.dirname(__FILE__)
|
6
|
+
$: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
|
5
7
|
|
6
|
-
#------------------------------------------------------------------ parameters entry
|
7
8
|
require 'optparse'
|
8
9
|
require 'socket'
|
9
10
|
|
11
|
+
###############################################################################################
|
12
|
+
# PARSE OPTIONS
|
13
|
+
###############################################################################################
|
10
14
|
options = {}
|
11
15
|
|
12
16
|
if !File.exists?('logs')
|
@@ -14,87 +18,147 @@ if !File.exists?('logs')
|
|
14
18
|
end
|
15
19
|
|
16
20
|
optparse = OptionParser.new do |opts|
|
17
|
-
|
21
|
+
|
22
|
+
options[:acess_db] = 'stnp'
|
23
|
+
opts.on( '-a', '--acess_db STRING', 'Select that databases is going to be used. s for Swissprot, t for trEMBL and n for ncrna, p for use Transdecoder and c for use testcode algothrim. By default is set to stnp' ) do |acess_db|
|
24
|
+
options[:acess_db] = acess_db
|
25
|
+
end
|
26
|
+
|
27
|
+
options[:blast] = ''
|
28
|
+
opts.on( '-b', '--blast STRING', 'Aditional options to blast execution' ) do |blast|
|
29
|
+
options[:blast] = blast
|
30
|
+
end
|
31
|
+
|
32
|
+
options[:chunk_size] = 200
|
33
|
+
opts.on( '-c', '--chunk_size SIZE', "Number of sequences processed in each block when parallelization is used. Default=200" ) do |s|
|
34
|
+
options[:chunk_size] = s.to_i
|
35
|
+
end
|
36
|
+
|
37
|
+
options[:est_db] = nil
|
38
|
+
opts.on( '-d', '--est_db FILE', "EST database for representative transcriptome\n" ) do |est_db|
|
39
|
+
options[:est_db] = est_db
|
40
|
+
if !File.exists?(options[:est_db])
|
41
|
+
puts "No valid path to EST database"
|
42
|
+
Process.exit(-1)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
options[:exonerate] = TRUE
|
47
|
+
opts.on( '-e', '--exonerate', 'Disables exonerate analysis' ) do |exonerate|
|
48
|
+
options[:exonerate] = FALSE
|
49
|
+
end
|
50
|
+
|
18
51
|
options[:fasta] = nil
|
19
52
|
opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
|
20
53
|
options[:fasta] = file
|
21
54
|
end
|
22
|
-
|
55
|
+
|
23
56
|
options[:tax_group] = nil
|
24
|
-
opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n\t\
|
57
|
+
opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n#{"\t"*5}fungi\n#{"\t"*5}human\n#{"\t"*5}invertebrates\n#{"\t"*5}mammals\n#{"\t"*5}plants\n#{"\t"*5}rodents\n#{"\t"*5}vertebrates\n\n" ) do |tax_name|
|
25
58
|
options[:tax_group] = tax_name
|
26
59
|
end
|
27
|
-
|
28
|
-
options[:
|
29
|
-
opts.on( '-
|
30
|
-
options[:
|
60
|
+
|
61
|
+
options[:ident] = 45.00
|
62
|
+
opts.on( '-i', '--identity_percent IDENTITY', 'identity percent threshold to consider as reliable the sequence similarity. Default=45.00' ) do |ident|
|
63
|
+
options[:ident] = ident.to_f
|
31
64
|
end
|
32
|
-
|
33
|
-
# options[:verbose] = nil
|
34
|
-
# opts.on( '-v', '--verbose_mode', "verbose mode\n\n" ) do |verbose|
|
35
|
-
# options[:verbose] = verbose
|
36
|
-
# end
|
37
65
|
|
38
|
-
options[:
|
39
|
-
opts.on( '-
|
40
|
-
options[:
|
66
|
+
options[:high_clustering] = TRUE
|
67
|
+
opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default true' ) do
|
68
|
+
options[:high_clustering] = FALSE
|
41
69
|
end
|
42
70
|
|
43
|
-
options[:
|
44
|
-
opts.on( '-
|
45
|
-
options[:
|
71
|
+
options[:subject_coverage] = 0.25
|
72
|
+
opts.on( '-j', '--subject_coverage_percent FLOAT', "Subject coverage percentage threshold" ) do |j|
|
73
|
+
options[:subject_coverage] = j.to_f/100
|
74
|
+
end
|
75
|
+
|
76
|
+
options[:min_nucleotides] = 100
|
77
|
+
opts.on( '-n', '--min_nucleotides minLONG', "min nucleotides to consider a part of chimera like putative unigene. Default=100\n\n" ) do |min_nucleotides|
|
78
|
+
options[:min_nucleotides] = min_nucleotides.to_i
|
46
79
|
end
|
47
80
|
|
48
81
|
options[:distance] = 15
|
49
82
|
opts.on( '-m', '--max_distance maxDIST', "maximal distance between query and subject gene boundaries to be qualified as putative, the less distance the more strict. Default=15\n\n" ) do |distance|
|
50
83
|
options[:distance] = distance.to_i
|
51
84
|
end
|
52
|
-
|
53
|
-
options[:
|
54
|
-
opts.on( '-
|
85
|
+
|
86
|
+
options[:port] = 0 #50000
|
87
|
+
opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
|
88
|
+
options[:port] = port.to_i
|
89
|
+
end
|
90
|
+
|
91
|
+
options[:chimera] = 'rc'
|
92
|
+
opts.on( '-q', '--chimera_detection STRING', "d for deactivate chimera detection mode, s for search chimeras only, r for revise it and c for cut it. Default = rcs \n\n" ) do |chimera|
|
93
|
+
chimera.downcase!
|
55
94
|
options[:chimera] = chimera
|
56
95
|
end
|
57
96
|
|
97
|
+
options[:reptrans] = nil
|
98
|
+
opts.on( '-r', '--representative_transcriptome', "Generates a fasta file with the minime transcriptome\n" ) do |reptrans|
|
99
|
+
options[:reptrans] = reptrans
|
100
|
+
end
|
101
|
+
|
102
|
+
options[:server_ip] = '0.0.0.0'
|
103
|
+
opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
|
104
|
+
|
105
|
+
# get list of available ips
|
106
|
+
ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
|
107
|
+
ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
|
108
|
+
|
109
|
+
if !ip
|
110
|
+
ip='0.0.0.0'
|
111
|
+
# $LOG.info("No available ip matching #{server_ip}")
|
112
|
+
end
|
113
|
+
# $ .info("Using ip #{ip}")
|
114
|
+
options[:server_ip] = ip
|
115
|
+
end
|
116
|
+
|
117
|
+
options[:ident_thresold] = 55.0
|
118
|
+
opts.on( '-t', '--identity_thresold FLOAT', "For chimeras only. Min identity to consider that two proteins are the same. Default=55.0\n\n" ) do |ident_thresold|
|
119
|
+
options[:ident_thresold] = ident_thresold.to_i
|
120
|
+
end
|
121
|
+
|
122
|
+
options[:user_db] = nil
|
123
|
+
opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
|
124
|
+
options[:user_db] = db
|
125
|
+
if !File.exists?(File.expand_path(options[:user_db])+'.psq')
|
126
|
+
puts "user database: #{options[:user_db]} was not found"
|
127
|
+
exit
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
options[:verbose] = 0
|
132
|
+
opts.on( '-v', '--verbose INTEGER', 'Show extra info' ) do |verbose|
|
133
|
+
options[:verbose] = verbose.to_i
|
134
|
+
end
|
135
|
+
|
58
136
|
options[:workers] = 2
|
59
137
|
opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
138
|
+
|
139
|
+
if File.exists?(workers)
|
140
|
+
# use workers file
|
141
|
+
options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
|
142
|
+
options[:workers].shift
|
143
|
+
elsif (workers.to_i > 0)
|
144
|
+
options[:workers] = workers.to_i
|
145
|
+
else
|
146
|
+
options[:workers] = 2
|
147
|
+
end
|
148
|
+
|
68
149
|
end
|
69
|
-
end
|
70
150
|
|
71
|
-
options[:
|
72
|
-
opts.on( '-
|
73
|
-
options[:
|
151
|
+
options[:training_ident] = 45.00
|
152
|
+
opts.on( '-x', '--training_identity_percent IDENTITY', 'identity percent threshold to use a complete sure sequence for Transdecoder training. Default=45.00' ) do |training_ident|
|
153
|
+
options[:training_ident] = ident.to_f
|
74
154
|
end
|
75
|
-
|
76
|
-
options[:server_ip] = '0.0.0.0'
|
77
|
-
opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
|
78
155
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
if !ip
|
85
|
-
ip='0.0.0.0'
|
86
|
-
# $LOG.info("No available ip matching #{server_ip}")
|
87
|
-
end
|
88
|
-
# $ .info("Using ip #{ip}")
|
89
|
-
options[:server_ip] = ip
|
90
|
-
end
|
91
|
-
|
92
|
-
options[:port] = 0 #50000
|
93
|
-
opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
|
94
|
-
options[:port] = port.to_i
|
95
|
-
end
|
96
|
-
|
156
|
+
options[:hdd] = FALSE
|
157
|
+
opts.on( '-z', '--hdd', 'Write/use blast report on HDD' ) do |hdd|
|
158
|
+
options[:hdd] = TRUE
|
159
|
+
end
|
97
160
|
|
161
|
+
|
98
162
|
# Set a banner, displayed at the top of the help screen.
|
99
163
|
opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
|
100
164
|
|
@@ -108,35 +172,25 @@ end
|
|
108
172
|
|
109
173
|
# parse options and remove from ARGV
|
110
174
|
optparse.parse!
|
111
|
-
# @verbose = options[:verbose]
|
112
|
-
|
113
|
-
# if (!@verbose.nil?)
|
114
|
-
# puts "You have chosen the verbose mode:\n\nInput File:\t#{options[:fasta]}\nTaxon Group:\t#{options[:tax_group]}\nOwn Database:\t#{options[:user_db]}\nCPU Number:\t#{options[:workers]}"
|
115
|
-
# end
|
116
175
|
|
117
|
-
#----------------------------------------------------------------------- testing errors in parameters entry
|
118
176
|
if (options[:fasta].nil?) || (options[:tax_group].nil?)
|
119
|
-
|
120
|
-
|
121
|
-
|
177
|
+
puts "incorrect number of arguments, you need a fasta file and a taxonomical group:\n\n\t"
|
178
|
+
puts optparse.help
|
179
|
+
exit
|
122
180
|
end
|
123
|
-
#----------------------------------------------------------------------- loading classes and gems
|
124
|
-
ROOT_PATH=File.dirname(__FILE__)
|
125
|
-
|
126
|
-
# $: << File.expand_path(File.join(ROOT_PATH, "classes"))
|
127
|
-
|
128
|
-
# load gem path, only to test locally
|
129
|
-
# $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
|
130
|
-
|
131
|
-
require 'full_lengther_next'
|
132
|
-
|
133
181
|
|
182
|
+
###################################################################################################
|
183
|
+
# PREPARE ENVIROMENT
|
184
|
+
###################################################################################################
|
134
185
|
if ENV['FULL_LENGTHER_NEXT_INIT'] && File.exists?(ENV['FULL_LENGTHER_NEXT_INIT'])
|
135
186
|
FULL_LENGTHER_NEXT_INIT=File.expand_path(ENV['FULL_LENGTHER_NEXT_INIT'])
|
136
187
|
else
|
137
188
|
FULL_LENGTHER_NEXT_INIT=File.join(ROOT_PATH,'init_env')
|
138
189
|
end
|
139
190
|
|
191
|
+
if !File.exists?('temp')
|
192
|
+
Dir.mkdir('temp')
|
193
|
+
end
|
140
194
|
|
141
195
|
if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
|
142
196
|
formatted_db_path = ENV['BLASTDB']
|
@@ -147,50 +201,53 @@ end
|
|
147
201
|
ENV['BLASTDB']=formatted_db_path
|
148
202
|
puts "Using databases at: #{ENV['BLASTDB']}"
|
149
203
|
|
150
|
-
ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','
|
151
|
-
if !File.exists?(ncrna_path)
|
152
|
-
|
204
|
+
ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','ncrna.nhr')
|
205
|
+
if !File.exists?(ncrna_path) && options[:acess_db].include?('c')
|
206
|
+
puts "DB File #{ncrna_path} doesn't exists"
|
153
207
|
puts optparse.help
|
154
208
|
exit
|
155
209
|
end
|
156
210
|
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
211
|
+
if options[:acess_db].include?('s') || options[:acess_db].include?('t')
|
212
|
+
sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}.psq")
|
213
|
+
if !File.exists?(sp_path)
|
214
|
+
puts "DB File #{sp_path} doesn't exists, or"
|
215
|
+
puts "incorrect taxon group name: #{options[:tax_group]} choose:"
|
216
|
+
puts optparse.help
|
217
|
+
exit
|
218
|
+
end
|
163
219
|
end
|
164
220
|
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
require 'fl_string_utils'
|
169
|
-
require "une_los_hit"
|
170
|
-
require "lcs" # like the class simliar of seqtrim, return the longest common sequence
|
171
|
-
require "test_code"
|
172
|
-
|
173
|
-
########################################################## MAIN #################################################################
|
221
|
+
##################################################################################################
|
222
|
+
# MAIN
|
223
|
+
###################################################################################################
|
174
224
|
|
175
|
-
require '
|
225
|
+
require 'scbi_mapreduce'
|
226
|
+
require 'my_worker_manager_fln' #First server
|
227
|
+
require 'reptrans'
|
176
228
|
|
177
229
|
$LOG = Logger.new(STDOUT)
|
178
230
|
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
|
179
231
|
|
180
|
-
|
181
|
-
|
182
|
-
custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
|
232
|
+
main_path = File.dirname(ROOT_PATH)
|
233
|
+
custom_worker_file = File.join(main_path, 'lib','full_lengther_next','classes','my_worker.rb')
|
183
234
|
|
184
|
-
|
235
|
+
$LOG.info 'Starting server'
|
185
236
|
# initialize work manager (open files, etc)
|
186
|
-
|
237
|
+
MyWorkerManagerFln.init_work_manager(options)
|
187
238
|
|
188
239
|
# Create server
|
189
|
-
server = ScbiMapreduce::Manager.new(options[:server_ip],options[:port], options[:workers],
|
190
|
-
server.chunk_size=options[:chunk_size]
|
240
|
+
server = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerFln, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
|
241
|
+
server.chunk_size = options[:chunk_size]
|
242
|
+
|
191
243
|
# launch server
|
192
244
|
server.start_server
|
193
|
-
|
194
|
-
|
245
|
+
$LOG.info 'Closing server'
|
246
|
+
|
247
|
+
if !options[:reptrans].nil?
|
248
|
+
seqs_annotation_prot, seqs_some_coding ,seqs_unknown= MyWorkerManagerFln.get_annotations()
|
249
|
+
reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
|
250
|
+
end
|
251
|
+
puts "\nGracias por utilizar Full-LengtherNEXT"
|
252
|
+
|
195
253
|
|
196
|
-
puts "\nGracias por utilizar Full-LengtherNEXT"
|
@@ -0,0 +1,236 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scbi_fasta'
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
##########################################################################################
|
7
|
+
## FUNCTIONS
|
8
|
+
##########################################################################################
|
9
|
+
def load_fasta(fasta)
|
10
|
+
seqs = []
|
11
|
+
fasta = FastaQualFile.new(fasta)
|
12
|
+
fasta.each do |name, seq|
|
13
|
+
seqs << [name, seq]
|
14
|
+
end
|
15
|
+
fasta.close
|
16
|
+
return seqs
|
17
|
+
end
|
18
|
+
|
19
|
+
def copy_seqs(seqs)
|
20
|
+
all = []
|
21
|
+
seqs.each do |seq|
|
22
|
+
all << [seq.first.dup, seq.last.dup]
|
23
|
+
end
|
24
|
+
return all
|
25
|
+
end
|
26
|
+
|
27
|
+
def output_files(options)
|
28
|
+
file = File.basename(options[:file])
|
29
|
+
output_files = {}
|
30
|
+
output_files[:insertions] = File.open(file+'_insertions', 'w') if options[:indel]
|
31
|
+
output_files[:delections] = File.open(file+'_deletions', 'w') if options[:indel]
|
32
|
+
output_files[:mix] = File.open(file+'_mix', 'w') if options[:indel]
|
33
|
+
output_files[:cut_100_pb] = File.open(file+'_trimmed', 'w') if options[:trim]
|
34
|
+
output_files[:paired] = File.open(file+'_paired', 'w') if options[:pair]
|
35
|
+
output_files[:fullChim] = File.open(file+'_fullChim', 'w') if options[:chim]
|
36
|
+
output_files[:fusionChim] = File.open(file+'_fusionChim', 'w') if options[:chim]
|
37
|
+
output_files[:fusionChimTruncate] = File.open(file+'_truncateChim', 'w') if options[:chim]
|
38
|
+
return output_files
|
39
|
+
end
|
40
|
+
|
41
|
+
def random_nt
|
42
|
+
nts =['a','c','g','t']
|
43
|
+
return nts[rand(4).truncate]
|
44
|
+
end
|
45
|
+
|
46
|
+
def insertions(seq, position)
|
47
|
+
first_cut = (seq.length/3).truncate
|
48
|
+
second_cut = first_cut *2
|
49
|
+
case position%3
|
50
|
+
when 0
|
51
|
+
seq = seq.insert(first_cut, random_nt)
|
52
|
+
name = '_I__'
|
53
|
+
when 1
|
54
|
+
seq = seq.insert(second_cut, random_nt)
|
55
|
+
name = '__I_'
|
56
|
+
when 2
|
57
|
+
seq = seq.insert(first_cut, random_nt)
|
58
|
+
seq = seq.insert(second_cut, random_nt)
|
59
|
+
name = '_I_I_'
|
60
|
+
end
|
61
|
+
return seq, name
|
62
|
+
end
|
63
|
+
|
64
|
+
def delections(seq, position)
|
65
|
+
first_cut = (seq.length/3).truncate
|
66
|
+
second_cut = first_cut *2
|
67
|
+
case position%3
|
68
|
+
when 0
|
69
|
+
seq.slice!(first_cut)
|
70
|
+
name='_D__'
|
71
|
+
when 1
|
72
|
+
seq.slice!(second_cut)
|
73
|
+
name='__D_'
|
74
|
+
when 2
|
75
|
+
seq.slice!(first_cut)
|
76
|
+
seq.slice!(second_cut)
|
77
|
+
name='_D_D_'
|
78
|
+
end
|
79
|
+
return seq, name
|
80
|
+
end
|
81
|
+
|
82
|
+
def mix(seq, position)
|
83
|
+
first_cut = (seq.length/3).truncate
|
84
|
+
second_cut = first_cut *2
|
85
|
+
case position%2
|
86
|
+
when 0
|
87
|
+
seq = seq.insert(first_cut, random_nt)
|
88
|
+
seq.slice!(second_cut)
|
89
|
+
name='_I_D_'
|
90
|
+
when 1
|
91
|
+
seq.slice!(first_cut)
|
92
|
+
seq = seq.insert(second_cut, random_nt)
|
93
|
+
name='_D_I_'
|
94
|
+
end
|
95
|
+
return seq, name
|
96
|
+
end
|
97
|
+
|
98
|
+
def load_utrs(utr_file)
|
99
|
+
utrs = {}
|
100
|
+
File.open(utr_file).each do |line|
|
101
|
+
line.chomp!
|
102
|
+
fields = line.split("\t")
|
103
|
+
seq_name = fields.shift
|
104
|
+
utrs[seq_name] = fields.map{|coord| coord.to_i}
|
105
|
+
end
|
106
|
+
return utrs
|
107
|
+
end
|
108
|
+
##########################################################################################
|
109
|
+
## OPTIONS
|
110
|
+
##########################################################################################
|
111
|
+
options = {}
|
112
|
+
optparse = OptionParser.new do |opts|
|
113
|
+
options[:file]='samples'
|
114
|
+
opts.on( '-f', '--file FILE', 'FASTA file') do |file|
|
115
|
+
options[:file]=file
|
116
|
+
end
|
117
|
+
|
118
|
+
options[:duplicate]= 1
|
119
|
+
opts.on( '-d', '--duplicate INTEGER', 'Duplicate sequences to dataset') do |duplicate|
|
120
|
+
options[:duplicate] = duplicate.to_i
|
121
|
+
end
|
122
|
+
|
123
|
+
options[:split]= FALSE
|
124
|
+
opts.on( '-s', '--split', 'Split sequences in each case') do
|
125
|
+
options[:duplicate] = 3
|
126
|
+
end
|
127
|
+
|
128
|
+
options[:chim]= TRUE
|
129
|
+
opts.on( '-c', '--chim', 'Make sequence set of chimeras') do
|
130
|
+
options[:chim] = FALSE
|
131
|
+
end
|
132
|
+
|
133
|
+
options[:indel]= TRUE
|
134
|
+
opts.on( '-i', '--indel', 'Make sequence set of indels') do
|
135
|
+
options[:indel] = FALSE
|
136
|
+
end
|
137
|
+
|
138
|
+
options[:pair]= TRUE
|
139
|
+
opts.on( '-p', '--pair', 'Make sequence set of paired') do
|
140
|
+
options[:pair] = FALSE
|
141
|
+
end
|
142
|
+
|
143
|
+
options[:trim]= TRUE
|
144
|
+
opts.on( '-t', '--trim', 'Make sequence set of trimmed') do
|
145
|
+
options[:trim] = FALSE
|
146
|
+
end
|
147
|
+
|
148
|
+
# Set a banner, displayed at the top of the help screen.
|
149
|
+
opts.banner = "Usage: #{File.basename($0)} -f FILE \n\n"
|
150
|
+
|
151
|
+
# This displays the help screen
|
152
|
+
opts.on( '-h', '--help', 'Display this screen' ) do
|
153
|
+
puts opts
|
154
|
+
exit
|
155
|
+
end
|
156
|
+
end # End opts
|
157
|
+
|
158
|
+
# parse options and remove from ARGV
|
159
|
+
optparse.parse!
|
160
|
+
|
161
|
+
##########################################################################################
|
162
|
+
## MAIN
|
163
|
+
##########################################################################################
|
164
|
+
if !File.exists?(options[:file])
|
165
|
+
puts 'File not exists'
|
166
|
+
Process.exit
|
167
|
+
end
|
168
|
+
seqs = load_fasta(options[:file])
|
169
|
+
output_files = output_files(options)
|
170
|
+
if options[:trim] || options[:chim]
|
171
|
+
file_ext = File.extname(options[:file])
|
172
|
+
utr_file = options[:file].gsub(file_ext,'')+'.utr'
|
173
|
+
utrs = {}
|
174
|
+
utrs = load_utrs(utr_file) if File.exists?(utr_file)
|
175
|
+
end
|
176
|
+
|
177
|
+
index = 0
|
178
|
+
seqs.each do |name, seq|
|
179
|
+
if index % 2 == 0 && !seqs[index+1].nil? && options[:chim]
|
180
|
+
second_seq = seqs[index+1].first
|
181
|
+
second_seq_fasta = seq+seqs[index+1].last
|
182
|
+
output_files[:fullChim].puts ">#{name+'_'+second_seq}\n#{seq+second_seq_fasta}"
|
183
|
+
if !utrs.empty?
|
184
|
+
utr_coord = utrs[name]
|
185
|
+
utr_coord_second = utrs[second_seq]
|
186
|
+
chim5 = seq[0..utr_coord.last]
|
187
|
+
chim3 = second_seq_fasta[utr_coord_second.first..second_seq_fasta.length-1]
|
188
|
+
output_files[:fusionChim].puts ">#{name+'_'+second_seq}\n#{chim5+chim3}" if !chim5.nil? && !chim3.nil?
|
189
|
+
chim5_trunc = chim5[0..chim5.length-100]
|
190
|
+
chim3_trunc = chim3[100..chim3.length]
|
191
|
+
output_files[:fusionChimTruncate].puts ">#{name+'_'+second_seq}\n#{chim5_trunc+chim3_trunc}" if !chim5_trunc.nil? && !chim3_trunc.nil?
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
if options[:trim]
|
196
|
+
if utrs.empty?
|
197
|
+
output_files[:cut_100_pb].puts ">#{name}\n#{seq[99..seq.length-101]}"
|
198
|
+
else
|
199
|
+
utr_coord = utrs[name]
|
200
|
+
trim_seq = seq[utr_coord.first+100..utr_coord.last-100]
|
201
|
+
output_files[:cut_100_pb].puts ">#{name}\n#{trim_seq}" if !trim_seq.nil? && !trim_seq.empty?
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
if options[:pair]
|
206
|
+
n_number = rand(5..50)
|
207
|
+
position = seq.length/2 - n_number/2
|
208
|
+
output_files[:paired].puts ">#{name}\n#{seq[0..position] + 'N'*n_number + seq[position+1..seq.length-1]}"
|
209
|
+
end
|
210
|
+
index += 1
|
211
|
+
end
|
212
|
+
|
213
|
+
if options[:indel]
|
214
|
+
all_seqs = []
|
215
|
+
options[:duplicate].times do
|
216
|
+
all_seqs.concat(copy_seqs(seqs))
|
217
|
+
end
|
218
|
+
|
219
|
+
length = all_seqs.length
|
220
|
+
all_seqs.each_with_index do |s, i|
|
221
|
+
case i
|
222
|
+
when 0..length/3-1
|
223
|
+
seq, type = insertions(s.last, i)
|
224
|
+
file = :insertions
|
225
|
+
when length/3..2*length/3-1
|
226
|
+
seq, type = delections(s.last, i)
|
227
|
+
file =:delections
|
228
|
+
else
|
229
|
+
seq, type = mix(s.last, i)
|
230
|
+
file = :mix
|
231
|
+
end
|
232
|
+
output_files[file].puts ">#{s.first}#{type}\n#{seq}"
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
output_files.values.map{|file| file.close}
|