full_lengther_next 0.0.8 → 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -2,11 +2,15 @@
2
2
 
3
3
  # 12-2-2011 Noe Fernandez Pozo.
4
4
  # Full-LengtherNEXT predicts if your sequences are complete, showing you the nucleotide sequences and the translated protein
5
+ ROOT_PATH=File.dirname(__FILE__)
6
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
5
7
 
6
- #------------------------------------------------------------------ parameters entry
7
8
  require 'optparse'
8
9
  require 'socket'
9
10
 
11
+ ###############################################################################################
12
+ # PARSE OPTIONS
13
+ ###############################################################################################
10
14
  options = {}
11
15
 
12
16
  if !File.exists?('logs')
@@ -14,87 +18,147 @@ if !File.exists?('logs')
14
18
  end
15
19
 
16
20
  optparse = OptionParser.new do |opts|
17
-
21
+
22
+ options[:acess_db] = 'stnp'
23
+ opts.on( '-a', '--acess_db STRING', 'Select that databases is going to be used. s for Swissprot, t for trEMBL and n for ncrna, p for use Transdecoder and c for use testcode algothrim. By default is set to stnp' ) do |acess_db|
24
+ options[:acess_db] = acess_db
25
+ end
26
+
27
+ options[:blast] = ''
28
+ opts.on( '-b', '--blast STRING', 'Aditional options to blast execution' ) do |blast|
29
+ options[:blast] = blast
30
+ end
31
+
32
+ options[:chunk_size] = 200
33
+ opts.on( '-c', '--chunk_size SIZE', "Number of sequences processed in each block when parallelization is used. Default=200" ) do |s|
34
+ options[:chunk_size] = s.to_i
35
+ end
36
+
37
+ options[:est_db] = nil
38
+ opts.on( '-d', '--est_db FILE', "EST database for representative transcriptome\n" ) do |est_db|
39
+ options[:est_db] = est_db
40
+ if !File.exists?(options[:est_db])
41
+ puts "No valid path to EST database"
42
+ Process.exit(-1)
43
+ end
44
+ end
45
+
46
+ options[:exonerate] = TRUE
47
+ opts.on( '-e', '--exonerate', 'Disables exonerate analysis' ) do |exonerate|
48
+ options[:exonerate] = FALSE
49
+ end
50
+
18
51
  options[:fasta] = nil
19
52
  opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
20
53
  options[:fasta] = file
21
54
  end
22
-
55
+
23
56
  options[:tax_group] = nil
24
- opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n\t\t\t\t\tfungi\n\t\t\t\t\thuman\n\t\t\t\t\tinvertebrates\n\t\t\t\t\tmammals\n\t\t\t\t\tplants\n\t\t\t\t\trodents\n\t\t\t\t\tvertebrates\n\n" ) do |tax_name|
57
+ opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n#{"\t"*5}fungi\n#{"\t"*5}human\n#{"\t"*5}invertebrates\n#{"\t"*5}mammals\n#{"\t"*5}plants\n#{"\t"*5}rodents\n#{"\t"*5}vertebrates\n\n" ) do |tax_name|
25
58
  options[:tax_group] = tax_name
26
59
  end
27
-
28
- options[:user_db] = nil
29
- opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
30
- options[:user_db] = db
60
+
61
+ options[:ident] = 45.00
62
+ opts.on( '-i', '--identity_percent IDENTITY', 'identity percent threshold to consider as reliable the sequence similarity. Default=45.00' ) do |ident|
63
+ options[:ident] = ident.to_f
31
64
  end
32
-
33
- # options[:verbose] = nil
34
- # opts.on( '-v', '--verbose_mode', "verbose mode\n\n" ) do |verbose|
35
- # options[:verbose] = verbose
36
- # end
37
65
 
38
- options[:evalue] = 1.0e-25
39
- opts.on( '-e', '--evalue EVALUE', 'e value threshold to consider as reliable the orthologue sequence. Default=1.0e-25' ) do |evalue|
40
- options[:evalue] = evalue.to_f
66
+ options[:high_clustering] = TRUE
67
+ opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default true' ) do
68
+ options[:high_clustering] = FALSE
41
69
  end
42
70
 
43
- options[:ident] = 45.00
44
- opts.on( '-i', '--identity_percent IDENTITY', 'identity percent threshold to consider as reliable the sequence similarity. Default=45.00' ) do |ident|
45
- options[:ident] = ident.to_f
71
+ options[:subject_coverage] = 0.25
72
+ opts.on( '-j', '--subject_coverage_percent FLOAT', "Subject coverage percentage threshold" ) do |j|
73
+ options[:subject_coverage] = j.to_f/100
74
+ end
75
+
76
+ options[:min_nucleotides] = 100
77
+ opts.on( '-n', '--min_nucleotides minLONG', "min nucleotides to consider a part of chimera like putative unigene. Default=100\n\n" ) do |min_nucleotides|
78
+ options[:min_nucleotides] = min_nucleotides.to_i
46
79
  end
47
80
 
48
81
  options[:distance] = 15
49
82
  opts.on( '-m', '--max_distance maxDIST', "maximal distance between query and subject gene boundaries to be qualified as putative, the less distance the more strict. Default=15\n\n" ) do |distance|
50
83
  options[:distance] = distance.to_i
51
84
  end
52
-
53
- options[:chimera] = nil
54
- opts.on( '-q', '--chimera_detection', "apply chimera detection mode\n\n" ) do |chimera|
85
+
86
+ options[:port] = 0 #50000
87
+ opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
88
+ options[:port] = port.to_i
89
+ end
90
+
91
+ options[:chimera] = 'rc'
92
+ opts.on( '-q', '--chimera_detection STRING', "d for deactivate chimera detection mode, s for search chimeras only, r for revise it and c for cut it. Default = rcs \n\n" ) do |chimera|
93
+ chimera.downcase!
55
94
  options[:chimera] = chimera
56
95
  end
57
96
 
97
+ options[:reptrans] = nil
98
+ opts.on( '-r', '--representative_transcriptome', "Generates a fasta file with the minime transcriptome\n" ) do |reptrans|
99
+ options[:reptrans] = reptrans
100
+ end
101
+
102
+ options[:server_ip] = '0.0.0.0'
103
+ opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
104
+
105
+ # get list of available ips
106
+ ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
107
+ ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
108
+
109
+ if !ip
110
+ ip='0.0.0.0'
111
+ # $LOG.info("No available ip matching #{server_ip}")
112
+ end
113
+ # $ .info("Using ip #{ip}")
114
+ options[:server_ip] = ip
115
+ end
116
+
117
+ options[:ident_thresold] = 55.0
118
+ opts.on( '-t', '--identity_thresold FLOAT', "For chimeras only. Min identity to consider that two proteins are the same. Default=55.0\n\n" ) do |ident_thresold|
119
+ options[:ident_thresold] = ident_thresold.to_i
120
+ end
121
+
122
+ options[:user_db] = nil
123
+ opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
124
+ options[:user_db] = db
125
+ if !File.exists?(File.expand_path(options[:user_db])+'.psq')
126
+ puts "user database: #{options[:user_db]} was not found"
127
+ exit
128
+ end
129
+ end
130
+
131
+ options[:verbose] = 0
132
+ opts.on( '-v', '--verbose INTEGER', 'Show extra info' ) do |verbose|
133
+ options[:verbose] = verbose.to_i
134
+ end
135
+
58
136
  options[:workers] = 2
59
137
  opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
60
- if File.exists?(workers)
61
- # use workers file
62
- options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
63
- options[:workers].shift
64
- elsif (workers.to_i > 0)
65
- options[:workers] = workers.to_i
66
- else
67
- options[:workers] = 2
138
+
139
+ if File.exists?(workers)
140
+ # use workers file
141
+ options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
142
+ options[:workers].shift
143
+ elsif (workers.to_i > 0)
144
+ options[:workers] = workers.to_i
145
+ else
146
+ options[:workers] = 2
147
+ end
148
+
68
149
  end
69
- end
70
150
 
71
- options[:chunk_size] = 200
72
- opts.on( '-c', '--chunk_size SIZE', "Number of sequences processed in each block when parallelization is used. Default=200" ) do |s|
73
- options[:chunk_size] = s.to_i
151
+ options[:training_ident] = 45.00
152
+ opts.on( '-x', '--training_identity_percent IDENTITY', 'identity percent threshold to use a complete sure sequence for Transdecoder training. Default=45.00' ) do |training_ident|
153
+ options[:training_ident] = ident.to_f
74
154
  end
75
-
76
- options[:server_ip] = '0.0.0.0'
77
- opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
78
155
 
79
- # get list of available ips
80
- ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
81
-
82
- ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
83
-
84
- if !ip
85
- ip='0.0.0.0'
86
- # $LOG.info("No available ip matching #{server_ip}")
87
- end
88
- # $ .info("Using ip #{ip}")
89
- options[:server_ip] = ip
90
- end
91
-
92
- options[:port] = 0 #50000
93
- opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
94
- options[:port] = port.to_i
95
- end
96
-
156
+ options[:hdd] = FALSE
157
+ opts.on( '-z', '--hdd', 'Write/use blast report on HDD' ) do |hdd|
158
+ options[:hdd] = TRUE
159
+ end
97
160
 
161
+
98
162
  # Set a banner, displayed at the top of the help screen.
99
163
  opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
100
164
 
@@ -108,35 +172,25 @@ end
108
172
 
109
173
  # parse options and remove from ARGV
110
174
  optparse.parse!
111
- # @verbose = options[:verbose]
112
-
113
- # if (!@verbose.nil?)
114
- # puts "You have chosen the verbose mode:\n\nInput File:\t#{options[:fasta]}\nTaxon Group:\t#{options[:tax_group]}\nOwn Database:\t#{options[:user_db]}\nCPU Number:\t#{options[:workers]}"
115
- # end
116
175
 
117
- #----------------------------------------------------------------------- testing errors in parameters entry
118
176
  if (options[:fasta].nil?) || (options[:tax_group].nil?)
119
- puts "incorrect number of arguments, you need a fasta file and a taxonomical group:\n\n\t"
120
- puts optparse.help
121
- exit
177
+ puts "incorrect number of arguments, you need a fasta file and a taxonomical group:\n\n\t"
178
+ puts optparse.help
179
+ exit
122
180
  end
123
- #----------------------------------------------------------------------- loading classes and gems
124
- ROOT_PATH=File.dirname(__FILE__)
125
-
126
- # $: << File.expand_path(File.join(ROOT_PATH, "classes"))
127
-
128
- # load gem path, only to test locally
129
- # $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
130
-
131
- require 'full_lengther_next'
132
-
133
181
 
182
+ ###################################################################################################
183
+ # PREPARE ENVIROMENT
184
+ ###################################################################################################
134
185
  if ENV['FULL_LENGTHER_NEXT_INIT'] && File.exists?(ENV['FULL_LENGTHER_NEXT_INIT'])
135
186
  FULL_LENGTHER_NEXT_INIT=File.expand_path(ENV['FULL_LENGTHER_NEXT_INIT'])
136
187
  else
137
188
  FULL_LENGTHER_NEXT_INIT=File.join(ROOT_PATH,'init_env')
138
189
  end
139
190
 
191
+ if !File.exists?('temp')
192
+ Dir.mkdir('temp')
193
+ end
140
194
 
141
195
  if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
142
196
  formatted_db_path = ENV['BLASTDB']
@@ -147,50 +201,53 @@ end
147
201
  ENV['BLASTDB']=formatted_db_path
148
202
  puts "Using databases at: #{ENV['BLASTDB']}"
149
203
 
150
- ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','ncrna_fln_100.fasta.nhr')
151
- if !File.exists?(ncrna_path)
152
- puts "DB File #{ncrna_path} doesn't exists"
204
+ ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','ncrna.nhr')
205
+ if !File.exists?(ncrna_path) && options[:acess_db].include?('c')
206
+ puts "DB File #{ncrna_path} doesn't exists"
153
207
  puts optparse.help
154
208
  exit
155
209
  end
156
210
 
157
- sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}.fasta.psq")
158
- if !File.exists?(sp_path)
159
- puts "DB File #{sp_path} doesn't exists, or"
160
- puts "incorrect taxon group name: #{options[:tax_group]} choose:"
161
- puts optparse.help
162
- exit
211
+ if options[:acess_db].include?('s') || options[:acess_db].include?('t')
212
+ sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}.psq")
213
+ if !File.exists?(sp_path)
214
+ puts "DB File #{sp_path} doesn't exists, or"
215
+ puts "incorrect taxon group name: #{options[:tax_group]} choose:"
216
+ puts optparse.help
217
+ exit
218
+ end
163
219
  end
164
220
 
165
- require 'scbi_blast' # is a gem
166
- require 'scbi_mapreduce'
167
- # puts $:
168
- require 'fl_string_utils'
169
- require "une_los_hit"
170
- require "lcs" # like the class simliar of seqtrim, return the longest common sequence
171
- require "test_code"
172
-
173
- ########################################################## MAIN #################################################################
221
+ ##################################################################################################
222
+ # MAIN
223
+ ###################################################################################################
174
224
 
175
- require 'my_worker_manager'
225
+ require 'scbi_mapreduce'
226
+ require 'my_worker_manager_fln' #First server
227
+ require 'reptrans'
176
228
 
177
229
  $LOG = Logger.new(STDOUT)
178
230
  $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
179
231
 
180
- # puts "ROOT_PATH: #{ROOT_PATH}"
181
-
182
- custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
232
+ main_path = File.dirname(ROOT_PATH)
233
+ custom_worker_file = File.join(main_path, 'lib','full_lengther_next','classes','my_worker.rb')
183
234
 
184
- $LOG.info 'Starting server'
235
+ $LOG.info 'Starting server'
185
236
  # initialize work manager (open files, etc)
186
- MyWorkerManager.init_work_manager(options, options[:chunk_size])
237
+ MyWorkerManagerFln.init_work_manager(options)
187
238
 
188
239
  # Create server
189
- server = ScbiMapreduce::Manager.new(options[:server_ip],options[:port], options[:workers], MyWorkerManager,custom_worker_file, STDOUT,FULL_LENGTHER_NEXT_INIT)
190
- server.chunk_size=options[:chunk_size]
240
+ server = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerFln, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
241
+ server.chunk_size = options[:chunk_size]
242
+
191
243
  # launch server
192
244
  server.start_server
193
-
194
- $LOG.info 'Closing server'
245
+ $LOG.info 'Closing server'
246
+
247
+ if !options[:reptrans].nil?
248
+ seqs_annotation_prot, seqs_some_coding ,seqs_unknown= MyWorkerManagerFln.get_annotations()
249
+ reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
250
+ end
251
+ puts "\nGracias por utilizar Full-LengtherNEXT"
252
+
195
253
 
196
- puts "\nGracias por utilizar Full-LengtherNEXT"
@@ -0,0 +1,236 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_fasta'
4
+ require 'optparse'
5
+
6
+ ##########################################################################################
7
+ ## FUNCTIONS
8
+ ##########################################################################################
9
+ def load_fasta(fasta)
10
+ seqs = []
11
+ fasta = FastaQualFile.new(fasta)
12
+ fasta.each do |name, seq|
13
+ seqs << [name, seq]
14
+ end
15
+ fasta.close
16
+ return seqs
17
+ end
18
+
19
+ def copy_seqs(seqs)
20
+ all = []
21
+ seqs.each do |seq|
22
+ all << [seq.first.dup, seq.last.dup]
23
+ end
24
+ return all
25
+ end
26
+
27
+ def output_files(options)
28
+ file = File.basename(options[:file])
29
+ output_files = {}
30
+ output_files[:insertions] = File.open(file+'_insertions', 'w') if options[:indel]
31
+ output_files[:delections] = File.open(file+'_deletions', 'w') if options[:indel]
32
+ output_files[:mix] = File.open(file+'_mix', 'w') if options[:indel]
33
+ output_files[:cut_100_pb] = File.open(file+'_trimmed', 'w') if options[:trim]
34
+ output_files[:paired] = File.open(file+'_paired', 'w') if options[:pair]
35
+ output_files[:fullChim] = File.open(file+'_fullChim', 'w') if options[:chim]
36
+ output_files[:fusionChim] = File.open(file+'_fusionChim', 'w') if options[:chim]
37
+ output_files[:fusionChimTruncate] = File.open(file+'_truncateChim', 'w') if options[:chim]
38
+ return output_files
39
+ end
40
+
41
+ def random_nt
42
+ nts =['a','c','g','t']
43
+ return nts[rand(4).truncate]
44
+ end
45
+
46
+ def insertions(seq, position)
47
+ first_cut = (seq.length/3).truncate
48
+ second_cut = first_cut *2
49
+ case position%3
50
+ when 0
51
+ seq = seq.insert(first_cut, random_nt)
52
+ name = '_I__'
53
+ when 1
54
+ seq = seq.insert(second_cut, random_nt)
55
+ name = '__I_'
56
+ when 2
57
+ seq = seq.insert(first_cut, random_nt)
58
+ seq = seq.insert(second_cut, random_nt)
59
+ name = '_I_I_'
60
+ end
61
+ return seq, name
62
+ end
63
+
64
+ def delections(seq, position)
65
+ first_cut = (seq.length/3).truncate
66
+ second_cut = first_cut *2
67
+ case position%3
68
+ when 0
69
+ seq.slice!(first_cut)
70
+ name='_D__'
71
+ when 1
72
+ seq.slice!(second_cut)
73
+ name='__D_'
74
+ when 2
75
+ seq.slice!(first_cut)
76
+ seq.slice!(second_cut)
77
+ name='_D_D_'
78
+ end
79
+ return seq, name
80
+ end
81
+
82
+ def mix(seq, position)
83
+ first_cut = (seq.length/3).truncate
84
+ second_cut = first_cut *2
85
+ case position%2
86
+ when 0
87
+ seq = seq.insert(first_cut, random_nt)
88
+ seq.slice!(second_cut)
89
+ name='_I_D_'
90
+ when 1
91
+ seq.slice!(first_cut)
92
+ seq = seq.insert(second_cut, random_nt)
93
+ name='_D_I_'
94
+ end
95
+ return seq, name
96
+ end
97
+
98
+ def load_utrs(utr_file)
99
+ utrs = {}
100
+ File.open(utr_file).each do |line|
101
+ line.chomp!
102
+ fields = line.split("\t")
103
+ seq_name = fields.shift
104
+ utrs[seq_name] = fields.map{|coord| coord.to_i}
105
+ end
106
+ return utrs
107
+ end
108
+ ##########################################################################################
109
+ ## OPTIONS
110
+ ##########################################################################################
111
+ options = {}
112
+ optparse = OptionParser.new do |opts|
113
+ options[:file]='samples'
114
+ opts.on( '-f', '--file FILE', 'FASTA file') do |file|
115
+ options[:file]=file
116
+ end
117
+
118
+ options[:duplicate]= 1
119
+ opts.on( '-d', '--duplicate INTEGER', 'Duplicate sequences to dataset') do |duplicate|
120
+ options[:duplicate] = duplicate.to_i
121
+ end
122
+
123
+ options[:split]= FALSE
124
+ opts.on( '-s', '--split', 'Split sequences in each case') do
125
+ options[:duplicate] = 3
126
+ end
127
+
128
+ options[:chim]= TRUE
129
+ opts.on( '-c', '--chim', 'Make sequence set of chimeras') do
130
+ options[:chim] = FALSE
131
+ end
132
+
133
+ options[:indel]= TRUE
134
+ opts.on( '-i', '--indel', 'Make sequence set of indels') do
135
+ options[:indel] = FALSE
136
+ end
137
+
138
+ options[:pair]= TRUE
139
+ opts.on( '-p', '--pair', 'Make sequence set of paired') do
140
+ options[:pair] = FALSE
141
+ end
142
+
143
+ options[:trim]= TRUE
144
+ opts.on( '-t', '--trim', 'Make sequence set of trimmed') do
145
+ options[:trim] = FALSE
146
+ end
147
+
148
+ # Set a banner, displayed at the top of the help screen.
149
+ opts.banner = "Usage: #{File.basename($0)} -f FILE \n\n"
150
+
151
+ # This displays the help screen
152
+ opts.on( '-h', '--help', 'Display this screen' ) do
153
+ puts opts
154
+ exit
155
+ end
156
+ end # End opts
157
+
158
+ # parse options and remove from ARGV
159
+ optparse.parse!
160
+
161
+ ##########################################################################################
162
+ ## MAIN
163
+ ##########################################################################################
164
+ if !File.exists?(options[:file])
165
+ puts 'File not exists'
166
+ Process.exit
167
+ end
168
+ seqs = load_fasta(options[:file])
169
+ output_files = output_files(options)
170
+ if options[:trim] || options[:chim]
171
+ file_ext = File.extname(options[:file])
172
+ utr_file = options[:file].gsub(file_ext,'')+'.utr'
173
+ utrs = {}
174
+ utrs = load_utrs(utr_file) if File.exists?(utr_file)
175
+ end
176
+
177
+ index = 0
178
+ seqs.each do |name, seq|
179
+ if index % 2 == 0 && !seqs[index+1].nil? && options[:chim]
180
+ second_seq = seqs[index+1].first
181
+ second_seq_fasta = seq+seqs[index+1].last
182
+ output_files[:fullChim].puts ">#{name+'_'+second_seq}\n#{seq+second_seq_fasta}"
183
+ if !utrs.empty?
184
+ utr_coord = utrs[name]
185
+ utr_coord_second = utrs[second_seq]
186
+ chim5 = seq[0..utr_coord.last]
187
+ chim3 = second_seq_fasta[utr_coord_second.first..second_seq_fasta.length-1]
188
+ output_files[:fusionChim].puts ">#{name+'_'+second_seq}\n#{chim5+chim3}" if !chim5.nil? && !chim3.nil?
189
+ chim5_trunc = chim5[0..chim5.length-100]
190
+ chim3_trunc = chim3[100..chim3.length]
191
+ output_files[:fusionChimTruncate].puts ">#{name+'_'+second_seq}\n#{chim5_trunc+chim3_trunc}" if !chim5_trunc.nil? && !chim3_trunc.nil?
192
+ end
193
+ end
194
+
195
+ if options[:trim]
196
+ if utrs.empty?
197
+ output_files[:cut_100_pb].puts ">#{name}\n#{seq[99..seq.length-101]}"
198
+ else
199
+ utr_coord = utrs[name]
200
+ trim_seq = seq[utr_coord.first+100..utr_coord.last-100]
201
+ output_files[:cut_100_pb].puts ">#{name}\n#{trim_seq}" if !trim_seq.nil? && !trim_seq.empty?
202
+ end
203
+ end
204
+
205
+ if options[:pair]
206
+ n_number = rand(5..50)
207
+ position = seq.length/2 - n_number/2
208
+ output_files[:paired].puts ">#{name}\n#{seq[0..position] + 'N'*n_number + seq[position+1..seq.length-1]}"
209
+ end
210
+ index += 1
211
+ end
212
+
213
+ if options[:indel]
214
+ all_seqs = []
215
+ options[:duplicate].times do
216
+ all_seqs.concat(copy_seqs(seqs))
217
+ end
218
+
219
+ length = all_seqs.length
220
+ all_seqs.each_with_index do |s, i|
221
+ case i
222
+ when 0..length/3-1
223
+ seq, type = insertions(s.last, i)
224
+ file = :insertions
225
+ when length/3..2*length/3-1
226
+ seq, type = delections(s.last, i)
227
+ file =:delections
228
+ else
229
+ seq, type = mix(s.last, i)
230
+ file = :mix
231
+ end
232
+ output_files[file].puts ">#{s.first}#{type}\n#{seq}"
233
+ end
234
+ end
235
+
236
+ output_files.values.map{|file| file.close}