full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -2,11 +2,15 @@
2
2
 
3
3
  # 12-2-2011 Noe Fernandez Pozo.
4
4
  # Full-LengtherNEXT predicts if your sequences are complete, showing you the nucleotide sequences and the translated protein
5
+ ROOT_PATH=File.dirname(__FILE__)
6
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
5
7
 
6
- #------------------------------------------------------------------ parameters entry
7
8
  require 'optparse'
8
9
  require 'socket'
9
10
 
11
+ ###############################################################################################
12
+ # PARSE OPTIONS
13
+ ###############################################################################################
10
14
  options = {}
11
15
 
12
16
  if !File.exists?('logs')
@@ -14,87 +18,147 @@ if !File.exists?('logs')
14
18
  end
15
19
 
16
20
  optparse = OptionParser.new do |opts|
17
-
21
+
22
+ options[:acess_db] = 'stnp'
23
+ opts.on( '-a', '--acess_db STRING', 'Select that databases is going to be used. s for Swissprot, t for trEMBL and n for ncrna, p for use Transdecoder and c for use testcode algothrim. By default is set to stnp' ) do |acess_db|
24
+ options[:acess_db] = acess_db
25
+ end
26
+
27
+ options[:blast] = ''
28
+ opts.on( '-b', '--blast STRING', 'Aditional options to blast execution' ) do |blast|
29
+ options[:blast] = blast
30
+ end
31
+
32
+ options[:chunk_size] = 200
33
+ opts.on( '-c', '--chunk_size SIZE', "Number of sequences processed in each block when parallelization is used. Default=200" ) do |s|
34
+ options[:chunk_size] = s.to_i
35
+ end
36
+
37
+ options[:est_db] = nil
38
+ opts.on( '-d', '--est_db FILE', "EST database for representative transcriptome\n" ) do |est_db|
39
+ options[:est_db] = est_db
40
+ if !File.exists?(options[:est_db])
41
+ puts "No valid path to EST database"
42
+ Process.exit(-1)
43
+ end
44
+ end
45
+
46
+ options[:exonerate] = TRUE
47
+ opts.on( '-e', '--exonerate', 'Disables exonerate analysis' ) do |exonerate|
48
+ options[:exonerate] = FALSE
49
+ end
50
+
18
51
  options[:fasta] = nil
19
52
  opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
20
53
  options[:fasta] = file
21
54
  end
22
-
55
+
23
56
  options[:tax_group] = nil
24
- opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n\t\t\t\t\tfungi\n\t\t\t\t\thuman\n\t\t\t\t\tinvertebrates\n\t\t\t\t\tmammals\n\t\t\t\t\tplants\n\t\t\t\t\trodents\n\t\t\t\t\tvertebrates\n\n" ) do |tax_name|
57
+ opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n#{"\t"*5}fungi\n#{"\t"*5}human\n#{"\t"*5}invertebrates\n#{"\t"*5}mammals\n#{"\t"*5}plants\n#{"\t"*5}rodents\n#{"\t"*5}vertebrates\n\n" ) do |tax_name|
25
58
  options[:tax_group] = tax_name
26
59
  end
27
-
28
- options[:user_db] = nil
29
- opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
30
- options[:user_db] = db
60
+
61
+ options[:ident] = 45.00
62
+ opts.on( '-i', '--identity_percent IDENTITY', 'identity percent threshold to consider as reliable the sequence similarity. Default=45.00' ) do |ident|
63
+ options[:ident] = ident.to_f
31
64
  end
32
-
33
- # options[:verbose] = nil
34
- # opts.on( '-v', '--verbose_mode', "verbose mode\n\n" ) do |verbose|
35
- # options[:verbose] = verbose
36
- # end
37
65
 
38
- options[:evalue] = 1.0e-25
39
- opts.on( '-e', '--evalue EVALUE', 'e value threshold to consider as reliable the orthologue sequence. Default=1.0e-25' ) do |evalue|
40
- options[:evalue] = evalue.to_f
66
+ options[:high_clustering] = TRUE
67
+ opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default true' ) do
68
+ options[:high_clustering] = FALSE
41
69
  end
42
70
 
43
- options[:ident] = 45.00
44
- opts.on( '-i', '--identity_percent IDENTITY', 'identity percent threshold to consider as reliable the sequence similarity. Default=45.00' ) do |ident|
45
- options[:ident] = ident.to_f
71
+ options[:subject_coverage] = 0.25
72
+ opts.on( '-j', '--subject_coverage_percent FLOAT', "Subject coverage percentage threshold" ) do |j|
73
+ options[:subject_coverage] = j.to_f/100
74
+ end
75
+
76
+ options[:min_nucleotides] = 100
77
+ opts.on( '-n', '--min_nucleotides minLONG', "min nucleotides to consider a part of chimera like putative unigene. Default=100\n\n" ) do |min_nucleotides|
78
+ options[:min_nucleotides] = min_nucleotides.to_i
46
79
  end
47
80
 
48
81
  options[:distance] = 15
49
82
  opts.on( '-m', '--max_distance maxDIST', "maximal distance between query and subject gene boundaries to be qualified as putative, the less distance the more strict. Default=15\n\n" ) do |distance|
50
83
  options[:distance] = distance.to_i
51
84
  end
52
-
53
- options[:chimera] = nil
54
- opts.on( '-q', '--chimera_detection', "apply chimera detection mode\n\n" ) do |chimera|
85
+
86
+ options[:port] = 0 #50000
87
+ opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
88
+ options[:port] = port.to_i
89
+ end
90
+
91
+ options[:chimera] = 'rc'
92
+ opts.on( '-q', '--chimera_detection STRING', "d for deactivate chimera detection mode, s for search chimeras only, r for revise it and c for cut it. Default = rcs \n\n" ) do |chimera|
93
+ chimera.downcase!
55
94
  options[:chimera] = chimera
56
95
  end
57
96
 
97
+ options[:reptrans] = nil
98
+ opts.on( '-r', '--representative_transcriptome', "Generates a fasta file with the minime transcriptome\n" ) do |reptrans|
99
+ options[:reptrans] = reptrans
100
+ end
101
+
102
+ options[:server_ip] = '0.0.0.0'
103
+ opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
104
+
105
+ # get list of available ips
106
+ ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
107
+ ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
108
+
109
+ if !ip
110
+ ip='0.0.0.0'
111
+ # $LOG.info("No available ip matching #{server_ip}")
112
+ end
113
+ # $ .info("Using ip #{ip}")
114
+ options[:server_ip] = ip
115
+ end
116
+
117
+ options[:ident_thresold] = 55.0
118
+ opts.on( '-t', '--identity_thresold FLOAT', "For chimeras only. Min identity to consider that two proteins are the same. Default=55.0\n\n" ) do |ident_thresold|
119
+ options[:ident_thresold] = ident_thresold.to_i
120
+ end
121
+
122
+ options[:user_db] = nil
123
+ opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
124
+ options[:user_db] = db
125
+ if !File.exists?(File.expand_path(options[:user_db])+'.psq')
126
+ puts "user database: #{options[:user_db]} was not found"
127
+ exit
128
+ end
129
+ end
130
+
131
+ options[:verbose] = 0
132
+ opts.on( '-v', '--verbose INTEGER', 'Show extra info' ) do |verbose|
133
+ options[:verbose] = verbose.to_i
134
+ end
135
+
58
136
  options[:workers] = 2
59
137
  opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|
60
- if File.exists?(workers)
61
- # use workers file
62
- options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
63
- options[:workers].shift
64
- elsif (workers.to_i > 0)
65
- options[:workers] = workers.to_i
66
- else
67
- options[:workers] = 2
138
+
139
+ if File.exists?(workers)
140
+ # use workers file
141
+ options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
142
+ options[:workers].shift
143
+ elsif (workers.to_i > 0)
144
+ options[:workers] = workers.to_i
145
+ else
146
+ options[:workers] = 2
147
+ end
148
+
68
149
  end
69
- end
70
150
 
71
- options[:chunk_size] = 200
72
- opts.on( '-c', '--chunk_size SIZE', "Number of sequences processed in each block when parallelization is used. Default=200" ) do |s|
73
- options[:chunk_size] = s.to_i
151
+ options[:training_ident] = 45.00
152
+ opts.on( '-x', '--training_identity_percent IDENTITY', 'identity percent threshold to use a complete sure sequence for Transdecoder training. Default=45.00' ) do |training_ident|
153
+ options[:training_ident] = ident.to_f
74
154
  end
75
-
76
- options[:server_ip] = '0.0.0.0'
77
- opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|
78
155
 
79
- # get list of available ips
80
- ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}
81
-
82
- ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
83
-
84
- if !ip
85
- ip='0.0.0.0'
86
- # $LOG.info("No available ip matching #{server_ip}")
87
- end
88
- # $ .info("Using ip #{ip}")
89
- options[:server_ip] = ip
90
- end
91
-
92
- options[:port] = 0 #50000
93
- opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
94
- options[:port] = port.to_i
95
- end
96
-
156
+ options[:hdd] = FALSE
157
+ opts.on( '-z', '--hdd', 'Write/use blast report on HDD' ) do |hdd|
158
+ options[:hdd] = TRUE
159
+ end
97
160
 
161
+
98
162
  # Set a banner, displayed at the top of the help screen.
99
163
  opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"
100
164
 
@@ -108,35 +172,25 @@ end
108
172
 
109
173
  # parse options and remove from ARGV
110
174
  optparse.parse!
111
- # @verbose = options[:verbose]
112
-
113
- # if (!@verbose.nil?)
114
- # puts "You have chosen the verbose mode:\n\nInput File:\t#{options[:fasta]}\nTaxon Group:\t#{options[:tax_group]}\nOwn Database:\t#{options[:user_db]}\nCPU Number:\t#{options[:workers]}"
115
- # end
116
175
 
117
- #----------------------------------------------------------------------- testing errors in parameters entry
118
176
  if (options[:fasta].nil?) || (options[:tax_group].nil?)
119
- puts "incorrect number of arguments, you need a fasta file and a taxonomical group:\n\n\t"
120
- puts optparse.help
121
- exit
177
+ puts "incorrect number of arguments, you need a fasta file and a taxonomical group:\n\n\t"
178
+ puts optparse.help
179
+ exit
122
180
  end
123
- #----------------------------------------------------------------------- loading classes and gems
124
- ROOT_PATH=File.dirname(__FILE__)
125
-
126
- # $: << File.expand_path(File.join(ROOT_PATH, "classes"))
127
-
128
- # load gem path, only to test locally
129
- # $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
130
-
131
- require 'full_lengther_next'
132
-
133
181
 
182
+ ###################################################################################################
183
+ # PREPARE ENVIROMENT
184
+ ###################################################################################################
134
185
  if ENV['FULL_LENGTHER_NEXT_INIT'] && File.exists?(ENV['FULL_LENGTHER_NEXT_INIT'])
135
186
  FULL_LENGTHER_NEXT_INIT=File.expand_path(ENV['FULL_LENGTHER_NEXT_INIT'])
136
187
  else
137
188
  FULL_LENGTHER_NEXT_INIT=File.join(ROOT_PATH,'init_env')
138
189
  end
139
190
 
191
+ if !File.exists?('temp')
192
+ Dir.mkdir('temp')
193
+ end
140
194
 
141
195
  if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
142
196
  formatted_db_path = ENV['BLASTDB']
@@ -147,50 +201,53 @@ end
147
201
  ENV['BLASTDB']=formatted_db_path
148
202
  puts "Using databases at: #{ENV['BLASTDB']}"
149
203
 
150
- ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','ncrna_fln_100.fasta.nhr')
151
- if !File.exists?(ncrna_path)
152
- puts "DB File #{ncrna_path} doesn't exists"
204
+ ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','ncrna.nhr')
205
+ if !File.exists?(ncrna_path) && options[:acess_db].include?('c')
206
+ puts "DB File #{ncrna_path} doesn't exists"
153
207
  puts optparse.help
154
208
  exit
155
209
  end
156
210
 
157
- sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}.fasta.psq")
158
- if !File.exists?(sp_path)
159
- puts "DB File #{sp_path} doesn't exists, or"
160
- puts "incorrect taxon group name: #{options[:tax_group]} choose:"
161
- puts optparse.help
162
- exit
211
+ if options[:acess_db].include?('s') || options[:acess_db].include?('t')
212
+ sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}.psq")
213
+ if !File.exists?(sp_path)
214
+ puts "DB File #{sp_path} doesn't exists, or"
215
+ puts "incorrect taxon group name: #{options[:tax_group]} choose:"
216
+ puts optparse.help
217
+ exit
218
+ end
163
219
  end
164
220
 
165
- require 'scbi_blast' # is a gem
166
- require 'scbi_mapreduce'
167
- # puts $:
168
- require 'fl_string_utils'
169
- require "une_los_hit"
170
- require "lcs" # like the class simliar of seqtrim, return the longest common sequence
171
- require "test_code"
172
-
173
- ########################################################## MAIN #################################################################
221
+ ##################################################################################################
222
+ # MAIN
223
+ ###################################################################################################
174
224
 
175
- require 'my_worker_manager'
225
+ require 'scbi_mapreduce'
226
+ require 'my_worker_manager_fln' #First server
227
+ require 'reptrans'
176
228
 
177
229
  $LOG = Logger.new(STDOUT)
178
230
  $LOG.datetime_format = "%Y-%m-%d %H:%M:%S"
179
231
 
180
- # puts "ROOT_PATH: #{ROOT_PATH}"
181
-
182
- custom_worker_file = File.join(File.dirname(ROOT_PATH),'lib','full_lengther_next','classes','my_worker.rb')
232
+ main_path = File.dirname(ROOT_PATH)
233
+ custom_worker_file = File.join(main_path, 'lib','full_lengther_next','classes','my_worker.rb')
183
234
 
184
- $LOG.info 'Starting server'
235
+ $LOG.info 'Starting server'
185
236
  # initialize work manager (open files, etc)
186
- MyWorkerManager.init_work_manager(options, options[:chunk_size])
237
+ MyWorkerManagerFln.init_work_manager(options)
187
238
 
188
239
  # Create server
189
- server = ScbiMapreduce::Manager.new(options[:server_ip],options[:port], options[:workers], MyWorkerManager,custom_worker_file, STDOUT,FULL_LENGTHER_NEXT_INIT)
190
- server.chunk_size=options[:chunk_size]
240
+ server = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerFln, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
241
+ server.chunk_size = options[:chunk_size]
242
+
191
243
  # launch server
192
244
  server.start_server
193
-
194
- $LOG.info 'Closing server'
245
+ $LOG.info 'Closing server'
246
+
247
+ if !options[:reptrans].nil?
248
+ seqs_annotation_prot, seqs_some_coding ,seqs_unknown= MyWorkerManagerFln.get_annotations()
249
+ reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
250
+ end
251
+ puts "\nGracias por utilizar Full-LengtherNEXT"
252
+
195
253
 
196
- puts "\nGracias por utilizar Full-LengtherNEXT"
@@ -0,0 +1,236 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scbi_fasta'
4
+ require 'optparse'
5
+
6
+ ##########################################################################################
7
+ ## FUNCTIONS
8
+ ##########################################################################################
9
+ def load_fasta(fasta)
10
+ seqs = []
11
+ fasta = FastaQualFile.new(fasta)
12
+ fasta.each do |name, seq|
13
+ seqs << [name, seq]
14
+ end
15
+ fasta.close
16
+ return seqs
17
+ end
18
+
19
+ def copy_seqs(seqs)
20
+ all = []
21
+ seqs.each do |seq|
22
+ all << [seq.first.dup, seq.last.dup]
23
+ end
24
+ return all
25
+ end
26
+
27
+ def output_files(options)
28
+ file = File.basename(options[:file])
29
+ output_files = {}
30
+ output_files[:insertions] = File.open(file+'_insertions', 'w') if options[:indel]
31
+ output_files[:delections] = File.open(file+'_deletions', 'w') if options[:indel]
32
+ output_files[:mix] = File.open(file+'_mix', 'w') if options[:indel]
33
+ output_files[:cut_100_pb] = File.open(file+'_trimmed', 'w') if options[:trim]
34
+ output_files[:paired] = File.open(file+'_paired', 'w') if options[:pair]
35
+ output_files[:fullChim] = File.open(file+'_fullChim', 'w') if options[:chim]
36
+ output_files[:fusionChim] = File.open(file+'_fusionChim', 'w') if options[:chim]
37
+ output_files[:fusionChimTruncate] = File.open(file+'_truncateChim', 'w') if options[:chim]
38
+ return output_files
39
+ end
40
+
41
+ def random_nt
42
+ nts =['a','c','g','t']
43
+ return nts[rand(4).truncate]
44
+ end
45
+
46
+ def insertions(seq, position)
47
+ first_cut = (seq.length/3).truncate
48
+ second_cut = first_cut *2
49
+ case position%3
50
+ when 0
51
+ seq = seq.insert(first_cut, random_nt)
52
+ name = '_I__'
53
+ when 1
54
+ seq = seq.insert(second_cut, random_nt)
55
+ name = '__I_'
56
+ when 2
57
+ seq = seq.insert(first_cut, random_nt)
58
+ seq = seq.insert(second_cut, random_nt)
59
+ name = '_I_I_'
60
+ end
61
+ return seq, name
62
+ end
63
+
64
+ def delections(seq, position)
65
+ first_cut = (seq.length/3).truncate
66
+ second_cut = first_cut *2
67
+ case position%3
68
+ when 0
69
+ seq.slice!(first_cut)
70
+ name='_D__'
71
+ when 1
72
+ seq.slice!(second_cut)
73
+ name='__D_'
74
+ when 2
75
+ seq.slice!(first_cut)
76
+ seq.slice!(second_cut)
77
+ name='_D_D_'
78
+ end
79
+ return seq, name
80
+ end
81
+
82
+ def mix(seq, position)
83
+ first_cut = (seq.length/3).truncate
84
+ second_cut = first_cut *2
85
+ case position%2
86
+ when 0
87
+ seq = seq.insert(first_cut, random_nt)
88
+ seq.slice!(second_cut)
89
+ name='_I_D_'
90
+ when 1
91
+ seq.slice!(first_cut)
92
+ seq = seq.insert(second_cut, random_nt)
93
+ name='_D_I_'
94
+ end
95
+ return seq, name
96
+ end
97
+
98
+ def load_utrs(utr_file)
99
+ utrs = {}
100
+ File.open(utr_file).each do |line|
101
+ line.chomp!
102
+ fields = line.split("\t")
103
+ seq_name = fields.shift
104
+ utrs[seq_name] = fields.map{|coord| coord.to_i}
105
+ end
106
+ return utrs
107
+ end
108
+ ##########################################################################################
109
+ ## OPTIONS
110
+ ##########################################################################################
111
+ options = {}
112
+ optparse = OptionParser.new do |opts|
113
+ options[:file]='samples'
114
+ opts.on( '-f', '--file FILE', 'FASTA file') do |file|
115
+ options[:file]=file
116
+ end
117
+
118
+ options[:duplicate]= 1
119
+ opts.on( '-d', '--duplicate INTEGER', 'Duplicate sequences to dataset') do |duplicate|
120
+ options[:duplicate] = duplicate.to_i
121
+ end
122
+
123
+ options[:split]= FALSE
124
+ opts.on( '-s', '--split', 'Split sequences in each case') do
125
+ options[:duplicate] = 3
126
+ end
127
+
128
+ options[:chim]= TRUE
129
+ opts.on( '-c', '--chim', 'Make sequence set of chimeras') do
130
+ options[:chim] = FALSE
131
+ end
132
+
133
+ options[:indel]= TRUE
134
+ opts.on( '-i', '--indel', 'Make sequence set of indels') do
135
+ options[:indel] = FALSE
136
+ end
137
+
138
+ options[:pair]= TRUE
139
+ opts.on( '-p', '--pair', 'Make sequence set of paired') do
140
+ options[:pair] = FALSE
141
+ end
142
+
143
+ options[:trim]= TRUE
144
+ opts.on( '-t', '--trim', 'Make sequence set of trimmed') do
145
+ options[:trim] = FALSE
146
+ end
147
+
148
+ # Set a banner, displayed at the top of the help screen.
149
+ opts.banner = "Usage: #{File.basename($0)} -f FILE \n\n"
150
+
151
+ # This displays the help screen
152
+ opts.on( '-h', '--help', 'Display this screen' ) do
153
+ puts opts
154
+ exit
155
+ end
156
+ end # End opts
157
+
158
+ # parse options and remove from ARGV
159
+ optparse.parse!
160
+
161
+ ##########################################################################################
162
+ ## MAIN
163
+ ##########################################################################################
164
+ if !File.exists?(options[:file])
165
+ puts 'File not exists'
166
+ Process.exit
167
+ end
168
+ seqs = load_fasta(options[:file])
169
+ output_files = output_files(options)
170
+ if options[:trim] || options[:chim]
171
+ file_ext = File.extname(options[:file])
172
+ utr_file = options[:file].gsub(file_ext,'')+'.utr'
173
+ utrs = {}
174
+ utrs = load_utrs(utr_file) if File.exists?(utr_file)
175
+ end
176
+
177
+ index = 0
178
+ seqs.each do |name, seq|
179
+ if index % 2 == 0 && !seqs[index+1].nil? && options[:chim]
180
+ second_seq = seqs[index+1].first
181
+ second_seq_fasta = seq+seqs[index+1].last
182
+ output_files[:fullChim].puts ">#{name+'_'+second_seq}\n#{seq+second_seq_fasta}"
183
+ if !utrs.empty?
184
+ utr_coord = utrs[name]
185
+ utr_coord_second = utrs[second_seq]
186
+ chim5 = seq[0..utr_coord.last]
187
+ chim3 = second_seq_fasta[utr_coord_second.first..second_seq_fasta.length-1]
188
+ output_files[:fusionChim].puts ">#{name+'_'+second_seq}\n#{chim5+chim3}" if !chim5.nil? && !chim3.nil?
189
+ chim5_trunc = chim5[0..chim5.length-100]
190
+ chim3_trunc = chim3[100..chim3.length]
191
+ output_files[:fusionChimTruncate].puts ">#{name+'_'+second_seq}\n#{chim5_trunc+chim3_trunc}" if !chim5_trunc.nil? && !chim3_trunc.nil?
192
+ end
193
+ end
194
+
195
+ if options[:trim]
196
+ if utrs.empty?
197
+ output_files[:cut_100_pb].puts ">#{name}\n#{seq[99..seq.length-101]}"
198
+ else
199
+ utr_coord = utrs[name]
200
+ trim_seq = seq[utr_coord.first+100..utr_coord.last-100]
201
+ output_files[:cut_100_pb].puts ">#{name}\n#{trim_seq}" if !trim_seq.nil? && !trim_seq.empty?
202
+ end
203
+ end
204
+
205
+ if options[:pair]
206
+ n_number = rand(5..50)
207
+ position = seq.length/2 - n_number/2
208
+ output_files[:paired].puts ">#{name}\n#{seq[0..position] + 'N'*n_number + seq[position+1..seq.length-1]}"
209
+ end
210
+ index += 1
211
+ end
212
+
213
+ if options[:indel]
214
+ all_seqs = []
215
+ options[:duplicate].times do
216
+ all_seqs.concat(copy_seqs(seqs))
217
+ end
218
+
219
+ length = all_seqs.length
220
+ all_seqs.each_with_index do |s, i|
221
+ case i
222
+ when 0..length/3-1
223
+ seq, type = insertions(s.last, i)
224
+ file = :insertions
225
+ when length/3..2*length/3-1
226
+ seq, type = delections(s.last, i)
227
+ file =:delections
228
+ else
229
+ seq, type = mix(s.last, i)
230
+ file = :mix
231
+ end
232
+ output_files[file].puts ">#{s.first}#{type}\n#{seq}"
233
+ end
234
+ end
235
+
236
+ output_files.values.map{|file| file.close}