full_lengther_next 0.0.8 → 0.5.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -3,142 +3,126 @@
3
3
  # 15-2-2011 Noe Fernandez-Pozo
4
4
  # Script to create your own Full-LengtherNext User database.
5
5
 
6
- require 'net/ftp'
7
-
8
- #receive one argument or fail
9
- if (ARGV.size != 2)
10
-
11
- puts "incorrect number of arguments, you need a taxonomic group like 'Coniferopsida', you can search it in 'http://www.ncbi.nlm.nih.gov/Taxonomy/'
12
- and a UniProt taxonomic group from this list:
13
-
14
- fungi
15
- human
16
- invertebrates
17
- mammals
18
- plants
19
- rodents
20
- vertebrates
21
-
22
- mode of use: ruby make_user_db.rb coniferopsida plants\n\n"
23
-
24
- Process.exit(-1);
25
- end
26
-
27
- (my_group,uniprot_group)=ARGV
28
-
29
- ################################################### Functions
30
-
31
- def filter_incomplete_seqs(output_file,file_name, my_group)
32
-
33
- puts " filtering sequences"
34
-
35
- # UniProtKB fragments with FT NON_CONS and FT NON_TER features.
36
- #
37
- # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
38
- # FT NON_TER 1 1
39
- # FT NON_TER 29 29
40
- # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
41
- # FT NON_CONS 1683 1684
42
- #
43
- # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
44
-
45
- newseq=false
46
- print_seq=false
47
- incomplete=false
48
- id=''
49
- description = ''
50
- organism_name = ''
51
- seq = ''
52
- organelle = ''
53
-
54
- File.open(file_name).each_line do |line|
55
- if (newseq == false)
56
- if (line =~ /^AC\s+(\w+);/)
57
- id=$1
58
- newseq = true
59
- description = ''
60
- organism_name = ''
61
- seq = ''
62
- print_seq = false
63
- incomplete = false
6
+ ROOT_PATH=File.dirname(__FILE__)
7
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
8
+
9
+ require 'cdhit'
10
+ require 'handle_db'
11
+ require 'optparse'
12
+
13
+ ##############################################################################################
14
+ ## METHODS
15
+ #############################################################################################
16
+ def get_seqs(index, taxon, isoform_hash)
17
+ seqs = ''
18
+ File.open(index).each do |line|
19
+ line.chomp!
20
+ fields = line.split("\t")
21
+ if fields[2].split(';').include?(taxon)
22
+ if fields[5] == '-'
64
23
  organelle = ''
24
+ else
25
+ organelle = fields[5].gsub('-','')
65
26
  end
66
- else
67
- if (line =~ /^DE\s+(.+)\;*/)
68
- if (description == '')
69
- description = $1
70
- description.sub!(/RecName: Full=/,'sp=')
71
- description.sub!(/SubName: Full=/,'tr=')
72
- end
73
- if (line =~ /Flags: Fragment/)
74
- # puts "#{id} #{line}"
75
- incomplete = true
76
- end
77
- elsif (line =~ /^OS\s+(.+)/)
78
- organism_name = $1
79
- elsif (line =~ /^OG\s+(.+)/)
80
- organelle = $1
81
- elsif (line =~ /^OC\s+[\w\s\;]*#{my_group}/i) && (!incomplete)
82
- print_seq=true
83
- # puts "#{id} #{organism_name} print_seq?: #{print_seq}"
84
- elsif (line =~ /^FT\s+NON_TER\s+/)
85
- print_seq=false
86
- # puts "#{id} NON_TER"
87
- elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
88
- print_seq=false
89
- # puts "#{id} NON_CONS"
90
- elsif (line =~ /^\s+([\w\s]+)/)
91
- seq += $1
92
- elsif (line =~ /^\/\//)
93
- seq.gsub!(/\s*/,'')
94
- if (seq !~ /^M/i)
95
- print_seq=false
96
- end
97
- newseq = false
98
-
99
- if (print_seq)
100
- output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
101
- end
27
+ seqs << ">#{[fields[0], fields[1], fields[3], organelle].join(' ')}\n#{fields[4]}\n"
28
+ if !isoform_hash.nil?
29
+ accid = fields[1].split(' ').first.split('-').first
30
+ var_splice = isoform_hash[accid]
31
+ seqs << var_splice + "\n" if !var_splice.nil?
102
32
  end
103
33
  end
104
34
  end
35
+ return seqs
105
36
  end
106
37
 
107
- ########################################################
108
- ## MAIN
109
- ########################################################
38
+ ##########################################################################################
39
+ ## OPTIONS
40
+ ##########################################################################################
110
41
 
111
- ROOT_PATH=File.dirname(__FILE__)
42
+ options = {}
112
43
 
113
- # $: << File.expand_path(File.join(ROOT_PATH, "classes"))
44
+ divs = %w{human fungi invertebrates mammals plants rodents vertebrates}
114
45
 
115
- # load gem path, only to test locally
116
- # $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
46
+ optparse = OptionParser.new do |opts|
47
+ options[:uniprot_div] = nil
48
+ opts.on( '-u', '--file String', 'Uniprot DBs to taxon search. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates.') do |uniprot_div|
49
+ if !divs.include?(uniprot_div)
50
+ puts 'This uniprot division not exists:', uniprot_div
51
+ process.exit
52
+ end
53
+ options[:uniprot_div] = uniprot_div
54
+ end
117
55
 
118
- require 'full_lengther_next'
56
+ options[:taxon] = nil
57
+ opts.on( '-t', '--taxon STRING', 'Specific taxon to search in uniprot division. Write taxo between \'\'') do |taxon|
58
+ options[:taxon] = taxon
59
+ end
119
60
 
120
- if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
121
- formatted_db_path = ENV['BLASTDB']
122
- else # otherwise use ROOTPATH + DB
123
- formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
124
- end
61
+ options[:local] = FALSE
62
+ opts.on( '-l', '--local', 'Only parse downloaded files without download them again') do
63
+ options[:local] = TRUE
64
+ end
125
65
 
126
- ENV['BLASTDB']=formatted_db_path
66
+ options[:cdhit] = 0
67
+ opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
68
+ options[:cdhit] = cdhit.to_f
69
+ end
127
70
 
71
+ # Set a banner, displayed at the top of the help screen.
72
+ opts.banner = "Usage: #{File.basename(__FILE__)} [options] \n\n"
128
73
 
129
- if !File.exists?(File.join(formatted_db_path, my_group))
130
- Dir.mkdir(File.join(formatted_db_path,my_group))
131
- end
74
+ # This displays the help screen
75
+ opts.on( '-h', '--help', 'Display this screen' ) do
76
+ puts opts
77
+ exit
78
+ end
79
+
80
+ end # End opts
132
81
 
133
- output_file_path=File.join(formatted_db_path,my_group,my_group+".fasta")
82
+ # parse options and remove from ARGV
83
+ optparse.parse!
134
84
 
135
- output_file = File.new(output_file_path, "w")
85
+ ########################################################
86
+ ## MAIN
87
+ ########################################################
136
88
 
137
- filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_sprot_#{uniprot_group}.dat"), my_group)
138
- filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_trembl_#{uniprot_group}.dat"), my_group)
89
+ if options[:taxon].nil? || options[:uniprot_div].nil?
90
+ puts 'Taxon or uniprot division was not specified'
91
+ Process.exit(-1)
92
+ end
139
93
 
140
- output_file.close
94
+ if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
95
+ formatted_db_path = ENV['BLASTDB']
96
+ else # otherwise use ROOTPATH + DB
97
+ formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
98
+ end
141
99
 
142
- `makeblastdb -in #{output_file_path} -dbtype 'prot' -parse_seqids`
100
+ if !options[:local]
101
+ user_db_folder = File.join(formatted_db_path, options[:taxon])
102
+ else
103
+ user_db_folder = File.join(Dir.pwd, options[:taxon])
104
+ end
105
+ output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
106
+ user_db_folder.gsub!(' ', '_')
107
+ output_file_path.gsub!(' ', '_')
108
+ Dir.mkdir(user_db_folder) if !File.exists?(user_db_folder)
109
+
110
+ isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
111
+ seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
112
+ isoform_hash = nil
113
+ seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
114
+
115
+ if options[:cdhit] > 0
116
+ output_file = File.open(output_file_path, 'w')
117
+ output_file.puts seqs
118
+ output_file.close
119
+ system("cd-hit -i #{output_file_path} -o #{output_file_path}_cln -c 1 -s 0.95 -M 0") #-d length of description in .clstr file, default 20 if set to 0, it takes the fasta defline and stops at first space (BUGGED OPTION) -M 0 cd-hit uses all memory that it needs
120
+ cdhit = Cdhit.new(output_file_path, output_file_path+'_cln.clstr')
121
+ cdhit.master_to_sp_seq
122
+ seqs = cdhit.get_all_master
123
+ seqs.map!{|s| s.to_s}
124
+ seqs = seqs.join("\n")
125
+ end
126
+ do_makeblastdb(seqs, output_file_path, 'prot')
143
127
 
144
128
  puts "make_user_db.rb has finished"
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+
4
+ #############################################
5
+ ### FUNCTIONS
6
+ #############################################
7
+ def create_fln_hash(path)
8
+ fln_hash = {}
9
+ file = File.open(path, 'r').each do |line|
10
+ fields = line.chomp.split
11
+ fln_hash[fields[1]] = fields[0].to_i
12
+ end
13
+ fln_hash['<=200seqs'] = fln_hash['good_seqs'] - fln_hash['sequences_>200']
14
+ fln_hash['>200seqs'] = fln_hash['sequences_>200'] - fln_hash['sequences_>500']
15
+ fln_hash['<=200unk'] = fln_hash['unknown'] - fln_hash['unknown_>200']
16
+ fln_hash['>200unk'] = fln_hash['unknown_>200'] - fln_hash['unknown_>500']
17
+ fln_hash['<=200cod'] = fln_hash['coding'] - fln_hash['coding_>200']
18
+ fln_hash['>200cod'] = fln_hash['coding_>200'] - fln_hash['coding_>500']
19
+ fln_hash['no_match_db'] = fln_hash['coding'] + fln_hash['unknown']
20
+
21
+ return fln_hash
22
+ end
23
+
24
+ def graph_table(fln_hash, output, graph_type, header_titles, categories_names, keywords, stacked_cols, titles = nil)
25
+ table = []
26
+ cmd = basic_plot_command(graph_type)
27
+ cmd << "set output '#{output}.png'\n"
28
+
29
+ if fln_hash.class.to_s == 'Array'
30
+ table << header(fln_hash.length, header_titles)
31
+ table.concat(categories(categories_names))
32
+ cmd << 'plot '
33
+ count = 0
34
+ fln_hash.each_with_index do |hash,i|
35
+ table = fill_table(hash, table, keywords, stacked_cols, graph_type)
36
+ if i == 0
37
+ first = TRUE
38
+ else
39
+ first = FALSE
40
+ end
41
+ cmd << histogram(stacked_cols, output,titles[i], count, graph_type, first)
42
+ if i < fln_hash.length-1
43
+ cmd << "\\\n"
44
+ end
45
+ if !graph_type.include?('clustered')
46
+ count += stacked_cols
47
+ else
48
+ count += 1
49
+ end
50
+ end
51
+ else
52
+ table << header(1, header_titles)
53
+ table.concat(categories(categories_names))
54
+ table = fill_table(fln_hash, table, keywords, stacked_cols, graph_type)
55
+ cmd << 'plot '+ histogram(stacked_cols, output, '', 0, graph_type, TRUE)
56
+ end
57
+ if table.length ==2 #Dummie row for rowstacked graph with a only category
58
+ table << table[1].dup
59
+ table[2].each_with_index do |cell, i|
60
+ if i== 0
61
+ table[2][i] = '&'
62
+ else
63
+ table[2][i] = 0
64
+ end
65
+ end
66
+ end
67
+ cmd.chop!
68
+ write_table(table, output)
69
+ write_cmd(cmd)
70
+ system('gnuplot cmd.dem')
71
+ end
72
+
73
+ def histogram(columns, file, name, add, graph_type, first)
74
+ cmd = ""
75
+ if first
76
+ cmd << "newhistogram \"#{name}\", '#{file}' using 2:xtic(1) t col,"
77
+ else
78
+ cmd << "newhistogram \"#{name}\", '' using #{2 + add}:xtic(1) t col,"
79
+ end
80
+ if !graph_type.include?('clustered')
81
+ (columns-1).times do |col|
82
+ cmd << " '' u #{3+col+add} t col,"
83
+ end
84
+ end
85
+ return cmd
86
+ end
87
+
88
+ def write_cmd(cmd)
89
+ if File.exists?('cmd.dem')
90
+ File.delete('cmd.dem')
91
+ end
92
+ file = File.open('cmd.dem', 'a')
93
+ file.puts cmd
94
+ file.close
95
+ end
96
+
97
+ def header(iterations, header_titles)
98
+ header = ['Clasification']
99
+ iterations.times do
100
+ header_titles.each do |title|
101
+ header << title
102
+ end
103
+ end
104
+ return header
105
+ end
106
+
107
+ def categories(cat)
108
+ array_cat = cat.split(' ').map{|name| [name]}
109
+ return array_cat
110
+ end
111
+
112
+ def fill_table(fln_hash, table, keywords, stacked_cols,graph_type)
113
+ series = 0
114
+ keywords.each_with_index do |key, i|
115
+ if graph_type.include?('clustered')
116
+ row = i +1 -stacked_cols*series
117
+ if (i+1) % stacked_cols == 0
118
+ series +=1
119
+ end
120
+ else
121
+ row = i/stacked_cols + 1
122
+ end
123
+ value = fln_hash[key]
124
+ if value.nil?
125
+ value = 0
126
+ end
127
+ if table.length == 2
128
+ table[1] << value
129
+ else
130
+ table[row] << value
131
+ end
132
+ end
133
+ return table
134
+ end
135
+
136
+ def write_table(table, file_name)
137
+ file_table = File.open(file_name, 'w')
138
+ table.each do |line|
139
+ file_table.puts line.join(' ')
140
+ end
141
+ file_table.close
142
+ end
143
+
144
+ def basic_plot_command(graph_type)
145
+ cmd = ''
146
+ if graph_type.include?('clustered')
147
+ cmd << "unset key\n"
148
+ else
149
+ cmd << "set key under nobox\n"
150
+ end
151
+ cmd << "set style data histogram\n"
152
+ cmd << "set style histogram #{graph_type} title offset 2,0.25\n"
153
+ cmd << "set style fill solid noborder\n"
154
+ cmd << "set boxwidth 0.95\n"
155
+ cmd << "unset xtics\n"
156
+ cmd << "set xtics nomirror rotate by -45 scale 0\n"
157
+ cmd << "set xlabel \" \" offset 0,-2\n"
158
+ cmd << "set ylabel \"Num sequences\"\n"
159
+ cmd << "set ytics\n"
160
+ cmd << "set grid y\n"
161
+ cmd << "set auto y\n"
162
+ cmd << "set terminal png nocrop enhanced font arial 15 size 1000,600\n"
163
+ return cmd
164
+ end
165
+
166
+
167
+
168
+ def parse_file(file)
169
+ titles = []
170
+ paths =[]
171
+ File.open(file,'r').each do |line|
172
+ fields = line.chomp.split("\t")
173
+ if !fields[0].nil?
174
+ titles << fields[0]
175
+ end
176
+ if !fields[1].nil?
177
+ paths << fields[1]
178
+ end
179
+ end
180
+
181
+ return titles, paths
182
+ end
183
+
184
+
185
+ ##########################################################################################
186
+ ## OPTIONS
187
+ ##########################################################################################
188
+
189
+ options = {}
190
+
191
+ optparse = OptionParser.new do |opts|
192
+ options[:file]='samples'
193
+ opts.on( '-f', '--file FILE', 'Path to FLN execution') do |file|
194
+ options[:file]=file
195
+ end
196
+
197
+ options[:path] = File.join('fln_results','summary_stats.txt')
198
+ opts.on( '-p', '--path PATH', 'Path to FLN different FLN results' ) do |path|
199
+ options[:path] = File.join(path,'fln_results','summary_stats.txt')
200
+ end
201
+
202
+ # Set a banner, displayed at the top of the help screen.
203
+ opts.banner = "Usage: plot_fln.rb [-p PATH || -f FILE] \n\n"
204
+
205
+ # This displays the help screen
206
+ opts.on( '-h', '--help', 'Display this screen' ) do
207
+ puts opts
208
+ exit
209
+ end
210
+
211
+ end # End opts
212
+
213
+ # parse options and remove from ARGV
214
+ optparse.parse!
215
+
216
+ ##########################################################################################
217
+ ## MAIN
218
+ ##########################################################################################
219
+
220
+ if File.exists?(options[:path])
221
+ fln_hash = create_fln_hash(options[:path])
222
+ end
223
+
224
+ if File.exists?(options[:file])
225
+ titles, paths = parse_file(options[:file])
226
+ fln_hash = []
227
+ paths.each do |path|
228
+ fln_hash << create_fln_hash(File.join(path,'fln_results','summary_stats.txt'))
229
+ end
230
+ end
231
+
232
+ graph_table(
233
+ fln_hash,
234
+ 'status_report_table',
235
+ 'rowstacked',
236
+ %w{Sure Putative},
237
+ 'Complete N-terminal C-terminal Internal NcRNA Coding Unknown',
238
+ %w{complete_sure complete_putative n_terminal_sure n_terminal_putative c_terminal_sure c_terminal_putative internal internal_putative ncrna ncrna_putative coding_sure coding_putative unknown unknown_putative},
239
+ 2,
240
+ titles)
241
+
242
+ graph_table(
243
+ fln_hash,
244
+ 'assembly_table',
245
+ 'rowstacked',
246
+ %w{<=200nt >200nt >500nt},
247
+ 'Unigenes Coding Unknown',
248
+ %w{<=200seqs >200seqs sequences_>500 <=200cod >200cod coding_>500 <=200unk >200unk unknown_>500},
249
+ 3,
250
+ titles)
251
+
252
+ graph_table(
253
+ fln_hash,
254
+ 'database_usage',
255
+ 'clustered',
256
+ %w{seqs},
257
+ 'UserDB SwissProt TrEMBL ncRNA None Diff-orthologues Complete Diff-complete',
258
+ %w{userdb swissprot trembl ncrna no_match_db different_orthologues complete different_completes},
259
+ 8,
260
+ titles)
261
+
262
+ graph_table(
263
+ fln_hash,
264
+ 'artifacts',
265
+ 'clustered',
266
+ %w{seqs},
267
+ 'Misassembled Chimeras Other',
268
+ %w{misassembled chimeras other_artifacts},
269
+ 3,
270
+ titles)