full_lengther_next 0.0.8 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. data/.gemtest +0 -0
  2. data/History.txt +2 -2
  3. data/Manifest.txt +33 -18
  4. data/Rakefile +4 -2
  5. data/bin/download_fln_dbs.rb +310 -158
  6. data/bin/full_lengther_next +160 -103
  7. data/bin/make_test_dataset.rb +236 -0
  8. data/bin/make_user_db.rb +101 -117
  9. data/bin/plot_fln.rb +270 -0
  10. data/bin/plot_taxonomy.rb +70 -0
  11. data/lib/expresscanvas.zip +0 -0
  12. data/lib/full_lengther_next.rb +3 -3
  13. data/lib/full_lengther_next/classes/artifacts.rb +66 -0
  14. data/lib/full_lengther_next/classes/blast_functions.rb +326 -0
  15. data/lib/full_lengther_next/classes/cdhit.rb +154 -0
  16. data/lib/full_lengther_next/classes/chimeric_seqs.rb +315 -57
  17. data/lib/full_lengther_next/classes/common_functions.rb +105 -63
  18. data/lib/full_lengther_next/classes/exonerate_result.rb +258 -0
  19. data/lib/full_lengther_next/classes/fl_analysis.rb +226 -617
  20. data/lib/full_lengther_next/classes/fl_string_utils.rb +4 -2
  21. data/lib/full_lengther_next/classes/fln_stats.rb +598 -557
  22. data/lib/full_lengther_next/classes/handle_db.rb +30 -0
  23. data/lib/full_lengther_next/classes/my_worker.rb +308 -138
  24. data/lib/full_lengther_next/classes/my_worker_EST.rb +54 -0
  25. data/lib/full_lengther_next/classes/my_worker_manager_EST.rb +69 -0
  26. data/lib/full_lengther_next/classes/my_worker_manager_fln.rb +389 -0
  27. data/lib/full_lengther_next/classes/nc_rna.rb +5 -7
  28. data/lib/full_lengther_next/classes/reptrans.rb +210 -0
  29. data/lib/full_lengther_next/classes/sequence.rb +439 -80
  30. data/lib/full_lengther_next/classes/test_code.rb +15 -16
  31. data/lib/full_lengther_next/classes/types.rb +12 -0
  32. data/lib/full_lengther_next/classes/une_los_hit.rb +148 -230
  33. data/lib/full_lengther_next/classes/warnings.rb +40 -0
  34. metadata +207 -93
  35. data/lib/full_lengther_next/classes/lcs.rb +0 -33
  36. data/lib/full_lengther_next/classes/my_worker_manager.rb +0 -240
@@ -3,142 +3,126 @@
3
3
  # 15-2-2011 Noe Fernandez-Pozo
4
4
  # Script to create your own Full-LengtherNext User database.
5
5
 
6
- require 'net/ftp'
7
-
8
- #receive one argument or fail
9
- if (ARGV.size != 2)
10
-
11
- puts "incorrect number of arguments, you need a taxonomic group like 'Coniferopsida', you can search it in 'http://www.ncbi.nlm.nih.gov/Taxonomy/'
12
- and a UniProt taxonomic group from this list:
13
-
14
- fungi
15
- human
16
- invertebrates
17
- mammals
18
- plants
19
- rodents
20
- vertebrates
21
-
22
- mode of use: ruby make_user_db.rb coniferopsida plants\n\n"
23
-
24
- Process.exit(-1);
25
- end
26
-
27
- (my_group,uniprot_group)=ARGV
28
-
29
- ################################################### Functions
30
-
31
- def filter_incomplete_seqs(output_file,file_name, my_group)
32
-
33
- puts " filtering sequences"
34
-
35
- # UniProtKB fragments with FT NON_CONS and FT NON_TER features.
36
- #
37
- # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines:
38
- # FT NON_TER 1 1
39
- # FT NON_TER 29 29
40
- # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line:
41
- # FT NON_CONS 1683 1684
42
- #
43
- # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'.
44
-
45
- newseq=false
46
- print_seq=false
47
- incomplete=false
48
- id=''
49
- description = ''
50
- organism_name = ''
51
- seq = ''
52
- organelle = ''
53
-
54
- File.open(file_name).each_line do |line|
55
- if (newseq == false)
56
- if (line =~ /^AC\s+(\w+);/)
57
- id=$1
58
- newseq = true
59
- description = ''
60
- organism_name = ''
61
- seq = ''
62
- print_seq = false
63
- incomplete = false
6
+ ROOT_PATH=File.dirname(__FILE__)
7
+ $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))
8
+
9
+ require 'cdhit'
10
+ require 'handle_db'
11
+ require 'optparse'
12
+
13
+ ##############################################################################################
14
+ ## METHODS
15
+ #############################################################################################
16
+ def get_seqs(index, taxon, isoform_hash)
17
+ seqs = ''
18
+ File.open(index).each do |line|
19
+ line.chomp!
20
+ fields = line.split("\t")
21
+ if fields[2].split(';').include?(taxon)
22
+ if fields[5] == '-'
64
23
  organelle = ''
24
+ else
25
+ organelle = fields[5].gsub('-','')
65
26
  end
66
- else
67
- if (line =~ /^DE\s+(.+)\;*/)
68
- if (description == '')
69
- description = $1
70
- description.sub!(/RecName: Full=/,'sp=')
71
- description.sub!(/SubName: Full=/,'tr=')
72
- end
73
- if (line =~ /Flags: Fragment/)
74
- # puts "#{id} #{line}"
75
- incomplete = true
76
- end
77
- elsif (line =~ /^OS\s+(.+)/)
78
- organism_name = $1
79
- elsif (line =~ /^OG\s+(.+)/)
80
- organelle = $1
81
- elsif (line =~ /^OC\s+[\w\s\;]*#{my_group}/i) && (!incomplete)
82
- print_seq=true
83
- # puts "#{id} #{organism_name} print_seq?: #{print_seq}"
84
- elsif (line =~ /^FT\s+NON_TER\s+/)
85
- print_seq=false
86
- # puts "#{id} NON_TER"
87
- elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/)
88
- print_seq=false
89
- # puts "#{id} NON_CONS"
90
- elsif (line =~ /^\s+([\w\s]+)/)
91
- seq += $1
92
- elsif (line =~ /^\/\//)
93
- seq.gsub!(/\s*/,'')
94
- if (seq !~ /^M/i)
95
- print_seq=false
96
- end
97
- newseq = false
98
-
99
- if (print_seq)
100
- output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}"
101
- end
27
+ seqs << ">#{[fields[0], fields[1], fields[3], organelle].join(' ')}\n#{fields[4]}\n"
28
+ if !isoform_hash.nil?
29
+ accid = fields[1].split(' ').first.split('-').first
30
+ var_splice = isoform_hash[accid]
31
+ seqs << var_splice + "\n" if !var_splice.nil?
102
32
  end
103
33
  end
104
34
  end
35
+ return seqs
105
36
  end
106
37
 
107
- ########################################################
108
- ## MAIN
109
- ########################################################
38
+ ##########################################################################################
39
+ ## OPTIONS
40
+ ##########################################################################################
110
41
 
111
- ROOT_PATH=File.dirname(__FILE__)
42
+ options = {}
112
43
 
113
- # $: << File.expand_path(File.join(ROOT_PATH, "classes"))
44
+ divs = %w{human fungi invertebrates mammals plants rodents vertebrates}
114
45
 
115
- # load gem path, only to test locally
116
- # $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib')
46
+ optparse = OptionParser.new do |opts|
47
+ options[:uniprot_div] = nil
48
+ opts.on( '-u', '--file String', 'Uniprot DBs to taxon search. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates.') do |uniprot_div|
49
+ if !divs.include?(uniprot_div)
50
+ puts 'This uniprot division not exists:', uniprot_div
51
+ process.exit
52
+ end
53
+ options[:uniprot_div] = uniprot_div
54
+ end
117
55
 
118
- require 'full_lengther_next'
56
+ options[:taxon] = nil
57
+ opts.on( '-t', '--taxon STRING', 'Specific taxon to search in uniprot division. Write taxo between \'\'') do |taxon|
58
+ options[:taxon] = taxon
59
+ end
119
60
 
120
- if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
121
- formatted_db_path = ENV['BLASTDB']
122
- else # otherwise use ROOTPATH + DB
123
- formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
124
- end
61
+ options[:local] = FALSE
62
+ opts.on( '-l', '--local', 'Only parse downloaded files without download them again') do
63
+ options[:local] = TRUE
64
+ end
125
65
 
126
- ENV['BLASTDB']=formatted_db_path
66
+ options[:cdhit] = 0
67
+ opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
68
+ options[:cdhit] = cdhit.to_f
69
+ end
127
70
 
71
+ # Set a banner, displayed at the top of the help screen.
72
+ opts.banner = "Usage: #{File.basename(__FILE__)} [options] \n\n"
128
73
 
129
- if !File.exists?(File.join(formatted_db_path, my_group))
130
- Dir.mkdir(File.join(formatted_db_path,my_group))
131
- end
74
+ # This displays the help screen
75
+ opts.on( '-h', '--help', 'Display this screen' ) do
76
+ puts opts
77
+ exit
78
+ end
79
+
80
+ end # End opts
132
81
 
133
- output_file_path=File.join(formatted_db_path,my_group,my_group+".fasta")
82
+ # parse options and remove from ARGV
83
+ optparse.parse!
134
84
 
135
- output_file = File.new(output_file_path, "w")
85
+ ########################################################
86
+ ## MAIN
87
+ ########################################################
136
88
 
137
- filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_sprot_#{uniprot_group}.dat"), my_group)
138
- filter_incomplete_seqs(output_file, File.join(formatted_db_path, "uniprot_trembl_#{uniprot_group}.dat"), my_group)
89
+ if options[:taxon].nil? || options[:uniprot_div].nil?
90
+ puts 'Taxon or uniprot division was not specified'
91
+ Process.exit(-1)
92
+ end
139
93
 
140
- output_file.close
94
+ if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
95
+ formatted_db_path = ENV['BLASTDB']
96
+ else # otherwise use ROOTPATH + DB
97
+ formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
98
+ end
141
99
 
142
- `makeblastdb -in #{output_file_path} -dbtype 'prot' -parse_seqids`
100
+ if !options[:local]
101
+ user_db_folder = File.join(formatted_db_path, options[:taxon])
102
+ else
103
+ user_db_folder = File.join(Dir.pwd, options[:taxon])
104
+ end
105
+ output_file_path = File.join(user_db_folder, options[:taxon]+".fasta")
106
+ user_db_folder.gsub!(' ', '_')
107
+ output_file_path.gsub!(' ', '_')
108
+ Dir.mkdir(user_db_folder) if !File.exists?(user_db_folder)
109
+
110
+ isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta'))
111
+ seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
112
+ isoform_hash = nil
113
+ seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash)
114
+
115
+ if options[:cdhit] > 0
116
+ output_file = File.open(output_file_path, 'w')
117
+ output_file.puts seqs
118
+ output_file.close
119
+ system("cd-hit -i #{output_file_path} -o #{output_file_path}_cln -c 1 -s 0.95 -M 0") #-d length of description in .clstr file, default 20 if set to 0, it takes the fasta defline and stops at first space (BUGGED OPTION) -M 0 cd-hit uses all memory that it needs
120
+ cdhit = Cdhit.new(output_file_path, output_file_path+'_cln.clstr')
121
+ cdhit.master_to_sp_seq
122
+ seqs = cdhit.get_all_master
123
+ seqs.map!{|s| s.to_s}
124
+ seqs = seqs.join("\n")
125
+ end
126
+ do_makeblastdb(seqs, output_file_path, 'prot')
143
127
 
144
128
  puts "make_user_db.rb has finished"
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env ruby
2
+ require 'optparse'
3
+
4
+ #############################################
5
+ ### FUNCTIONS
6
+ #############################################
7
+ def create_fln_hash(path)
8
+ fln_hash = {}
9
+ file = File.open(path, 'r').each do |line|
10
+ fields = line.chomp.split
11
+ fln_hash[fields[1]] = fields[0].to_i
12
+ end
13
+ fln_hash['<=200seqs'] = fln_hash['good_seqs'] - fln_hash['sequences_>200']
14
+ fln_hash['>200seqs'] = fln_hash['sequences_>200'] - fln_hash['sequences_>500']
15
+ fln_hash['<=200unk'] = fln_hash['unknown'] - fln_hash['unknown_>200']
16
+ fln_hash['>200unk'] = fln_hash['unknown_>200'] - fln_hash['unknown_>500']
17
+ fln_hash['<=200cod'] = fln_hash['coding'] - fln_hash['coding_>200']
18
+ fln_hash['>200cod'] = fln_hash['coding_>200'] - fln_hash['coding_>500']
19
+ fln_hash['no_match_db'] = fln_hash['coding'] + fln_hash['unknown']
20
+
21
+ return fln_hash
22
+ end
23
+
24
+ def graph_table(fln_hash, output, graph_type, header_titles, categories_names, keywords, stacked_cols, titles = nil)
25
+ table = []
26
+ cmd = basic_plot_command(graph_type)
27
+ cmd << "set output '#{output}.png'\n"
28
+
29
+ if fln_hash.class.to_s == 'Array'
30
+ table << header(fln_hash.length, header_titles)
31
+ table.concat(categories(categories_names))
32
+ cmd << 'plot '
33
+ count = 0
34
+ fln_hash.each_with_index do |hash,i|
35
+ table = fill_table(hash, table, keywords, stacked_cols, graph_type)
36
+ if i == 0
37
+ first = TRUE
38
+ else
39
+ first = FALSE
40
+ end
41
+ cmd << histogram(stacked_cols, output,titles[i], count, graph_type, first)
42
+ if i < fln_hash.length-1
43
+ cmd << "\\\n"
44
+ end
45
+ if !graph_type.include?('clustered')
46
+ count += stacked_cols
47
+ else
48
+ count += 1
49
+ end
50
+ end
51
+ else
52
+ table << header(1, header_titles)
53
+ table.concat(categories(categories_names))
54
+ table = fill_table(fln_hash, table, keywords, stacked_cols, graph_type)
55
+ cmd << 'plot '+ histogram(stacked_cols, output, '', 0, graph_type, TRUE)
56
+ end
57
+ if table.length ==2 #Dummie row for rowstacked graph with a only category
58
+ table << table[1].dup
59
+ table[2].each_with_index do |cell, i|
60
+ if i== 0
61
+ table[2][i] = '&'
62
+ else
63
+ table[2][i] = 0
64
+ end
65
+ end
66
+ end
67
+ cmd.chop!
68
+ write_table(table, output)
69
+ write_cmd(cmd)
70
+ system('gnuplot cmd.dem')
71
+ end
72
+
73
+ def histogram(columns, file, name, add, graph_type, first)
74
+ cmd = ""
75
+ if first
76
+ cmd << "newhistogram \"#{name}\", '#{file}' using 2:xtic(1) t col,"
77
+ else
78
+ cmd << "newhistogram \"#{name}\", '' using #{2 + add}:xtic(1) t col,"
79
+ end
80
+ if !graph_type.include?('clustered')
81
+ (columns-1).times do |col|
82
+ cmd << " '' u #{3+col+add} t col,"
83
+ end
84
+ end
85
+ return cmd
86
+ end
87
+
88
+ def write_cmd(cmd)
89
+ if File.exists?('cmd.dem')
90
+ File.delete('cmd.dem')
91
+ end
92
+ file = File.open('cmd.dem', 'a')
93
+ file.puts cmd
94
+ file.close
95
+ end
96
+
97
+ def header(iterations, header_titles)
98
+ header = ['Clasification']
99
+ iterations.times do
100
+ header_titles.each do |title|
101
+ header << title
102
+ end
103
+ end
104
+ return header
105
+ end
106
+
107
+ def categories(cat)
108
+ array_cat = cat.split(' ').map{|name| [name]}
109
+ return array_cat
110
+ end
111
+
112
+ def fill_table(fln_hash, table, keywords, stacked_cols,graph_type)
113
+ series = 0
114
+ keywords.each_with_index do |key, i|
115
+ if graph_type.include?('clustered')
116
+ row = i +1 -stacked_cols*series
117
+ if (i+1) % stacked_cols == 0
118
+ series +=1
119
+ end
120
+ else
121
+ row = i/stacked_cols + 1
122
+ end
123
+ value = fln_hash[key]
124
+ if value.nil?
125
+ value = 0
126
+ end
127
+ if table.length == 2
128
+ table[1] << value
129
+ else
130
+ table[row] << value
131
+ end
132
+ end
133
+ return table
134
+ end
135
+
136
+ def write_table(table, file_name)
137
+ file_table = File.open(file_name, 'w')
138
+ table.each do |line|
139
+ file_table.puts line.join(' ')
140
+ end
141
+ file_table.close
142
+ end
143
+
144
+ def basic_plot_command(graph_type)
145
+ cmd = ''
146
+ if graph_type.include?('clustered')
147
+ cmd << "unset key\n"
148
+ else
149
+ cmd << "set key under nobox\n"
150
+ end
151
+ cmd << "set style data histogram\n"
152
+ cmd << "set style histogram #{graph_type} title offset 2,0.25\n"
153
+ cmd << "set style fill solid noborder\n"
154
+ cmd << "set boxwidth 0.95\n"
155
+ cmd << "unset xtics\n"
156
+ cmd << "set xtics nomirror rotate by -45 scale 0\n"
157
+ cmd << "set xlabel \" \" offset 0,-2\n"
158
+ cmd << "set ylabel \"Num sequences\"\n"
159
+ cmd << "set ytics\n"
160
+ cmd << "set grid y\n"
161
+ cmd << "set auto y\n"
162
+ cmd << "set terminal png nocrop enhanced font arial 15 size 1000,600\n"
163
+ return cmd
164
+ end
165
+
166
+
167
+
168
+ def parse_file(file)
169
+ titles = []
170
+ paths =[]
171
+ File.open(file,'r').each do |line|
172
+ fields = line.chomp.split("\t")
173
+ if !fields[0].nil?
174
+ titles << fields[0]
175
+ end
176
+ if !fields[1].nil?
177
+ paths << fields[1]
178
+ end
179
+ end
180
+
181
+ return titles, paths
182
+ end
183
+
184
+
185
+ ##########################################################################################
186
+ ## OPTIONS
187
+ ##########################################################################################
188
+
189
+ options = {}
190
+
191
+ optparse = OptionParser.new do |opts|
192
+ options[:file]='samples'
193
+ opts.on( '-f', '--file FILE', 'Path to FLN execution') do |file|
194
+ options[:file]=file
195
+ end
196
+
197
+ options[:path] = File.join('fln_results','summary_stats.txt')
198
+ opts.on( '-p', '--path PATH', 'Path to FLN different FLN results' ) do |path|
199
+ options[:path] = File.join(path,'fln_results','summary_stats.txt')
200
+ end
201
+
202
+ # Set a banner, displayed at the top of the help screen.
203
+ opts.banner = "Usage: plot_fln.rb [-p PATH || -f FILE] \n\n"
204
+
205
+ # This displays the help screen
206
+ opts.on( '-h', '--help', 'Display this screen' ) do
207
+ puts opts
208
+ exit
209
+ end
210
+
211
+ end # End opts
212
+
213
+ # parse options and remove from ARGV
214
+ optparse.parse!
215
+
216
+ ##########################################################################################
217
+ ## MAIN
218
+ ##########################################################################################
219
+
220
+ if File.exists?(options[:path])
221
+ fln_hash = create_fln_hash(options[:path])
222
+ end
223
+
224
+ if File.exists?(options[:file])
225
+ titles, paths = parse_file(options[:file])
226
+ fln_hash = []
227
+ paths.each do |path|
228
+ fln_hash << create_fln_hash(File.join(path,'fln_results','summary_stats.txt'))
229
+ end
230
+ end
231
+
232
+ graph_table(
233
+ fln_hash,
234
+ 'status_report_table',
235
+ 'rowstacked',
236
+ %w{Sure Putative},
237
+ 'Complete N-terminal C-terminal Internal NcRNA Coding Unknown',
238
+ %w{complete_sure complete_putative n_terminal_sure n_terminal_putative c_terminal_sure c_terminal_putative internal internal_putative ncrna ncrna_putative coding_sure coding_putative unknown unknown_putative},
239
+ 2,
240
+ titles)
241
+
242
+ graph_table(
243
+ fln_hash,
244
+ 'assembly_table',
245
+ 'rowstacked',
246
+ %w{<=200nt >200nt >500nt},
247
+ 'Unigenes Coding Unknown',
248
+ %w{<=200seqs >200seqs sequences_>500 <=200cod >200cod coding_>500 <=200unk >200unk unknown_>500},
249
+ 3,
250
+ titles)
251
+
252
+ graph_table(
253
+ fln_hash,
254
+ 'database_usage',
255
+ 'clustered',
256
+ %w{seqs},
257
+ 'UserDB SwissProt TrEMBL ncRNA None Diff-orthologues Complete Diff-complete',
258
+ %w{userdb swissprot trembl ncrna no_match_db different_orthologues complete different_completes},
259
+ 8,
260
+ titles)
261
+
262
+ graph_table(
263
+ fln_hash,
264
+ 'artifacts',
265
+ 'clustered',
266
+ %w{seqs},
267
+ 'Misassembled Chimeras Other',
268
+ %w{misassembled chimeras other_artifacts},
269
+ 3,
270
+ titles)