full_lengther_next 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/bin/download_fln_dbs.rb +23 -23
- data/bin/full_lengther_next +11 -11
- data/bin/make_test_dataset.rb +9 -9
- data/bin/make_user_db.rb +2 -2
- data/full_lengther_next.gemspec +2 -1
- data/lib/full_lengther_next/artifacts.rb +7 -7
- data/lib/full_lengther_next/bio_patch.rb +93 -0
- data/lib/full_lengther_next/blast_functions.rb +17 -17
- data/lib/full_lengther_next/cdhit.rb +5 -5
- data/lib/full_lengther_next/chimeric_seqs.rb +5 -5
- data/lib/full_lengther_next/common_functions.rb +1 -1
- data/lib/full_lengther_next/exonerate_result.rb +5 -5
- data/lib/full_lengther_next/fl_analysis.rb +6 -6
- data/lib/full_lengther_next/fln_stats.rb +2 -2
- data/lib/full_lengther_next/handle_db.rb +1 -1
- data/lib/full_lengther_next/my_worker.rb +6 -6
- data/lib/full_lengther_next/my_worker_EST.rb +1 -1
- data/lib/full_lengther_next/my_worker_manager_fln.rb +1 -1
- data/lib/full_lengther_next/sequence.rb +9 -9
- data/lib/full_lengther_next/une_los_hit.rb +5 -5
- data/lib/full_lengther_next/version.rb +1 -1
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e6d28d54912b46305ba0047c8458469afffe6660189cb1950c1d30290982e2c4
|
4
|
+
data.tar.gz: fca1a71701c8b1c763102623b6fc60d7699b697ebd636ce53f14886d07fb35f4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c1398e4d8448f10550769e4f6cb7baa2492337f377fab2f33b3de7067213f1d45ceae1a717de69c59c113c80d585d85bd7d4f1b0d6ed236fb0a5c8d7ac244f3e
|
7
|
+
data.tar.gz: 85c5238d3569e17509135e97383e748183fadc00c2c9eca9f984492b12ba4b4bd22e908eaae755162e4070c5eda8df1fcdffdbdb794c216537a0304b7bbe45bd
|
data/bin/download_fln_dbs.rb
CHANGED
@@ -10,6 +10,7 @@ require 'scbi_zcat'
|
|
10
10
|
require 'optparse'
|
11
11
|
require 'cdhit'
|
12
12
|
require 'handle_db'
|
13
|
+
require 'bio_patch'
|
13
14
|
|
14
15
|
##############################################################################################
|
15
16
|
## METHODS
|
@@ -69,10 +70,10 @@ def filtering_seqs(fasta_file, max_length, black_list)
|
|
69
70
|
end
|
70
71
|
|
71
72
|
def compare_list(string, list)
|
72
|
-
res =
|
73
|
+
res = false
|
73
74
|
list.each do |word|
|
74
75
|
if string.include?(word)
|
75
|
-
res =
|
76
|
+
res = true
|
76
77
|
break
|
77
78
|
end
|
78
79
|
end
|
@@ -139,13 +140,13 @@ def filter_and_makeDB(formatted_db_path, dbtype, db_group, isoform_hash, prefix,
|
|
139
140
|
end
|
140
141
|
|
141
142
|
def complete?(uniprot_record)
|
142
|
-
complete =
|
143
|
+
complete = true
|
143
144
|
if uniprot_record.description.include?('Flags: Fragment') || #Discard non full length records
|
144
145
|
uniprot_record.seq[0] != 'M' ||
|
145
146
|
uniprot_record.seq.include?('XX') ||
|
146
147
|
uniprot_record.ft.keys.include?('NON_TER') ||# The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key
|
147
148
|
uniprot_record.ft.keys.include?('NON_CONS') # Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them
|
148
|
-
complete =
|
149
|
+
complete = false
|
149
150
|
end
|
150
151
|
return complete
|
151
152
|
end
|
@@ -305,29 +306,29 @@ optparse = OptionParser.new do |opts|
|
|
305
306
|
end
|
306
307
|
end
|
307
308
|
|
308
|
-
options[:no_download] =
|
309
|
+
options[:no_download] = false
|
309
310
|
opts.on( '-d', '--no_download', 'Only parse downloaded files without download them again') do
|
310
|
-
options[:no_download] =
|
311
|
+
options[:no_download] = true
|
311
312
|
end
|
312
313
|
|
313
|
-
options[:no_ncrna] =
|
314
|
+
options[:no_ncrna] = false
|
314
315
|
opts.on( '-n', '--no_ncrna', 'No use ncrna sequences') do
|
315
|
-
options[:no_ncrna] =
|
316
|
+
options[:no_ncrna] = true
|
316
317
|
end
|
317
318
|
|
318
|
-
options[:only_index] =
|
319
|
+
options[:only_index] = false
|
319
320
|
opts.on( '-i', '--only_index', 'Build annotation index only without do blast DB') do
|
320
|
-
options[:only_index] =
|
321
|
+
options[:only_index] = true
|
321
322
|
end
|
322
323
|
|
323
|
-
options[:no_trembl] =
|
324
|
+
options[:no_trembl] = false
|
324
325
|
opts.on( '-t', '--no_trembl', 'No use trembl sequences') do
|
325
|
-
options[:no_trembl] =
|
326
|
+
options[:no_trembl] = true
|
326
327
|
end
|
327
328
|
|
328
|
-
options[:all] =
|
329
|
+
options[:all] = false
|
329
330
|
opts.on( '-a', '--all_sequences', 'Generate databases with all sequences') do
|
330
|
-
options[:all] =
|
331
|
+
options[:all] = true
|
331
332
|
end
|
332
333
|
|
333
334
|
options[:cdhit] = 0
|
@@ -335,14 +336,14 @@ optparse = OptionParser.new do |opts|
|
|
335
336
|
options[:cdhit] = cdhit.to_f
|
336
337
|
end
|
337
338
|
|
338
|
-
options[:no_uniprot] =
|
339
|
+
options[:no_uniprot] = false
|
339
340
|
opts.on( '-p', '--no_uniprot', 'No use uniprot sequences') do
|
340
|
-
options[:no_uniprot] =
|
341
|
+
options[:no_uniprot] = true
|
341
342
|
end
|
342
343
|
|
343
|
-
options[:passive_ftp] =
|
344
|
+
options[:passive_ftp] = false
|
344
345
|
opts.on( '-P', '--passive_ftp', 'Use pasive ftp') do
|
345
|
-
options[:passive_ftp] =
|
346
|
+
options[:passive_ftp] = true
|
346
347
|
end
|
347
348
|
|
348
349
|
# Set a banner, displayed at the top of the help screen.
|
@@ -364,19 +365,18 @@ optparse.parse!
|
|
364
365
|
## MAIN
|
365
366
|
##############################################################################################
|
366
367
|
|
367
|
-
|
368
|
-
|
369
|
-
formatted_db_path = ENV['BLASTDB']
|
368
|
+
if !ENV['BLASTDB'].nil?
|
369
|
+
formatted_db_path = File.expand_path(ENV['BLASTDB'])
|
370
370
|
else # otherwise use ROOTPATH + DB
|
371
371
|
formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
|
372
|
-
Dir.mkdir(formatted_db_path)
|
373
372
|
end
|
373
|
+
Dir.mkdir(formatted_db_path) if !File.exists?(formatted_db_path)
|
374
374
|
|
375
375
|
|
376
376
|
ENV['BLASTDB'] = formatted_db_path
|
377
377
|
puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
|
378
378
|
puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n"
|
379
|
-
|
379
|
+
puts "Patched? #{Bio::UniProtKB.patched?}"
|
380
380
|
download_ncrna(formatted_db_path, options[:no_download]) if !options[:no_ncrna]
|
381
381
|
|
382
382
|
if !options[:no_download]
|
data/bin/full_lengther_next
CHANGED
@@ -86,9 +86,9 @@ optparse = OptionParser.new do |opts|
|
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
|
-
options[:exonerate] =
|
89
|
+
options[:exonerate] = true
|
90
90
|
opts.on( '-e', '--exonerate', 'Disables exonerate analysis' ) do |exonerate|
|
91
|
-
options[:exonerate] =
|
91
|
+
options[:exonerate] = false
|
92
92
|
end
|
93
93
|
|
94
94
|
options[:fasta] = nil
|
@@ -106,9 +106,9 @@ optparse = OptionParser.new do |opts|
|
|
106
106
|
options[:ident] = ident.to_f
|
107
107
|
end
|
108
108
|
|
109
|
-
options[:high_clustering] =
|
109
|
+
options[:high_clustering] = false
|
110
110
|
opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default false' ) do
|
111
|
-
options[:high_clustering] =
|
111
|
+
options[:high_clustering] = true
|
112
112
|
end
|
113
113
|
|
114
114
|
options[:subject_coverage] = 0.25
|
@@ -165,7 +165,7 @@ optparse = OptionParser.new do |opts|
|
|
165
165
|
options[:user_db] = nil
|
166
166
|
opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
|
167
167
|
options[:user_db] = db
|
168
|
-
if
|
168
|
+
if Dir.glob(File.expand_path(db+'*.psq')).empty?
|
169
169
|
puts "user database: #{options[:user_db]} was not found"
|
170
170
|
exit
|
171
171
|
end
|
@@ -196,9 +196,9 @@ optparse = OptionParser.new do |opts|
|
|
196
196
|
options[:training_ident] = ident.to_f
|
197
197
|
end
|
198
198
|
|
199
|
-
options[:hdd] =
|
199
|
+
options[:hdd] = false
|
200
200
|
opts.on( '-z', '--hdd', 'Write/use blast report on HDD' ) do |hdd|
|
201
|
-
options[:hdd] =
|
201
|
+
options[:hdd] = true
|
202
202
|
end
|
203
203
|
|
204
204
|
|
@@ -207,9 +207,9 @@ optparse = OptionParser.new do |opts|
|
|
207
207
|
options[:files2map] = files2map.split(';').map{|map_files| map_files.split(',')}
|
208
208
|
end
|
209
209
|
|
210
|
-
options[:remove_unmapped] =
|
210
|
+
options[:remove_unmapped] = true
|
211
211
|
opts.on('-R', '--remove_unmapped', 'When fastq files are provided, all sequences without at least a read pair are removed. When this option is enabled this filtering is disabled' ) do
|
212
|
-
options[:remove_unmapped] =
|
212
|
+
options[:remove_unmapped] = false
|
213
213
|
end
|
214
214
|
|
215
215
|
# Set a banner, displayed at the top of the help screen.
|
@@ -268,8 +268,8 @@ if !File.exists?(ncrna_path) && options[:acess_db].include?('c')
|
|
268
268
|
end
|
269
269
|
|
270
270
|
if options[:acess_db].include?('s') || options[:acess_db].include?('t')
|
271
|
-
sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}
|
272
|
-
if
|
271
|
+
sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}*.psq")
|
272
|
+
if Dir.glob(sp_path).empty?
|
273
273
|
puts "DB File #{sp_path} doesn't exists, or"
|
274
274
|
puts "incorrect taxon group name: #{options[:tax_group]} choose:"
|
275
275
|
puts optparse.help
|
data/bin/make_test_dataset.rb
CHANGED
@@ -120,29 +120,29 @@ optparse = OptionParser.new do |opts|
|
|
120
120
|
options[:duplicate] = duplicate.to_i
|
121
121
|
end
|
122
122
|
|
123
|
-
options[:split]=
|
123
|
+
options[:split]= false
|
124
124
|
opts.on( '-s', '--split', 'Split sequences in each case') do
|
125
125
|
options[:duplicate] = 3
|
126
126
|
end
|
127
127
|
|
128
|
-
options[:chim]=
|
128
|
+
options[:chim]= true
|
129
129
|
opts.on( '-c', '--chim', 'Make sequence set of chimeras') do
|
130
|
-
options[:chim] =
|
130
|
+
options[:chim] = false
|
131
131
|
end
|
132
132
|
|
133
|
-
options[:indel]=
|
133
|
+
options[:indel]= true
|
134
134
|
opts.on( '-i', '--indel', 'Make sequence set of indels') do
|
135
|
-
options[:indel] =
|
135
|
+
options[:indel] = false
|
136
136
|
end
|
137
137
|
|
138
|
-
options[:pair]=
|
138
|
+
options[:pair]= true
|
139
139
|
opts.on( '-p', '--pair', 'Make sequence set of paired') do
|
140
|
-
options[:pair] =
|
140
|
+
options[:pair] = false
|
141
141
|
end
|
142
142
|
|
143
|
-
options[:trim]=
|
143
|
+
options[:trim]= true
|
144
144
|
opts.on( '-t', '--trim', 'Make sequence set of trimmed') do
|
145
|
-
options[:trim] =
|
145
|
+
options[:trim] = false
|
146
146
|
end
|
147
147
|
|
148
148
|
# Set a banner, displayed at the top of the help screen.
|
data/bin/make_user_db.rb
CHANGED
@@ -63,9 +63,9 @@ optparse = OptionParser.new do |opts|
|
|
63
63
|
options[:name] = name
|
64
64
|
end
|
65
65
|
|
66
|
-
options[:local] =
|
66
|
+
options[:local] = false
|
67
67
|
opts.on( '-l', '--local', 'Only parse downloaded files without download them again') do
|
68
|
-
options[:local] =
|
68
|
+
options[:local] = true
|
69
69
|
end
|
70
70
|
|
71
71
|
options[:user_fasta] = nil
|
data/full_lengther_next.gemspec
CHANGED
@@ -32,7 +32,8 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_runtime_dependency 'scbi_blast'
|
33
33
|
spec.add_runtime_dependency 'scbi_mapreduce'
|
34
34
|
spec.add_runtime_dependency 'scbi_zcat'
|
35
|
-
spec.add_runtime_dependency 'bio
|
35
|
+
spec.add_runtime_dependency 'bio'
|
36
|
+
#spec.add_runtime_dependency 'bio-cd-hit-report' # Removed due to conflicts with bio-ruby2. This gem depends on bio ruby 1.4.3. cdhit options disabled
|
36
37
|
spec.add_runtime_dependency 'report_html'
|
37
38
|
|
38
39
|
|
@@ -7,11 +7,11 @@ include ChimericSeqs
|
|
7
7
|
## MAIN FUNCTION
|
8
8
|
#####################################################################
|
9
9
|
def artifact?(seq, query, db_name, db_path, options, new_seqs)
|
10
|
-
artifact =
|
10
|
+
artifact = false
|
11
11
|
# UNMAPPED CONTIG DETECTION
|
12
12
|
if query.nil? && seq.unmapped? #If seq is misassembled stop chimera analisys
|
13
13
|
seq.hit = nil
|
14
|
-
artifact =
|
14
|
+
artifact = true
|
15
15
|
seq.type = UNMAPPED
|
16
16
|
end
|
17
17
|
|
@@ -19,7 +19,7 @@ def artifact?(seq, query, db_name, db_path, options, new_seqs)
|
|
19
19
|
# MISASSEMBLED DETECTION
|
20
20
|
if !artifact && misassembled_detection(query) #If seq is misassembled stop chimera analisys
|
21
21
|
seq.hit = query.hits.first
|
22
|
-
artifact =
|
22
|
+
artifact = true
|
23
23
|
seq.type = MISASSEMBLED
|
24
24
|
seq.warnings('ERROR#1')
|
25
25
|
end
|
@@ -35,7 +35,7 @@ def artifact?(seq, query, db_name, db_path, options, new_seqs)
|
|
35
35
|
else
|
36
36
|
seq.hit = query.hits.first
|
37
37
|
end
|
38
|
-
artifact =
|
38
|
+
artifact = true
|
39
39
|
seq.type = OTHER
|
40
40
|
seq.warnings('ERROR#2')
|
41
41
|
end
|
@@ -55,7 +55,7 @@ def artifact?(seq, query, db_name, db_path, options, new_seqs)
|
|
55
55
|
new_seqs.concat(chimera)
|
56
56
|
seq.db_name = db_name
|
57
57
|
seq.type = CHIMERA
|
58
|
-
artifact =
|
58
|
+
artifact = true
|
59
59
|
end
|
60
60
|
end
|
61
61
|
end
|
@@ -64,8 +64,8 @@ def artifact?(seq, query, db_name, db_path, options, new_seqs)
|
|
64
64
|
puts seq.prot_annot_calification
|
65
65
|
end
|
66
66
|
seq.db_name = db_name
|
67
|
-
seq.save_fasta =
|
68
|
-
seq.ignore =
|
67
|
+
seq.save_fasta = false
|
68
|
+
seq.ignore = true
|
69
69
|
end
|
70
70
|
return artifact
|
71
71
|
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module Bio
|
2
|
+
class UniProtKB
|
3
|
+
def self.patched?
|
4
|
+
return true
|
5
|
+
end
|
6
|
+
|
7
|
+
def ft(feature_key = nil)
|
8
|
+
return ft[feature_key] if feature_key
|
9
|
+
return @data['FT'] if @data['FT']
|
10
|
+
|
11
|
+
table = []
|
12
|
+
begin
|
13
|
+
get('FT').split("\n").each do |line|
|
14
|
+
if line =~ /^FT \w/
|
15
|
+
feature = line.chomp.ljust(74)
|
16
|
+
table << [feature[ 5..12].strip, # Feature Name
|
17
|
+
feature[14..19].strip, # From
|
18
|
+
feature[21..26].strip, # To
|
19
|
+
feature[34..74].strip ] # Description
|
20
|
+
else
|
21
|
+
table.last << line.chomp.sub!(/^FT +/, '')
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Joining Description lines
|
26
|
+
table = table.map { |feature|
|
27
|
+
ftid = feature.pop if feature.last =~ /FTId=/
|
28
|
+
if feature.size > 4
|
29
|
+
feature = [feature[0],
|
30
|
+
feature[1],
|
31
|
+
feature[2],
|
32
|
+
feature[3, feature.size - 3].join(" ")]
|
33
|
+
end
|
34
|
+
feature << if ftid then ftid else '' end
|
35
|
+
}
|
36
|
+
|
37
|
+
###### PATCH TO RECOVER PARSER
|
38
|
+
to_delete = []
|
39
|
+
table.each_with_index do |feature, i|
|
40
|
+
name, from, to, descrition = feature
|
41
|
+
if from.empty?
|
42
|
+
coors = to.split("..")
|
43
|
+
if coors.length == 2
|
44
|
+
feature[1] = coors[0]
|
45
|
+
feature[2] = coors[1]
|
46
|
+
elsif /[^\d]/ =~ to
|
47
|
+
to_delete << i
|
48
|
+
else
|
49
|
+
feature[1] = to
|
50
|
+
feature[2] = to
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
to_delete.reverse_each{|i| table.delete_at(i)}
|
55
|
+
#####
|
56
|
+
|
57
|
+
hash = {}
|
58
|
+
table.each do |feature|
|
59
|
+
hash[feature[0]] = [] unless hash[feature[0]]
|
60
|
+
hash[feature[0]] << {
|
61
|
+
# Removing '<', '>' or '?' in FROM/TO endopoint.
|
62
|
+
'From' => feature[1].sub(/\D/, '').to_i,
|
63
|
+
'To' => feature[2].sub(/\D/, '').to_i,
|
64
|
+
'Description' => feature[3],
|
65
|
+
'FTId' => feature[4].to_s.sub(/\/FTId=/, '').sub(/\.$/, ''),
|
66
|
+
'diff' => [],
|
67
|
+
'original' => feature
|
68
|
+
}
|
69
|
+
|
70
|
+
case feature[0]
|
71
|
+
when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
|
72
|
+
case hash[feature[0]].last['Description']
|
73
|
+
when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
|
74
|
+
original_res = $1
|
75
|
+
changed_res = $2
|
76
|
+
original_res = original_res.gsub(/ /,'').strip
|
77
|
+
chenged_res = changed_res.gsub(/ /,'').strip
|
78
|
+
when /Missing/i
|
79
|
+
original_res = seq.subseq(hash[feature[0]].last['From'],
|
80
|
+
hash[feature[0]].last['To'])
|
81
|
+
changed_res = ''
|
82
|
+
end
|
83
|
+
hash[feature[0]].last['diff'] = [original_res, chenged_res]
|
84
|
+
end
|
85
|
+
end
|
86
|
+
rescue
|
87
|
+
raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
|
88
|
+
end
|
89
|
+
|
90
|
+
@data['FT'] = hash
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
@@ -105,31 +105,31 @@ def set_thresold_evalue(hits)
|
|
105
105
|
end
|
106
106
|
|
107
107
|
def same_subject_hsp(hit, second_hit)
|
108
|
-
same =
|
108
|
+
same = false
|
109
109
|
if hit.acc == second_hit.acc
|
110
110
|
if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
|
111
|
-
same =
|
111
|
+
same = true
|
112
112
|
end
|
113
113
|
end
|
114
114
|
return same
|
115
115
|
end
|
116
116
|
|
117
117
|
def same_query_hsp(hit, second_hit)
|
118
|
-
same =
|
118
|
+
same = false
|
119
119
|
if hit.acc == second_hit.acc
|
120
120
|
if hit.q_beg <= second_hit.q_beg && hit.q_end >= hit.q_end && (second_hit.q_beg - hit.q_end).abs > 1
|
121
|
-
same =
|
121
|
+
same = true
|
122
122
|
end
|
123
123
|
end
|
124
124
|
return same
|
125
125
|
end
|
126
126
|
|
127
127
|
def same_sense?(hit, second_hit)
|
128
|
-
same=
|
128
|
+
same= false
|
129
129
|
hit_sense = hit.q_frame <=> 0
|
130
130
|
second_hit_sense = second_hit.q_frame <=> 0
|
131
131
|
if hit_sense == second_hit_sense
|
132
|
-
same =
|
132
|
+
same = true
|
133
133
|
end
|
134
134
|
return same
|
135
135
|
end
|
@@ -158,7 +158,7 @@ def clean_by_query_length_match(blast_result, min_len_nt)
|
|
158
158
|
end
|
159
159
|
|
160
160
|
|
161
|
-
def clean_overlapping_hsps(blast_result, keep_if_diff_sense =
|
161
|
+
def clean_overlapping_hsps(blast_result, keep_if_diff_sense = false)
|
162
162
|
blast_result.querys.each do |query|
|
163
163
|
if query.hits.length > 1
|
164
164
|
query.hits.each_with_index do |hit, j|
|
@@ -190,7 +190,7 @@ end
|
|
190
190
|
#####################################################################
|
191
191
|
|
192
192
|
def misassembled_detection(query)
|
193
|
-
miss=
|
193
|
+
miss=false
|
194
194
|
hits = cluster_hsps(query.hits)
|
195
195
|
misassembled_hits = []
|
196
196
|
hits.each do |hit|
|
@@ -202,7 +202,7 @@ def misassembled_detection(query)
|
|
202
202
|
end
|
203
203
|
end
|
204
204
|
if misassembled_hits.length*1.0/ hits.length > 0.5
|
205
|
-
miss =
|
205
|
+
miss = true
|
206
206
|
else #Remove missassembled hits to avoid broken analysis
|
207
207
|
query.hits.reverse_each do |hsp|
|
208
208
|
if misassembled_hits.include?(hsp.acc)
|
@@ -214,16 +214,16 @@ def misassembled_detection(query)
|
|
214
214
|
end
|
215
215
|
|
216
216
|
def multiple_hsps(query, num)
|
217
|
-
multiple =
|
217
|
+
multiple = false
|
218
218
|
hsps = query.hits.select{|h| h.acc == query.hits.first.acc}
|
219
219
|
if hsps.length >= num
|
220
|
-
multiple =
|
220
|
+
multiple = true
|
221
221
|
end
|
222
222
|
return multiple
|
223
223
|
end
|
224
224
|
|
225
225
|
def overlapping_hsps_on_subject(query)
|
226
|
-
overlapping =
|
226
|
+
overlapping = false
|
227
227
|
current_hit = query.hits.first.acc
|
228
228
|
complete_hit = []
|
229
229
|
cleaned_hits = []
|
@@ -252,16 +252,16 @@ def clean_subject_overlapping_hsps(complete_hit, cleaned_hits)
|
|
252
252
|
end
|
253
253
|
|
254
254
|
def subject_overlapping_hsps(hit)
|
255
|
-
overlapping =
|
255
|
+
overlapping = false
|
256
256
|
hsp_table = hsps_relationship_subject(hit)
|
257
257
|
if !hsp_table.empty?
|
258
258
|
hit = clean_hsp_by_identity(hit, 55)
|
259
259
|
if hit.empty?
|
260
|
-
overlapping =
|
260
|
+
overlapping = true
|
261
261
|
else
|
262
262
|
hsp_table = hsps_relationship_subject(hit)
|
263
263
|
if !hsp_table.empty?
|
264
|
-
overlapping =
|
264
|
+
overlapping = true
|
265
265
|
end
|
266
266
|
end
|
267
267
|
end
|
@@ -286,10 +286,10 @@ def hsps_relationship_subject(hit)
|
|
286
286
|
end
|
287
287
|
|
288
288
|
def same_subject_hsp(hit, second_hit)
|
289
|
-
same =
|
289
|
+
same = false
|
290
290
|
if hit.acc == second_hit.acc
|
291
291
|
if hit.s_beg <= second_hit.s_beg && hit.s_end >= hit.s_end && (second_hit.s_beg - hit.s_end).abs > 1
|
292
|
-
same =
|
292
|
+
same = true
|
293
293
|
end
|
294
294
|
end
|
295
295
|
return same
|
@@ -59,8 +59,8 @@ class Cdhit
|
|
59
59
|
if master_seq.db != 'sp'
|
60
60
|
sp_seq=get_sp(cluster)
|
61
61
|
if !sp_seq.nil?
|
62
|
-
cluster.map{|seq| seq.master=
|
63
|
-
sp_seq.master=
|
62
|
+
cluster.map{|seq| seq.master=false}
|
63
|
+
sp_seq.master= true
|
64
64
|
end
|
65
65
|
end
|
66
66
|
}
|
@@ -109,7 +109,7 @@ class Cdhit
|
|
109
109
|
|
110
110
|
|
111
111
|
def cd_hit_clusters(clust_file)
|
112
|
-
require 'bio-cd-hit-report'
|
112
|
+
#require 'bio-cd-hit-report'
|
113
113
|
report = Bio::CdHitReport.new(clust_file)
|
114
114
|
report.each_cluster do |cluster|
|
115
115
|
clust=[]
|
@@ -128,9 +128,9 @@ class Cdhit
|
|
128
128
|
member.gsub!('>','')
|
129
129
|
fields = member.split(',')
|
130
130
|
data = fields[1].split(' ',2)
|
131
|
-
master =
|
131
|
+
master = false
|
132
132
|
if data[1] == '*'
|
133
|
-
master =
|
133
|
+
master = true
|
134
134
|
end
|
135
135
|
return data[0],master
|
136
136
|
end
|
@@ -110,8 +110,8 @@ module ChimericSeqs
|
|
110
110
|
seq_bak.clean_warnings
|
111
111
|
seq_bak.seq_name += "_split_#{hit_position}"
|
112
112
|
seq_bak.clean_orfs
|
113
|
-
seq_bak.save_fasta =
|
114
|
-
seq_bak.ignore =
|
113
|
+
seq_bak.save_fasta = true
|
114
|
+
seq_bak.ignore = false
|
115
115
|
|
116
116
|
# Cut sequence and move hit/hsps limits
|
117
117
|
#----------------------------------------
|
@@ -244,10 +244,10 @@ module ChimericSeqs
|
|
244
244
|
end
|
245
245
|
|
246
246
|
def hit_is_in?(h_beg, h_end, hit)
|
247
|
-
is=
|
247
|
+
is=false
|
248
248
|
# CONTIENE #OVERLAP
|
249
249
|
if h_beg <= hit[BEG] && h_end > hit[BEG] || hit[BEG] <= h_beg && hit[STOP] > h_beg
|
250
|
-
is=
|
250
|
+
is=true
|
251
251
|
end
|
252
252
|
return is
|
253
253
|
end
|
@@ -324,7 +324,7 @@ module ChimericSeqs
|
|
324
324
|
cmd='clustalo -i - -o /dev/null --percent-id --full --distmat-out=/dev/stdout --force'
|
325
325
|
clustal_matrix = nil
|
326
326
|
IO.popen(cmd,'w+') {|clustal|
|
327
|
-
clustal.sync =
|
327
|
+
clustal.sync = true
|
328
328
|
clustal.write(seq_fasta)
|
329
329
|
clustal.close_write
|
330
330
|
clustal_matrix = clustal.readlines
|
@@ -101,7 +101,7 @@ module CommonFunctions
|
|
101
101
|
hit.q_frame = -hit.q_frame
|
102
102
|
hit.q_end = query_fasta.length - 1 - hit.q_end
|
103
103
|
hit.q_beg = query_fasta.length - 1 - hit.q_beg
|
104
|
-
hit.reversed =
|
104
|
+
hit.reversed = true
|
105
105
|
query_fasta = query_fasta.complementary_dna # ESTO REALMENTE HACE LA REVERSO COMPLEMENTARIA.
|
106
106
|
if hit.class.to_s == 'ExoBlastHit'
|
107
107
|
hit.q_frameshift.map!{|position, num_nts|
|
@@ -39,7 +39,7 @@ end
|
|
39
39
|
class ExonerateResult
|
40
40
|
|
41
41
|
# Parser initialization
|
42
|
-
def initialize(input, seqs= nil, query_seqs = nil, all =
|
42
|
+
def initialize(input, seqs= nil, query_seqs = nil, all = true)
|
43
43
|
@querys = []
|
44
44
|
@seqs = seqs #unigenes
|
45
45
|
@prot_seqs = query_seqs#prot
|
@@ -106,8 +106,8 @@ class ExonerateResult
|
|
106
106
|
|
107
107
|
#this method only works fine with --model protein2dna parameter of exonerate
|
108
108
|
def hiting(features, tags, query) #Convierte las coordenadas relativas del exonerate a absolutas tipo blast, definiendo solo los hits
|
109
|
-
do_align =
|
110
|
-
do_align =
|
109
|
+
do_align = false
|
110
|
+
do_align = true if !@prot_seqs.nil? && !@seqs.nil?
|
111
111
|
start_target = features['target_start_align']#Unigen
|
112
112
|
start_query = features['query_start_align'] #proteina
|
113
113
|
ends_target = features['target_end_align']
|
@@ -143,7 +143,7 @@ class ExonerateResult
|
|
143
143
|
target_alignment << target_seq[counter_target, tag[TARGET]].translate
|
144
144
|
end
|
145
145
|
if tag[OPERATION] == 'F'
|
146
|
-
if tag[TARGET] > 0 && tag[TARGET] < 3 #
|
146
|
+
if tag[TARGET] > 0 && tag[TARGET] < 3 #true FRAMESHIFT
|
147
147
|
gap_shift += 1
|
148
148
|
if tags[n_operation+1][OPERATION] != 'G' #there are frameshift that not insert a gap, we do it
|
149
149
|
query_alignment << '-' if do_align
|
@@ -203,7 +203,7 @@ class ExonerateResult
|
|
203
203
|
def define_hit_parameters(hit, features, tags)
|
204
204
|
hit.gaps = 0
|
205
205
|
tags.map{|aln| hit.gaps += 1 if aln[0] == 'G'}
|
206
|
-
hit.reversed =
|
206
|
+
hit.reversed = false
|
207
207
|
hit.align_len =(features['query_end_align'] - features['query_start_align']).abs+1
|
208
208
|
hit.mismatches=0
|
209
209
|
hit.e_val=0
|
@@ -171,9 +171,9 @@ module FlAnalysis
|
|
171
171
|
end
|
172
172
|
|
173
173
|
if atg_status == 'putative' || end_status == 'putative'
|
174
|
-
status =
|
174
|
+
status = false # Putative
|
175
175
|
else
|
176
|
-
status =
|
176
|
+
status = true # Sure
|
177
177
|
end
|
178
178
|
|
179
179
|
return type, status
|
@@ -187,7 +187,7 @@ module FlAnalysis
|
|
187
187
|
$global_warnings << ['SeqShorter', final_prot.length, final_hit.s_len]
|
188
188
|
if final_prot.length + 100 < final_hit.s_len || final_prot.length*2 < final_hit.s_len
|
189
189
|
if type == COMPLETE
|
190
|
-
status =
|
190
|
+
status = false
|
191
191
|
$global_warnings << 'VeryShorter'
|
192
192
|
end
|
193
193
|
end
|
@@ -209,7 +209,7 @@ module FlAnalysis
|
|
209
209
|
$global_warnings = [] # Clean all warnings for current sequence
|
210
210
|
seq.seq_nt = mark_nt_seqs(final_hit, query_fasta)
|
211
211
|
if type == COMPLETE
|
212
|
-
seq.ignore =
|
212
|
+
seq.ignore = true
|
213
213
|
end
|
214
214
|
end
|
215
215
|
if $verbose > 2
|
@@ -265,8 +265,8 @@ module FlAnalysis
|
|
265
265
|
|
266
266
|
## VERBOSE METHODS
|
267
267
|
def show_nts
|
268
|
-
show =
|
269
|
-
show =
|
268
|
+
show = false
|
269
|
+
show = true if $verbose && $verbose > 3
|
270
270
|
return show
|
271
271
|
end
|
272
272
|
|
@@ -74,7 +74,7 @@ module FlnStats
|
|
74
74
|
if !$1.nil?
|
75
75
|
organism = $1
|
76
76
|
else
|
77
|
-
name =~ /(\w+ \w+) \(([\w ]+)\)/
|
77
|
+
name =~ /(\w+ \w+) \(([\w \/]+)\)/
|
78
78
|
if !$1.nil?
|
79
79
|
organism = $1
|
80
80
|
end
|
@@ -610,4 +610,4 @@ module FlnStats
|
|
610
610
|
html = '<div style="font-size:25px; margin: 10"><b>'+title+'</b></div>'
|
611
611
|
return html
|
612
612
|
end
|
613
|
-
end
|
613
|
+
end
|
@@ -35,7 +35,7 @@ end
|
|
35
35
|
def do_makeblastdb(seqs, output, dbtype)
|
36
36
|
cmd="makeblastdb -in - -out #{output} -title #{File.basename(output)} -dbtype #{dbtype} -parse_seqids"
|
37
37
|
IO.popen(cmd,'w+') {|makedb|
|
38
|
-
makedb.sync =
|
38
|
+
makedb.sync = true
|
39
39
|
makedb.write(seqs)
|
40
40
|
makedb.close_write
|
41
41
|
puts makedb.readlines
|
@@ -193,7 +193,7 @@ class MyWorker < ScbiMapreduce::Worker
|
|
193
193
|
|
194
194
|
|
195
195
|
# ejecuta blast utilizando los parametros fichero de entrada, base de datos, tipo de blast y evalue
|
196
|
-
def run_blast(input, database, blast_type, evalue, additional_blast_options, do_exonerate, filter =
|
196
|
+
def run_blast(input, database, blast_type, evalue, additional_blast_options, do_exonerate, filter = true)
|
197
197
|
if !input.empty? && !input.nil?
|
198
198
|
$WORKER_LOG.info "DB: #{File.basename(database)} #{input.length}"
|
199
199
|
blast = BatchBlast.new("-db #{database}", blast_type, "-evalue #{evalue} #{additional_blast_options}")
|
@@ -202,7 +202,7 @@ class MyWorker < ScbiMapreduce::Worker
|
|
202
202
|
if @options[:hdd] #Write/parse blast on Disk
|
203
203
|
file_name = file_path+'.blast' #Each blast is identified with database_name and first sequence's name on chunk
|
204
204
|
if !File.exists?(file_name)
|
205
|
-
blast_result = blast.do_blast_seqs(input, :table,
|
205
|
+
blast_result = blast.do_blast_seqs(input, :table, true, file_name)
|
206
206
|
else
|
207
207
|
blast = nil
|
208
208
|
blast_result=BlastTableResult.new(file_name)
|
@@ -223,8 +223,8 @@ class MyWorker < ScbiMapreduce::Worker
|
|
223
223
|
end
|
224
224
|
|
225
225
|
def rescue_sequence(e, seq, status)
|
226
|
-
seq.save_fasta =
|
227
|
-
seq.ignore =
|
226
|
+
seq.save_fasta = false
|
227
|
+
seq.ignore = true
|
228
228
|
seq.type = FAILED
|
229
229
|
puts '-- '+seq.seq_name+' FAILED ANALYSIS -- '+status,
|
230
230
|
e.message,
|
@@ -232,7 +232,7 @@ class MyWorker < ScbiMapreduce::Worker
|
|
232
232
|
end
|
233
233
|
|
234
234
|
def check_ncRNA(check_seqs, ncrna_path, blast_type, evalue)
|
235
|
-
my_blast = run_blast(check_seqs, ncrna_path, blast_type, evalue, '',
|
235
|
+
my_blast = run_blast(check_seqs, ncrna_path, blast_type, evalue, '', false, nil)
|
236
236
|
if !my_blast.nil?
|
237
237
|
check_seqs.each_with_index do |seq,i|
|
238
238
|
find_nc_rna(seq, my_blast.querys[i])
|
@@ -280,7 +280,7 @@ class MyWorker < ScbiMapreduce::Worker
|
|
280
280
|
|
281
281
|
if seq.type == FAILED
|
282
282
|
seq.type = UNKNOWN
|
283
|
-
seq.ignore =
|
283
|
+
seq.ignore = false
|
284
284
|
else
|
285
285
|
best_option.warnings(warning) if !warning.nil?
|
286
286
|
end
|
@@ -28,7 +28,7 @@ class MyWorkerEst < MyWorker
|
|
28
28
|
#####################################################################################
|
29
29
|
|
30
30
|
def blastEST(array_seqs)
|
31
|
-
blast = run_blast(array_seqs, @blast_path, 'blastn', 1e-6, nil,
|
31
|
+
blast = run_blast(array_seqs, @blast_path, 'blastn', 1e-6, nil, false)
|
32
32
|
if blast.nil?
|
33
33
|
$LOG.info 'BLAST FAILED'
|
34
34
|
Process.exit(-1)
|
@@ -444,7 +444,7 @@ class MyWorkerManagerFln < ScbiMapreduce::WorkManager
|
|
444
444
|
@@stats_hash['coding'] += 1
|
445
445
|
coding = select_orf(coding)
|
446
446
|
if coding[1] == 'complete'
|
447
|
-
seq.status =
|
447
|
+
seq.status = true
|
448
448
|
@@stats_hash['coding_sure'] += 1
|
449
449
|
else
|
450
450
|
@@stats_hash['coding_putative'] += 1
|
@@ -16,7 +16,7 @@ class Sequence
|
|
16
16
|
@seq_aa = nil # Protein sequence generated over unigen
|
17
17
|
@db =nil
|
18
18
|
@type = UNKNOWN # See types.rb
|
19
|
-
@status =
|
19
|
+
@status = false # true => Sure, false => Putative
|
20
20
|
@id = nil #Prot or EST id, can be several => array
|
21
21
|
@warnings = []
|
22
22
|
@annotations=[]
|
@@ -27,9 +27,9 @@ class Sequence
|
|
27
27
|
@fpkm = []
|
28
28
|
@coverage_analysis = []
|
29
29
|
|
30
|
-
@area_without_annotation=
|
31
|
-
@save_fasta=
|
32
|
-
@ignore=
|
30
|
+
@area_without_annotation=false
|
31
|
+
@save_fasta=true
|
32
|
+
@ignore = false
|
33
33
|
@hit=nil
|
34
34
|
@t_code=0
|
35
35
|
end
|
@@ -86,7 +86,7 @@ class Sequence
|
|
86
86
|
|
87
87
|
def reset_classification
|
88
88
|
@type = UNKNOWN
|
89
|
-
@status =
|
89
|
+
@status = false
|
90
90
|
end
|
91
91
|
|
92
92
|
def clean_warnings
|
@@ -150,7 +150,7 @@ class Sequence
|
|
150
150
|
def test_code(test_code)
|
151
151
|
@t_code = test_code
|
152
152
|
if @t_code >= 0.95
|
153
|
-
@status =
|
153
|
+
@status = true
|
154
154
|
end
|
155
155
|
end
|
156
156
|
|
@@ -470,7 +470,7 @@ class Sequence
|
|
470
470
|
upstream_annotation_space = hit.q_beg
|
471
471
|
downstream_annotation_space = @fasta_length - hit.q_end
|
472
472
|
if upstream_annotation_space >= 150 || downstream_annotation_space >= 150
|
473
|
-
@area_without_annotation =
|
473
|
+
@area_without_annotation = true
|
474
474
|
end
|
475
475
|
return @area_without_annotation
|
476
476
|
end
|
@@ -490,8 +490,8 @@ class Sequence
|
|
490
490
|
end
|
491
491
|
|
492
492
|
def unmapped?
|
493
|
-
res =
|
494
|
-
res =
|
493
|
+
res = false
|
494
|
+
res = true if !@coverage_analysis.empty? && @coverage_analysis[3] == 0 #3 => percentage of sequence covered by reads
|
495
495
|
return res
|
496
496
|
end
|
497
497
|
end
|
@@ -165,7 +165,7 @@ class UneLosHit
|
|
165
165
|
#if frame_ori < 0 && h.q_frame > 0 || frame_ori > 0 && h.q_frame < 0
|
166
166
|
if h.q_frame < 0 # si la secuencia esta al reves le damos la vuelta
|
167
167
|
query_fasta = reverse_seq(query_fasta_ori, h)
|
168
|
-
h.reversed =
|
168
|
+
h.reversed = true
|
169
169
|
end
|
170
170
|
misma_id << h
|
171
171
|
#end
|
@@ -176,17 +176,17 @@ class UneLosHit
|
|
176
176
|
end
|
177
177
|
|
178
178
|
def overlapping_hits?(hit)
|
179
|
-
overlap =
|
179
|
+
overlap = false
|
180
180
|
if @final_hit.q_end >= hit.q_beg && @final_hit.q_end < hit.q_end && @final_hit.q_end < hit.q_end
|
181
|
-
overlap =
|
181
|
+
overlap = true
|
182
182
|
end
|
183
183
|
return overlap
|
184
184
|
end
|
185
185
|
|
186
186
|
def separated_hits?(hit)
|
187
|
-
separated=
|
187
|
+
separated=false
|
188
188
|
if @final_hit.q_end < hit.q_beg && hit.q_end > @final_hit.q_end
|
189
|
-
separated =
|
189
|
+
separated = true
|
190
190
|
end
|
191
191
|
return separated
|
192
192
|
end
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: full_lengther_next
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Pedro Seoane
|
8
8
|
- Noe Fernandez
|
9
9
|
- Dario Guerrero
|
10
|
-
autorequire:
|
10
|
+
autorequire:
|
11
11
|
bindir: bin
|
12
12
|
cert_chain: []
|
13
|
-
date:
|
13
|
+
date: 2022-09-05 00:00:00.000000000 Z
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: xml-simple
|
@@ -83,7 +83,7 @@ dependencies:
|
|
83
83
|
- !ruby/object:Gem::Version
|
84
84
|
version: '0'
|
85
85
|
- !ruby/object:Gem::Dependency
|
86
|
-
name: bio
|
86
|
+
name: bio
|
87
87
|
requirement: !ruby/object:Gem::Requirement
|
88
88
|
requirements:
|
89
89
|
- - ">="
|
@@ -189,6 +189,7 @@ files:
|
|
189
189
|
- full_lengther_next.gemspec
|
190
190
|
- lib/full_lengther_next.rb
|
191
191
|
- lib/full_lengther_next/artifacts.rb
|
192
|
+
- lib/full_lengther_next/bio_patch.rb
|
192
193
|
- lib/full_lengther_next/blast_functions.rb
|
193
194
|
- lib/full_lengther_next/cdhit.rb
|
194
195
|
- lib/full_lengther_next/chimeric_seqs.rb
|
@@ -220,7 +221,7 @@ homepage: https://github.com/seoanezonjic
|
|
220
221
|
licenses:
|
221
222
|
- MIT
|
222
223
|
metadata: {}
|
223
|
-
post_install_message:
|
224
|
+
post_install_message:
|
224
225
|
rdoc_options: []
|
225
226
|
require_paths:
|
226
227
|
- lib
|
@@ -235,9 +236,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
235
236
|
- !ruby/object:Gem::Version
|
236
237
|
version: '0'
|
237
238
|
requirements: []
|
238
|
-
|
239
|
-
|
240
|
-
signing_key:
|
239
|
+
rubygems_version: 3.3.7
|
240
|
+
signing_key:
|
241
241
|
specification_version: 4
|
242
242
|
summary: Tool to annotate transcriptomes and it is able to stablish the integrity
|
243
243
|
of each transcript. Also, FLN can detect novel genes on a target organism.
|