bio 2.0.1 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ChangeLog +313 -0
- data/KNOWN_ISSUES.rdoc +1 -1
- data/LEGAL +9 -0
- data/README.rdoc +3 -3
- data/RELEASE_NOTES.rdoc +59 -0
- data/bioruby.gemspec +4 -2
- data/bioruby.gemspec.erb +0 -1
- data/lib/bio/appl/iprscan/report.rb +3 -3
- data/lib/bio/appl/sosui/report.rb +1 -1
- data/lib/bio/db/embl/uniprotkb.rb +137 -12
- data/lib/bio/db/go.rb +2 -2
- data/lib/bio/sequence/common.rb +112 -0
- data/lib/bio/sequence/format.rb +1 -0
- data/lib/bio/tree.rb +1 -1
- data/lib/bio/version.rb +1 -1
- data/test/data/uniprot/P28907.uniprot +551 -0
- data/test/unit/bio/db/embl/test_uniprotkb_P28907.rb +325 -0
- data/test/unit/bio/sequence/test_ruby3.rb +462 -0
- metadata +6 -4
@@ -174,7 +174,7 @@ class UniProtKB < EMBLDB
|
|
174
174
|
#
|
175
175
|
# http://www.uniprot.org/docs/sp_news.htm
|
176
176
|
def parse_DE_line_rel14(str)
|
177
|
-
#
|
177
|
+
# Returns if it is not the new format since Rel.14
|
178
178
|
return nil unless /^DE (RecName|AltName|SubName)\: / =~ str
|
179
179
|
ret = []
|
180
180
|
cur = nil
|
@@ -248,9 +248,8 @@ class UniProtKB < EMBLDB
|
|
248
248
|
# SYNONYM >=0
|
249
249
|
# CONTEINS >=0
|
250
250
|
def protein_name
|
251
|
-
|
252
|
-
parsed_de_line
|
253
|
-
if parsed_de_line then
|
251
|
+
parsed_de_line = self.de
|
252
|
+
if parsed_de_line.kind_of?(Array) then
|
254
253
|
# since UniProtKB release 14.0 of 22-Jul-2008
|
255
254
|
name = nil
|
256
255
|
parsed_de_line.each do |a|
|
@@ -275,7 +274,6 @@ class UniProtKB < EMBLDB
|
|
275
274
|
return name
|
276
275
|
end
|
277
276
|
|
278
|
-
|
279
277
|
# returns synonyms (unofficial and/or alternative names).
|
280
278
|
# Returns an Array containing String objects.
|
281
279
|
#
|
@@ -292,9 +290,8 @@ class UniProtKB < EMBLDB
|
|
292
290
|
# synonyms are each placed in () following the official name on the DE line.
|
293
291
|
def synonyms
|
294
292
|
ary = Array.new
|
295
|
-
|
296
|
-
parsed_de_line
|
297
|
-
if parsed_de_line then
|
293
|
+
parsed_de_line = self.de
|
294
|
+
if parsed_de_line.kind_of?(Array) then
|
298
295
|
# since UniProtKB release 14.0 of 22-Jul-2008
|
299
296
|
parsed_de_line.each do |a|
|
300
297
|
case a[0]
|
@@ -330,6 +327,20 @@ class UniProtKB < EMBLDB
|
|
330
327
|
return ary
|
331
328
|
end
|
332
329
|
|
330
|
+
# Returns an Array (for new format since rel 14)
|
331
|
+
# or a String (for old format before rel 14) for the DE line.
|
332
|
+
#
|
333
|
+
def de
|
334
|
+
return @data['DE'] if @data['DE']
|
335
|
+
parsed_de_line = parse_DE_line_rel14(get('DE'))
|
336
|
+
case parsed_de_line
|
337
|
+
when Array # new format since rel14
|
338
|
+
@data['DE'] ||= parsed_de_line
|
339
|
+
else
|
340
|
+
super
|
341
|
+
end
|
342
|
+
@data['DE']
|
343
|
+
end
|
333
344
|
|
334
345
|
# returns gene names in the GN line.
|
335
346
|
#
|
@@ -1197,9 +1208,124 @@ class UniProtKB < EMBLDB
|
|
1197
1208
|
return ft[feature_key] if feature_key
|
1198
1209
|
return @data['FT'] if @data['FT']
|
1199
1210
|
|
1211
|
+
ftstr = get('FT')
|
1212
|
+
ftlines = ftstr.split("\n")
|
1213
|
+
for i in 0..10 do
|
1214
|
+
if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ ftlines[i] &&
|
1215
|
+
/^FT +\/([^\s\=]+)(?:\=(\")?(.+)(\")?)?\s*$/ =~ ftlines[i+1] then
|
1216
|
+
fmt_2019_11 = true
|
1217
|
+
break #for i
|
1218
|
+
end
|
1219
|
+
end #for i
|
1220
|
+
|
1221
|
+
hash = if fmt_2019_11 then
|
1222
|
+
ft_2019_11_parser(ftlines)
|
1223
|
+
else
|
1224
|
+
ft_legacy_parser(ftlines)
|
1225
|
+
end
|
1226
|
+
@data['FT'] = hash
|
1227
|
+
end
|
1228
|
+
|
1229
|
+
# FT parser since UniProt release 2019_11
|
1230
|
+
# https://www.uniprot.org/release-notes/2019-12-18-release#text%5Fft
|
1231
|
+
def ft_2019_11_parser(ftlines)
|
1232
|
+
table = []
|
1233
|
+
cur_ft = nil
|
1234
|
+
cont = false
|
1235
|
+
begin
|
1236
|
+
ftlines.each do |line|
|
1237
|
+
if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ line
|
1238
|
+
cur_ft = [$1.to_s, # Feature Name
|
1239
|
+
"#{$2}#{$4}", # From
|
1240
|
+
$5.to_s, # To
|
1241
|
+
[] # Qualifiers
|
1242
|
+
]
|
1243
|
+
table.push cur_ft
|
1244
|
+
cont = false
|
1245
|
+
elsif cont && /^FT {19}/ =~ line
|
1246
|
+
str = $'
|
1247
|
+
str.rstrip!
|
1248
|
+
orig = cur_ft[3][-1][1].to_s
|
1249
|
+
if orig.size > 0 && orig[-1] != ' ' &&
|
1250
|
+
str.length > 0 && str[0] != ' ' then
|
1251
|
+
orig.concat ' '
|
1252
|
+
end
|
1253
|
+
orig.concat str
|
1254
|
+
cur_ft[3][-1][1] = orig
|
1255
|
+
if cont && orig[-1] == "\""
|
1256
|
+
orig.chop!
|
1257
|
+
cont = false
|
1258
|
+
end
|
1259
|
+
elsif /^FT +\/([^\s\=]+)(?:\=(\")?(.+))?\s*$/ =~ line
|
1260
|
+
key = $1
|
1261
|
+
val = $3
|
1262
|
+
val.rstrip!
|
1263
|
+
cur_ft[3].push [ key, val ]
|
1264
|
+
cont = false
|
1265
|
+
if $2 == "\""
|
1266
|
+
if val.to_s[-1] == "\""
|
1267
|
+
val.chop!
|
1268
|
+
else
|
1269
|
+
cont = true
|
1270
|
+
end
|
1271
|
+
end
|
1272
|
+
else
|
1273
|
+
raise "FT parse error: #{line.inspect}"
|
1274
|
+
end
|
1275
|
+
end
|
1276
|
+
|
1277
|
+
hash = {}
|
1278
|
+
table.each do |feature|
|
1279
|
+
cur_h = {
|
1280
|
+
# Removing '<', '>' or '?' in FROM/TO endopoint.
|
1281
|
+
'From' => feature[1].sub(/\D/, '').to_i,
|
1282
|
+
'To' => feature[2].sub(/\D/, '').to_i,
|
1283
|
+
'diff' => [],
|
1284
|
+
'original' => feature
|
1285
|
+
}
|
1286
|
+
hash[feature[0]] ||= []
|
1287
|
+
hash[feature[0]].push cur_h
|
1288
|
+
feature[3].each do |a|
|
1289
|
+
case a[0]
|
1290
|
+
when 'From', 'To', 'Description', 'FTId', 'diff', 'original'
|
1291
|
+
; # do nothing
|
1292
|
+
else
|
1293
|
+
cur_h[a[0]] = a[1]
|
1294
|
+
end
|
1295
|
+
end
|
1296
|
+
if cur_h["id"] then
|
1297
|
+
cur_h['FTId'] = cur_h['id']
|
1298
|
+
end
|
1299
|
+
|
1300
|
+
case feature[0]
|
1301
|
+
when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
|
1302
|
+
case cur_h['note'].to_s
|
1303
|
+
when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
|
1304
|
+
original_res = $1
|
1305
|
+
changed_res = $2
|
1306
|
+
original_res = original_res.gsub(/ /,'').strip
|
1307
|
+
chenged_res = changed_res.gsub(/ /,'').strip
|
1308
|
+
when /Missing/i
|
1309
|
+
original_res = seq.subseq(cur_h['From'],
|
1310
|
+
cur_h['To'])
|
1311
|
+
changed_res = ''
|
1312
|
+
end
|
1313
|
+
cur_h['diff'] = [original_res, chenged_res]
|
1314
|
+
end
|
1315
|
+
end
|
1316
|
+
rescue
|
1317
|
+
raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
|
1318
|
+
end
|
1319
|
+
|
1320
|
+
hash
|
1321
|
+
end
|
1322
|
+
private :ft_2019_11_parser
|
1323
|
+
|
1324
|
+
# FT parser for the format before Uniprot release 2019_11
|
1325
|
+
def ft_legacy_parser(ftlines)
|
1200
1326
|
table = []
|
1201
1327
|
begin
|
1202
|
-
|
1328
|
+
ftlines.each do |line|
|
1203
1329
|
if line =~ /^FT \w/
|
1204
1330
|
feature = line.chomp.ljust(74)
|
1205
1331
|
table << [feature[ 5..12].strip, # Feature Name
|
@@ -1256,10 +1382,9 @@ class UniProtKB < EMBLDB
|
|
1256
1382
|
raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
|
1257
1383
|
end
|
1258
1384
|
|
1259
|
-
|
1385
|
+
hash
|
1260
1386
|
end
|
1261
|
-
|
1262
|
-
|
1387
|
+
private :ft_legacy_parser
|
1263
1388
|
|
1264
1389
|
# returns a Hash of conteins in the SQ lines.
|
1265
1390
|
# * Bio::UniProtKBL#sq -> hsh
|
data/lib/bio/db/go.rb
CHANGED
@@ -193,7 +193,7 @@ class GO
|
|
193
193
|
# Delimiter
|
194
194
|
RS = DELIMITER
|
195
195
|
|
196
|
-
#
|
196
|
+
# Returns an Array of parsed gene_association flatfile.
|
197
197
|
# Block is acceptable.
|
198
198
|
def self.parser(str)
|
199
199
|
if block_given?
|
@@ -226,7 +226,7 @@ class GO
|
|
226
226
|
# Returns Db_Reference variable.
|
227
227
|
attr_reader :db_reference # -> []
|
228
228
|
|
229
|
-
#
|
229
|
+
# Returns Evidence code variable.
|
230
230
|
attr_reader :evidence
|
231
231
|
|
232
232
|
# Returns the entry is associated with this value.
|
data/lib/bio/sequence/common.rb
CHANGED
@@ -303,6 +303,118 @@ module Common
|
|
303
303
|
end
|
304
304
|
alias splicing splice
|
305
305
|
|
306
|
+
#--
|
307
|
+
# Workaround for Ruby 3.0.0 incompatible changes
|
308
|
+
if ::RUBY_VERSION > "3"
|
309
|
+
|
310
|
+
# Acts almost the same as String#split.
|
311
|
+
def split(*arg)
|
312
|
+
if block_given?
|
313
|
+
super
|
314
|
+
else
|
315
|
+
ret = super(*arg)
|
316
|
+
ret.collect! { |x| self.class.new('').replace(x) }
|
317
|
+
ret
|
318
|
+
end
|
319
|
+
end
|
320
|
+
|
321
|
+
%w( * ljust rjust center ).each do |w|
|
322
|
+
module_eval %Q{
|
323
|
+
def #{w}(*arg)
|
324
|
+
self.class.new('').replace(super)
|
325
|
+
end
|
326
|
+
}
|
327
|
+
end
|
328
|
+
|
329
|
+
%w( chomp chop
|
330
|
+
delete delete_prefix delete_suffix
|
331
|
+
lstrip rstrip strip
|
332
|
+
reverse
|
333
|
+
squeeze
|
334
|
+
succ next
|
335
|
+
tr tr_s
|
336
|
+
capitalize upcase downcase swapcase
|
337
|
+
).each do |w|
|
338
|
+
module_eval %Q{
|
339
|
+
def #{w}(*arg)
|
340
|
+
s = self.dup
|
341
|
+
s.#{w}!(*arg)
|
342
|
+
s
|
343
|
+
end
|
344
|
+
}
|
345
|
+
end
|
346
|
+
|
347
|
+
%w( sub gsub ).each do |w|
|
348
|
+
module_eval %Q{
|
349
|
+
def #{w}(*arg, &block)
|
350
|
+
s = self.dup
|
351
|
+
s.#{w}!(*arg, &block)
|
352
|
+
s
|
353
|
+
end
|
354
|
+
}
|
355
|
+
end
|
356
|
+
|
357
|
+
#Reference: https://nacl-ltd.github.io/2018/11/08/gsub-wrapper.html
|
358
|
+
#(Title: Is it possible to implement gsub wrapper?)
|
359
|
+
%w( sub! gsub! ).each do |w|
|
360
|
+
module_eval %Q{
|
361
|
+
def #{w}(*arg, &block)
|
362
|
+
if block_given? then
|
363
|
+
super(*arg) do |m|
|
364
|
+
b = Thread.current[:_backref]
|
365
|
+
Thread.current[:_backref] = ::Regexp.last_match
|
366
|
+
block.binding.eval("$~ = Thread.current[:_backref]")
|
367
|
+
Thread.current[:_backref] = b
|
368
|
+
block.call(self.class.new('').replace(m))
|
369
|
+
end
|
370
|
+
else
|
371
|
+
super
|
372
|
+
end
|
373
|
+
end
|
374
|
+
}
|
375
|
+
end
|
376
|
+
|
377
|
+
%w( each_char each_grapheme_cluster each_line ).each do |w|
|
378
|
+
module_eval %Q{
|
379
|
+
def #{w}
|
380
|
+
if block_given?
|
381
|
+
super { |c| yield(self.class.new('').replace(c)) }
|
382
|
+
else
|
383
|
+
enum_for(:#{w})
|
384
|
+
end
|
385
|
+
end
|
386
|
+
}
|
387
|
+
end
|
388
|
+
|
389
|
+
%w( slice [] slice! ).each do |w|
|
390
|
+
module_eval %Q{
|
391
|
+
def #{w}(*arg)
|
392
|
+
r = super
|
393
|
+
r ? self.class.new('').replace(r) : r
|
394
|
+
end
|
395
|
+
}
|
396
|
+
end
|
397
|
+
|
398
|
+
%w( partition rpartition ).each do |w|
|
399
|
+
module_eval %Q{
|
400
|
+
def #{w}(sep)
|
401
|
+
r = super
|
402
|
+
if r.kind_of?(Array)
|
403
|
+
r[1] == sep ?
|
404
|
+
[ self.class.new('').replace(r[0]),
|
405
|
+
r[1],
|
406
|
+
self.class.new('').replace(r[2]) ] :
|
407
|
+
r.collect { |x| self.class.new('').replace(x) }
|
408
|
+
else
|
409
|
+
r
|
410
|
+
end
|
411
|
+
end
|
412
|
+
}
|
413
|
+
end
|
414
|
+
#++
|
415
|
+
|
416
|
+
end # if ::RUBY_VERSION > "3"
|
417
|
+
|
306
418
|
end # Common
|
307
419
|
|
308
420
|
end # Sequence
|
data/lib/bio/sequence/format.rb
CHANGED
data/lib/bio/tree.rb
CHANGED
@@ -605,7 +605,7 @@ module Bio
|
|
605
605
|
end
|
606
606
|
|
607
607
|
# Gets path from node1 to node2.
|
608
|
-
#
|
608
|
+
# Returns an array of nodes, including node1 and node2.
|
609
609
|
# If node1 and/or node2 do not exist, IndexError is raised.
|
610
610
|
# If node1 and node2 are not connected, NoPathError is raised.
|
611
611
|
# The result is unspecified for cyclic trees.
|
data/lib/bio/version.rb
CHANGED
@@ -10,7 +10,7 @@
|
|
10
10
|
module Bio
|
11
11
|
|
12
12
|
# BioRuby version (Array containing Integer)
|
13
|
-
BIORUBY_VERSION = [2, 0,
|
13
|
+
BIORUBY_VERSION = [2, 0, 4].extend(Comparable).freeze
|
14
14
|
|
15
15
|
# Extra version specifier (String or nil).
|
16
16
|
# Existance of the value indicates development version.
|