bio 2.0.3 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -248,9 +248,8 @@ class UniProtKB < EMBLDB
248
248
  # SYNONYM >=0
249
249
  # CONTEINS >=0
250
250
  def protein_name
251
- @data['DE'] ||= parse_DE_line_rel14(get('DE'))
252
- parsed_de_line = @data['DE']
253
- if parsed_de_line then
251
+ parsed_de_line = self.de
252
+ if parsed_de_line.kind_of?(Array) then
254
253
  # since UniProtKB release 14.0 of 22-Jul-2008
255
254
  name = nil
256
255
  parsed_de_line.each do |a|
@@ -275,7 +274,6 @@ class UniProtKB < EMBLDB
275
274
  return name
276
275
  end
277
276
 
278
-
279
277
  # returns synonyms (unofficial and/or alternative names).
280
278
  # Returns an Array containing String objects.
281
279
  #
@@ -292,9 +290,8 @@ class UniProtKB < EMBLDB
292
290
  # synonyms are each placed in () following the official name on the DE line.
293
291
  def synonyms
294
292
  ary = Array.new
295
- @data['DE'] ||= parse_DE_line_rel14(get('DE'))
296
- parsed_de_line = @data['DE']
297
- if parsed_de_line then
293
+ parsed_de_line = self.de
294
+ if parsed_de_line.kind_of?(Array) then
298
295
  # since UniProtKB release 14.0 of 22-Jul-2008
299
296
  parsed_de_line.each do |a|
300
297
  case a[0]
@@ -330,6 +327,20 @@ class UniProtKB < EMBLDB
330
327
  return ary
331
328
  end
332
329
 
330
+ # Returns an Array (for new format since rel 14)
331
+ # or a String (for old format before rel 14) for the DE line.
332
+ #
333
+ def de
334
+ return @data['DE'] if @data['DE']
335
+ parsed_de_line = parse_DE_line_rel14(get('DE'))
336
+ case parsed_de_line
337
+ when Array # new format since rel14
338
+ @data['DE'] ||= parsed_de_line
339
+ else
340
+ super
341
+ end
342
+ @data['DE']
343
+ end
333
344
 
334
345
  # returns gene names in the GN line.
335
346
  #
@@ -519,22 +530,43 @@ class UniProtKB < EMBLDB
519
530
  # http://br.expasy.org/sprot/userman.html#OH_line
520
531
  def oh
521
532
  unless @data['OH']
522
- @data['OH'] = fetch('OH').split("\. ").map {|x|
523
- if x =~ /NCBI_TaxID=(\d+);/
524
- taxid = $1
525
- else
526
- raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
527
- $!, "\n", get('OH'), "\n"].join
528
-
529
- end
530
- if x =~ /NCBI_TaxID=\d+; (.+)/
531
- host_name = $1
532
- host_name.sub!(/\.$/, '')
533
- else
534
- host_name = nil
533
+ oh = []
534
+ a = fetch('OH').split(/(NCBI\_TaxID\=)(\d+)(\;)/)
535
+ t = catch :error do
536
+ taxid = nil
537
+ host_name = nil
538
+ while x = a.shift
539
+ x = x.to_s.strip
540
+ case x
541
+ when ''
542
+ next
543
+ when 'NCBI_TaxID='
544
+ if taxid then
545
+ oh.push({'NCBI_TaxID' => taxid, 'HostName' => host_name})
546
+ taxid = nil
547
+ host_name = nil
548
+ end
549
+ taxid = a.shift
550
+ throw :error, :missing_semicolon if a.shift != ';'
551
+ else
552
+ throw :error, :missing_taxid if host_name
553
+ host_name = x
554
+ host_name.sub!(/\.\z/, '')
555
+ end
556
+ end #while x...
557
+ if taxid then
558
+ oh.push({'NCBI_TaxID' => taxid, 'HostName' => host_name})
559
+ elsif host_name then
560
+ throw :error, :missing_taxid_last
535
561
  end
536
- {'NCBI_TaxID' => taxid, 'HostName' => host_name}
537
- }
562
+ nil
563
+ end #t = catch...
564
+ if t then
565
+ raise ArgumentError,
566
+ ["Error: Invalid OH line format (#{self.entry_id}):",
567
+ $!, "\n", get('OH'), "\n"].join
568
+ end
569
+ @data['OH'] = oh
538
570
  end
539
571
  @data['OH']
540
572
  end
@@ -911,6 +943,7 @@ class UniProtKB < EMBLDB
911
943
 
912
944
 
913
945
  def cc_alternative_products(data)
946
+ return nil unless data
914
947
  ap = data.join('')
915
948
  return ap unless ap
916
949
 
@@ -949,6 +982,7 @@ class UniProtKB < EMBLDB
949
982
 
950
983
 
951
984
  def cc_biophysiochemical_properties(data)
985
+ return nil unless data
952
986
  data = data[0]
953
987
 
954
988
  hash = {'Absorption' => {},
@@ -984,6 +1018,7 @@ class UniProtKB < EMBLDB
984
1018
 
985
1019
 
986
1020
  def cc_caution(data)
1021
+ return nil unless data
987
1022
  data.join('')
988
1023
  end
989
1024
  private :cc_caution
@@ -993,6 +1028,7 @@ class UniProtKB < EMBLDB
993
1028
  #
994
1029
  # CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
995
1030
  def cc_interaction(data)
1031
+ return nil unless data
996
1032
  str = data.join('')
997
1033
  it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
998
1034
  it.map {|ent|
@@ -1048,6 +1084,7 @@ class UniProtKB < EMBLDB
1048
1084
 
1049
1085
 
1050
1086
  def cc_pathway(data)
1087
+ return nil unless data
1051
1088
  data.map {|x| x.sub(/\.$/, '') }.map {|x|
1052
1089
  x.split(/; | and |: /)
1053
1090
  }[0]
@@ -1056,6 +1093,7 @@ class UniProtKB < EMBLDB
1056
1093
 
1057
1094
 
1058
1095
  def cc_rna_editing(data)
1096
+ return nil unless data
1059
1097
  data = data.join('')
1060
1098
  entry = {'Modified_positions' => [], 'Note' => ""}
1061
1099
  if data =~ /Modified_positions=(.+?)(\.|;)/
@@ -1072,6 +1110,7 @@ class UniProtKB < EMBLDB
1072
1110
 
1073
1111
 
1074
1112
  def cc_subcellular_location(data)
1113
+ return nil unless data
1075
1114
  data.map {|x|
1076
1115
  x.split('. ').map {|y|
1077
1116
  y.split('; ').map {|z|
@@ -1090,6 +1129,7 @@ class UniProtKB < EMBLDB
1090
1129
  #++
1091
1130
 
1092
1131
  def cc_web_resource(data)
1132
+ return nil unless data
1093
1133
  data.map {|x|
1094
1134
  entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
1095
1135
  x.split(';').each do |y|
@@ -1197,9 +1237,128 @@ class UniProtKB < EMBLDB
1197
1237
  return ft[feature_key] if feature_key
1198
1238
  return @data['FT'] if @data['FT']
1199
1239
 
1240
+ ftstr = get('FT')
1241
+ ftlines = ftstr.split("\n")
1242
+ for i in 0..10 do
1243
+ if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ ftlines[i] &&
1244
+ /^FT +\/([^\s\=]+)(?:\=(\")?(.+)(\")?)?\s*$/ =~ ftlines[i+1] then
1245
+ fmt_2019_11 = true
1246
+ break #for i
1247
+ end
1248
+ end #for i
1249
+
1250
+ hash = if fmt_2019_11 then
1251
+ ft_2019_11_parser(ftlines)
1252
+ else
1253
+ ft_legacy_parser(ftlines)
1254
+ end
1255
+ @data['FT'] = hash
1256
+ end
1257
+
1258
+ # FT parser since UniProt release 2019_11
1259
+ # https://www.uniprot.org/release-notes/2019-12-18-release#text%5Fft
1260
+ def ft_2019_11_parser(ftlines)
1261
+ table = []
1262
+ cur_ft = nil
1263
+ cont = false
1264
+ begin
1265
+ ftlines.each do |line|
1266
+ if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ line
1267
+ f_name = $1.to_s
1268
+ f_from = "#{$2}#{$4}"
1269
+ f_to = $5.to_s
1270
+ f_to = f_from if f_to.empty?
1271
+ cur_ft = [f_name, # Feature Name
1272
+ f_from, # From
1273
+ f_to, # To
1274
+ [] # Qualifiers
1275
+ ]
1276
+ table.push cur_ft
1277
+ cont = false
1278
+ elsif cont && /^FT {19}/ =~ line
1279
+ str = $'
1280
+ str.rstrip!
1281
+ orig = cur_ft[3][-1][1].to_s
1282
+ if orig.size > 0 && orig[-1] != ' ' &&
1283
+ str.length > 0 && str[0] != ' ' then
1284
+ orig.concat ' '
1285
+ end
1286
+ orig.concat str
1287
+ cur_ft[3][-1][1] = orig
1288
+ if cont && orig[-1] == "\""
1289
+ orig.chop!
1290
+ cont = false
1291
+ end
1292
+ elsif /^FT +\/([^\s\=]+)(?:\=(\")?(.+))?\s*$/ =~ line
1293
+ key = $1
1294
+ val = $3
1295
+ val.rstrip!
1296
+ cur_ft[3].push [ key, val ]
1297
+ cont = false
1298
+ if $2 == "\""
1299
+ if val.to_s[-1] == "\""
1300
+ val.chop!
1301
+ else
1302
+ cont = true
1303
+ end
1304
+ end
1305
+ else
1306
+ raise "FT parse error: #{line.inspect}"
1307
+ end
1308
+ end
1309
+
1310
+ hash = {}
1311
+ table.each do |feature|
1312
+ cur_h = {
1313
+ # Removing '<', '>' or '?' in FROM/TO endopoint.
1314
+ 'From' => feature[1].sub(/\D/, '').to_i,
1315
+ 'To' => feature[2].sub(/\D/, '').to_i,
1316
+ 'diff' => [],
1317
+ 'original' => feature
1318
+ }
1319
+ hash[feature[0]] ||= []
1320
+ hash[feature[0]].push cur_h
1321
+ feature[3].each do |a|
1322
+ case a[0]
1323
+ when 'From', 'To', 'Description', 'FTId', 'diff', 'original'
1324
+ ; # do nothing
1325
+ else
1326
+ cur_h[a[0]] = a[1]
1327
+ end
1328
+ end
1329
+ if cur_h["id"] then
1330
+ cur_h['FTId'] = cur_h['id']
1331
+ end
1332
+
1333
+ case feature[0]
1334
+ when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
1335
+ case cur_h['note'].to_s
1336
+ when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
1337
+ original_res = $1
1338
+ changed_res = $2
1339
+ original_res = original_res.gsub(/ /,'').strip
1340
+ chenged_res = changed_res.gsub(/ /,'').strip
1341
+ when /Missing/i
1342
+ original_res = seq.subseq(cur_h['From'],
1343
+ cur_h['To'])
1344
+ changed_res = ''
1345
+ end
1346
+ cur_h['diff'] = [original_res, chenged_res]
1347
+ end
1348
+ end
1349
+ rescue
1350
+ raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
1351
+ end
1352
+
1353
+ hash
1354
+ end
1355
+ private :ft_2019_11_parser
1356
+
1357
+ # FT parser for the format before Uniprot release 2019_11
1358
+ def ft_legacy_parser(ftlines)
1200
1359
  table = []
1201
1360
  begin
1202
- get('FT').split("\n").each do |line|
1361
+ ftlines.each do |line|
1203
1362
  if line =~ /^FT \w/
1204
1363
  feature = line.chomp.ljust(74)
1205
1364
  table << [feature[ 5..12].strip, # Feature Name
@@ -1256,10 +1415,9 @@ class UniProtKB < EMBLDB
1256
1415
  raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
1257
1416
  end
1258
1417
 
1259
- @data['FT'] = hash
1418
+ hash
1260
1419
  end
1261
-
1262
-
1420
+ private :ft_legacy_parser
1263
1421
 
1264
1422
  # returns a Hash of conteins in the SQ lines.
1265
1423
  # * Bio::UniProtKBL#sq -> hsh
data/lib/bio/version.rb CHANGED
@@ -10,7 +10,7 @@
10
10
  module Bio
11
11
 
12
12
  # BioRuby version (Array containing Integer)
13
- BIORUBY_VERSION = [2, 0, 3].extend(Comparable).freeze
13
+ BIORUBY_VERSION = [2, 0, 5].extend(Comparable).freeze
14
14
 
15
15
  # Extra version specifier (String or nil).
16
16
  # Existance of the value indicates development version.
@@ -0,0 +1,127 @@
1
+ ID 1A_AMVLE Reviewed; 1126 AA.
2
+ AC P03589;
3
+ DT 21-JUL-1986, integrated into UniProtKB/Swiss-Prot.
4
+ DT 21-JUL-1986, sequence version 1.
5
+ DT 22-FEB-2023, entry version 78.
6
+ DE RecName: Full=Replication protein 1a;
7
+ DE Includes:
8
+ DE RecName: Full=ATP-dependent helicase;
9
+ DE EC=3.6.4.-;
10
+ DE Includes:
11
+ DE RecName: Full=Methyltransferase;
12
+ DE EC=2.1.1.-;
13
+ GN ORFNames=ORF1a;
14
+ OS Alfalfa mosaic virus (strain 425 / isolate Leiden).
15
+ OC Viruses; Riboviria; Orthornavirae; Kitrinoviricota; Alsuviricetes;
16
+ OC Martellivirales; Bromoviridae; Alfamovirus.
17
+ OX NCBI_TaxID=12322;
18
+ OH NCBI_TaxID=4045; Apium graveolens (Celery).
19
+ OH NCBI_TaxID=83862; Astragalus glycyphyllos (Wild liquorice).
20
+ OH NCBI_TaxID=4072; Capsicum annuum (Capsicum pepper).
21
+ OH NCBI_TaxID=41386; Caryopteris incana.
22
+ OH NCBI_TaxID=3827; Cicer arietinum (Chickpea) (Garbanzo).
23
+ OH NCBI_TaxID=3847; Glycine max (Soybean) (Glycine hispida).
24
+ OH NCBI_TaxID=35936; Lablab purpureus (Hyacinth bean) (Dolichos lablab).
25
+ OH NCBI_TaxID=4236; Lactuca sativa (Garden lettuce).
26
+ OH NCBI_TaxID=3864; Lens culinaris (Lentil) (Cicer lens).
27
+ OH NCBI_TaxID=3869; Lupinus.
28
+ OH NCBI_TaxID=145753; Malva parviflora (Little mallow) (Cheeseweed mallow).
29
+ OH NCBI_TaxID=3879; Medicago sativa (Alfalfa).
30
+ OH NCBI_TaxID=4097; Nicotiana tabacum (Common tobacco).
31
+ OH NCBI_TaxID=3885; Phaseolus vulgaris (Kidney bean) (French bean).
32
+ OH NCBI_TaxID=23113; Philadelphus.
33
+ OH NCBI_TaxID=3888; Pisum sativum (Garden pea).
34
+ OH NCBI_TaxID=4081; Solanum lycopersicum (Tomato) (Lycopersicon esculentum).
35
+ OH NCBI_TaxID=4113; Solanum tuberosum (Potato).
36
+ OH NCBI_TaxID=157662; Teramnus repens.
37
+ OH NCBI_TaxID=60916; Trifolium incarnatum (Crimson clover).
38
+ OH NCBI_TaxID=85293; Viburnum opulus (High-bush cranberry).
39
+ OH NCBI_TaxID=3916; Vigna radiata var. radiata (Mung bean) (Phaseolus aureus).
40
+ OH NCBI_TaxID=3917; Vigna unguiculata (Cowpea).
41
+ RN [1]
42
+ RP NUCLEOTIDE SEQUENCE [GENOMIC RNA].
43
+ RX PubMed=6298738; DOI=10.1093/nar/11.5.1253;
44
+ RA Cornelissen B.J.C., Brederode F.T., Moormann R.J.M., Bol J.F.;
45
+ RT "Complete nucleotide sequence of alfalfa mosaic virus RNA 1.";
46
+ RL Nucleic Acids Res. 11:1253-1265(1983).
47
+ CC -!- FUNCTION: Involved in the virus replication. Contains a helicase domain
48
+ CC and a methyltransferase domain. The methyltransferase domain is
49
+ CC probably involved in viral RNA capping. Involved in the formation of ER
50
+ CC membrane spherular invaginations in which RNA replication complexes
51
+ CC form (By similarity). {ECO:0000250}.
52
+ CC -!- SUBUNIT: Interacts with RNA-directed RNA polymerase 2a. {ECO:0000250}.
53
+ CC -!- SUBCELLULAR LOCATION: Host endoplasmic reticulum membrane
54
+ CC {ECO:0000250}; Peripheral membrane protein {ECO:0000250}.
55
+ CC -!- SIMILARITY: Belongs to the bromoviridae replication protein 1a family.
56
+ CC {ECO:0000305}.
57
+ CC ---------------------------------------------------------------------------
58
+ CC Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms
59
+ CC Distributed under the Creative Commons Attribution (CC BY 4.0) License
60
+ CC ---------------------------------------------------------------------------
61
+ DR EMBL; L00163; AAA46289.1; -; Genomic_RNA.
62
+ DR PIR; A04197; WMFM12.
63
+ DR RefSeq; NP_041192.1; NC_001495.1.
64
+ DR SMR; P03589; -.
65
+ DR GeneID; 962667; -.
66
+ DR KEGG; vg:962667; -.
67
+ DR Proteomes; UP000000358; Genome.
68
+ DR GO; GO:0044167; C:host cell endoplasmic reticulum membrane; IEA:UniProtKB-SubCell.
69
+ DR GO; GO:0016020; C:membrane; IEA:UniProtKB-KW.
70
+ DR GO; GO:0005524; F:ATP binding; IEA:UniProtKB-KW.
71
+ DR GO; GO:0004386; F:helicase activity; IEA:UniProtKB-KW.
72
+ DR GO; GO:0016787; F:hydrolase activity; IEA:UniProtKB-KW.
73
+ DR GO; GO:0008174; F:mRNA methyltransferase activity; IEA:InterPro.
74
+ DR GO; GO:0003723; F:RNA binding; IEA:InterPro.
75
+ DR GO; GO:0006396; P:RNA processing; IEA:InterPro.
76
+ DR Gene3D; 3.40.50.300; P-loop containing nucleotide triphosphate hydrolases; 2.
77
+ DR InterPro; IPR027351; (+)RNA_virus_helicase_core_dom.
78
+ DR InterPro; IPR002588; Alphavirus-like_MT_dom.
79
+ DR InterPro; IPR027417; P-loop_NTPase.
80
+ DR Pfam; PF01443; Viral_helicase1; 1.
81
+ DR Pfam; PF01660; Vmethyltransf; 1.
82
+ DR SUPFAM; SSF52540; P-loop containing nucleoside triphosphate hydrolases; 1.
83
+ DR PROSITE; PS51743; ALPHAVIRUS_MT; 1.
84
+ DR PROSITE; PS51657; PSRV_HELICASE; 1.
85
+ PE 3: Inferred from homology;
86
+ KW ATP-binding; Helicase; Host endoplasmic reticulum; Host membrane;
87
+ KW Hydrolase; Membrane; Methyltransferase; Nucleotide-binding;
88
+ KW Reference proteome; Transferase.
89
+ FT CHAIN 1..1126
90
+ FT /note="Replication protein 1a"
91
+ FT /id="PRO_0000083254"
92
+ FT DOMAIN 90..278
93
+ FT /note="Alphavirus-like MT"
94
+ FT /evidence="ECO:0000255|PROSITE-ProRule:PRU01079"
95
+ FT DOMAIN 806..963
96
+ FT /note="(+)RNA virus helicase ATP-binding"
97
+ FT DOMAIN 964..1125
98
+ FT /note="(+)RNA virus helicase C-terminal"
99
+ FT REGION 69..406
100
+ FT /note="Methyltransferase"
101
+ FT REGION 834..1094
102
+ FT /note="ATP-dependent helicase"
103
+ FT BINDING 838..845
104
+ FT /ligand="ATP"
105
+ FT /ligand_id="ChEBI:CHEBI:30616"
106
+ FT /evidence="ECO:0000255"
107
+ SQ SEQUENCE 1126 AA; 125828 MW; BF5A8019B47D4CBF CRC64;
108
+ MNADAQSTDA SLSMREPLSH ASIQEMLRRV VEKQAADDTT AIGKVFSEAG RAYAQDALPS
109
+ DKGEVLKISF SLDATQQNIL RANFPGRRTV FSNSSSSSHC FAAAHRLLET DFVYRCFGNT
110
+ VDSIIDLGGN FVSHMKVKRH NVHCCCPILD ARDGARLTER ILSLKSYVRK HPEIVGEADY
111
+ CMDTFQKCSR RADYAFAIHS TSDLDVGELA CSLDQKGVMK FICTMMVDAD MLIHNEGEIP
112
+ NFNVRWEIDR KKDLIHFDFI DEPNLGYSHR FSLLKHYLTY NAVDLGHAAY RIERKQDFGG
113
+ VMVIDLTYSL GFVPKMPHSN GRSCAWYNRV KGQMVVHTVN EGYYHHSYQT AVRRKVLVDK
114
+ KVLTRVTEVA FRQFRPNADA HSAIQSIATM LSSSTNHTII GGVTLISGKP LSPDDYIPVA
115
+ TTIYYRVKKL YNAIPEMLSL LDKGERLSTD AVLKGSEGPM WYSGPTFLSA LDKVNVPGDF
116
+ VAKALLSLPK RDLKSLFSRS ATSHSERTPV RDESPIRCTD GVFYPIRMLL KCLGSDKFES
117
+ VTITDPRSNT ETTVDLYQSF QKKIETVFSF ILGKIDGPSP LISDPVYFQS LEDVYYAEWH
118
+ QGNAIDASNY ARTLLDDIRK QKEESLKAKA KEVEDAQKLN RAILQVHAYL EAHPDGGKIE
119
+ GLGLSSQFIA KIPELAIPTP KPLPEFEKNA ETGEILRINP HSDAILEAID YLKSTSANSI
120
+ ITLNKLGDHC QWTTKGLDVV WAGDDKRRAF IPKKNTWVGP TARSYPLAKY ERAMSKDGYV
121
+ TLRWDGEVLD ANCVRSLSQY EIVFVDQSCV FASAEAIIPS LEKALGLEAH FSVTIVDGVA
122
+ GCGKTTNIKQ IARSSGRDVD LILTSNRSSA DELKETIDCS PLTKLHYIRT CDSYLMSASA
123
+ VKAQRLIFDE CFLQHAGLVY AAATLAGCSE VIGFGDTEQI PFVSRNPSFV FRHHKLTGKV
124
+ ERKLITWRSP ADATYCLEKY FYKNKKPVKT NSRVLRSIEV VPINSPVSVE RNTNALYLCH
125
+ TQAEKAVLKA QTHLKGCDNI FTTHEAQGKT FDNVYFCRLT RTSTSLATGR DPINGPCNGL
126
+ VALSRHKKTF KYFTIAHDSD DVIYNACRDA GNTDDSILAR SYNHNF
127
+ //