bio 2.0.3 → 2.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -248,9 +248,8 @@ class UniProtKB < EMBLDB
248
248
  # SYNONYM >=0
249
249
  # CONTEINS >=0
250
250
  def protein_name
251
- @data['DE'] ||= parse_DE_line_rel14(get('DE'))
252
- parsed_de_line = @data['DE']
253
- if parsed_de_line then
251
+ parsed_de_line = self.de
252
+ if parsed_de_line.kind_of?(Array) then
254
253
  # since UniProtKB release 14.0 of 22-Jul-2008
255
254
  name = nil
256
255
  parsed_de_line.each do |a|
@@ -275,7 +274,6 @@ class UniProtKB < EMBLDB
275
274
  return name
276
275
  end
277
276
 
278
-
279
277
  # returns synonyms (unofficial and/or alternative names).
280
278
  # Returns an Array containing String objects.
281
279
  #
@@ -292,9 +290,8 @@ class UniProtKB < EMBLDB
292
290
  # synonyms are each placed in () following the official name on the DE line.
293
291
  def synonyms
294
292
  ary = Array.new
295
- @data['DE'] ||= parse_DE_line_rel14(get('DE'))
296
- parsed_de_line = @data['DE']
297
- if parsed_de_line then
293
+ parsed_de_line = self.de
294
+ if parsed_de_line.kind_of?(Array) then
298
295
  # since UniProtKB release 14.0 of 22-Jul-2008
299
296
  parsed_de_line.each do |a|
300
297
  case a[0]
@@ -330,6 +327,20 @@ class UniProtKB < EMBLDB
330
327
  return ary
331
328
  end
332
329
 
330
+ # Returns an Array (for new format since rel 14)
331
+ # or a String (for old format before rel 14) for the DE line.
332
+ #
333
+ def de
334
+ return @data['DE'] if @data['DE']
335
+ parsed_de_line = parse_DE_line_rel14(get('DE'))
336
+ case parsed_de_line
337
+ when Array # new format since rel14
338
+ @data['DE'] ||= parsed_de_line
339
+ else
340
+ super
341
+ end
342
+ @data['DE']
343
+ end
333
344
 
334
345
  # returns gene names in the GN line.
335
346
  #
@@ -519,22 +530,43 @@ class UniProtKB < EMBLDB
519
530
  # http://br.expasy.org/sprot/userman.html#OH_line
520
531
  def oh
521
532
  unless @data['OH']
522
- @data['OH'] = fetch('OH').split("\. ").map {|x|
523
- if x =~ /NCBI_TaxID=(\d+);/
524
- taxid = $1
525
- else
526
- raise ArgumentError, ["Error: Invalid OH line format (#{self.entry_id}):",
527
- $!, "\n", get('OH'), "\n"].join
528
-
529
- end
530
- if x =~ /NCBI_TaxID=\d+; (.+)/
531
- host_name = $1
532
- host_name.sub!(/\.$/, '')
533
- else
534
- host_name = nil
533
+ oh = []
534
+ a = fetch('OH').split(/(NCBI\_TaxID\=)(\d+)(\;)/)
535
+ t = catch :error do
536
+ taxid = nil
537
+ host_name = nil
538
+ while x = a.shift
539
+ x = x.to_s.strip
540
+ case x
541
+ when ''
542
+ next
543
+ when 'NCBI_TaxID='
544
+ if taxid then
545
+ oh.push({'NCBI_TaxID' => taxid, 'HostName' => host_name})
546
+ taxid = nil
547
+ host_name = nil
548
+ end
549
+ taxid = a.shift
550
+ throw :error, :missing_semicolon if a.shift != ';'
551
+ else
552
+ throw :error, :missing_taxid if host_name
553
+ host_name = x
554
+ host_name.sub!(/\.\z/, '')
555
+ end
556
+ end #while x...
557
+ if taxid then
558
+ oh.push({'NCBI_TaxID' => taxid, 'HostName' => host_name})
559
+ elsif host_name then
560
+ throw :error, :missing_taxid_last
535
561
  end
536
- {'NCBI_TaxID' => taxid, 'HostName' => host_name}
537
- }
562
+ nil
563
+ end #t = catch...
564
+ if t then
565
+ raise ArgumentError,
566
+ ["Error: Invalid OH line format (#{self.entry_id}):",
567
+ $!, "\n", get('OH'), "\n"].join
568
+ end
569
+ @data['OH'] = oh
538
570
  end
539
571
  @data['OH']
540
572
  end
@@ -911,6 +943,7 @@ class UniProtKB < EMBLDB
911
943
 
912
944
 
913
945
  def cc_alternative_products(data)
946
+ return nil unless data
914
947
  ap = data.join('')
915
948
  return ap unless ap
916
949
 
@@ -949,6 +982,7 @@ class UniProtKB < EMBLDB
949
982
 
950
983
 
951
984
  def cc_biophysiochemical_properties(data)
985
+ return nil unless data
952
986
  data = data[0]
953
987
 
954
988
  hash = {'Absorption' => {},
@@ -984,6 +1018,7 @@ class UniProtKB < EMBLDB
984
1018
 
985
1019
 
986
1020
  def cc_caution(data)
1021
+ return nil unless data
987
1022
  data.join('')
988
1023
  end
989
1024
  private :cc_caution
@@ -993,6 +1028,7 @@ class UniProtKB < EMBLDB
993
1028
  #
994
1029
  # CC P46527:CDKN1B; NbExp=1; IntAct=EBI-359815, EBI-519280;
995
1030
  def cc_interaction(data)
1031
+ return nil unless data
996
1032
  str = data.join('')
997
1033
  it = str.scan(/(.+?); NbExp=(.+?); IntAct=(.+?);/)
998
1034
  it.map {|ent|
@@ -1048,6 +1084,7 @@ class UniProtKB < EMBLDB
1048
1084
 
1049
1085
 
1050
1086
  def cc_pathway(data)
1087
+ return nil unless data
1051
1088
  data.map {|x| x.sub(/\.$/, '') }.map {|x|
1052
1089
  x.split(/; | and |: /)
1053
1090
  }[0]
@@ -1056,6 +1093,7 @@ class UniProtKB < EMBLDB
1056
1093
 
1057
1094
 
1058
1095
  def cc_rna_editing(data)
1096
+ return nil unless data
1059
1097
  data = data.join('')
1060
1098
  entry = {'Modified_positions' => [], 'Note' => ""}
1061
1099
  if data =~ /Modified_positions=(.+?)(\.|;)/
@@ -1072,6 +1110,7 @@ class UniProtKB < EMBLDB
1072
1110
 
1073
1111
 
1074
1112
  def cc_subcellular_location(data)
1113
+ return nil unless data
1075
1114
  data.map {|x|
1076
1115
  x.split('. ').map {|y|
1077
1116
  y.split('; ').map {|z|
@@ -1090,6 +1129,7 @@ class UniProtKB < EMBLDB
1090
1129
  #++
1091
1130
 
1092
1131
  def cc_web_resource(data)
1132
+ return nil unless data
1093
1133
  data.map {|x|
1094
1134
  entry = {'Name' => nil, 'Note' => nil, 'URL' => nil}
1095
1135
  x.split(';').each do |y|
@@ -1197,9 +1237,128 @@ class UniProtKB < EMBLDB
1197
1237
  return ft[feature_key] if feature_key
1198
1238
  return @data['FT'] if @data['FT']
1199
1239
 
1240
+ ftstr = get('FT')
1241
+ ftlines = ftstr.split("\n")
1242
+ for i in 0..10 do
1243
+ if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ ftlines[i] &&
1244
+ /^FT +\/([^\s\=]+)(?:\=(\")?(.+)(\")?)?\s*$/ =~ ftlines[i+1] then
1245
+ fmt_2019_11 = true
1246
+ break #for i
1247
+ end
1248
+ end #for i
1249
+
1250
+ hash = if fmt_2019_11 then
1251
+ ft_2019_11_parser(ftlines)
1252
+ else
1253
+ ft_legacy_parser(ftlines)
1254
+ end
1255
+ @data['FT'] = hash
1256
+ end
1257
+
1258
+ # FT parser since UniProt release 2019_11
1259
+ # https://www.uniprot.org/release-notes/2019-12-18-release#text%5Fft
1260
+ def ft_2019_11_parser(ftlines)
1261
+ table = []
1262
+ cur_ft = nil
1263
+ cont = false
1264
+ begin
1265
+ ftlines.each do |line|
1266
+ if /^FT +([^\s]+) +(([^\s]+)\:)?([\<\?]?[0-9]+|\?)(?:\.\.([\>\?]?[0-9]+|\?))?\s*$/ =~ line
1267
+ f_name = $1.to_s
1268
+ f_from = "#{$2}#{$4}"
1269
+ f_to = $5.to_s
1270
+ f_to = f_from if f_to.empty?
1271
+ cur_ft = [f_name, # Feature Name
1272
+ f_from, # From
1273
+ f_to, # To
1274
+ [] # Qualifiers
1275
+ ]
1276
+ table.push cur_ft
1277
+ cont = false
1278
+ elsif cont && /^FT {19}/ =~ line
1279
+ str = $'
1280
+ str.rstrip!
1281
+ orig = cur_ft[3][-1][1].to_s
1282
+ if orig.size > 0 && orig[-1] != ' ' &&
1283
+ str.length > 0 && str[0] != ' ' then
1284
+ orig.concat ' '
1285
+ end
1286
+ orig.concat str
1287
+ cur_ft[3][-1][1] = orig
1288
+ if cont && orig[-1] == "\""
1289
+ orig.chop!
1290
+ cont = false
1291
+ end
1292
+ elsif /^FT +\/([^\s\=]+)(?:\=(\")?(.+))?\s*$/ =~ line
1293
+ key = $1
1294
+ val = $3
1295
+ val.rstrip!
1296
+ cur_ft[3].push [ key, val ]
1297
+ cont = false
1298
+ if $2 == "\""
1299
+ if val.to_s[-1] == "\""
1300
+ val.chop!
1301
+ else
1302
+ cont = true
1303
+ end
1304
+ end
1305
+ else
1306
+ raise "FT parse error: #{line.inspect}"
1307
+ end
1308
+ end
1309
+
1310
+ hash = {}
1311
+ table.each do |feature|
1312
+ cur_h = {
1313
+ # Removing '<', '>' or '?' in FROM/TO endopoint.
1314
+ 'From' => feature[1].sub(/\D/, '').to_i,
1315
+ 'To' => feature[2].sub(/\D/, '').to_i,
1316
+ 'diff' => [],
1317
+ 'original' => feature
1318
+ }
1319
+ hash[feature[0]] ||= []
1320
+ hash[feature[0]].push cur_h
1321
+ feature[3].each do |a|
1322
+ case a[0]
1323
+ when 'From', 'To', 'Description', 'FTId', 'diff', 'original'
1324
+ ; # do nothing
1325
+ else
1326
+ cur_h[a[0]] = a[1]
1327
+ end
1328
+ end
1329
+ if cur_h["id"] then
1330
+ cur_h['FTId'] = cur_h['id']
1331
+ end
1332
+
1333
+ case feature[0]
1334
+ when 'VARSPLIC', 'VARIANT', 'VAR_SEQ', 'CONFLICT'
1335
+ case cur_h['note'].to_s
1336
+ when /(\w[\w ]*\w*) - ?> (\w[\w ]*\w*)/
1337
+ original_res = $1
1338
+ changed_res = $2
1339
+ original_res = original_res.gsub(/ /,'').strip
1340
+ chenged_res = changed_res.gsub(/ /,'').strip
1341
+ when /Missing/i
1342
+ original_res = seq.subseq(cur_h['From'],
1343
+ cur_h['To'])
1344
+ changed_res = ''
1345
+ end
1346
+ cur_h['diff'] = [original_res, chenged_res]
1347
+ end
1348
+ end
1349
+ rescue
1350
+ raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
1351
+ end
1352
+
1353
+ hash
1354
+ end
1355
+ private :ft_2019_11_parser
1356
+
1357
+ # FT parser for the format before Uniprot release 2019_11
1358
+ def ft_legacy_parser(ftlines)
1200
1359
  table = []
1201
1360
  begin
1202
- get('FT').split("\n").each do |line|
1361
+ ftlines.each do |line|
1203
1362
  if line =~ /^FT \w/
1204
1363
  feature = line.chomp.ljust(74)
1205
1364
  table << [feature[ 5..12].strip, # Feature Name
@@ -1256,10 +1415,9 @@ class UniProtKB < EMBLDB
1256
1415
  raise "Invalid FT Lines(#{$!}) in #{entry_id}:, \n'#{self.get('FT')}'\n"
1257
1416
  end
1258
1417
 
1259
- @data['FT'] = hash
1418
+ hash
1260
1419
  end
1261
-
1262
-
1420
+ private :ft_legacy_parser
1263
1421
 
1264
1422
  # returns a Hash of conteins in the SQ lines.
1265
1423
  # * Bio::UniProtKBL#sq -> hsh
data/lib/bio/version.rb CHANGED
@@ -10,7 +10,7 @@
10
10
  module Bio
11
11
 
12
12
  # BioRuby version (Array containing Integer)
13
- BIORUBY_VERSION = [2, 0, 3].extend(Comparable).freeze
13
+ BIORUBY_VERSION = [2, 0, 5].extend(Comparable).freeze
14
14
 
15
15
  # Extra version specifier (String or nil).
16
16
  # Existance of the value indicates development version.
@@ -0,0 +1,127 @@
1
+ ID 1A_AMVLE Reviewed; 1126 AA.
2
+ AC P03589;
3
+ DT 21-JUL-1986, integrated into UniProtKB/Swiss-Prot.
4
+ DT 21-JUL-1986, sequence version 1.
5
+ DT 22-FEB-2023, entry version 78.
6
+ DE RecName: Full=Replication protein 1a;
7
+ DE Includes:
8
+ DE RecName: Full=ATP-dependent helicase;
9
+ DE EC=3.6.4.-;
10
+ DE Includes:
11
+ DE RecName: Full=Methyltransferase;
12
+ DE EC=2.1.1.-;
13
+ GN ORFNames=ORF1a;
14
+ OS Alfalfa mosaic virus (strain 425 / isolate Leiden).
15
+ OC Viruses; Riboviria; Orthornavirae; Kitrinoviricota; Alsuviricetes;
16
+ OC Martellivirales; Bromoviridae; Alfamovirus.
17
+ OX NCBI_TaxID=12322;
18
+ OH NCBI_TaxID=4045; Apium graveolens (Celery).
19
+ OH NCBI_TaxID=83862; Astragalus glycyphyllos (Wild liquorice).
20
+ OH NCBI_TaxID=4072; Capsicum annuum (Capsicum pepper).
21
+ OH NCBI_TaxID=41386; Caryopteris incana.
22
+ OH NCBI_TaxID=3827; Cicer arietinum (Chickpea) (Garbanzo).
23
+ OH NCBI_TaxID=3847; Glycine max (Soybean) (Glycine hispida).
24
+ OH NCBI_TaxID=35936; Lablab purpureus (Hyacinth bean) (Dolichos lablab).
25
+ OH NCBI_TaxID=4236; Lactuca sativa (Garden lettuce).
26
+ OH NCBI_TaxID=3864; Lens culinaris (Lentil) (Cicer lens).
27
+ OH NCBI_TaxID=3869; Lupinus.
28
+ OH NCBI_TaxID=145753; Malva parviflora (Little mallow) (Cheeseweed mallow).
29
+ OH NCBI_TaxID=3879; Medicago sativa (Alfalfa).
30
+ OH NCBI_TaxID=4097; Nicotiana tabacum (Common tobacco).
31
+ OH NCBI_TaxID=3885; Phaseolus vulgaris (Kidney bean) (French bean).
32
+ OH NCBI_TaxID=23113; Philadelphus.
33
+ OH NCBI_TaxID=3888; Pisum sativum (Garden pea).
34
+ OH NCBI_TaxID=4081; Solanum lycopersicum (Tomato) (Lycopersicon esculentum).
35
+ OH NCBI_TaxID=4113; Solanum tuberosum (Potato).
36
+ OH NCBI_TaxID=157662; Teramnus repens.
37
+ OH NCBI_TaxID=60916; Trifolium incarnatum (Crimson clover).
38
+ OH NCBI_TaxID=85293; Viburnum opulus (High-bush cranberry).
39
+ OH NCBI_TaxID=3916; Vigna radiata var. radiata (Mung bean) (Phaseolus aureus).
40
+ OH NCBI_TaxID=3917; Vigna unguiculata (Cowpea).
41
+ RN [1]
42
+ RP NUCLEOTIDE SEQUENCE [GENOMIC RNA].
43
+ RX PubMed=6298738; DOI=10.1093/nar/11.5.1253;
44
+ RA Cornelissen B.J.C., Brederode F.T., Moormann R.J.M., Bol J.F.;
45
+ RT "Complete nucleotide sequence of alfalfa mosaic virus RNA 1.";
46
+ RL Nucleic Acids Res. 11:1253-1265(1983).
47
+ CC -!- FUNCTION: Involved in the virus replication. Contains a helicase domain
48
+ CC and a methyltransferase domain. The methyltransferase domain is
49
+ CC probably involved in viral RNA capping. Involved in the formation of ER
50
+ CC membrane spherular invaginations in which RNA replication complexes
51
+ CC form (By similarity). {ECO:0000250}.
52
+ CC -!- SUBUNIT: Interacts with RNA-directed RNA polymerase 2a. {ECO:0000250}.
53
+ CC -!- SUBCELLULAR LOCATION: Host endoplasmic reticulum membrane
54
+ CC {ECO:0000250}; Peripheral membrane protein {ECO:0000250}.
55
+ CC -!- SIMILARITY: Belongs to the bromoviridae replication protein 1a family.
56
+ CC {ECO:0000305}.
57
+ CC ---------------------------------------------------------------------------
58
+ CC Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms
59
+ CC Distributed under the Creative Commons Attribution (CC BY 4.0) License
60
+ CC ---------------------------------------------------------------------------
61
+ DR EMBL; L00163; AAA46289.1; -; Genomic_RNA.
62
+ DR PIR; A04197; WMFM12.
63
+ DR RefSeq; NP_041192.1; NC_001495.1.
64
+ DR SMR; P03589; -.
65
+ DR GeneID; 962667; -.
66
+ DR KEGG; vg:962667; -.
67
+ DR Proteomes; UP000000358; Genome.
68
+ DR GO; GO:0044167; C:host cell endoplasmic reticulum membrane; IEA:UniProtKB-SubCell.
69
+ DR GO; GO:0016020; C:membrane; IEA:UniProtKB-KW.
70
+ DR GO; GO:0005524; F:ATP binding; IEA:UniProtKB-KW.
71
+ DR GO; GO:0004386; F:helicase activity; IEA:UniProtKB-KW.
72
+ DR GO; GO:0016787; F:hydrolase activity; IEA:UniProtKB-KW.
73
+ DR GO; GO:0008174; F:mRNA methyltransferase activity; IEA:InterPro.
74
+ DR GO; GO:0003723; F:RNA binding; IEA:InterPro.
75
+ DR GO; GO:0006396; P:RNA processing; IEA:InterPro.
76
+ DR Gene3D; 3.40.50.300; P-loop containing nucleotide triphosphate hydrolases; 2.
77
+ DR InterPro; IPR027351; (+)RNA_virus_helicase_core_dom.
78
+ DR InterPro; IPR002588; Alphavirus-like_MT_dom.
79
+ DR InterPro; IPR027417; P-loop_NTPase.
80
+ DR Pfam; PF01443; Viral_helicase1; 1.
81
+ DR Pfam; PF01660; Vmethyltransf; 1.
82
+ DR SUPFAM; SSF52540; P-loop containing nucleoside triphosphate hydrolases; 1.
83
+ DR PROSITE; PS51743; ALPHAVIRUS_MT; 1.
84
+ DR PROSITE; PS51657; PSRV_HELICASE; 1.
85
+ PE 3: Inferred from homology;
86
+ KW ATP-binding; Helicase; Host endoplasmic reticulum; Host membrane;
87
+ KW Hydrolase; Membrane; Methyltransferase; Nucleotide-binding;
88
+ KW Reference proteome; Transferase.
89
+ FT CHAIN 1..1126
90
+ FT /note="Replication protein 1a"
91
+ FT /id="PRO_0000083254"
92
+ FT DOMAIN 90..278
93
+ FT /note="Alphavirus-like MT"
94
+ FT /evidence="ECO:0000255|PROSITE-ProRule:PRU01079"
95
+ FT DOMAIN 806..963
96
+ FT /note="(+)RNA virus helicase ATP-binding"
97
+ FT DOMAIN 964..1125
98
+ FT /note="(+)RNA virus helicase C-terminal"
99
+ FT REGION 69..406
100
+ FT /note="Methyltransferase"
101
+ FT REGION 834..1094
102
+ FT /note="ATP-dependent helicase"
103
+ FT BINDING 838..845
104
+ FT /ligand="ATP"
105
+ FT /ligand_id="ChEBI:CHEBI:30616"
106
+ FT /evidence="ECO:0000255"
107
+ SQ SEQUENCE 1126 AA; 125828 MW; BF5A8019B47D4CBF CRC64;
108
+ MNADAQSTDA SLSMREPLSH ASIQEMLRRV VEKQAADDTT AIGKVFSEAG RAYAQDALPS
109
+ DKGEVLKISF SLDATQQNIL RANFPGRRTV FSNSSSSSHC FAAAHRLLET DFVYRCFGNT
110
+ VDSIIDLGGN FVSHMKVKRH NVHCCCPILD ARDGARLTER ILSLKSYVRK HPEIVGEADY
111
+ CMDTFQKCSR RADYAFAIHS TSDLDVGELA CSLDQKGVMK FICTMMVDAD MLIHNEGEIP
112
+ NFNVRWEIDR KKDLIHFDFI DEPNLGYSHR FSLLKHYLTY NAVDLGHAAY RIERKQDFGG
113
+ VMVIDLTYSL GFVPKMPHSN GRSCAWYNRV KGQMVVHTVN EGYYHHSYQT AVRRKVLVDK
114
+ KVLTRVTEVA FRQFRPNADA HSAIQSIATM LSSSTNHTII GGVTLISGKP LSPDDYIPVA
115
+ TTIYYRVKKL YNAIPEMLSL LDKGERLSTD AVLKGSEGPM WYSGPTFLSA LDKVNVPGDF
116
+ VAKALLSLPK RDLKSLFSRS ATSHSERTPV RDESPIRCTD GVFYPIRMLL KCLGSDKFES
117
+ VTITDPRSNT ETTVDLYQSF QKKIETVFSF ILGKIDGPSP LISDPVYFQS LEDVYYAEWH
118
+ QGNAIDASNY ARTLLDDIRK QKEESLKAKA KEVEDAQKLN RAILQVHAYL EAHPDGGKIE
119
+ GLGLSSQFIA KIPELAIPTP KPLPEFEKNA ETGEILRINP HSDAILEAID YLKSTSANSI
120
+ ITLNKLGDHC QWTTKGLDVV WAGDDKRRAF IPKKNTWVGP TARSYPLAKY ERAMSKDGYV
121
+ TLRWDGEVLD ANCVRSLSQY EIVFVDQSCV FASAEAIIPS LEKALGLEAH FSVTIVDGVA
122
+ GCGKTTNIKQ IARSSGRDVD LILTSNRSSA DELKETIDCS PLTKLHYIRT CDSYLMSASA
123
+ VKAQRLIFDE CFLQHAGLVY AAATLAGCSE VIGFGDTEQI PFVSRNPSFV FRHHKLTGKV
124
+ ERKLITWRSP ADATYCLEKY FYKNKKPVKT NSRVLRSIEV VPINSPVSVE RNTNALYLCH
125
+ TQAEKAVLKA QTHLKGCDNI FTTHEAQGKT FDNVYFCRLT RTSTSLATGR DPINGPCNGL
126
+ VALSRHKKTF KYFTIAHDSD DVIYNACRDA GNTDDSILAR SYNHNF
127
+ //