bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,122 @@
1
+ #
2
+ # bio/db/pdb/residue.rb - residue class for PDB
3
+ #
4
+ # Copyright (C) 2004 Alex Gutteridge <alexg@ebi.ac.uk>
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # $Id: residue.rb,v 1.4 2005/12/18 17:34:47 ngoto Exp $
21
+
22
+ require 'bio/db/pdb'
23
+
24
+ module Bio
25
+
26
+ class PDB
27
+
28
+ #Residue class - id is a composite of resSeq and iCode
29
+ class Residue
30
+
31
+ include Utils
32
+ include AtomFinder
33
+ include Enumerable
34
+ include Comparable
35
+
36
+ attr_reader :resName, :resSeq, :iCode, :id, :chain, :hetatm
37
+ attr_writer :resName, :chain, :hetatm
38
+
39
+ def initialize(resName = nil, resSeq = nil, iCode = nil,
40
+ chain = nil, hetatm = false)
41
+
42
+ @resName = resName
43
+ @resSeq = resSeq
44
+ @iCode = iCode
45
+
46
+ @hetatm = hetatm
47
+
48
+ #Residue id is required because resSeq doesn't uniquely identify
49
+ #a residue. ID is constructed from resSeq and iCode and is appended
50
+ #to 'LIGAND' if the residue is a HETATM
51
+ if (!@resSeq and !@iCode)
52
+ @id = nil
53
+ else
54
+ @id = "#{@resSeq}#{@iCode.strip}"
55
+ if @hetatm
56
+ @id = 'LIGAND' + @id
57
+ end
58
+ end
59
+
60
+ @chain = chain
61
+
62
+ @atoms = Array.new
63
+
64
+ end
65
+
66
+ #Keyed access to atoms based on element e.g. ["CA"]
67
+ def [](key)
68
+ atom = @atoms.find{ |atom| key == atom.element }
69
+ end
70
+
71
+ #Need to define these to make sure id is correctly updated
72
+ def resSeq=(resSeq)
73
+ @resSeq = resSeq.to_i
74
+ @id = "#{@resSeq}#{@iCode.strip}"
75
+ if @hetatm
76
+ @id = 'LIGAND' + @id
77
+ end
78
+ end
79
+
80
+ def iCode=(iCode)
81
+ @iCode = iCode
82
+ @id = "#{@resSeq}#{@iCode.strip}"
83
+ if @hetatm
84
+ @id = 'LIGAND' + @id
85
+ end
86
+ end
87
+
88
+ #Adds an atom to this residue
89
+ def addAtom(atom)
90
+ raise "Expecting ATOM or HETATM" unless atom.is_a? Bio::PDB::Record::ATOM
91
+ @atoms.push(atom)
92
+ self
93
+ end
94
+
95
+ #Iterator over the atoms
96
+ def each
97
+ @atoms.each{ |atom| yield atom }
98
+ end
99
+ #Alias to override AtomFinder#each_atom
100
+ alias each_atom each
101
+
102
+ #Sorts based on resSeq and iCode if need be
103
+ def <=>(other)
104
+ if @resSeq != other.resSeq
105
+ return @resSeq <=> other.resSeq
106
+ else
107
+ return @iCode <=> other.iCode
108
+ end
109
+ end
110
+
111
+ #Stringifies each atom
112
+ def to_s
113
+ string = ""
114
+ @atoms.each{ |atom| string << atom.to_s << "\n" }
115
+ return string
116
+ end
117
+
118
+ end
119
+
120
+ end
121
+
122
+ end
@@ -0,0 +1,234 @@
1
+ #
2
+ # bio/db/pdb/utils.rb - Utility modules for PDB
3
+ #
4
+ # Copyright (C) 2004 Alex Gutteridge <alexg@ebi.ac.uk>
5
+ # Copyright (C) 2004 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
6
+ #
7
+ # This library is free software; you can redistribute it and/or
8
+ # modify it under the terms of the GNU Lesser General Public
9
+ # License as published by the Free Software Foundation; either
10
+ # version 2 of the License, or (at your option) any later version.
11
+ #
12
+ # This library is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ # Lesser General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Lesser General Public
18
+ # License along with this library; if not, write to the Free Software
19
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
+ #
21
+ # $Id: utils.rb,v 1.2 2005/09/08 01:22:11 k Exp $
22
+
23
+ require 'matrix'
24
+ require 'bio/db/pdb'
25
+
26
+ module Bio; class PDB
27
+
28
+ module Utils
29
+ #The methods in this mixin should be applicalbe to all PDB objects
30
+
31
+ #Returns the coordinates of the geometric centre (average co-ord)
32
+ #of any AtomFinder (or .atoms) implementing object
33
+ def geometricCentre()
34
+
35
+ x = y = z = count = 0
36
+
37
+ self.each_atom{ |atom|
38
+ x += atom.x
39
+ y += atom.y
40
+ z += atom.z
41
+ count += 1
42
+ }
43
+
44
+ x = x / count
45
+ y = y / count
46
+ z = z / count
47
+
48
+ Coordinate[x,y,z]
49
+
50
+ end
51
+
52
+ #Returns the coords of the centre of gravity for any
53
+ #AtomFinder implementing object
54
+ #Blleurgh! - working out what element it is from the atom name is
55
+ #tricky - this'll work in most cases but not metals etc...
56
+ #a proper element field is included in some PDB files but not all.
57
+ ElementMass = {
58
+ 'H' => 1,
59
+ 'C' => 12,
60
+ 'N' => 14,
61
+ 'O' => 16,
62
+ 'S' => 32,
63
+ 'P' => 31
64
+ }
65
+
66
+ def centreOfGravity()
67
+
68
+ x = y = z = total = 0
69
+
70
+ self.each_atom{ |atom|
71
+ element = atom.element[0,1]
72
+ mass = ElementMass[element]
73
+ total += mass
74
+ x += atom.x * mass
75
+ y += atom.y * mass
76
+ z += atom.z * mass
77
+ }
78
+
79
+ x = x / total
80
+ y = y / total
81
+ z = z / total
82
+
83
+ Coordinate[x,y,z]
84
+
85
+ end
86
+
87
+ #Perhaps distance and dihedral would be better off as class methods?
88
+ #(rather) than instance methods
89
+ def self.distance(coord1,coord2)
90
+ coord1 = to_xyz(coord1)
91
+ coord2 = to_xyz(coord2)
92
+ (coord1 - coord2).r
93
+ end
94
+
95
+ def self.dihedral_angle(coord1,coord2,coord3,coord4)
96
+
97
+ (a1,b1,c1,d) = calculatePlane(coord1,coord2,coord3)
98
+ (a2,b2,c2) = calculatePlane(coord2,coord3,coord4)
99
+
100
+ torsion = acos((a1*a2 + b1*b2 + c1*c2)/(Math.sqrt(a1**2 + b1**2 + c1**2) * Math.sqrt(a2**2 + b2**2 + c2**2)))
101
+
102
+ if ((a1*coord4.x + b1*coord4.y + c1*coord4.z + d) < 0)
103
+ -torsion
104
+ else
105
+ torsion
106
+ end
107
+ end
108
+
109
+ #Implicit conversion into Vector or Bio::PDB::Coordinate
110
+ def self.to_xyz(obj)
111
+ unless obj.is_a?(Vector)
112
+ begin
113
+ obj = obj.xyz
114
+ rescue NameError
115
+ obj = Vector.elements(obj.to_a)
116
+ end
117
+ end
118
+ obj
119
+ end
120
+
121
+ #Methods required for the dihedral angle calculations
122
+ #perhaps these should go in some separate Math module
123
+ def self.rad2deg(r)
124
+ (r/Math::PI)*180
125
+ end
126
+
127
+ def self.acos(x)
128
+ Math.atan2(Math.sqrt(1 - x**2),x)
129
+ end
130
+
131
+ def self.calculatePlane(coord1,coord2,coord3)
132
+ a = coord1.y * (coord2.z - coord3.z) +
133
+ coord2.y * (coord3.z - coord1.z) +
134
+ coord3.y * (coord1.z - coord2.z)
135
+ b = coord1.z * (coord2.x - coord3.x) +
136
+ coord2.z * (coord3.x - coord1.x) +
137
+ coord3.z * (coord1.x - coord2.x)
138
+ c = coord1.x * (coord2.y - coord3.y) +
139
+ coord2.x * (coord3.y - coord1.y) +
140
+ coord3.x * (coord1.y - coord2.y)
141
+ d = -1 *
142
+ (
143
+ (coord1.x * (coord2.y * coord3.z - coord3.y * coord2.z)) +
144
+ (coord2.x * (coord3.y * coord1.z - coord1.y * coord3.z)) +
145
+ (coord3.x * (coord1.y * coord2.z - coord2.y * coord1.z))
146
+ )
147
+
148
+ return [a,b,c,d]
149
+
150
+ end
151
+
152
+ #Every class in the heirarchy implements finder, this takes
153
+ #a class which determines which type of object to find, the associated
154
+ #block is then run in classic .find style
155
+ def finder(findtype,&block)
156
+ if findtype == Bio::PDB::Atom
157
+ return self.find_atom(&block)
158
+ elsif findtype == Bio::PDB::Residue
159
+ return self.find_residue(&block)
160
+ elsif findtype == Bio::PDB::Chain
161
+ return self.find_chain(&block)
162
+ elsif findtype == Bio::PDB::Model
163
+ return self.find_model(&block)
164
+ else
165
+ raise TypeError, "You can't find a #{findtype}"
166
+ end
167
+ end
168
+ end #module Utils
169
+
170
+ #The *Finder modules implement a find_* method which returns
171
+ #an array of anything for which the block evals true
172
+ #(suppose Enumerable#find_all method).
173
+ #The each_* style methods act as classic iterators.
174
+ module ModelFinder
175
+ def find_model()
176
+ array = []
177
+ self.each_model{ |model|
178
+ array.push(model) if yield(model)
179
+ }
180
+ return array
181
+ end
182
+ end
183
+
184
+ #The heirarchical nature of the objects allow us to re-use the
185
+ #methods from the previous level - e.g. A PDB object can use the .models
186
+ #method defined in ModuleFinder to iterate through the models to find the
187
+ #chains
188
+ module ChainFinder
189
+ def find_chain()
190
+ array = []
191
+ self.each_chain{ |chain|
192
+ array.push(chain) if yield(chain)
193
+ }
194
+ return array
195
+ end
196
+ def each_chain()
197
+ self.each_model{ |model|
198
+ model.each{ |chain| yield chain }
199
+ }
200
+ end
201
+ end
202
+
203
+ module ResidueFinder
204
+ def find_residue()
205
+ array = []
206
+ self.each_residue{ |residue|
207
+ array.push(residue) if yield(residue)
208
+ }
209
+ return array
210
+ end
211
+ def each_residue()
212
+ self.each_chain{ |chain|
213
+ chain.each{ |residue| yield residue }
214
+ }
215
+ end
216
+ end
217
+
218
+ module AtomFinder
219
+ def find_atom()
220
+ array = []
221
+ self.each_atom{ |atom|
222
+ array.push(atom) if yield(atom)
223
+ }
224
+ return array
225
+ end
226
+ def each_atom()
227
+ self.each_residue{ |residue|
228
+ residue.each{ |atom| yield atom }
229
+ }
230
+ end
231
+ end
232
+
233
+ end; end #module Bio; class PDB
234
+
@@ -0,0 +1,616 @@
1
+ #
2
+ # = bio/db/prosite.rb - PROSITE database class
3
+ #
4
+ # Copyright:: Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
5
+ # Licence:: LGPL
6
+ #
7
+ # $Id: prosite.rb,v 0.13 2005/12/18 18:24:08 k Exp $
8
+ #
9
+ # == Description
10
+ #
11
+ #
12
+ # == Example
13
+ # == References
14
+ #--
15
+ #
16
+ # This library is free software; you can redistribute it and/or
17
+ # modify it under the terms of the GNU Lesser General Public
18
+ # License as published by the Free Software Foundation; either
19
+ # version 2 of the License, or (at your option) any later version.
20
+ #
21
+ # This library is distributed in the hope that it will be useful,
22
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
23
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24
+ # Lesser General Public License for more details.
25
+ #
26
+ # You should have received a copy of the GNU Lesser General Public
27
+ # License along with this library; if not, write to the Free Software
28
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29
+ #
30
+ #++
31
+ #
32
+
33
+ require 'bio/db'
34
+
35
+ module Bio
36
+
37
+ class PROSITE < EMBLDB
38
+
39
+ # Delimiter
40
+ DELIMITER = "\n//\n"
41
+
42
+ # Delimiter
43
+ RS = DELIMITER
44
+
45
+ # Bio::DB API
46
+ TAGSIZE = 5
47
+
48
+
49
+ def initialize(entry)
50
+ super(entry, TAGSIZE)
51
+ end
52
+
53
+
54
+ # ID Identification (Begins each entry; 1 per entry)
55
+ #
56
+ # ID ENTRY_NAME; ENTRY_TYPE. (ENTRY_TYPE : PATTERN, MATRIX, RULE)
57
+ #
58
+ # Returns
59
+ def name
60
+ unless @data['ID']
61
+ @data['ID'], @data['TYPE'] = fetch('ID').chomp('.').split('; ')
62
+ end
63
+ @data['ID']
64
+ end
65
+
66
+ # Returns
67
+ def division
68
+ unless @data['TYPE']
69
+ name
70
+ end
71
+ @data['TYPE']
72
+ end
73
+
74
+
75
+ # AC Accession number (1 per entry)
76
+ #
77
+ # AC PSnnnnn;
78
+ #
79
+ # Returns
80
+ def ac
81
+ unless @data['AC']
82
+ @data['AC'] = fetch('AC').chomp(';')
83
+ end
84
+ @data['AC']
85
+ end
86
+
87
+ alias entry_id ac
88
+
89
+
90
+ # DT Date (1 per entry)
91
+ #
92
+ # DT MMM-YYYY (CREATED); MMM-YYYY (DATA UPDATE); MMM-YYYY (INFO UPDATE).
93
+ #
94
+ # Returns
95
+ def dt
96
+ field_fetch('DT')
97
+ end
98
+
99
+ alias date dt
100
+
101
+
102
+ # DE Short description (1 per entry)
103
+ #
104
+ # DE Description.
105
+ #
106
+ # Returns
107
+ def de
108
+ field_fetch('DE')
109
+ end
110
+
111
+ alias definition de
112
+
113
+
114
+ # PA Pattern (>=0 per entry)
115
+ #
116
+ # see - pa2re method
117
+ #
118
+ # Returns
119
+ def pa
120
+ field_fetch('PA')
121
+ @data['PA'] = fetch('PA') unless @data['PA']
122
+ @data['PA'].gsub!(/\s+/, '') if @data['PA']
123
+ @data['PA']
124
+ end
125
+
126
+ alias pattern pa
127
+
128
+
129
+ # MA Matrix/profile (>=0 per entry)
130
+ #
131
+ # see - ma2re method
132
+ #
133
+ # Returns
134
+ def ma
135
+ field_fetch('MA')
136
+ end
137
+
138
+ alias profile ma
139
+
140
+
141
+ # RU Rule (>=0 per entry)
142
+ #
143
+ # RU Rule_Description.
144
+ #
145
+ # The rule is described in ordinary English and is free-format.
146
+ #
147
+ # Returns
148
+ def ru
149
+ field_fetch('RU')
150
+ end
151
+
152
+ alias rule ru
153
+
154
+
155
+ # NR Numerical results (>=0 per entry)
156
+ #
157
+ # - SWISS-PROT scan statistics of true and false positives/negatives
158
+ #
159
+ # /RELEASE SWISS-PROT release number and total number of sequence
160
+ # entries in that release.
161
+ # /TOTAL Total number of hits in SWISS-PROT.
162
+ # /POSITIVE Number of hits on proteins that are known to belong to the
163
+ # set in consideration.
164
+ # /UNKNOWN Number of hits on proteins that could possibly belong to
165
+ # the set in consideration.
166
+ # /FALSE_POS Number of false hits (on unrelated proteins).
167
+ # /FALSE_NEG Number of known missed hits.
168
+ # /PARTIAL Number of partial sequences which belong to the set in
169
+ # consideration, but which are not hit by the pattern or
170
+ # profile because they are partial (fragment) sequences.
171
+ #
172
+ # Returns
173
+ def nr
174
+ unless @data['NR']
175
+ hash = {} # temporal hash
176
+ fetch('NR').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
177
+ if v =~ /^(\d+)\((\d+)\)$/
178
+ hits = $1.to_i # the number of hits
179
+ seqs = $2.to_i # the number of sequences
180
+ v = [hits, seqs]
181
+ elsif v =~ /([\d\.]+),(\d+)/
182
+ sprel = $1 # the number of SWISS-PROT release
183
+ spseq = $2.to_i # the number of SWISS-PROT sequences
184
+ v = [sprel, spseq]
185
+ else
186
+ v = v.to_i
187
+ end
188
+ hash[k] = v
189
+ end
190
+ @data['NR'] = hash
191
+ end
192
+ @data['NR']
193
+ end
194
+
195
+ alias statistics nr
196
+
197
+ # Returns
198
+ def release
199
+ statistics['RELEASE']
200
+ end
201
+
202
+ # Returns
203
+ def swissprot_release_number
204
+ release.first
205
+ end
206
+
207
+ # Returns
208
+ def swissprot_release_sequences
209
+ release.last
210
+ end
211
+
212
+ # Returns
213
+ def total
214
+ statistics['TOTAL']
215
+ end
216
+
217
+ # Returns
218
+ def total_hits
219
+ total.first
220
+ end
221
+
222
+ # Returns
223
+ def total_sequences
224
+ total.last
225
+ end
226
+
227
+ # Returns
228
+ def positive
229
+ statistics['POSITIVE']
230
+ end
231
+
232
+ # Returns
233
+ def positive_hits
234
+ positive.first
235
+ end
236
+
237
+ # Returns
238
+ def positive_sequences
239
+ positive.last
240
+ end
241
+
242
+ # Returns
243
+ def unknown
244
+ statistics['UNKNOWN']
245
+ end
246
+
247
+ # Returns
248
+ def unknown_hits
249
+ unknown.first
250
+ end
251
+
252
+ # Returns
253
+ def unknown_sequences
254
+ unknown.last
255
+ end
256
+
257
+ # Returns
258
+ def false_pos
259
+ statistics['FALSE_POS']
260
+ end
261
+
262
+ # Returns
263
+ def false_positive_hits
264
+ false_pos.first
265
+ end
266
+
267
+ # Returns
268
+ def false_positive_sequences
269
+ false_pos.last
270
+ end
271
+
272
+ # Returns
273
+ def false_neg
274
+ statistics['FALSE_NEG']
275
+ end
276
+ alias false_negative_hits false_neg
277
+
278
+ # Returns
279
+ def partial
280
+ statistics['PARTIAL']
281
+ end
282
+
283
+
284
+ # CC Comments (>=0 per entry)
285
+ #
286
+ # CC /QUALIFIER=data; /QUALIFIER=data; .......
287
+ #
288
+ # /TAXO-RANGE Taxonomic range.
289
+ # /MAX-REPEAT Maximum known number of repetitions of the pattern in a
290
+ # single protein.
291
+ # /SITE Indication of an `interesting' site in the pattern.
292
+ # /SKIP-FLAG Indication of an entry that can be, in some cases, ignored
293
+ # by a program (because it is too unspecific).
294
+ #
295
+ # Returns
296
+ def cc
297
+ unless @data['CC']
298
+ hash = {} # temporal hash
299
+ fetch('CC').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
300
+ hash[k] = v
301
+ end
302
+ @data['CC'] = hash
303
+ end
304
+ @data['CC']
305
+ end
306
+
307
+ alias comment cc
308
+
309
+ # Returns
310
+ def taxon_range(expand = nil)
311
+ range = comment['TAXO-RANGE']
312
+ if range and expand
313
+ expand = []
314
+ range.scan(/./) do |x|
315
+ case x
316
+ when 'A'; expand.push('archaebacteria')
317
+ when 'B'; expand.push('bacteriophages')
318
+ when 'E'; expand.push('eukaryotes')
319
+ when 'P'; expand.push('prokaryotes')
320
+ when 'V'; expand.push('eukaryotic viruses')
321
+ end
322
+ end
323
+ range = expand
324
+ end
325
+ return range
326
+ end
327
+
328
+ # Returns
329
+ def max_repeat
330
+ comment['MAX-REPEAT'].to_i
331
+ end
332
+
333
+ # Returns
334
+ def site
335
+ if comment['SITE']
336
+ num, desc = comment['SITE'].split(',')
337
+ end
338
+ return [num.to_i, desc]
339
+ end
340
+
341
+ # Returns
342
+ def skip_flag
343
+ if comment['SKIP-FLAG'] == 'TRUE'
344
+ return true
345
+ end
346
+ end
347
+
348
+
349
+ # DR Cross-references to SWISS-PROT (>=0 per entry)
350
+ #
351
+ # DR AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C;
352
+ #
353
+ # - `AC_NB' is the SWISS-PROT primary accession number of the entry to
354
+ # which reference is being made.
355
+ # - `ENTRY_NAME' is the SWISS-PROT entry name.
356
+ # - `C' is a one character flag that can be one of the following:
357
+ #
358
+ # T For a true positive.
359
+ # N For a false negative; a sequence which belongs to the set under
360
+ # consideration, but which has not been picked up by the pattern or
361
+ # profile.
362
+ # P For a `potential' hit; a sequence that belongs to the set under
363
+ # consideration, but which was not picked up because the region(s) that
364
+ # are used as a 'fingerprint' (pattern or profile) is not yet available
365
+ # in the data bank (partial sequence).
366
+ # ? For an unknown; a sequence which possibly could belong to the set under
367
+ # consideration.
368
+ # F For a false positive; a sequence which does not belong to the set in
369
+ # consideration.
370
+ #
371
+ # Returns
372
+ def dr
373
+ unless @data['DR']
374
+ hash = {} # temporal hash
375
+ if fetch('DR')
376
+ fetch('DR').scan(/(\w+)\s*, (\w+)\s*, (.);/).each do |a, e, c|
377
+ hash[a] = [e, c] # SWISS-PROT : accession, entry, true/false
378
+ end
379
+ end
380
+ @data['DR'] = hash
381
+ end
382
+ @data['DR']
383
+ end
384
+
385
+ alias sp_xref dr
386
+
387
+ # Returns
388
+ def list_xref(flag, by_name = nil)
389
+ ary = []
390
+ sp_xref.each do |sp_acc, value|
391
+ if value[1] == flag
392
+ if by_name
393
+ sp_name = value[0]
394
+ ary.push(sp_name)
395
+ else
396
+ ary.push(sp_acc)
397
+ end
398
+ end
399
+ end
400
+ return ary
401
+ end
402
+
403
+ # Returns
404
+ def list_truepositive(by_name = nil)
405
+ list_xref('T', by_name)
406
+ end
407
+
408
+ # Returns
409
+ def list_falsenegative(by_name = nil)
410
+ list_xref('F', by_name)
411
+ end
412
+
413
+ # Returns
414
+ def list_falsepositive(by_name = nil)
415
+ list_xref('P', by_name)
416
+ end
417
+
418
+ # Returns
419
+ def list_potentialhit(by_name = nil)
420
+ list_xref('P', by_name)
421
+ end
422
+
423
+ # Returns
424
+ def list_unknown(by_name = nil)
425
+ list_xref('?', by_name)
426
+ end
427
+
428
+
429
+ # 3D Cross-references to PDB (>=0 per entry)
430
+ #
431
+ # 3D name; [name2;...]
432
+ #
433
+ # Returns
434
+ def pdb_xref
435
+ unless @data['3D']
436
+ @data['3D'] = fetch('3D').split(/; */)
437
+ end
438
+ @data['3D']
439
+ end
440
+
441
+
442
+ # DO Pointer to the documentation file (1 per entry)
443
+ #
444
+ # DO PDOCnnnnn;
445
+ #
446
+ # Returns
447
+ def pdoc_xref
448
+ @data['DO'] = fetch('DO').chomp(';')
449
+ end
450
+
451
+
452
+ ### prosite pattern to regular expression
453
+ #
454
+ # prosite/prosuser.txt:
455
+ #
456
+ # The PA (PAttern) lines contains the definition of a PROSITE pattern. The
457
+ # patterns are described using the following conventions:
458
+ #
459
+ # 0) The standard IUPAC one-letter codes for the amino acids are used.
460
+ # 0) Ambiguities are indicated by listing the acceptable amino acids for a
461
+ # given position, between square parentheses `[ ]'. For example: [ALT]
462
+ # stands for Ala or Leu or Thr.
463
+ # 1) A period ends the pattern.
464
+ # 2) When a pattern is restricted to either the N- or C-terminal of a
465
+ # sequence, that pattern either starts with a `<' symbol or respectively
466
+ # ends with a `>' symbol.
467
+ # 3) Ambiguities are also indicated by listing between a pair of curly
468
+ # brackets `{ }' the amino acids that are not accepted at a given
469
+ # position. For example: {AM} stands for any amino acid except Ala and
470
+ # Met.
471
+ # 4) Repetition of an element of the pattern can be indicated by following
472
+ # that element with a numerical value or a numerical range between
473
+ # parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to
474
+ # x-x or x-x-x or x-x-x-x.
475
+ # 5) The symbol `x' is used for a position where any amino acid is accepted.
476
+ # 6) Each element in a pattern is separated from its neighbor by a `-'.
477
+ #
478
+ # Examples:
479
+ #
480
+ # PA [AC]-x-V-x(4)-{ED}.
481
+ #
482
+ # This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-{any
483
+ # but Glu or Asp}
484
+ #
485
+ # PA <A-x-[ST](2)-x(0,1)-V.
486
+ #
487
+ # This pattern, which must be in the N-terminal of the sequence (`<'), is
488
+ # translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val
489
+ #
490
+ def self.pa2re(pattern)
491
+ pattern.gsub!(/\s/, '') # remove white spaces
492
+ pattern.sub!(/\.$/, '') # (1) remove trailing '.'
493
+ pattern.sub!(/^</, '^') # (2) restricted to the N-terminal : `<'
494
+ pattern.sub!(/>$/, '$') # (2) restricted to the C-terminal : `>'
495
+ pattern.gsub!(/\{(\w+)\}/) { |m|
496
+ '[^' + $1 + ']' # (3) not accepted at a given position : '{}'
497
+ }
498
+ pattern.gsub!(/\(([\d,]+)\)/) { |m|
499
+ '{' + $1 + '}' # (4) repetition of an element : (n), (n,m)
500
+ }
501
+ pattern.tr!('x', '.') # (5) any amino acid is accepted : 'x'
502
+ pattern.tr!('-', '') # (6) each element is separated by a '-'
503
+ Regexp.new(pattern)
504
+ end
505
+
506
+ def pa2re(pattern)
507
+ self.class.pa2re(pattern)
508
+ end
509
+
510
+
511
+ ### prosite profile to regular expression
512
+ #
513
+ # prosite/profile.txt:
514
+ #
515
+ # Returns
516
+ def ma2re(matrix)
517
+ raise NotImplementedError
518
+ end
519
+
520
+ end
521
+
522
+ end
523
+
524
+
525
+ if __FILE__ == $0
526
+
527
+ begin
528
+ require 'pp'
529
+ alias p pp
530
+ rescue LoadError
531
+ end
532
+
533
+ ps = Bio::PROSITE.new(ARGF.read)
534
+
535
+ list = %w(
536
+ name
537
+ division
538
+ ac
539
+ entry_id
540
+ dt
541
+ date
542
+ de
543
+ definition
544
+ pa
545
+ pattern
546
+ ma
547
+ profile
548
+ ru
549
+ rule
550
+ nr
551
+ statistics
552
+ release
553
+ swissprot_release_number
554
+ swissprot_release_sequences
555
+ total
556
+ total_hits
557
+ total_sequences
558
+ positive
559
+ positive_hits
560
+ positive_sequences
561
+ unknown
562
+ unknown_hits
563
+ unknown_sequences
564
+ false_pos
565
+ false_positive_hits
566
+ false_positive_sequences
567
+ false_neg
568
+ false_negative_hits
569
+ partial
570
+ cc
571
+ comment
572
+ max_repeat
573
+ site
574
+ skip_flag
575
+ dr
576
+ sp_xref
577
+ pdb_xref
578
+ pdoc_xref
579
+ )
580
+
581
+ list.each do |method|
582
+ puts ">>> #{method}"
583
+ p ps.send(method)
584
+ end
585
+
586
+ puts ">>> taxon_range"
587
+ p ps.taxon_range
588
+ puts ">>> taxon_range(expand)"
589
+ p ps.taxon_range(true)
590
+
591
+ puts ">>> list_truepositive"
592
+ p ps.list_truepositive
593
+ puts ">>> list_truepositive(by_name)"
594
+ p ps.list_truepositive(true)
595
+
596
+ puts ">>> list_falsenegative"
597
+ p ps.list_falsenegative
598
+ puts ">>> list_falsenegative(by_name)"
599
+ p ps.list_falsenegative(true)
600
+
601
+ puts ">>> list_falsepositive"
602
+ p ps.list_falsepositive
603
+ puts ">>> list_falsepositive(by_name)"
604
+ p ps.list_falsepositive(true)
605
+
606
+ puts ">>> list_potentialhit"
607
+ p ps.list_potentialhit
608
+ puts ">>> list_potentialhit(by_name)"
609
+ p ps.list_potentialhit(true)
610
+
611
+ puts ">>> list_unknown"
612
+ p ps.list_unknown
613
+ puts ">>> list_unknown(by_name)"
614
+ p ps.list_unknown(true)
615
+
616
+ end