bio 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,122 @@
1
+ #
2
+ # bio/db/pdb/residue.rb - residue class for PDB
3
+ #
4
+ # Copyright (C) 2004 Alex Gutteridge <alexg@ebi.ac.uk>
5
+ #
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2 of the License, or (at your option) any later version.
10
+ #
11
+ # This library is distributed in the hope that it will be useful,
12
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
13
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
+ # Lesser General Public License for more details.
15
+ #
16
+ # You should have received a copy of the GNU Lesser General Public
17
+ # License along with this library; if not, write to the Free Software
18
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19
+ #
20
+ # $Id: residue.rb,v 1.4 2005/12/18 17:34:47 ngoto Exp $
21
+
22
+ require 'bio/db/pdb'
23
+
24
+ module Bio
25
+
26
+ class PDB
27
+
28
+ #Residue class - id is a composite of resSeq and iCode
29
+ class Residue
30
+
31
+ include Utils
32
+ include AtomFinder
33
+ include Enumerable
34
+ include Comparable
35
+
36
+ attr_reader :resName, :resSeq, :iCode, :id, :chain, :hetatm
37
+ attr_writer :resName, :chain, :hetatm
38
+
39
+ def initialize(resName = nil, resSeq = nil, iCode = nil,
40
+ chain = nil, hetatm = false)
41
+
42
+ @resName = resName
43
+ @resSeq = resSeq
44
+ @iCode = iCode
45
+
46
+ @hetatm = hetatm
47
+
48
+ #Residue id is required because resSeq doesn't uniquely identify
49
+ #a residue. ID is constructed from resSeq and iCode and is appended
50
+ #to 'LIGAND' if the residue is a HETATM
51
+ if (!@resSeq and !@iCode)
52
+ @id = nil
53
+ else
54
+ @id = "#{@resSeq}#{@iCode.strip}"
55
+ if @hetatm
56
+ @id = 'LIGAND' + @id
57
+ end
58
+ end
59
+
60
+ @chain = chain
61
+
62
+ @atoms = Array.new
63
+
64
+ end
65
+
66
+ #Keyed access to atoms based on element e.g. ["CA"]
67
+ def [](key)
68
+ atom = @atoms.find{ |atom| key == atom.element }
69
+ end
70
+
71
+ #Need to define these to make sure id is correctly updated
72
+ def resSeq=(resSeq)
73
+ @resSeq = resSeq.to_i
74
+ @id = "#{@resSeq}#{@iCode.strip}"
75
+ if @hetatm
76
+ @id = 'LIGAND' + @id
77
+ end
78
+ end
79
+
80
+ def iCode=(iCode)
81
+ @iCode = iCode
82
+ @id = "#{@resSeq}#{@iCode.strip}"
83
+ if @hetatm
84
+ @id = 'LIGAND' + @id
85
+ end
86
+ end
87
+
88
+ #Adds an atom to this residue
89
+ def addAtom(atom)
90
+ raise "Expecting ATOM or HETATM" unless atom.is_a? Bio::PDB::Record::ATOM
91
+ @atoms.push(atom)
92
+ self
93
+ end
94
+
95
+ #Iterator over the atoms
96
+ def each
97
+ @atoms.each{ |atom| yield atom }
98
+ end
99
+ #Alias to override AtomFinder#each_atom
100
+ alias each_atom each
101
+
102
+ #Sorts based on resSeq and iCode if need be
103
+ def <=>(other)
104
+ if @resSeq != other.resSeq
105
+ return @resSeq <=> other.resSeq
106
+ else
107
+ return @iCode <=> other.iCode
108
+ end
109
+ end
110
+
111
+ #Stringifies each atom
112
+ def to_s
113
+ string = ""
114
+ @atoms.each{ |atom| string << atom.to_s << "\n" }
115
+ return string
116
+ end
117
+
118
+ end
119
+
120
+ end
121
+
122
+ end
@@ -0,0 +1,234 @@
1
+ #
2
+ # bio/db/pdb/utils.rb - Utility modules for PDB
3
+ #
4
+ # Copyright (C) 2004 Alex Gutteridge <alexg@ebi.ac.uk>
5
+ # Copyright (C) 2004 GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>
6
+ #
7
+ # This library is free software; you can redistribute it and/or
8
+ # modify it under the terms of the GNU Lesser General Public
9
+ # License as published by the Free Software Foundation; either
10
+ # version 2 of the License, or (at your option) any later version.
11
+ #
12
+ # This library is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15
+ # Lesser General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Lesser General Public
18
+ # License along with this library; if not, write to the Free Software
19
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20
+ #
21
+ # $Id: utils.rb,v 1.2 2005/09/08 01:22:11 k Exp $
22
+
23
+ require 'matrix'
24
+ require 'bio/db/pdb'
25
+
26
+ module Bio; class PDB
27
+
28
+ module Utils
29
+ #The methods in this mixin should be applicalbe to all PDB objects
30
+
31
+ #Returns the coordinates of the geometric centre (average co-ord)
32
+ #of any AtomFinder (or .atoms) implementing object
33
+ def geometricCentre()
34
+
35
+ x = y = z = count = 0
36
+
37
+ self.each_atom{ |atom|
38
+ x += atom.x
39
+ y += atom.y
40
+ z += atom.z
41
+ count += 1
42
+ }
43
+
44
+ x = x / count
45
+ y = y / count
46
+ z = z / count
47
+
48
+ Coordinate[x,y,z]
49
+
50
+ end
51
+
52
+ #Returns the coords of the centre of gravity for any
53
+ #AtomFinder implementing object
54
+ #Blleurgh! - working out what element it is from the atom name is
55
+ #tricky - this'll work in most cases but not metals etc...
56
+ #a proper element field is included in some PDB files but not all.
57
+ ElementMass = {
58
+ 'H' => 1,
59
+ 'C' => 12,
60
+ 'N' => 14,
61
+ 'O' => 16,
62
+ 'S' => 32,
63
+ 'P' => 31
64
+ }
65
+
66
+ def centreOfGravity()
67
+
68
+ x = y = z = total = 0
69
+
70
+ self.each_atom{ |atom|
71
+ element = atom.element[0,1]
72
+ mass = ElementMass[element]
73
+ total += mass
74
+ x += atom.x * mass
75
+ y += atom.y * mass
76
+ z += atom.z * mass
77
+ }
78
+
79
+ x = x / total
80
+ y = y / total
81
+ z = z / total
82
+
83
+ Coordinate[x,y,z]
84
+
85
+ end
86
+
87
+ #Perhaps distance and dihedral would be better off as class methods?
88
+ #(rather) than instance methods
89
+ def self.distance(coord1,coord2)
90
+ coord1 = to_xyz(coord1)
91
+ coord2 = to_xyz(coord2)
92
+ (coord1 - coord2).r
93
+ end
94
+
95
+ def self.dihedral_angle(coord1,coord2,coord3,coord4)
96
+
97
+ (a1,b1,c1,d) = calculatePlane(coord1,coord2,coord3)
98
+ (a2,b2,c2) = calculatePlane(coord2,coord3,coord4)
99
+
100
+ torsion = acos((a1*a2 + b1*b2 + c1*c2)/(Math.sqrt(a1**2 + b1**2 + c1**2) * Math.sqrt(a2**2 + b2**2 + c2**2)))
101
+
102
+ if ((a1*coord4.x + b1*coord4.y + c1*coord4.z + d) < 0)
103
+ -torsion
104
+ else
105
+ torsion
106
+ end
107
+ end
108
+
109
+ #Implicit conversion into Vector or Bio::PDB::Coordinate
110
+ def self.to_xyz(obj)
111
+ unless obj.is_a?(Vector)
112
+ begin
113
+ obj = obj.xyz
114
+ rescue NameError
115
+ obj = Vector.elements(obj.to_a)
116
+ end
117
+ end
118
+ obj
119
+ end
120
+
121
+ #Methods required for the dihedral angle calculations
122
+ #perhaps these should go in some separate Math module
123
+ def self.rad2deg(r)
124
+ (r/Math::PI)*180
125
+ end
126
+
127
+ def self.acos(x)
128
+ Math.atan2(Math.sqrt(1 - x**2),x)
129
+ end
130
+
131
+ def self.calculatePlane(coord1,coord2,coord3)
132
+ a = coord1.y * (coord2.z - coord3.z) +
133
+ coord2.y * (coord3.z - coord1.z) +
134
+ coord3.y * (coord1.z - coord2.z)
135
+ b = coord1.z * (coord2.x - coord3.x) +
136
+ coord2.z * (coord3.x - coord1.x) +
137
+ coord3.z * (coord1.x - coord2.x)
138
+ c = coord1.x * (coord2.y - coord3.y) +
139
+ coord2.x * (coord3.y - coord1.y) +
140
+ coord3.x * (coord1.y - coord2.y)
141
+ d = -1 *
142
+ (
143
+ (coord1.x * (coord2.y * coord3.z - coord3.y * coord2.z)) +
144
+ (coord2.x * (coord3.y * coord1.z - coord1.y * coord3.z)) +
145
+ (coord3.x * (coord1.y * coord2.z - coord2.y * coord1.z))
146
+ )
147
+
148
+ return [a,b,c,d]
149
+
150
+ end
151
+
152
+ #Every class in the heirarchy implements finder, this takes
153
+ #a class which determines which type of object to find, the associated
154
+ #block is then run in classic .find style
155
+ def finder(findtype,&block)
156
+ if findtype == Bio::PDB::Atom
157
+ return self.find_atom(&block)
158
+ elsif findtype == Bio::PDB::Residue
159
+ return self.find_residue(&block)
160
+ elsif findtype == Bio::PDB::Chain
161
+ return self.find_chain(&block)
162
+ elsif findtype == Bio::PDB::Model
163
+ return self.find_model(&block)
164
+ else
165
+ raise TypeError, "You can't find a #{findtype}"
166
+ end
167
+ end
168
+ end #module Utils
169
+
170
+ #The *Finder modules implement a find_* method which returns
171
+ #an array of anything for which the block evals true
172
+ #(suppose Enumerable#find_all method).
173
+ #The each_* style methods act as classic iterators.
174
+ module ModelFinder
175
+ def find_model()
176
+ array = []
177
+ self.each_model{ |model|
178
+ array.push(model) if yield(model)
179
+ }
180
+ return array
181
+ end
182
+ end
183
+
184
+ #The heirarchical nature of the objects allow us to re-use the
185
+ #methods from the previous level - e.g. A PDB object can use the .models
186
+ #method defined in ModuleFinder to iterate through the models to find the
187
+ #chains
188
+ module ChainFinder
189
+ def find_chain()
190
+ array = []
191
+ self.each_chain{ |chain|
192
+ array.push(chain) if yield(chain)
193
+ }
194
+ return array
195
+ end
196
+ def each_chain()
197
+ self.each_model{ |model|
198
+ model.each{ |chain| yield chain }
199
+ }
200
+ end
201
+ end
202
+
203
+ module ResidueFinder
204
+ def find_residue()
205
+ array = []
206
+ self.each_residue{ |residue|
207
+ array.push(residue) if yield(residue)
208
+ }
209
+ return array
210
+ end
211
+ def each_residue()
212
+ self.each_chain{ |chain|
213
+ chain.each{ |residue| yield residue }
214
+ }
215
+ end
216
+ end
217
+
218
+ module AtomFinder
219
+ def find_atom()
220
+ array = []
221
+ self.each_atom{ |atom|
222
+ array.push(atom) if yield(atom)
223
+ }
224
+ return array
225
+ end
226
+ def each_atom()
227
+ self.each_residue{ |residue|
228
+ residue.each{ |atom| yield atom }
229
+ }
230
+ end
231
+ end
232
+
233
+ end; end #module Bio; class PDB
234
+
@@ -0,0 +1,616 @@
1
+ #
2
+ # = bio/db/prosite.rb - PROSITE database class
3
+ #
4
+ # Copyright:: Copyright (C) 2001 KATAYAMA Toshiaki <k@bioruby.org>
5
+ # Licence:: LGPL
6
+ #
7
+ # $Id: prosite.rb,v 0.13 2005/12/18 18:24:08 k Exp $
8
+ #
9
+ # == Description
10
+ #
11
+ #
12
+ # == Example
13
+ # == References
14
+ #--
15
+ #
16
+ # This library is free software; you can redistribute it and/or
17
+ # modify it under the terms of the GNU Lesser General Public
18
+ # License as published by the Free Software Foundation; either
19
+ # version 2 of the License, or (at your option) any later version.
20
+ #
21
+ # This library is distributed in the hope that it will be useful,
22
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
23
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24
+ # Lesser General Public License for more details.
25
+ #
26
+ # You should have received a copy of the GNU Lesser General Public
27
+ # License along with this library; if not, write to the Free Software
28
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29
+ #
30
+ #++
31
+ #
32
+
33
+ require 'bio/db'
34
+
35
+ module Bio
36
+
37
+ class PROSITE < EMBLDB
38
+
39
+ # Delimiter
40
+ DELIMITER = "\n//\n"
41
+
42
+ # Delimiter
43
+ RS = DELIMITER
44
+
45
+ # Bio::DB API
46
+ TAGSIZE = 5
47
+
48
+
49
+ def initialize(entry)
50
+ super(entry, TAGSIZE)
51
+ end
52
+
53
+
54
+ # ID Identification (Begins each entry; 1 per entry)
55
+ #
56
+ # ID ENTRY_NAME; ENTRY_TYPE. (ENTRY_TYPE : PATTERN, MATRIX, RULE)
57
+ #
58
+ # Returns
59
+ def name
60
+ unless @data['ID']
61
+ @data['ID'], @data['TYPE'] = fetch('ID').chomp('.').split('; ')
62
+ end
63
+ @data['ID']
64
+ end
65
+
66
+ # Returns
67
+ def division
68
+ unless @data['TYPE']
69
+ name
70
+ end
71
+ @data['TYPE']
72
+ end
73
+
74
+
75
+ # AC Accession number (1 per entry)
76
+ #
77
+ # AC PSnnnnn;
78
+ #
79
+ # Returns
80
+ def ac
81
+ unless @data['AC']
82
+ @data['AC'] = fetch('AC').chomp(';')
83
+ end
84
+ @data['AC']
85
+ end
86
+
87
+ alias entry_id ac
88
+
89
+
90
+ # DT Date (1 per entry)
91
+ #
92
+ # DT MMM-YYYY (CREATED); MMM-YYYY (DATA UPDATE); MMM-YYYY (INFO UPDATE).
93
+ #
94
+ # Returns
95
+ def dt
96
+ field_fetch('DT')
97
+ end
98
+
99
+ alias date dt
100
+
101
+
102
+ # DE Short description (1 per entry)
103
+ #
104
+ # DE Description.
105
+ #
106
+ # Returns
107
+ def de
108
+ field_fetch('DE')
109
+ end
110
+
111
+ alias definition de
112
+
113
+
114
+ # PA Pattern (>=0 per entry)
115
+ #
116
+ # see - pa2re method
117
+ #
118
+ # Returns
119
+ def pa
120
+ field_fetch('PA')
121
+ @data['PA'] = fetch('PA') unless @data['PA']
122
+ @data['PA'].gsub!(/\s+/, '') if @data['PA']
123
+ @data['PA']
124
+ end
125
+
126
+ alias pattern pa
127
+
128
+
129
+ # MA Matrix/profile (>=0 per entry)
130
+ #
131
+ # see - ma2re method
132
+ #
133
+ # Returns
134
+ def ma
135
+ field_fetch('MA')
136
+ end
137
+
138
+ alias profile ma
139
+
140
+
141
+ # RU Rule (>=0 per entry)
142
+ #
143
+ # RU Rule_Description.
144
+ #
145
+ # The rule is described in ordinary English and is free-format.
146
+ #
147
+ # Returns
148
+ def ru
149
+ field_fetch('RU')
150
+ end
151
+
152
+ alias rule ru
153
+
154
+
155
+ # NR Numerical results (>=0 per entry)
156
+ #
157
+ # - SWISS-PROT scan statistics of true and false positives/negatives
158
+ #
159
+ # /RELEASE SWISS-PROT release number and total number of sequence
160
+ # entries in that release.
161
+ # /TOTAL Total number of hits in SWISS-PROT.
162
+ # /POSITIVE Number of hits on proteins that are known to belong to the
163
+ # set in consideration.
164
+ # /UNKNOWN Number of hits on proteins that could possibly belong to
165
+ # the set in consideration.
166
+ # /FALSE_POS Number of false hits (on unrelated proteins).
167
+ # /FALSE_NEG Number of known missed hits.
168
+ # /PARTIAL Number of partial sequences which belong to the set in
169
+ # consideration, but which are not hit by the pattern or
170
+ # profile because they are partial (fragment) sequences.
171
+ #
172
+ # Returns
173
+ def nr
174
+ unless @data['NR']
175
+ hash = {} # temporal hash
176
+ fetch('NR').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
177
+ if v =~ /^(\d+)\((\d+)\)$/
178
+ hits = $1.to_i # the number of hits
179
+ seqs = $2.to_i # the number of sequences
180
+ v = [hits, seqs]
181
+ elsif v =~ /([\d\.]+),(\d+)/
182
+ sprel = $1 # the number of SWISS-PROT release
183
+ spseq = $2.to_i # the number of SWISS-PROT sequences
184
+ v = [sprel, spseq]
185
+ else
186
+ v = v.to_i
187
+ end
188
+ hash[k] = v
189
+ end
190
+ @data['NR'] = hash
191
+ end
192
+ @data['NR']
193
+ end
194
+
195
+ alias statistics nr
196
+
197
+ # Returns
198
+ def release
199
+ statistics['RELEASE']
200
+ end
201
+
202
+ # Returns
203
+ def swissprot_release_number
204
+ release.first
205
+ end
206
+
207
+ # Returns
208
+ def swissprot_release_sequences
209
+ release.last
210
+ end
211
+
212
+ # Returns
213
+ def total
214
+ statistics['TOTAL']
215
+ end
216
+
217
+ # Returns
218
+ def total_hits
219
+ total.first
220
+ end
221
+
222
+ # Returns
223
+ def total_sequences
224
+ total.last
225
+ end
226
+
227
+ # Returns
228
+ def positive
229
+ statistics['POSITIVE']
230
+ end
231
+
232
+ # Returns
233
+ def positive_hits
234
+ positive.first
235
+ end
236
+
237
+ # Returns
238
+ def positive_sequences
239
+ positive.last
240
+ end
241
+
242
+ # Returns
243
+ def unknown
244
+ statistics['UNKNOWN']
245
+ end
246
+
247
+ # Returns
248
+ def unknown_hits
249
+ unknown.first
250
+ end
251
+
252
+ # Returns
253
+ def unknown_sequences
254
+ unknown.last
255
+ end
256
+
257
+ # Returns
258
+ def false_pos
259
+ statistics['FALSE_POS']
260
+ end
261
+
262
+ # Returns
263
+ def false_positive_hits
264
+ false_pos.first
265
+ end
266
+
267
+ # Returns
268
+ def false_positive_sequences
269
+ false_pos.last
270
+ end
271
+
272
+ # Returns
273
+ def false_neg
274
+ statistics['FALSE_NEG']
275
+ end
276
+ alias false_negative_hits false_neg
277
+
278
+ # Returns
279
+ def partial
280
+ statistics['PARTIAL']
281
+ end
282
+
283
+
284
+ # CC Comments (>=0 per entry)
285
+ #
286
+ # CC /QUALIFIER=data; /QUALIFIER=data; .......
287
+ #
288
+ # /TAXO-RANGE Taxonomic range.
289
+ # /MAX-REPEAT Maximum known number of repetitions of the pattern in a
290
+ # single protein.
291
+ # /SITE Indication of an `interesting' site in the pattern.
292
+ # /SKIP-FLAG Indication of an entry that can be, in some cases, ignored
293
+ # by a program (because it is too unspecific).
294
+ #
295
+ # Returns
296
+ def cc
297
+ unless @data['CC']
298
+ hash = {} # temporal hash
299
+ fetch('CC').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
300
+ hash[k] = v
301
+ end
302
+ @data['CC'] = hash
303
+ end
304
+ @data['CC']
305
+ end
306
+
307
+ alias comment cc
308
+
309
+ # Returns
310
+ def taxon_range(expand = nil)
311
+ range = comment['TAXO-RANGE']
312
+ if range and expand
313
+ expand = []
314
+ range.scan(/./) do |x|
315
+ case x
316
+ when 'A'; expand.push('archaebacteria')
317
+ when 'B'; expand.push('bacteriophages')
318
+ when 'E'; expand.push('eukaryotes')
319
+ when 'P'; expand.push('prokaryotes')
320
+ when 'V'; expand.push('eukaryotic viruses')
321
+ end
322
+ end
323
+ range = expand
324
+ end
325
+ return range
326
+ end
327
+
328
+ # Returns
329
+ def max_repeat
330
+ comment['MAX-REPEAT'].to_i
331
+ end
332
+
333
+ # Returns
334
+ def site
335
+ if comment['SITE']
336
+ num, desc = comment['SITE'].split(',')
337
+ end
338
+ return [num.to_i, desc]
339
+ end
340
+
341
+ # Returns
342
+ def skip_flag
343
+ if comment['SKIP-FLAG'] == 'TRUE'
344
+ return true
345
+ end
346
+ end
347
+
348
+
349
+ # DR Cross-references to SWISS-PROT (>=0 per entry)
350
+ #
351
+ # DR AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C;
352
+ #
353
+ # - `AC_NB' is the SWISS-PROT primary accession number of the entry to
354
+ # which reference is being made.
355
+ # - `ENTRY_NAME' is the SWISS-PROT entry name.
356
+ # - `C' is a one character flag that can be one of the following:
357
+ #
358
+ # T For a true positive.
359
+ # N For a false negative; a sequence which belongs to the set under
360
+ # consideration, but which has not been picked up by the pattern or
361
+ # profile.
362
+ # P For a `potential' hit; a sequence that belongs to the set under
363
+ # consideration, but which was not picked up because the region(s) that
364
+ # are used as a 'fingerprint' (pattern or profile) is not yet available
365
+ # in the data bank (partial sequence).
366
+ # ? For an unknown; a sequence which possibly could belong to the set under
367
+ # consideration.
368
+ # F For a false positive; a sequence which does not belong to the set in
369
+ # consideration.
370
+ #
371
+ # Returns
372
+ def dr
373
+ unless @data['DR']
374
+ hash = {} # temporal hash
375
+ if fetch('DR')
376
+ fetch('DR').scan(/(\w+)\s*, (\w+)\s*, (.);/).each do |a, e, c|
377
+ hash[a] = [e, c] # SWISS-PROT : accession, entry, true/false
378
+ end
379
+ end
380
+ @data['DR'] = hash
381
+ end
382
+ @data['DR']
383
+ end
384
+
385
+ alias sp_xref dr
386
+
387
+ # Returns
388
+ def list_xref(flag, by_name = nil)
389
+ ary = []
390
+ sp_xref.each do |sp_acc, value|
391
+ if value[1] == flag
392
+ if by_name
393
+ sp_name = value[0]
394
+ ary.push(sp_name)
395
+ else
396
+ ary.push(sp_acc)
397
+ end
398
+ end
399
+ end
400
+ return ary
401
+ end
402
+
403
+ # Returns
404
+ def list_truepositive(by_name = nil)
405
+ list_xref('T', by_name)
406
+ end
407
+
408
+ # Returns
409
+ def list_falsenegative(by_name = nil)
410
+ list_xref('F', by_name)
411
+ end
412
+
413
+ # Returns
414
+ def list_falsepositive(by_name = nil)
415
+ list_xref('P', by_name)
416
+ end
417
+
418
+ # Returns
419
+ def list_potentialhit(by_name = nil)
420
+ list_xref('P', by_name)
421
+ end
422
+
423
+ # Returns
424
+ def list_unknown(by_name = nil)
425
+ list_xref('?', by_name)
426
+ end
427
+
428
+
429
+ # 3D Cross-references to PDB (>=0 per entry)
430
+ #
431
+ # 3D name; [name2;...]
432
+ #
433
+ # Returns
434
+ def pdb_xref
435
+ unless @data['3D']
436
+ @data['3D'] = fetch('3D').split(/; */)
437
+ end
438
+ @data['3D']
439
+ end
440
+
441
+
442
+ # DO Pointer to the documentation file (1 per entry)
443
+ #
444
+ # DO PDOCnnnnn;
445
+ #
446
+ # Returns
447
+ def pdoc_xref
448
+ @data['DO'] = fetch('DO').chomp(';')
449
+ end
450
+
451
+
452
+ ### prosite pattern to regular expression
453
+ #
454
+ # prosite/prosuser.txt:
455
+ #
456
+ # The PA (PAttern) lines contains the definition of a PROSITE pattern. The
457
+ # patterns are described using the following conventions:
458
+ #
459
+ # 0) The standard IUPAC one-letter codes for the amino acids are used.
460
+ # 0) Ambiguities are indicated by listing the acceptable amino acids for a
461
+ # given position, between square parentheses `[ ]'. For example: [ALT]
462
+ # stands for Ala or Leu or Thr.
463
+ # 1) A period ends the pattern.
464
+ # 2) When a pattern is restricted to either the N- or C-terminal of a
465
+ # sequence, that pattern either starts with a `<' symbol or respectively
466
+ # ends with a `>' symbol.
467
+ # 3) Ambiguities are also indicated by listing between a pair of curly
468
+ # brackets `{ }' the amino acids that are not accepted at a given
469
+ # position. For example: {AM} stands for any amino acid except Ala and
470
+ # Met.
471
+ # 4) Repetition of an element of the pattern can be indicated by following
472
+ # that element with a numerical value or a numerical range between
473
+ # parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to
474
+ # x-x or x-x-x or x-x-x-x.
475
+ # 5) The symbol `x' is used for a position where any amino acid is accepted.
476
+ # 6) Each element in a pattern is separated from its neighbor by a `-'.
477
+ #
478
+ # Examples:
479
+ #
480
+ # PA [AC]-x-V-x(4)-{ED}.
481
+ #
482
+ # This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-{any
483
+ # but Glu or Asp}
484
+ #
485
+ # PA <A-x-[ST](2)-x(0,1)-V.
486
+ #
487
+ # This pattern, which must be in the N-terminal of the sequence (`<'), is
488
+ # translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val
489
+ #
490
+ def self.pa2re(pattern)
491
+ pattern.gsub!(/\s/, '') # remove white spaces
492
+ pattern.sub!(/\.$/, '') # (1) remove trailing '.'
493
+ pattern.sub!(/^</, '^') # (2) restricted to the N-terminal : `<'
494
+ pattern.sub!(/>$/, '$') # (2) restricted to the C-terminal : `>'
495
+ pattern.gsub!(/\{(\w+)\}/) { |m|
496
+ '[^' + $1 + ']' # (3) not accepted at a given position : '{}'
497
+ }
498
+ pattern.gsub!(/\(([\d,]+)\)/) { |m|
499
+ '{' + $1 + '}' # (4) repetition of an element : (n), (n,m)
500
+ }
501
+ pattern.tr!('x', '.') # (5) any amino acid is accepted : 'x'
502
+ pattern.tr!('-', '') # (6) each element is separated by a '-'
503
+ Regexp.new(pattern)
504
+ end
505
+
506
+ def pa2re(pattern)
507
+ self.class.pa2re(pattern)
508
+ end
509
+
510
+
511
+ ### prosite profile to regular expression
512
+ #
513
+ # prosite/profile.txt:
514
+ #
515
+ # Returns
516
+ def ma2re(matrix)
517
+ raise NotImplementedError
518
+ end
519
+
520
+ end
521
+
522
+ end
523
+
524
+
525
+ if __FILE__ == $0
526
+
527
+ begin
528
+ require 'pp'
529
+ alias p pp
530
+ rescue LoadError
531
+ end
532
+
533
+ ps = Bio::PROSITE.new(ARGF.read)
534
+
535
+ list = %w(
536
+ name
537
+ division
538
+ ac
539
+ entry_id
540
+ dt
541
+ date
542
+ de
543
+ definition
544
+ pa
545
+ pattern
546
+ ma
547
+ profile
548
+ ru
549
+ rule
550
+ nr
551
+ statistics
552
+ release
553
+ swissprot_release_number
554
+ swissprot_release_sequences
555
+ total
556
+ total_hits
557
+ total_sequences
558
+ positive
559
+ positive_hits
560
+ positive_sequences
561
+ unknown
562
+ unknown_hits
563
+ unknown_sequences
564
+ false_pos
565
+ false_positive_hits
566
+ false_positive_sequences
567
+ false_neg
568
+ false_negative_hits
569
+ partial
570
+ cc
571
+ comment
572
+ max_repeat
573
+ site
574
+ skip_flag
575
+ dr
576
+ sp_xref
577
+ pdb_xref
578
+ pdoc_xref
579
+ )
580
+
581
+ list.each do |method|
582
+ puts ">>> #{method}"
583
+ p ps.send(method)
584
+ end
585
+
586
+ puts ">>> taxon_range"
587
+ p ps.taxon_range
588
+ puts ">>> taxon_range(expand)"
589
+ p ps.taxon_range(true)
590
+
591
+ puts ">>> list_truepositive"
592
+ p ps.list_truepositive
593
+ puts ">>> list_truepositive(by_name)"
594
+ p ps.list_truepositive(true)
595
+
596
+ puts ">>> list_falsenegative"
597
+ p ps.list_falsenegative
598
+ puts ">>> list_falsenegative(by_name)"
599
+ p ps.list_falsenegative(true)
600
+
601
+ puts ">>> list_falsepositive"
602
+ p ps.list_falsepositive
603
+ puts ">>> list_falsepositive(by_name)"
604
+ p ps.list_falsepositive(true)
605
+
606
+ puts ">>> list_potentialhit"
607
+ p ps.list_potentialhit
608
+ puts ">>> list_potentialhit(by_name)"
609
+ p ps.list_potentialhit(true)
610
+
611
+ puts ">>> list_unknown"
612
+ p ps.list_unknown
613
+ puts ">>> list_unknown(by_name)"
614
+ p ps.list_unknown(true)
615
+
616
+ end