bio 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. data/bin/bioruby +107 -0
  2. data/bin/br_biofetch.rb +59 -0
  3. data/bin/br_bioflat.rb +294 -0
  4. data/bin/br_biogetseq.rb +57 -0
  5. data/bin/br_pmfetch.rb +431 -0
  6. data/doc/BioRuby.rd.ja +225 -0
  7. data/doc/Changes-0.7.rd +236 -0
  8. data/doc/Design.rd.ja +341 -0
  9. data/doc/KEGG_API.rd +1437 -0
  10. data/doc/KEGG_API.rd.ja +1399 -0
  11. data/doc/TODO.rd.ja +138 -0
  12. data/doc/Tutorial.rd +1138 -0
  13. data/doc/Tutorial.rd.ja +2110 -0
  14. data/etc/bioinformatics/seqdatabase.ini +210 -0
  15. data/lib/bio.rb +256 -0
  16. data/lib/bio/alignment.rb +1906 -0
  17. data/lib/bio/appl/bl2seq/report.rb +350 -0
  18. data/lib/bio/appl/blast.rb +269 -0
  19. data/lib/bio/appl/blast/format0.rb +1402 -0
  20. data/lib/bio/appl/blast/format8.rb +95 -0
  21. data/lib/bio/appl/blast/report.rb +652 -0
  22. data/lib/bio/appl/blast/rexml.rb +151 -0
  23. data/lib/bio/appl/blast/wublast.rb +553 -0
  24. data/lib/bio/appl/blast/xmlparser.rb +222 -0
  25. data/lib/bio/appl/blat/report.rb +392 -0
  26. data/lib/bio/appl/clustalw.rb +191 -0
  27. data/lib/bio/appl/clustalw/report.rb +154 -0
  28. data/lib/bio/appl/emboss.rb +68 -0
  29. data/lib/bio/appl/fasta.rb +262 -0
  30. data/lib/bio/appl/fasta/format10.rb +428 -0
  31. data/lib/bio/appl/fasta/format6.rb +37 -0
  32. data/lib/bio/appl/genscan/report.rb +570 -0
  33. data/lib/bio/appl/hmmer.rb +129 -0
  34. data/lib/bio/appl/hmmer/report.rb +556 -0
  35. data/lib/bio/appl/mafft.rb +222 -0
  36. data/lib/bio/appl/mafft/report.rb +119 -0
  37. data/lib/bio/appl/psort.rb +555 -0
  38. data/lib/bio/appl/psort/report.rb +473 -0
  39. data/lib/bio/appl/sim4.rb +134 -0
  40. data/lib/bio/appl/sim4/report.rb +501 -0
  41. data/lib/bio/appl/sosui/report.rb +166 -0
  42. data/lib/bio/appl/spidey/report.rb +604 -0
  43. data/lib/bio/appl/targetp/report.rb +283 -0
  44. data/lib/bio/appl/tmhmm/report.rb +238 -0
  45. data/lib/bio/command.rb +166 -0
  46. data/lib/bio/data/aa.rb +354 -0
  47. data/lib/bio/data/codontable.rb +740 -0
  48. data/lib/bio/data/na.rb +226 -0
  49. data/lib/bio/db.rb +340 -0
  50. data/lib/bio/db/aaindex.rb +280 -0
  51. data/lib/bio/db/embl/common.rb +332 -0
  52. data/lib/bio/db/embl/embl.rb +446 -0
  53. data/lib/bio/db/embl/sptr.rb +954 -0
  54. data/lib/bio/db/embl/swissprot.rb +32 -0
  55. data/lib/bio/db/embl/trembl.rb +31 -0
  56. data/lib/bio/db/embl/uniprot.rb +32 -0
  57. data/lib/bio/db/fantom.rb +604 -0
  58. data/lib/bio/db/fasta.rb +869 -0
  59. data/lib/bio/db/genbank/common.rb +299 -0
  60. data/lib/bio/db/genbank/ddbj.rb +34 -0
  61. data/lib/bio/db/genbank/genbank.rb +354 -0
  62. data/lib/bio/db/genbank/genpept.rb +73 -0
  63. data/lib/bio/db/genbank/refseq.rb +31 -0
  64. data/lib/bio/db/gff.rb +106 -0
  65. data/lib/bio/db/go.rb +497 -0
  66. data/lib/bio/db/kegg/brite.rb +51 -0
  67. data/lib/bio/db/kegg/cell.rb +88 -0
  68. data/lib/bio/db/kegg/compound.rb +130 -0
  69. data/lib/bio/db/kegg/enzyme.rb +125 -0
  70. data/lib/bio/db/kegg/expression.rb +173 -0
  71. data/lib/bio/db/kegg/genes.rb +293 -0
  72. data/lib/bio/db/kegg/genome.rb +362 -0
  73. data/lib/bio/db/kegg/glycan.rb +213 -0
  74. data/lib/bio/db/kegg/keggtab.rb +418 -0
  75. data/lib/bio/db/kegg/kgml.rb +299 -0
  76. data/lib/bio/db/kegg/ko.rb +178 -0
  77. data/lib/bio/db/kegg/reaction.rb +97 -0
  78. data/lib/bio/db/litdb.rb +131 -0
  79. data/lib/bio/db/medline.rb +317 -0
  80. data/lib/bio/db/nbrf.rb +199 -0
  81. data/lib/bio/db/pdb.rb +38 -0
  82. data/lib/bio/db/pdb/atom.rb +60 -0
  83. data/lib/bio/db/pdb/chain.rb +117 -0
  84. data/lib/bio/db/pdb/model.rb +106 -0
  85. data/lib/bio/db/pdb/pdb.rb +1682 -0
  86. data/lib/bio/db/pdb/residue.rb +122 -0
  87. data/lib/bio/db/pdb/utils.rb +234 -0
  88. data/lib/bio/db/prosite.rb +616 -0
  89. data/lib/bio/db/rebase.rb +417 -0
  90. data/lib/bio/db/transfac.rb +387 -0
  91. data/lib/bio/feature.rb +201 -0
  92. data/lib/bio/io/brdb.rb +103 -0
  93. data/lib/bio/io/das.rb +471 -0
  94. data/lib/bio/io/dbget.rb +212 -0
  95. data/lib/bio/io/ddbjxml.rb +614 -0
  96. data/lib/bio/io/fastacmd.rb +123 -0
  97. data/lib/bio/io/fetch.rb +114 -0
  98. data/lib/bio/io/flatfile.rb +496 -0
  99. data/lib/bio/io/flatfile/bdb.rb +266 -0
  100. data/lib/bio/io/flatfile/index.rb +1308 -0
  101. data/lib/bio/io/flatfile/indexer.rb +778 -0
  102. data/lib/bio/io/higet.rb +92 -0
  103. data/lib/bio/io/keggapi.rb +863 -0
  104. data/lib/bio/io/pubmed.rb +189 -0
  105. data/lib/bio/io/registry.rb +308 -0
  106. data/lib/bio/io/soapwsdl.rb +114 -0
  107. data/lib/bio/io/sql.rb +428 -0
  108. data/lib/bio/location.rb +650 -0
  109. data/lib/bio/pathway.rb +991 -0
  110. data/lib/bio/reference.rb +308 -0
  111. data/lib/bio/sequence.rb +593 -0
  112. data/lib/bio/shell.rb +51 -0
  113. data/lib/bio/shell/core.rb +512 -0
  114. data/lib/bio/shell/plugin/codon.rb +228 -0
  115. data/lib/bio/shell/plugin/entry.rb +85 -0
  116. data/lib/bio/shell/plugin/flatfile.rb +119 -0
  117. data/lib/bio/shell/plugin/keggapi.rb +187 -0
  118. data/lib/bio/shell/plugin/midi.rb +448 -0
  119. data/lib/bio/shell/plugin/obda.rb +63 -0
  120. data/lib/bio/shell/plugin/seq.rb +238 -0
  121. data/lib/bio/shell/session.rb +214 -0
  122. data/lib/bio/util/color_scheme.rb +214 -0
  123. data/lib/bio/util/color_scheme/buried.rb +78 -0
  124. data/lib/bio/util/color_scheme/helix.rb +78 -0
  125. data/lib/bio/util/color_scheme/hydropathy.rb +83 -0
  126. data/lib/bio/util/color_scheme/nucleotide.rb +50 -0
  127. data/lib/bio/util/color_scheme/strand.rb +78 -0
  128. data/lib/bio/util/color_scheme/taylor.rb +69 -0
  129. data/lib/bio/util/color_scheme/turn.rb +78 -0
  130. data/lib/bio/util/color_scheme/zappo.rb +69 -0
  131. data/lib/bio/util/contingency_table.rb +337 -0
  132. data/lib/bio/util/sirna.rb +306 -0
  133. data/lib/bioruby.rb +34 -0
  134. data/sample/biofetch.rb +475 -0
  135. data/sample/color_scheme_na.rb +99 -0
  136. data/sample/dbget +37 -0
  137. data/sample/fasta2tab.rb +99 -0
  138. data/sample/fsplit.rb +51 -0
  139. data/sample/gb2fasta.rb +31 -0
  140. data/sample/gb2tab.rb +325 -0
  141. data/sample/gbtab2mysql.rb +161 -0
  142. data/sample/genes2nuc.rb +33 -0
  143. data/sample/genes2pep.rb +33 -0
  144. data/sample/genes2tab.rb +81 -0
  145. data/sample/genome2rb.rb +29 -0
  146. data/sample/genome2tab.rb +76 -0
  147. data/sample/goslim.rb +311 -0
  148. data/sample/gt2fasta.rb +47 -0
  149. data/sample/pmfetch.rb +42 -0
  150. data/sample/pmsearch.rb +42 -0
  151. data/sample/psortplot_html.rb +222 -0
  152. data/sample/ssearch2tab.rb +96 -0
  153. data/sample/tdiary.rb +158 -0
  154. data/sample/tfastx2tab.rb +100 -0
  155. data/sample/vs-genes.rb +212 -0
  156. data/test/data/SOSUI/sample.report +11 -0
  157. data/test/data/TMHMM/sample.report +21 -0
  158. data/test/data/blast/eco:b0002.faa +15 -0
  159. data/test/data/blast/eco:b0002.faa.m0 +128 -0
  160. data/test/data/blast/eco:b0002.faa.m7 +65 -0
  161. data/test/data/blast/eco:b0002.faa.m8 +1 -0
  162. data/test/data/embl/AB090716.embl +65 -0
  163. data/test/data/genscan/sample.report +63 -0
  164. data/test/data/prosite/prosite.dat +2233 -0
  165. data/test/data/refseq/nm_126355.entret +64 -0
  166. data/test/data/uniprot/p53_human.uniprot +1456 -0
  167. data/test/runner.rb +10 -0
  168. data/test/unit/bio/appl/blast/test_report.rb +427 -0
  169. data/test/unit/bio/appl/blast/test_xmlparser.rb +400 -0
  170. data/test/unit/bio/appl/genscan/test_report.rb +195 -0
  171. data/test/unit/bio/appl/sosui/test_report.rb +94 -0
  172. data/test/unit/bio/appl/targetp/test_report.rb +159 -0
  173. data/test/unit/bio/appl/test_blast.rb +159 -0
  174. data/test/unit/bio/appl/test_fasta.rb +142 -0
  175. data/test/unit/bio/appl/tmhmm/test_report.rb +139 -0
  176. data/test/unit/bio/data/test_aa.rb +103 -0
  177. data/test/unit/bio/data/test_codontable.rb +120 -0
  178. data/test/unit/bio/data/test_na.rb +89 -0
  179. data/test/unit/bio/db/embl/test_common.rb +130 -0
  180. data/test/unit/bio/db/embl/test_embl.rb +227 -0
  181. data/test/unit/bio/db/embl/test_sptr.rb +268 -0
  182. data/test/unit/bio/db/embl/test_uniprot.rb +44 -0
  183. data/test/unit/bio/db/kegg/test_genes.rb +58 -0
  184. data/test/unit/bio/db/test_fasta.rb +263 -0
  185. data/test/unit/bio/db/test_gff.rb +140 -0
  186. data/test/unit/bio/db/test_prosite.rb +1450 -0
  187. data/test/unit/bio/io/test_ddbjxml.rb +87 -0
  188. data/test/unit/bio/io/test_soapwsdl.rb +45 -0
  189. data/test/unit/bio/shell/plugin/test_seq.rb +175 -0
  190. data/test/unit/bio/test_alignment.rb +1028 -0
  191. data/test/unit/bio/test_command.rb +71 -0
  192. data/test/unit/bio/test_db.rb +109 -0
  193. data/test/unit/bio/test_feature.rb +128 -0
  194. data/test/unit/bio/test_location.rb +51 -0
  195. data/test/unit/bio/test_pathway.rb +485 -0
  196. data/test/unit/bio/test_sequence.rb +386 -0
  197. data/test/unit/bio/test_shell.rb +31 -0
  198. data/test/unit/bio/util/test_color_scheme.rb +45 -0
  199. data/test/unit/bio/util/test_contingency_table.rb +106 -0
  200. data/test/unit/bio/util/test_sirna.rb +258 -0
  201. metadata +295 -0
@@ -0,0 +1,650 @@
1
+ #
2
+ # = bio/location.rb - Locations/Location class (GenBank location format)
3
+ #
4
+ # Copyright:: Copyright (C) 2001, 2005
5
+ # KATAYAMA Toshiaki <k@bioruby.org>
6
+ # License:: LGPL
7
+ #
8
+ # $Id: location.rb,v 0.22 2005/12/18 15:50:06 k Exp $
9
+ #
10
+ # == Appendix : GenBank location descriptor classification
11
+ #
12
+ # === Definition of the position notation of the GenBank location format
13
+ #
14
+ # According to the GenBank manual 'gbrel.txt', I classified position notations
15
+ # into 10 patterns - (A) to (J).
16
+ #
17
+ # 3.4.12.2 Feature Location
18
+ #
19
+ # The second column of the feature descriptor line designates the
20
+ # location of the feature in the sequence. The location descriptor
21
+ # begins at position 22. Several conventions are used to indicate
22
+ # sequence location.
23
+ #
24
+ # Base numbers in location descriptors refer to numbering in the entry,
25
+ # which is not necessarily the same as the numbering scheme used in the
26
+ # published report. The first base in the presented sequence is numbered
27
+ # base 1. Sequences are presented in the 5 to 3 direction.
28
+ #
29
+ # Location descriptors can be one of the following:
30
+ #
31
+ # (A) 1. A single base;
32
+ #
33
+ # (B) 2. A contiguous span of bases;
34
+ #
35
+ # (C) 3. A site between two bases;
36
+ #
37
+ # (D) 4. A single base chosen from a range of bases;
38
+ #
39
+ # (E) 5. A single base chosen from among two or more specified bases;
40
+ #
41
+ # (F) 6. A joining of sequence spans;
42
+ #
43
+ # (G) 7. A reference to an entry other than the one to which the feature
44
+ # belongs (i.e., a remote entry), followed by a location descriptor
45
+ # referring to the remote sequence;
46
+ #
47
+ # (H) 8. A literal sequence (a string of bases enclosed in quotation marks).
48
+ #
49
+ #
50
+ # (C) A site between two residues, such as an endonuclease cleavage site, is
51
+ # indicated by listing the two bases separated by a carat (e.g., 23^24).
52
+ #
53
+ # (D) A single residue chosen from a range of residues is indicated by the
54
+ # number of the first and last bases in the range separated by a single
55
+ # period (e.g., 23.79). The symbols < and > indicate that the end point
56
+ # (I) of the range is beyond the specified base number.
57
+ #
58
+ # (B) A contiguous span of bases is indicated by the number of the first and
59
+ # last bases in the range separated by two periods (e.g., 23..79). The
60
+ # (I) symbols < and > indicate that the end point of the range is beyond the
61
+ # specified base number. Starting and ending positions can be indicated
62
+ # by base number or by one of the operators described below.
63
+ #
64
+ # Operators are prefixes that specify what must be done to the indicated
65
+ # sequence to locate the feature. The following are the operators
66
+ # available, along with their most common format and a description.
67
+ #
68
+ # (J) complement (location): The feature is complementary to the location
69
+ # indicated. Complementary strands are read 5 to 3.
70
+ #
71
+ # (F) join (location, location, .. location): The indicated elements should
72
+ # be placed end to end to form one contiguous sequence.
73
+ #
74
+ # (F) order (location, location, .. location): The elements are found in the
75
+ # specified order in the 5 to 3 direction, but nothing is implied about
76
+ # the rationality of joining them.
77
+ #
78
+ # (F) group (location, location, .. location): The elements are related and
79
+ # should be grouped together, but no order is implied.
80
+ #
81
+ # (E) one-of (location, location, .. location): The element can be any one,
82
+ # but only one, of the items listed.
83
+ #
84
+ # === Reduction strategy of the position notations
85
+ #
86
+ # (A) Location n
87
+ #
88
+ # (B) Location n..m
89
+ #
90
+ # (C) Location n^m
91
+ #
92
+ # (D) (n.m) => Location n
93
+ #
94
+ # (E) one-of(n,m,..) => Location n
95
+ # one-of(n..m,..) => Location n..m
96
+ #
97
+ # (F) order(loc,loc,..) => join(loc, loc,..)
98
+ # group(loc,loc,..) => join(loc, loc,..)
99
+ # join(loc,loc,..) => Sequence
100
+ #
101
+ # (G) ID:loc => Location with ID
102
+ #
103
+ # (H) "atgc" => Location only with Sequence
104
+ #
105
+ # (I) <n => Location n with lt flag
106
+ # >n => Location n with gt flag
107
+ # <n..m => Location n..m with lt flag
108
+ # n..>m => Location n..m with gt flag
109
+ # <n..>m => Location n..m with lt, gt flag
110
+ #
111
+ # (J) complement(loc) => Sequence
112
+ #
113
+ # (K) replace(loc, str) => Location with replacement Sequence
114
+ #
115
+ # === GenBank location examples
116
+ #
117
+ # (C) n^m
118
+ #
119
+ # * [AB015179] 754^755
120
+ # * [AF179299] complement(53^54)
121
+ # * [CELXOL1ES] replace(4480^4481,"")
122
+ # * [ECOUW87] replace(4792^4793,"a")
123
+ # * [APLPCII] replace(1905^1906,"acaaagacaccgccctacgcc")
124
+ #
125
+ # (D) (n.m)
126
+ #
127
+ # * [HACSODA] 157..(800.806)
128
+ # * [HALSODB] (67.68)..(699.703)
129
+ # * [AP001918] (45934.45974)..46135
130
+ # * [BACSPOJ] <180..(731.761)
131
+ # * [BBU17998] (88.89)..>1122
132
+ # * [ECHTGA] complement((1700.1708)..(1715.1721))
133
+ # * [ECPAP17] complement(<22..(255.275))
134
+ # * [LPATOVGNS] complement((64.74)..1525)
135
+ # * [PIP404CG] join((8298.8300)..10206,1..855)
136
+ # * [BOVMHDQBY4] join(M30006.1:(392.467)..575,M30005.1:415..681,M30004.1:129..410,M30004.1:907..1017,521..534)
137
+ # * [HUMMIC2A] replace((651.655)..(651.655),"")
138
+ # * [HUMSOD102] order(L44135.1:(454.445)..>538,<1..181)
139
+ #
140
+ # (E) one-of
141
+ #
142
+ # * [ECU17136] one-of(898,900)..983
143
+ # * [CELCYT1A] one-of(5971..6308,5971..6309)
144
+ # * [DMU17742] 8050..one-of(10731,10758,10905,11242)
145
+ # * [PFU27807] one-of(623,627,632)..one-of(628,633,637)
146
+ # * [BTBAINH1] one-of(845,953,963,1078,1104)..1354
147
+ # * [ATU39449] join(one-of(969..1094,970..1094,995..1094,1018..1094),1518..1587,1726..2119,2220..2833,2945..3215)
148
+ #
149
+ # (F) join, order, group
150
+ #
151
+ # * [AB037374S2] join(AB037374.1:1..177,1..807)
152
+ # * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
153
+ # * [ASNOS11] join(AF130124.1:<2563..2964,AF130125.1:21..157,AF130126.1:12..174,AF130127.1:21..112,AF130128.1:21..162,AF130128.1:281..595,AF130128.1:661..842,AF130128.1:916..1030,AF130129.1:21..115,AF130130.1:21..165,AF130131.1:21..125,AF130132.1:21..428,AF130132.1:492..746,AF130133.1:21..168,AF130133.1:232..401,AF130133.1:475..906,AF130133.1:970..1107,AF130133.1:1176..1367,21..>128)
154
+ #
155
+ # * [AARPOB2] order(AF194507.1:<1..510,1..>871)
156
+ # * [AF006691] order(912..1918,20410..21416)
157
+ # * [AF024666] order(complement(18919..19224),complement(13965..14892))
158
+ # * [AF264948] order(27066..27076,27089..27099,27283..27314,27330..27352)
159
+ # * [D63363] order(3..26,complement(964..987))
160
+ # * [ECOCURLI2] order(complement(1009..>1260),complement(AF081827.1:<1..177))
161
+ # * [S72388S2] order(join(S72388.1:757..911,S72388.1:609..1542),1..>139)
162
+ # * [HEYRRE07] order(complement(1..38),complement(M82666.1:1..140),complement(M82665.1:1..176),complement(M82664.1:1..215),complement(M82663.1:1..185),complement(M82662.1:1..49),complement(M82661.1:1..133))
163
+ # * [COL11A1G34] order(AF101079.1:558..1307,AF101080.1:1..749,AF101081.1:1..898,AF101082.1:1..486,AF101083.1:1..942,AF101084.1:1..1734,AF101085.1:1..2385,AF101086.1:1..1813,AF101087.1:1..2287,AF101088.1:1..1073,AF101089.1:1..989,AF101090.1:1..5017,AF101091.1:1..3401,AF101092.1:1..1225,AF101093.1:1..1072,AF101094.1:1..989,AF101095.1:1..1669,AF101096.1:1..918,AF101097.1:1..1114,AF101098.1:1..1074,AF101099.1:1..1709,AF101100.1:1..986,AF101101.1:1..1934,AF101102.1:1..1699,AF101103.1:1..940,AF101104.1:1..2330,AF101105.1:1..4467,AF101106.1:1..1876,AF101107.1:1..2465,AF101108.1:1..1150,AF101109.1:1..1170,AF101110.1:1..1158,AF101111.1:1..1193,1..611)
164
+ #
165
+ # group() are found in the COMMENT field only (in GenBank 122.0)
166
+ #
167
+ # gbpat2.seq: FT repeat_region group(598..606,611..619)
168
+ # gbpat2.seq: FT repeat_region group(8..16,1457..1464).
169
+ # gbpat2.seq: FT variation group(t1,t2)
170
+ # gbpat2.seq: FT variation group(t1,t3)
171
+ # gbpat2.seq: FT variation group(t1,t2,t3)
172
+ # gbpat2.seq: FT repeat_region group(11..202,203..394)
173
+ # gbpri9.seq:COMMENT Residues reported = 'group(1..2145);'.
174
+ #
175
+ # (G) ID:location
176
+ #
177
+ # * [AARPOB2] order(AF194507.1:<1..510,1..>871)
178
+ # * [AF178221S4] join(AF178221.1:<1..60,AF178222.1:1..63,AF178223.1:1..42,1..>90)
179
+ # * [BOVMHDQBY4] join(M30006.1:(392.467)..575,M30005.1:415..681,M30004.1:129..410,M30004.1:907..1017,521..534)
180
+ # * [HUMSOD102] order(L44135.1:(454.445)..>538,<1..181)
181
+ # * [SL16SRRN1] order(<1..>267,X67092.1:<1..>249,X67093.1:<1..>233)
182
+ #
183
+ # (I) <, >
184
+ #
185
+ # * [A5U48871] <1..>318
186
+ # * [AA23SRRNP] <1..388
187
+ # * [AA23SRRNP] 503..>1010
188
+ # * [AAM5961] complement(<1..229)
189
+ # * [AAM5961] complement(5231..>5598)
190
+ # * [AF043934] join(<1,60..99,161..241,302..370,436..594,676..887,993..1141,1209..1329,1387..1559,1626..1646,1708..>1843)
191
+ # * [BACSPOJ] <180..(731.761)
192
+ # * [BBU17998] (88.89)..>1122
193
+ # * [AARPOB2] order(AF194507.1:<1..510,1..>871)
194
+ # * [SL16SRRN1] order(<1..>267,X67092.1:<1..>249,X67093.1:<1..>233)
195
+ #
196
+ # (J) complement
197
+ #
198
+ # * [AF179299] complement(53^54) <= hoge insertion site etc.
199
+ # * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
200
+ # * [AF209868S2] order(complement(1..>308),complement(AF209868.1:75..336))
201
+ # * [AP000001] join(complement(1..61),complement(AP000007.1:252907..253505))
202
+ # * [CPPLCG] complement(<1..(1093.1098))
203
+ # * [D63363] order(3..26,complement(964..987))
204
+ # * [ECHTGA] complement((1700.1708)..(1715.1721))
205
+ # * [ECOUXW] order(complement(1658..1663),complement(1636..1641))
206
+ # * [LPATOVGNS] complement((64.74)..1525)
207
+ # * [AF129075] complement(join(71606..71829,75327..75446,76039..76203,76282..76353,76914..77029,77114..77201,77276..77342,78138..78316,79755..79892,81501..81562,81676..81856,82341..82490,84208..84287,85032..85122,88316..88403))
208
+ # * [ZFDYST2] join(AF137145.1:<1..18,complement(<1..99))
209
+ #
210
+ # (K) replace
211
+ #
212
+ # * [CSU27710] replace(64,"A")
213
+ # * [CELXOL1ES] replace(5256,"t")
214
+ # * [ANICPC] replace(1..468,"")
215
+ # * [CSU27710] replace(67..68,"GC")
216
+ # * [CELXOL1ES] replace(4480^4481,"") <= ? only one case in GenBank 122.0
217
+ # * [ECOUW87] replace(4792^4793,"a")
218
+ # * [CEU34893] replace(1..22,"ggttttaacccagttactcaag")
219
+ # * [APLPCII] replace(1905^1906,"acaaagacaccgccctacgcc")
220
+ # * [MBDR3S1] replace(1400..>9281,"")
221
+ # * [HUMMHDPB1F] replace(complement(36..37),"ttc")
222
+ # * [HUMMIC2A] replace((651.655)..(651.655),"")
223
+ # * [LEIMDRPGP] replace(1..1554,"L01572")
224
+ # * [TRBND3] replace(376..395,"atttgtgtgtggtaatta")
225
+ # * [TRBND3] replace(376..395,"atttgtgtgggtaatttta")
226
+ # * [TRBND3] replace(376..395,"attttgttgttgttttgttttgaatta")
227
+ # * [TRBND3] replace(376..395,"atgtgtggtgaatta")
228
+ # * [TRBND3] replace(376..395,"atgtgtgtggtaatta")
229
+ # * [TRBND3] replace(376..395,"gatttgttgtggtaatttta")
230
+ # * [MSU09460] replace(193, <= replace(193, "t")
231
+ # * [HUMMAGE12X] replace(3002..3003, <= replace(3002..3003, "GC")
232
+ # * [ADR40FIB] replace(510..520, <= replace(510..520, "taatcctaccg")
233
+ # * [RATDYIIAAB] replace(1306..1443,"aagaacatccacggagtcagaactgggctcttcacgccggatttggcgttcgaggccattgtgaaaaagcaggcaatgcaccagcaagctcagttcctacccctgcgtggacctggttatccaggagctaatcagtacagttaggtggtcaagctgaaagagccctgtctgaaa")
234
+ #
235
+ #--
236
+ #
237
+ # This library is free software; you can redistribute it and/or
238
+ # modify it under the terms of the GNU Lesser General Public
239
+ # License as published by the Free Software Foundation; either
240
+ # version 2 of the License, or (at your option) any later version.
241
+ #
242
+ # This library is distributed in the hope that it will be useful,
243
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
244
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
245
+ # Lesser General Public License for more details.
246
+ #
247
+ # You should have received a copy of the GNU Lesser General Public
248
+ # License along with this library; if not, write to the Free Software
249
+ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
250
+ #
251
+ #++
252
+ #
253
+
254
+ module Bio
255
+
256
+ class Location
257
+
258
+ # Pass a range of the 'location' segment. The 'location' segment can be
259
+ # 'ID:' + ('n' or 'n..m' or 'n^m' or "seq") with '<' or '>'.
260
+ def initialize(location = nil)
261
+
262
+ if location
263
+ if location =~ /:/ # (G) ID:location
264
+ xref_id, location = location.split(':')
265
+ end
266
+ if location =~ /</ # (I) <,>
267
+ lt = true
268
+ end
269
+ if location =~ />/
270
+ gt = true
271
+ end
272
+ end
273
+
274
+ # s : start base, e : end base => from, to
275
+ case location
276
+ when /^[<>]?(\d+)$/ # (A, I) n
277
+ s = e = $1.to_i
278
+ when /^[<>]?(\d+)\.\.[<>]?(\d+)$/ # (B, I) n..m
279
+ s = $1.to_i
280
+ e = $2.to_i
281
+ if e - s < 0
282
+ # raise "Error: invalid range : #{location}"
283
+ $stderr.puts "[Warning] invalid range : #{location}" if $DEBUG
284
+ end
285
+ when /^[<>]?(\d+)\^[<>]?(\d+)$/ # (C, I) n^m
286
+ s = $1.to_i
287
+ e = $2.to_i
288
+ if e - s != 1
289
+ # raise "Error: invalid range : #{location}"
290
+ $stderr.puts "[Warning] invalid range : #{location}" if $DEBUG
291
+ end
292
+ when /^"?([ATGCatgc]+)"?$/ # (H) literal sequence
293
+ sequence = $1.downcase
294
+ s = e = nil
295
+ when nil
296
+ ;
297
+ else
298
+ raise "Error: unknown location format : #{location}"
299
+ end
300
+
301
+ @from = s # start position of the location
302
+ @to = e # end position of the location
303
+ @strand = 1 # strand direction of the location
304
+ # forward => 1 or complement => -1
305
+ @sequence = sequence # literal sequence of the location
306
+ @lt = lt # true if the position contains '<'
307
+ @gt = gt # true if the position contains '>'
308
+ @xref_id = xref_id # link to the external entry as GenBank ID
309
+ end
310
+
311
+ attr_accessor :from, :to, :strand, :sequence, :lt, :gt, :xref_id
312
+
313
+ # Complement the sequence from outside.
314
+ def complement
315
+ @strand *= -1
316
+ self # return Location object
317
+ end
318
+
319
+ # Replace the sequence from outside.
320
+ def replace(sequence)
321
+ @sequence = sequence.downcase
322
+ self # return Location object
323
+ end
324
+
325
+ # Returns a range (from..to) of the segment as a Range object.
326
+ def range
327
+ @from..@to
328
+ end
329
+
330
+ end # class location
331
+
332
+
333
+ class Locations
334
+
335
+ include Enumerable
336
+
337
+ # Parse a GenBank style position string and returns a Locations object,
338
+ # which contains a list of Location objects.
339
+ def initialize(position)
340
+ if position.is_a? Array
341
+ @locations = position
342
+ else
343
+ position = gbl_cleanup(position) # preprocessing
344
+ @locations = gbl_pos2loc(position) # create an Array of Location
345
+ end
346
+ end
347
+ attr_accessor :locations
348
+
349
+ # Iterates on each Location object.
350
+ def each
351
+ @locations.each do |x|
352
+ yield(x)
353
+ end
354
+ end
355
+
356
+ # Returns nth Location object.
357
+ def [](n)
358
+ @locations[n]
359
+ end
360
+
361
+ # Returns first Location object.
362
+ def first
363
+ @locations.first
364
+ end
365
+
366
+ # Returns last Location object.
367
+ def last
368
+ @locations.last
369
+ end
370
+
371
+ # Returns an Array containing overall min and max position [min, max]
372
+ # of this Locations object.
373
+ def span
374
+ span_min = @locations.min { |a,b| a.from <=> b.from }
375
+ span_max = @locations.max { |a,b| a.to <=> b.to }
376
+ return span_min.from, span_max.to
377
+ end
378
+
379
+ # Similar to span, but returns a Range object min..max
380
+ def range
381
+ min, max = span
382
+ min..max
383
+ end
384
+
385
+ # Returns a length of the spliced RNA.
386
+ def length
387
+ len = 0
388
+ @locations.each do |x|
389
+ if x.sequence
390
+ len += x.sequence.size
391
+ else
392
+ len += (x.to - x.from + 1)
393
+ end
394
+ end
395
+ len
396
+ end
397
+ alias size length
398
+
399
+ # Convert absolute position in DNA (na) to relative position in RNA (na).
400
+ # If type == :aa,
401
+ # convert absolute position in DNA (na) to relative position in Protein (aa).
402
+ def relative(n, type = nil)
403
+ case type
404
+ when :location
405
+ ;
406
+ when :aa
407
+ if n = abs2rel(n)
408
+ (n - 1) / 3 + 1
409
+ else
410
+ nil
411
+ end
412
+ else
413
+ abs2rel(n)
414
+ end
415
+ end
416
+
417
+ # Convert relative position in RNA (na) to absolute position in DNA (na).
418
+ # If type == :aa,
419
+ # convert relative position in Protein (aa) -> absolute position in DNA (na).
420
+ #
421
+ # * Examples
422
+ #
423
+ # loc = Bio::Locations.new('complement(12838..13533)')
424
+ # loc.absolute(10) #=> 13524 (rel2abs)
425
+ # loc.relative(13524) #=> 10 (abs2rel)
426
+ # loc.absolute(10, :aa) #=> 13506 (rel2abs)
427
+ # loc.relative(13506, :aa) #=> 10 (abs2rel)
428
+ #
429
+ def absolute(n, type = nil)
430
+ case type
431
+ when :location
432
+ ;
433
+ when :aa
434
+ n = (n - 1) * 3 + 1
435
+ rel2abs(n)
436
+ else
437
+ rel2abs(n)
438
+ end
439
+ end
440
+
441
+
442
+ private
443
+
444
+
445
+ # Preprocessing to clean up the position notation.
446
+ def gbl_cleanup(position)
447
+ # sometimes position contains white spaces...
448
+ position.gsub!(/\s+/, '')
449
+
450
+ # select one base # (D) n.m
451
+ # .. n m :
452
+ # <match> $1 ( $2 $3 not )
453
+ position.gsub!(/(\.{2})?\(?([<>\d]+)\.([<>\d]+)(?!:)\)?/) do |match|
454
+ if $1
455
+ $1 + $3 # ..(n.m) => ..m
456
+ else
457
+ $2 # (?n.m)? => n
458
+ end
459
+ end
460
+
461
+ # select the 1st location # (E) one-of()
462
+ # <match> .. one-of ($2 ,$3 )
463
+ position.gsub!(/(\.{2})?one-of\(([^,]+),([^)]+)\)/) do |match|
464
+ if $1
465
+ $1 + $3.gsub(/.*,(.*)/, '\1') # ..one-of(n,m) => ..m
466
+ else
467
+ $2 # one-of(n,m) => n
468
+ end
469
+ end
470
+
471
+ # substitute order(), group() by join() # (F) group(), order()
472
+ position.gsub!(/(order|group)/, 'join')
473
+
474
+ return position
475
+ end
476
+
477
+
478
+ # Parse position notation and create Location objects.
479
+ def gbl_pos2loc(position)
480
+ ary = []
481
+
482
+ case position
483
+
484
+ when /^join\((.*)\)$/ # (F) join()
485
+ position = $1
486
+
487
+ join_list = [] # sub positions to join
488
+ bracket = [] # position with bracket
489
+ s_count = 0 # stack counter
490
+
491
+ position.split(',').each do |sub_pos|
492
+ case sub_pos
493
+ when /\(.*\)/
494
+ join_list << sub_pos
495
+ when /\(/
496
+ s_count += 1
497
+ bracket << sub_pos
498
+ when /\)/
499
+ s_count -= 1
500
+ bracket << sub_pos
501
+ if s_count == 0
502
+ join_list << bracket.join(',')
503
+ end
504
+ else
505
+ if s_count == 0
506
+ join_list << sub_pos
507
+ else
508
+ bracket << sub_pos
509
+ end
510
+ end
511
+ end
512
+
513
+ join_list.each do |position|
514
+ ary << gbl_pos2loc(position)
515
+ end
516
+
517
+ when /^complement\((.*)\)$/ # (J) complement()
518
+ position = $1
519
+ gbl_pos2loc(position).reverse_each do |location|
520
+ ary << location.complement
521
+ end
522
+
523
+ when /^replace\(([^,]+),"?([^"]*)"?\)/ # (K) replace()
524
+ position = $1
525
+ sequence = $2
526
+ ary << gbl_pos2loc(position).first.replace(sequence)
527
+
528
+ else # (A, B, C, G, H, I)
529
+ ary << Location.new(position)
530
+
531
+ end
532
+
533
+ return ary.flatten
534
+ end
535
+
536
+
537
+ # Convert the relative position to the absolute position
538
+ def rel2abs(n)
539
+ return nil unless n > 0 # out of range
540
+
541
+ cursor = 0
542
+ @locations.each do |x|
543
+ if x.sequence
544
+ len = x.sequence.size
545
+ else
546
+ len = x.to - x.from + 1
547
+ end
548
+ if n > cursor + len
549
+ cursor += len
550
+ else
551
+ if x.strand < 0
552
+ return x.to - (n - cursor - 1)
553
+ else
554
+ return x.from + (n - cursor - 1)
555
+ end
556
+ end
557
+ end
558
+ return nil # out of range
559
+ end
560
+
561
+ # Convert the absolute position to the relative position
562
+ def abs2rel(n)
563
+ return nil unless n > 0 # out of range
564
+
565
+ cursor = 0
566
+ @locations.each do |x|
567
+ if x.sequence
568
+ len = x.sequence.size
569
+ else
570
+ len = x.to - x.from + 1
571
+ end
572
+ if n < x.from or n > x.to then
573
+ cursor += len
574
+ else
575
+ if x.strand < 0 then
576
+ return x.to - (n - cursor - 1)
577
+ else
578
+ return n + cursor + 1 - x.from
579
+ end
580
+ end
581
+ end
582
+ return nil # out of range
583
+ end
584
+
585
+ end # class Locations
586
+
587
+ end # module Bio
588
+
589
+
590
+ if __FILE__ == $0
591
+ puts "Test new & span methods"
592
+ [
593
+ '754^755',
594
+ 'complement(53^54)',
595
+ 'replace(4792^4793,"a")',
596
+ 'replace(1905^1906,"acaaagacaccgccctacgcc")',
597
+ '157..(800.806)',
598
+ '(67.68)..(699.703)',
599
+ '(45934.45974)..46135',
600
+ '<180..(731.761)',
601
+ '(88.89)..>1122',
602
+ 'complement((1700.1708)..(1715.1721))',
603
+ 'complement(<22..(255.275))',
604
+ 'complement((64.74)..1525)',
605
+ 'join((8298.8300)..10206,1..855)',
606
+ 'replace((651.655)..(651.655),"")',
607
+ 'one-of(898,900)..983',
608
+ 'one-of(5971..6308,5971..6309)',
609
+ '8050..one-of(10731,10758,10905,11242)',
610
+ 'one-of(623,627,632)..one-of(628,633,637)',
611
+ 'one-of(845,953,963,1078,1104)..1354',
612
+ 'join(2035..2050,complement(1775..1818),13..345,414..992,1232..1253,1024..1157)',
613
+ 'join(complement(1..61),complement(AP000007.1:252907..253505))',
614
+ 'complement(join(71606..71829,75327..75446,76039..76203))',
615
+ 'order(3..26,complement(964..987))',
616
+ 'order(L44135.1:(454.445)..>538,<1..181)',
617
+ '<200001..<318389',
618
+ ].each do |pos|
619
+ p pos
620
+ p Bio::Locations.new(pos).span
621
+ p Bio::Locations.new(pos).range
622
+ p Bio::Locations.new(pos)
623
+ end
624
+
625
+ puts "Test rel2abs/abs2rel method"
626
+ [
627
+ '6..15',
628
+ 'join(6..10,16..30)',
629
+ 'complement(join(6..10,16..30))',
630
+ 'join(complement(6..10),complement(16..30))',
631
+ 'join(6..10,complement(16..30))',
632
+ ].each do |pos|
633
+ loc = Bio::Locations.new(pos)
634
+ p pos
635
+ # p loc
636
+ (1..21).each do |x|
637
+ print "absolute(#{x}) #=> ", y = loc.absolute(x), "\n"
638
+ print "relative(#{y}) #=> ", y ? loc.relative(y) : y, "\n"
639
+ print "absolute(#{x}, :aa) #=> ", y = loc.absolute(x, :aa), "\n"
640
+ print "relative(#{y}, :aa) #=> ", y ? loc.relative(y, :aa) : y, "\n"
641
+ end
642
+ end
643
+
644
+ pos = 'join(complement(6..10),complement(16..30))'
645
+ loc = Bio::Locations.new(pos)
646
+ print "pos : "; p pos
647
+ print "`- loc[1] : "; p loc[1]
648
+ print " `- range : "; p loc[1].range
649
+ end
650
+