bio 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. data/ChangeLog +3421 -0
  2. data/KNOWN_ISSUES.rdoc +88 -0
  3. data/README.rdoc +252 -0
  4. data/README_DEV.rdoc +285 -0
  5. data/Rakefile +143 -0
  6. data/bin/bioruby +0 -0
  7. data/bin/br_biofetch.rb +0 -0
  8. data/bin/br_bioflat.rb +12 -1
  9. data/bin/br_biogetseq.rb +0 -0
  10. data/bin/br_pmfetch.rb +4 -3
  11. data/bioruby.gemspec +477 -0
  12. data/bioruby.gemspec.erb +117 -0
  13. data/doc/Changes-0.7.rd +7 -0
  14. data/doc/Changes-1.3.rdoc +239 -0
  15. data/doc/Tutorial.rd +296 -184
  16. data/doc/Tutorial.rd.html +1031 -0
  17. data/doc/Tutorial.rd.ja +111 -45
  18. data/doc/Tutorial.rd.ja.html +2225 -0
  19. data/doc/bioruby.css +281 -0
  20. data/extconf.rb +2 -0
  21. data/lib/bio.rb +29 -4
  22. data/lib/bio/appl/blast.rb +306 -121
  23. data/lib/bio/appl/blast/ddbj.rb +142 -0
  24. data/lib/bio/appl/blast/format0.rb +35 -25
  25. data/lib/bio/appl/blast/format8.rb +2 -2
  26. data/lib/bio/appl/blast/genomenet.rb +263 -0
  27. data/lib/bio/appl/blast/ncbioptions.rb +220 -0
  28. data/lib/bio/appl/blast/remote.rb +106 -0
  29. data/lib/bio/appl/blast/report.rb +260 -9
  30. data/lib/bio/appl/blast/rexml.rb +12 -5
  31. data/lib/bio/appl/blast/rpsblast.rb +277 -0
  32. data/lib/bio/appl/blast/wublast.rb +133 -12
  33. data/lib/bio/appl/blast/xmlparser.rb +35 -18
  34. data/lib/bio/appl/blat/report.rb +46 -5
  35. data/lib/bio/appl/emboss.rb +62 -13
  36. data/lib/bio/appl/fasta.rb +9 -11
  37. data/lib/bio/appl/genscan/report.rb +3 -3
  38. data/lib/bio/appl/hmmer.rb +1 -1
  39. data/lib/bio/appl/hmmer/report.rb +10 -10
  40. data/lib/bio/appl/paml/baseml.rb +95 -0
  41. data/lib/bio/appl/paml/baseml/report.rb +32 -0
  42. data/lib/bio/appl/paml/codeml.rb +242 -0
  43. data/lib/bio/appl/paml/codeml/rates.rb +67 -0
  44. data/lib/bio/appl/paml/codeml/report.rb +67 -0
  45. data/lib/bio/appl/paml/common.rb +348 -0
  46. data/lib/bio/appl/paml/common_report.rb +38 -0
  47. data/lib/bio/appl/paml/yn00.rb +103 -0
  48. data/lib/bio/appl/paml/yn00/report.rb +32 -0
  49. data/lib/bio/appl/psort.rb +2 -2
  50. data/lib/bio/appl/pts1.rb +5 -5
  51. data/lib/bio/appl/tmhmm/report.rb +10 -1
  52. data/lib/bio/command.rb +297 -41
  53. data/lib/bio/compat/features.rb +157 -0
  54. data/lib/bio/compat/references.rb +128 -0
  55. data/lib/bio/db/biosql/biosql_to_biosequence.rb +67 -0
  56. data/lib/bio/db/biosql/sequence.rb +508 -0
  57. data/lib/bio/db/embl/common.rb +28 -12
  58. data/lib/bio/db/embl/embl.rb +107 -9
  59. data/lib/bio/db/embl/embl_to_biosequence.rb +85 -0
  60. data/lib/bio/db/embl/format_embl.rb +190 -0
  61. data/lib/bio/db/embl/sptr.rb +15 -16
  62. data/lib/bio/db/fantom.rb +6 -8
  63. data/lib/bio/db/fasta.rb +10 -507
  64. data/lib/bio/db/fasta/defline.rb +532 -0
  65. data/lib/bio/db/fasta/fasta_to_biosequence.rb +63 -0
  66. data/lib/bio/db/fasta/format_fasta.rb +97 -0
  67. data/lib/bio/db/genbank/common.rb +25 -8
  68. data/lib/bio/db/genbank/format_genbank.rb +187 -0
  69. data/lib/bio/db/genbank/genbank.rb +36 -1
  70. data/lib/bio/db/genbank/genbank_to_biosequence.rb +86 -0
  71. data/lib/bio/db/gff.rb +1791 -119
  72. data/lib/bio/db/kegg/glycan.rb +2 -6
  73. data/lib/bio/db/lasergene.rb +3 -3
  74. data/lib/bio/db/medline.rb +4 -1
  75. data/lib/bio/db/newick.rb +10 -10
  76. data/lib/bio/db/pdb/chain.rb +6 -2
  77. data/lib/bio/db/pdb/pdb.rb +12 -3
  78. data/lib/bio/db/rebase.rb +7 -8
  79. data/lib/bio/db/soft.rb +3 -3
  80. data/lib/bio/feature.rb +1 -88
  81. data/lib/bio/io/biosql/biodatabase.rb +64 -0
  82. data/lib/bio/io/biosql/bioentry.rb +29 -0
  83. data/lib/bio/io/biosql/bioentry_dbxref.rb +11 -0
  84. data/lib/bio/io/biosql/bioentry_path.rb +12 -0
  85. data/lib/bio/io/biosql/bioentry_qualifier_value.rb +10 -0
  86. data/lib/bio/io/biosql/bioentry_reference.rb +10 -0
  87. data/lib/bio/io/biosql/bioentry_relationship.rb +10 -0
  88. data/lib/bio/io/biosql/biosequence.rb +11 -0
  89. data/lib/bio/io/biosql/comment.rb +7 -0
  90. data/lib/bio/io/biosql/config/database.yml +20 -0
  91. data/lib/bio/io/biosql/dbxref.rb +13 -0
  92. data/lib/bio/io/biosql/dbxref_qualifier_value.rb +12 -0
  93. data/lib/bio/io/biosql/location.rb +32 -0
  94. data/lib/bio/io/biosql/location_qualifier_value.rb +11 -0
  95. data/lib/bio/io/biosql/ontology.rb +10 -0
  96. data/lib/bio/io/biosql/reference.rb +9 -0
  97. data/lib/bio/io/biosql/seqfeature.rb +32 -0
  98. data/lib/bio/io/biosql/seqfeature_dbxref.rb +11 -0
  99. data/lib/bio/io/biosql/seqfeature_path.rb +11 -0
  100. data/lib/bio/io/biosql/seqfeature_qualifier_value.rb +20 -0
  101. data/lib/bio/io/biosql/seqfeature_relationship.rb +11 -0
  102. data/lib/bio/io/biosql/taxon.rb +12 -0
  103. data/lib/bio/io/biosql/taxon_name.rb +9 -0
  104. data/lib/bio/io/biosql/term.rb +27 -0
  105. data/lib/bio/io/biosql/term_dbxref.rb +11 -0
  106. data/lib/bio/io/biosql/term_path.rb +12 -0
  107. data/lib/bio/io/biosql/term_relationship.rb +13 -0
  108. data/lib/bio/io/biosql/term_relationship_term.rb +11 -0
  109. data/lib/bio/io/biosql/term_synonym.rb +10 -0
  110. data/lib/bio/io/das.rb +7 -7
  111. data/lib/bio/io/ddbjxml.rb +57 -0
  112. data/lib/bio/io/ensembl.rb +2 -2
  113. data/lib/bio/io/fetch.rb +28 -14
  114. data/lib/bio/io/flatfile.rb +17 -853
  115. data/lib/bio/io/flatfile/autodetection.rb +545 -0
  116. data/lib/bio/io/flatfile/buffer.rb +237 -0
  117. data/lib/bio/io/flatfile/index.rb +17 -7
  118. data/lib/bio/io/flatfile/indexer.rb +30 -12
  119. data/lib/bio/io/flatfile/splitter.rb +297 -0
  120. data/lib/bio/io/hinv.rb +442 -0
  121. data/lib/bio/io/keggapi.rb +2 -2
  122. data/lib/bio/io/ncbirest.rb +733 -0
  123. data/lib/bio/io/pubmed.rb +34 -80
  124. data/lib/bio/io/registry.rb +2 -2
  125. data/lib/bio/io/sql.rb +178 -357
  126. data/lib/bio/io/togows.rb +458 -0
  127. data/lib/bio/location.rb +106 -11
  128. data/lib/bio/pathway.rb +120 -14
  129. data/lib/bio/reference.rb +115 -101
  130. data/lib/bio/sequence.rb +164 -183
  131. data/lib/bio/sequence/adapter.rb +108 -0
  132. data/lib/bio/sequence/common.rb +22 -45
  133. data/lib/bio/sequence/compat.rb +2 -2
  134. data/lib/bio/sequence/dblink.rb +54 -0
  135. data/lib/bio/sequence/format.rb +254 -77
  136. data/lib/bio/sequence/format_raw.rb +23 -0
  137. data/lib/bio/shell.rb +3 -1
  138. data/lib/bio/shell/core.rb +2 -2
  139. data/lib/bio/shell/plugin/entry.rb +33 -4
  140. data/lib/bio/shell/plugin/ncbirest.rb +64 -0
  141. data/lib/bio/shell/plugin/togows.rb +40 -0
  142. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/bioruby_generator.rb +0 -0
  143. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_classes.rhtml +0 -0
  144. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_log.rhtml +0 -0
  145. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_methods.rhtml +0 -0
  146. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_modules.rhtml +0 -0
  147. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/_variables.rhtml +0 -0
  148. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-bg.gif +0 -0
  149. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-gem.png +0 -0
  150. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby-link.gif +0 -0
  151. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.css +0 -0
  152. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby.rhtml +0 -0
  153. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_controller.rb +0 -0
  154. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/bioruby_helper.rb +0 -0
  155. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/commands.rhtml +0 -0
  156. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/history.rhtml +0 -0
  157. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/index.rhtml +0 -0
  158. data/lib/bio/shell/rails/vendor/plugins/{generators → bioruby/generators}/bioruby/templates/spinner.gif +0 -0
  159. data/lib/bio/tree.rb +4 -2
  160. data/lib/bio/util/color_scheme.rb +2 -2
  161. data/lib/bio/util/contingency_table.rb +2 -2
  162. data/lib/bio/util/restriction_enzyme.rb +2 -2
  163. data/lib/bio/util/restriction_enzyme/single_strand.rb +6 -5
  164. data/lib/bio/version.rb +25 -0
  165. data/rdoc.zsh +8 -0
  166. data/sample/any2fasta.rb +0 -0
  167. data/sample/biofetch.rb +0 -0
  168. data/sample/dbget +0 -0
  169. data/sample/demo_sequence.rb +158 -0
  170. data/sample/enzymes.rb +0 -0
  171. data/sample/fasta2tab.rb +0 -0
  172. data/sample/fastagrep.rb +72 -0
  173. data/sample/fastasort.rb +54 -0
  174. data/sample/fsplit.rb +0 -0
  175. data/sample/gb2fasta.rb +2 -3
  176. data/sample/gb2tab.rb +0 -0
  177. data/sample/gbtab2mysql.rb +0 -0
  178. data/sample/genes2nuc.rb +0 -0
  179. data/sample/genes2pep.rb +0 -0
  180. data/sample/genes2tab.rb +0 -0
  181. data/sample/genome2rb.rb +0 -0
  182. data/sample/genome2tab.rb +0 -0
  183. data/sample/goslim.rb +0 -0
  184. data/sample/gt2fasta.rb +0 -0
  185. data/sample/na2aa.rb +34 -0
  186. data/sample/pmfetch.rb +0 -0
  187. data/sample/pmsearch.rb +0 -0
  188. data/sample/ssearch2tab.rb +0 -0
  189. data/sample/tfastx2tab.rb +0 -0
  190. data/sample/vs-genes.rb +0 -0
  191. data/setup.rb +1596 -0
  192. data/test/data/blast/blastp-multi.m7 +188 -0
  193. data/test/data/command/echoarg2.bat +1 -0
  194. data/test/data/paml/codeml/control_file.txt +30 -0
  195. data/test/data/paml/codeml/output.txt +78 -0
  196. data/test/data/paml/codeml/rates +217 -0
  197. data/test/data/rpsblast/misc.rpsblast +193 -0
  198. data/test/data/soft/GDS100_partial.soft +0 -0
  199. data/test/data/soft/GSE3457_family_partial.soft +0 -0
  200. data/test/functional/bio/appl/test_pts1.rb +115 -0
  201. data/test/functional/bio/io/test_ensembl.rb +123 -80
  202. data/test/functional/bio/io/test_togows.rb +267 -0
  203. data/test/functional/bio/sequence/test_output_embl.rb +51 -0
  204. data/test/functional/bio/test_command.rb +301 -0
  205. data/test/runner.rb +17 -1
  206. data/test/unit/bio/appl/blast/test_ncbioptions.rb +112 -0
  207. data/test/unit/bio/appl/blast/test_report.rb +753 -35
  208. data/test/unit/bio/appl/blast/test_rpsblast.rb +398 -0
  209. data/test/unit/bio/appl/paml/codeml/test_rates.rb +45 -0
  210. data/test/unit/bio/appl/paml/codeml/test_report.rb +45 -0
  211. data/test/unit/bio/appl/paml/test_codeml.rb +174 -0
  212. data/test/unit/bio/appl/test_blast.rb +135 -4
  213. data/test/unit/bio/appl/test_fasta.rb +2 -2
  214. data/test/unit/bio/appl/test_pts1.rb +1 -64
  215. data/test/unit/bio/db/embl/test_common.rb +15 -15
  216. data/test/unit/bio/db/embl/test_embl.rb +4 -4
  217. data/test/unit/bio/db/embl/test_embl_rel89.rb +5 -5
  218. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +203 -0
  219. data/test/unit/bio/db/embl/test_sptr.rb +38 -1
  220. data/test/unit/bio/db/pdb/test_pdb.rb +2 -2
  221. data/test/unit/bio/db/test_gff.rb +1151 -25
  222. data/test/unit/bio/db/test_medline.rb +127 -0
  223. data/test/unit/bio/db/test_nexus.rb +5 -1
  224. data/test/unit/bio/db/test_prosite.rb +4 -4
  225. data/test/unit/bio/io/flatfile/test_autodetection.rb +375 -0
  226. data/test/unit/bio/io/flatfile/test_buffer.rb +251 -0
  227. data/test/unit/bio/io/flatfile/test_splitter.rb +369 -0
  228. data/test/unit/bio/io/test_ddbjxml.rb +8 -3
  229. data/test/unit/bio/io/test_fastacmd.rb +5 -5
  230. data/test/unit/bio/io/test_flatfile.rb +357 -106
  231. data/test/unit/bio/io/test_soapwsdl.rb +2 -2
  232. data/test/unit/bio/io/test_togows.rb +161 -0
  233. data/test/unit/bio/sequence/test_common.rb +210 -11
  234. data/test/unit/bio/sequence/test_compat.rb +3 -3
  235. data/test/unit/bio/sequence/test_dblink.rb +58 -0
  236. data/test/unit/bio/sequence/test_na.rb +2 -2
  237. data/test/unit/bio/test_command.rb +111 -50
  238. data/test/unit/bio/test_feature.rb +29 -1
  239. data/test/unit/bio/test_location.rb +566 -6
  240. data/test/unit/bio/test_pathway.rb +91 -65
  241. data/test/unit/bio/test_reference.rb +67 -13
  242. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +3 -3
  243. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +3 -3
  244. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +3 -3
  245. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -3
  246. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +3 -3
  247. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +3 -3
  248. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +3 -3
  249. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +3 -3
  250. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +3 -3
  251. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +3 -3
  252. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -4
  253. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +3 -3
  254. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +3 -3
  255. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +3 -3
  256. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +3 -3
  257. data/test/unit/bio/util/test_restriction_enzyme.rb +3 -3
  258. metadata +202 -167
  259. data/test/unit/bio/appl/blast/test_xmlparser.rb +0 -388
@@ -0,0 +1,532 @@
1
+ #
2
+ # = bio/db/fasta/defline.rb - FASTA defline parser class
3
+ #
4
+ # Copyright:: Copyright (C) 2001, 2002
5
+ # GOTO Naohisa <ngoto@gen-info.osaka-u.ac.jp>,
6
+ # Toshiaki Katayama <k@bioruby.org>
7
+ # License:: The Ruby License
8
+ #
9
+ # $Id: defline.rb,v 1.1.2.1 2008/06/20 13:22:32 ngoto Exp $
10
+ #
11
+ # == Description
12
+ #
13
+ # Bio::FastaDefline is a parser class for definition line (defline)
14
+ # of the FASTA format.
15
+ #
16
+ # == Examples
17
+ #
18
+ # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
19
+ # rub.entry_id ==> 'gi|671595'
20
+ # rub.get('emb') ==> 'CAA85678.1'
21
+ # rub.emb ==> 'CAA85678.1'
22
+ # rub.gi ==> '671595'
23
+ # rub.accession ==> 'CAA85678'
24
+ # rub.accessions ==> [ 'CAA85678' ]
25
+ # rub.acc_version ==> 'CAA85678.1'
26
+ # rub.locus ==> nil
27
+ # rub.list_ids ==> [["gi", "671595"],
28
+ # ["emb", "CAA85678.1", nil],
29
+ # ["Perovskia abrotanoides"]]
30
+ #
31
+ # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
32
+ # ckr.entry_id ==> "gi|2495000"
33
+ # ckr.sp ==> "CCKR_CAVPO"
34
+ # ckr.pir ==> "I51898"
35
+ # ckr.gb ==> "AAB29504.1"
36
+ # ckr.gi ==> "2495000"
37
+ # ckr.accession ==> "AAB29504"
38
+ # ckr.accessions ==> ["Q63931", "AAB29504"]
39
+ # ckr.acc_version ==> "AAB29504.1"
40
+ # ckr.locus ==> nil
41
+ # ckr.description ==>
42
+ # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
43
+ # ckr.descriptions ==>
44
+ # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
45
+ # "cholecystokinin A receptor - guinea pig",
46
+ # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
47
+ # ckr.words ==>
48
+ # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
49
+ # "receptor", "type"]
50
+ # ckr.id_strings ==>
51
+ # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
52
+ # "544724", "AAB29504.1", "Cavia"]
53
+ # ckr.list_ids ==>
54
+ # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
55
+ # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
56
+ # ["gb", "AAB29504.1", nil], ["Cavia"]]
57
+ #
58
+ # == References
59
+ #
60
+ # * FASTA format (WikiPedia)
61
+ # http://en.wikipedia.org/wiki/FASTA_format
62
+ #
63
+ # * Fasta format description (NCBI)
64
+ # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
65
+ #
66
+
67
+ module Bio
68
+
69
+ #--
70
+ # split from fasta.rb revision 1.28
71
+ #++
72
+
73
+ # Parsing FASTA Defline, and extract IDs and other informations.
74
+ # IDs are NSIDs (NCBI standard FASTA sequence identifiers)
75
+ # or ":"-separated IDs.
76
+ #
77
+ # specs are described in:
78
+ # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
79
+ # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
80
+ #
81
+ # === Examples
82
+ #
83
+ # rub = Bio::FastaDefline.new('>gi|671595|emb|CAA85678.1| rubisco large subunit [Perovskia abrotanoides]')
84
+ # rub.entry_id ==> 'gi|671595'
85
+ # rub.get('emb') ==> 'CAA85678.1'
86
+ # rub.emb ==> 'CAA85678.1'
87
+ # rub.gi ==> '671595'
88
+ # rub.accession ==> 'CAA85678'
89
+ # rub.accessions ==> [ 'CAA85678' ]
90
+ # rub.acc_version ==> 'CAA85678.1'
91
+ # rub.locus ==> nil
92
+ # rub.list_ids ==> [["gi", "671595"],
93
+ # ["emb", "CAA85678.1", nil],
94
+ # ["Perovskia abrotanoides"]]
95
+ #
96
+ # ckr = Bio::FastaDefline.new(">gi|2495000|sp|Q63931|CCKR_CAVPO CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)\001gi|2147182|pir||I51898 cholecystokinin A receptor - guinea pig\001gi|544724|gb|AAB29504.1| cholecystokinin A receptor; CCK-A receptor [Cavia]")
97
+ # ckr.entry_id ==> "gi|2495000"
98
+ # ckr.sp ==> "CCKR_CAVPO"
99
+ # ckr.pir ==> "I51898"
100
+ # ckr.gb ==> "AAB29504.1"
101
+ # ckr.gi ==> "2495000"
102
+ # ckr.accession ==> "AAB29504"
103
+ # ckr.accessions ==> ["Q63931", "AAB29504"]
104
+ # ckr.acc_version ==> "AAB29504.1"
105
+ # ckr.locus ==> nil
106
+ # ckr.description ==>
107
+ # "CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)"
108
+ # ckr.descriptions ==>
109
+ # ["CHOLECYSTOKININ TYPE A RECEPTOR (CCK-A RECEPTOR) (CCK-AR)",
110
+ # "cholecystokinin A receptor - guinea pig",
111
+ # "cholecystokinin A receptor; CCK-A receptor [Cavia]"]
112
+ # ckr.words ==>
113
+ # ["cavia", "cck-a", "cck-ar", "cholecystokinin", "guinea", "pig",
114
+ # "receptor", "type"]
115
+ # ckr.id_strings ==>
116
+ # ["2495000", "Q63931", "CCKR_CAVPO", "2147182", "I51898",
117
+ # "544724", "AAB29504.1", "Cavia"]
118
+ # ckr.list_ids ==>
119
+ # [["gi", "2495000"], ["sp", "Q63931", "CCKR_CAVPO"],
120
+ # ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
121
+ # ["gb", "AAB29504.1", nil], ["Cavia"]]
122
+ #
123
+ # === Refereneces
124
+ #
125
+ # * Fasta format description (NCBI)
126
+ # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
127
+ #
128
+ # * Frequently Asked Questions: Indexing of Sequence Identifiers (by Warren R. Gish.)
129
+ # http://blast.wustl.edu/doc/FAQ-Indexing.html#Identifiers
130
+ #
131
+ # * README.formatdb
132
+ # ftp://ftp.ncbi.nih.gov/blast/documents/README.formatdb
133
+ #
134
+ class FastaDefline
135
+
136
+ NSIDs = {
137
+ # NCBI and WU-BLAST
138
+ 'gi' => [ 'gi' ], # NCBI GI
139
+ 'gb' => [ 'acc_version', 'locus' ], # GenBank
140
+ 'emb' => [ 'acc_version', 'locus' ], # EMBL
141
+ 'dbj' => [ 'acc_version', 'locus' ], # DDBJ
142
+ 'sp' => [ 'accession', 'entry_id' ], # SWISS-PROT
143
+ 'pdb' => [ 'entry_id', 'chain' ], # PDB
144
+ 'bbs' => [ 'number' ], # GenInfo Backbone Id
145
+ 'gnl' => [ 'database' , 'entry_id' ], # General database identifier
146
+ 'ref' => [ 'acc_version' , 'locus' ], # NCBI Reference Sequence
147
+ 'lcl' => [ 'entry_id' ], # Local Sequence identifier
148
+
149
+ # WU-BLAST and NCBI
150
+ 'pir' => [ 'accession', 'entry_id' ], # PIR
151
+ 'prf' => [ 'accession', 'entry_id' ], # Protein Research Foundation
152
+ 'pat' => [ 'country', 'number', 'serial' ], # Patents
153
+
154
+ # WU-BLAST only
155
+ 'bbm' => [ 'number' ], # NCBI GenInfo Backbone database identifier
156
+ 'gim' => [ 'number' ], # NCBI GenInfo Import identifier
157
+ 'gp' => [ 'acc_version', 'locus' ], # GenPept
158
+ 'oth' => [ 'accession', 'name', 'release' ], # Other (user-definable) identifier
159
+ 'tpd' => [ 'accession', 'name' ], # Third party annotation, DDBJ
160
+ 'tpe' => [ 'accession', 'name' ], # Third party annotation, EMBL
161
+ 'tpg' => [ 'accession', 'name' ], # Third party annotation, GenBank
162
+
163
+ # Original
164
+ 'ri' => [ 'entry_id', 'rearray_id', 'len' ], # RIKEN FANTOM DB
165
+ }
166
+
167
+ # Shows array that contains IDs (or ID-like strings).
168
+ # Returns an array of arrays of strings.
169
+ attr_reader :list_ids
170
+
171
+ # Shows a possibly unique identifier.
172
+ # Returns a string.
173
+ attr_reader :entry_id
174
+
175
+ # Parses given string.
176
+ def initialize(str)
177
+ @deflines = []
178
+ @info = {}
179
+ @list_ids = []
180
+
181
+ @entry_id = nil
182
+
183
+ lines = str.split("\x01")
184
+ lines.each do |line|
185
+ add_defline(line)
186
+ end
187
+ end #def initialize
188
+
189
+ # Parses given string and adds parsed data.
190
+ def add_defline(str)
191
+ case str
192
+ when /^\>?\s*((?:[^\|\s]*\|)+[^\s]+)\s*(.*)$/
193
+ # NSIDs
194
+ # examples:
195
+ # >gi|9910844|sp|Q9UWG2|RL3_METVA 50S ribosomal protein L3P
196
+ #
197
+ # note: regexp (:?) means grouping without backreferences
198
+ i = $1
199
+ d = $2
200
+ tks = i.split('|')
201
+ tks << '' if i[-1,1] == '|'
202
+ a = parse_NSIDs(tks)
203
+ i = a[0].join('|')
204
+ a.unshift('|')
205
+ d = tks.join('|') + ' ' + d unless tks.empty?
206
+ a << d
207
+ this_line = a
208
+ match_EC(d)
209
+ parse_square_brackets(d).each do |x|
210
+ if !match_EC(x, false) and x =~ /\A[A-Z]/ then
211
+ di = [ x ]
212
+ @list_ids << di
213
+ @info['organism'] = x unless @info['organism']
214
+ end
215
+ end
216
+
217
+ when /^\>?\s*([a-zA-Z0-9]+\:[^\s]+)\s*(.*)$/
218
+ # examples:
219
+ # >sce:YBR160W CDC28, SRM5; cyclin-dependent protein kinase catalytic subunit [EC:2.7.1.-] [SP:CC28_YEAST]
220
+ # >emb:CACDC28 [X80034] C.albicans CDC28 gene
221
+ i = $1
222
+ d = $2
223
+ a = parse_ColonSepID(i)
224
+ i = a.join(':')
225
+ this_line = [ ':', a , d ]
226
+ match_EC(d)
227
+ parse_square_brackets(d).each do |x|
228
+ if !match_EC(x, false) and x =~ /:/ then
229
+ parse_ColonSepID(x)
230
+ elsif x =~ /\A\s*([A-Z][A-Z0-9_\.]+)\s*\z/ then
231
+ @list_ids << [ $1 ]
232
+ end
233
+ end
234
+
235
+ when /^\>?\s*(\S+)(?:\s+(.+))?$/
236
+ # examples:
237
+ # >ABC12345 this is test
238
+ i = $1
239
+ d = $2.to_s
240
+ @list_ids << [ i.chomp('.') ]
241
+ this_line = [ '', [ i ], d ]
242
+ match_EC(d)
243
+ else
244
+ i = str
245
+ d = ''
246
+ match_EC(i)
247
+ this_line = [ '', [ i ], d ]
248
+ end
249
+
250
+ @deflines << this_line
251
+ @entry_id = i unless @entry_id
252
+ end
253
+
254
+ def match_EC(str, write_flag = true)
255
+ di = nil
256
+ str.scan(/EC\:((:?[\-\d]+\.){3}(:?[\-\d]+))/i) do |x|
257
+ di = [ 'EC', $1 ]
258
+ if write_flag then
259
+ @info['ec'] = di[1] if (!@info['ec'] or @info['ec'].to_s =~ /\-/)
260
+ @list_ids << di
261
+ end
262
+ end
263
+ di
264
+ end
265
+ private :match_EC
266
+
267
+ def parse_square_brackets(str)
268
+ r = []
269
+ str.scan(/\[([^\]]*)\]/) do |x|
270
+ r << x[0]
271
+ end
272
+ r
273
+ end
274
+ private :parse_square_brackets
275
+
276
+ def parse_ColonSepID(str)
277
+ di = str.split(':', 2)
278
+ di << nil if di.size <= 1
279
+ @list_ids << di
280
+ di
281
+ end
282
+ private :parse_ColonSepID
283
+
284
+ def parse_NSIDs(ary)
285
+ # this method destroys ary
286
+ data = []
287
+ while token = ary.shift
288
+ if labels = self.class::NSIDs[token] then
289
+ di = [ token ]
290
+ idtype = token
291
+ labels.each do |x|
292
+ token = ary.shift
293
+ break unless token
294
+ if self.class::NSIDs[token] then
295
+ ary.unshift(token)
296
+ break #each
297
+ end
298
+ if token.length > 0 then
299
+ di << token
300
+ else
301
+ di << nil
302
+ end
303
+ end
304
+ data << di
305
+ else
306
+ if token.length > 0 then
307
+ # UCID (uncontrolled identifiers)
308
+ di = [ token ]
309
+ data << di
310
+ @info['ucid'] = token unless @info['ucid']
311
+ end
312
+ break #while
313
+ end
314
+ end #while
315
+ @list_ids.concat data
316
+ data
317
+ end #def parse_NSIDs
318
+ private :parse_NSIDs
319
+
320
+
321
+ # Shows original string.
322
+ # Note that the result of this method may be different from
323
+ # original string which is given in FastaDefline.new method.
324
+ def to_s
325
+ @deflines.collect { |a|
326
+ s = a[0]
327
+ (a[1..-2].collect { |x| x.join(s) }.join(s) + ' ' + a[-1]).strip
328
+ }.join("\x01")
329
+ end
330
+
331
+ # Shows description.
332
+ def description
333
+ @deflines[0].to_a[-1]
334
+ end
335
+
336
+ # Returns descriptions.
337
+ def descriptions
338
+ @deflines.collect do |a|
339
+ a[-1]
340
+ end
341
+ end
342
+
343
+ # Shows ID-like strings.
344
+ # Returns an array of strings.
345
+ def id_strings
346
+ r = []
347
+ @list_ids.each do |a|
348
+ if a.size >= 2 then
349
+ r.concat a[1..-1].find_all { |x| x }
350
+ else
351
+ if a[0].to_s.size > 0 and a[0] =~ /\A[A-Za-z0-9\.\-\_]+\z/
352
+ r << a[0]
353
+ end
354
+ end
355
+ end
356
+ r.concat( words(true, []).find_all do |x|
357
+ x =~ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/ or
358
+ x =~ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
359
+ end)
360
+ r
361
+ end
362
+
363
+ KillWords = [
364
+ 'an', 'the', 'this', 'that',
365
+ 'is', 'are', 'were', 'was', 'be', 'can', 'may', 'might',
366
+ 'as', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'with',
367
+ 'from', 'and', 'or', 'not',
368
+ 'dna', 'rna', 'mrna', 'cdna', 'orf',
369
+ 'aa', 'nt', 'pct', 'id', 'ec', 'sp', 'subsp',
370
+ 'similar', 'involved', 'identical', 'identity',
371
+ 'cds', 'clone', 'library', 'contig', 'contigs',
372
+ 'homolog', 'homologue', 'homologs', 'homologous',
373
+ 'protein', 'proteins', 'gene', 'genes',
374
+ 'product', 'products', 'sequence', 'sequences',
375
+ 'strain', 'strains', 'region', 'regions',
376
+ ]
377
+ KillWordsHash = {}
378
+ KillWords.each { |x| KillWordsHash[x] = true }
379
+
380
+ KillRegexpArray = [
381
+ /\A\d{1,3}\%?\z/,
382
+ /\A[A-Z][A-Za-z0-9\_]*[0-9]+[A-Za-z0-9\_]+\z/,
383
+ /\A[A-Z][A-Z0-9]*\_[A-Z0-9\_]+\z/
384
+ ]
385
+
386
+ # Shows words used in the defline. Returns an Array.
387
+ def words(case_sensitive = nil, kill_regexp = self.class::KillRegexpArray,
388
+ kwhash = self.class::KillWordsHash)
389
+ a = descriptions.join(' ').split(/[\.\,\;\:\(\)\[\]\{\}\<\>\"\'\`\~\/\|\?\!\&\@\#\s\x00-\x1f\x7f]+/)
390
+ a.collect! do |x|
391
+ x.sub!(/\A[\$\*\-\+]+/, '')
392
+ x.sub!(/[\$\*\-\=]+\z/, '')
393
+ if x.size <= 1 then
394
+ nil
395
+ elsif kwhash[x.downcase] then
396
+ nil
397
+ else
398
+ if kill_regexp.find { |expr| expr =~ x } then
399
+ nil
400
+ else
401
+ x
402
+ end
403
+ end
404
+ end
405
+ a.compact!
406
+ a.collect! { |x| x.downcase } unless case_sensitive
407
+ a.sort!
408
+ a.uniq!
409
+ a
410
+ end
411
+
412
+ # Returns identifires by a database name.
413
+ def get(dbname)
414
+ db = dbname.to_s
415
+ r = nil
416
+ unless r = @info[db] then
417
+ di = @list_ids.find { |x| x[0] == db.to_s }
418
+ if di and di.size <= 2 then
419
+ r = di[-1]
420
+ elsif di then
421
+ labels = self.class::NSIDs[db]
422
+ [ 'acc_version', 'entry_id',
423
+ 'locus', 'accession', 'number'].each do |x|
424
+ if i = labels.index(x) then
425
+ r = di[i+1]
426
+ break if r
427
+ end
428
+ end
429
+ r = di[1..-1].find { |x| x } unless r
430
+ end
431
+ @info[db] = r if r
432
+ end
433
+ r
434
+ end
435
+
436
+ # Returns an identifier by given type.
437
+ def get_by_type(type_str)
438
+ @list_ids.each do |x|
439
+ if labels = self.class::NSIDs[x[0]] then
440
+ if i = labels.index(type_str) then
441
+ return x[i+1]
442
+ end
443
+ end
444
+ end
445
+ nil
446
+ end
447
+
448
+ # Returns identifiers by given type.
449
+ def get_all_by_type(*type_strarg)
450
+ d = []
451
+ @list_ids.each do |x|
452
+ if labels = self.class::NSIDs[x[0]] then
453
+ type_strarg.each do |y|
454
+ if i = labels.index(y) then
455
+ d << x[i+1] if x[i+1]
456
+ end
457
+ end
458
+ end
459
+ end
460
+ d
461
+ end
462
+
463
+ # Shows locus.
464
+ # If the entry has more than two of such IDs,
465
+ # only the first ID are shown.
466
+ # Returns a string or nil.
467
+ def locus
468
+ unless defined?(@locus)
469
+ @locus = get_by_type('locus')
470
+ end
471
+ @locus
472
+ end
473
+
474
+ # Shows GI.
475
+ # If the entry has more than two of such IDs,
476
+ # only the first ID are shown.
477
+ # Returns a string or nil.
478
+ def gi
479
+ unless defined?(@gi) then
480
+ @gi = get_by_type('gi')
481
+ end
482
+ @gi
483
+ end
484
+
485
+ # Shows accession with version number.
486
+ # If the entry has more than two of such IDs,
487
+ # only the first ID are shown.
488
+ # Returns a string or nil.
489
+ def acc_version
490
+ unless defined?(@acc_version) then
491
+ @acc_version = get_by_type('acc_version')
492
+ end
493
+ @acc_version
494
+ end
495
+
496
+ # Shows accession numbers.
497
+ # Returns an array of strings.
498
+ def accessions
499
+ unless defined?(@accessions) then
500
+ @accessions = get_all_by_type('accession', 'acc_version')
501
+ @accessions.collect! { |x| x.sub(/\..*\z/, '') }
502
+ end
503
+ @accessions
504
+ end
505
+
506
+ # Shows an accession number.
507
+ def accession
508
+ unless defined?(@accession) then
509
+ if acc_version then
510
+ @accession = acc_version.split('.')[0]
511
+ else
512
+ @accession = accessions[0]
513
+ end
514
+ end
515
+ @accession
516
+ end
517
+
518
+ def method_missing(name, *args)
519
+ # raise ArgumentError,
520
+ # "wrong # of arguments(#{args.size} for 1)" if args.size >= 2
521
+ r = get(name, *args)
522
+ if !r and !(self.class::NSIDs[name.to_s]) then
523
+ raise "NameError: undefined method `#{name.inspect}'"
524
+ end
525
+ r
526
+ end
527
+
528
+
529
+ end #class FastaDefline
530
+
531
+ end #module Bio
532
+