bio 1.3.1 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (303) hide show
  1. data/ChangeLog +2105 -3728
  2. data/KNOWN_ISSUES.rdoc +35 -3
  3. data/README.rdoc +8 -2
  4. data/RELEASE_NOTES.rdoc +166 -0
  5. data/bin/bioruby +4 -1
  6. data/bioruby.gemspec +146 -1
  7. data/bioruby.gemspec.erb +3 -1
  8. data/doc/ChangeLog-before-1.3.1 +3961 -0
  9. data/doc/Tutorial.rd +154 -22
  10. data/doc/Tutorial.rd.html +125 -68
  11. data/lib/bio.rb +21 -6
  12. data/lib/bio/appl/bl2seq/report.rb +11 -202
  13. data/lib/bio/appl/blast/format0.rb +0 -193
  14. data/lib/bio/appl/blast/report.rb +2 -147
  15. data/lib/bio/appl/blast/wublast.rb +0 -208
  16. data/lib/bio/appl/fasta.rb +4 -19
  17. data/lib/bio/appl/fasta/format10.rb +0 -14
  18. data/lib/bio/appl/genscan/report.rb +0 -176
  19. data/lib/bio/appl/hmmer.rb +1 -15
  20. data/lib/bio/appl/hmmer/report.rb +0 -100
  21. data/lib/bio/appl/meme/mast.rb +156 -0
  22. data/lib/bio/appl/meme/mast/report.rb +91 -0
  23. data/lib/bio/appl/meme/motif.rb +48 -0
  24. data/lib/bio/appl/psort.rb +0 -111
  25. data/lib/bio/appl/psort/report.rb +1 -45
  26. data/lib/bio/appl/pts1.rb +2 -4
  27. data/lib/bio/appl/sosui/report.rb +5 -54
  28. data/lib/bio/appl/targetp/report.rb +1 -104
  29. data/lib/bio/appl/tmhmm/report.rb +0 -36
  30. data/lib/bio/command.rb +94 -10
  31. data/lib/bio/data/aa.rb +1 -77
  32. data/lib/bio/data/codontable.rb +1 -95
  33. data/lib/bio/data/na.rb +1 -26
  34. data/lib/bio/db/aaindex.rb +1 -38
  35. data/lib/bio/db/fasta.rb +1 -134
  36. data/lib/bio/db/fasta/format_qual.rb +204 -0
  37. data/lib/bio/db/fasta/qual.rb +102 -0
  38. data/lib/bio/db/fastq.rb +645 -0
  39. data/lib/bio/db/fastq/fastq_to_biosequence.rb +40 -0
  40. data/lib/bio/db/fastq/format_fastq.rb +175 -0
  41. data/lib/bio/db/genbank/genbank.rb +1 -86
  42. data/lib/bio/db/gff.rb +0 -17
  43. data/lib/bio/db/go.rb +4 -72
  44. data/lib/bio/db/kegg/common.rb +112 -0
  45. data/lib/bio/db/kegg/compound.rb +29 -20
  46. data/lib/bio/db/kegg/drug.rb +74 -34
  47. data/lib/bio/db/kegg/enzyme.rb +26 -5
  48. data/lib/bio/db/kegg/genes.rb +128 -15
  49. data/lib/bio/db/kegg/genome.rb +3 -41
  50. data/lib/bio/db/kegg/glycan.rb +19 -24
  51. data/lib/bio/db/kegg/orthology.rb +16 -56
  52. data/lib/bio/db/kegg/reaction.rb +81 -28
  53. data/lib/bio/db/kegg/taxonomy.rb +1 -52
  54. data/lib/bio/db/litdb.rb +1 -16
  55. data/lib/bio/db/phyloxml/phyloxml.xsd +582 -0
  56. data/lib/bio/db/phyloxml/phyloxml_elements.rb +1174 -0
  57. data/lib/bio/db/phyloxml/phyloxml_parser.rb +954 -0
  58. data/lib/bio/db/phyloxml/phyloxml_writer.rb +228 -0
  59. data/lib/bio/db/prosite.rb +2 -95
  60. data/lib/bio/db/rebase.rb +5 -6
  61. data/lib/bio/db/sanger_chromatogram/abif.rb +120 -0
  62. data/lib/bio/db/sanger_chromatogram/chromatogram.rb +133 -0
  63. data/lib/bio/db/sanger_chromatogram/chromatogram_to_biosequence.rb +32 -0
  64. data/lib/bio/db/sanger_chromatogram/scf.rb +210 -0
  65. data/lib/bio/io/das.rb +0 -44
  66. data/lib/bio/io/ddbjxml.rb +1 -181
  67. data/lib/bio/io/flatfile.rb +1 -7
  68. data/lib/bio/io/flatfile/autodetection.rb +6 -0
  69. data/lib/bio/io/keggapi.rb +0 -442
  70. data/lib/bio/io/ncbirest.rb +130 -132
  71. data/lib/bio/io/ncbisoap.rb +2 -1
  72. data/lib/bio/io/pubmed.rb +0 -88
  73. data/lib/bio/location.rb +0 -73
  74. data/lib/bio/pathway.rb +0 -171
  75. data/lib/bio/sequence.rb +18 -1
  76. data/lib/bio/sequence/adapter.rb +3 -0
  77. data/lib/bio/sequence/format.rb +16 -0
  78. data/lib/bio/sequence/quality_score.rb +205 -0
  79. data/lib/bio/tree.rb +70 -5
  80. data/lib/bio/util/restriction_enzyme/single_strand.rb +3 -2
  81. data/lib/bio/util/sirna.rb +1 -23
  82. data/lib/bio/version.rb +1 -1
  83. data/sample/demo_aaindex.rb +67 -0
  84. data/sample/demo_aminoacid.rb +101 -0
  85. data/sample/demo_bl2seq_report.rb +220 -0
  86. data/sample/demo_blast_report.rb +285 -0
  87. data/sample/demo_codontable.rb +119 -0
  88. data/sample/demo_das.rb +105 -0
  89. data/sample/demo_ddbjxml.rb +212 -0
  90. data/sample/demo_fasta_remote.rb +51 -0
  91. data/sample/demo_fastaformat.rb +105 -0
  92. data/sample/demo_genbank.rb +132 -0
  93. data/sample/demo_genscan_report.rb +202 -0
  94. data/sample/demo_gff1.rb +49 -0
  95. data/sample/demo_go.rb +98 -0
  96. data/sample/demo_hmmer_report.rb +149 -0
  97. data/sample/demo_kegg_compound.rb +57 -0
  98. data/sample/demo_kegg_drug.rb +65 -0
  99. data/sample/demo_kegg_genome.rb +74 -0
  100. data/sample/demo_kegg_glycan.rb +72 -0
  101. data/sample/demo_kegg_orthology.rb +62 -0
  102. data/sample/demo_kegg_reaction.rb +66 -0
  103. data/sample/demo_kegg_taxonomy.rb +92 -0
  104. data/sample/demo_keggapi.rb +502 -0
  105. data/sample/demo_litdb.rb +42 -0
  106. data/sample/demo_locations.rb +99 -0
  107. data/sample/demo_ncbi_rest.rb +130 -0
  108. data/sample/demo_nucleicacid.rb +49 -0
  109. data/sample/demo_pathway.rb +196 -0
  110. data/sample/demo_prosite.rb +120 -0
  111. data/sample/demo_psort.rb +138 -0
  112. data/sample/demo_psort_report.rb +70 -0
  113. data/sample/demo_pubmed.rb +118 -0
  114. data/sample/demo_sirna.rb +63 -0
  115. data/sample/demo_sosui_report.rb +89 -0
  116. data/sample/demo_targetp_report.rb +135 -0
  117. data/sample/demo_tmhmm_report.rb +68 -0
  118. data/sample/pmfetch.rb +13 -4
  119. data/sample/pmsearch.rb +15 -4
  120. data/sample/test_phyloxml_big.rb +205 -0
  121. data/test/bioruby_test_helper.rb +61 -0
  122. data/test/data/KEGG/1.1.1.1.enzyme +935 -0
  123. data/test/data/KEGG/C00025.compound +102 -0
  124. data/test/data/KEGG/D00063.drug +104 -0
  125. data/test/data/KEGG/G00024.glycan +47 -0
  126. data/test/data/KEGG/G01366.glycan +18 -0
  127. data/test/data/KEGG/K02338.orthology +902 -0
  128. data/test/data/KEGG/R00006.reaction +14 -0
  129. data/test/data/fastq/README.txt +109 -0
  130. data/test/data/fastq/error_diff_ids.fastq +20 -0
  131. data/test/data/fastq/error_double_qual.fastq +22 -0
  132. data/test/data/fastq/error_double_seq.fastq +22 -0
  133. data/test/data/fastq/error_long_qual.fastq +20 -0
  134. data/test/data/fastq/error_no_qual.fastq +20 -0
  135. data/test/data/fastq/error_qual_del.fastq +20 -0
  136. data/test/data/fastq/error_qual_escape.fastq +20 -0
  137. data/test/data/fastq/error_qual_null.fastq +0 -0
  138. data/test/data/fastq/error_qual_space.fastq +21 -0
  139. data/test/data/fastq/error_qual_tab.fastq +21 -0
  140. data/test/data/fastq/error_qual_unit_sep.fastq +20 -0
  141. data/test/data/fastq/error_qual_vtab.fastq +20 -0
  142. data/test/data/fastq/error_short_qual.fastq +20 -0
  143. data/test/data/fastq/error_spaces.fastq +20 -0
  144. data/test/data/fastq/error_tabs.fastq +21 -0
  145. data/test/data/fastq/error_trunc_at_plus.fastq +19 -0
  146. data/test/data/fastq/error_trunc_at_qual.fastq +19 -0
  147. data/test/data/fastq/error_trunc_at_seq.fastq +18 -0
  148. data/test/data/fastq/error_trunc_in_plus.fastq +19 -0
  149. data/test/data/fastq/error_trunc_in_qual.fastq +20 -0
  150. data/test/data/fastq/error_trunc_in_seq.fastq +18 -0
  151. data/test/data/fastq/error_trunc_in_title.fastq +17 -0
  152. data/test/data/fastq/illumina_full_range_as_illumina.fastq +8 -0
  153. data/test/data/fastq/illumina_full_range_as_sanger.fastq +8 -0
  154. data/test/data/fastq/illumina_full_range_as_solexa.fastq +8 -0
  155. data/test/data/fastq/illumina_full_range_original_illumina.fastq +8 -0
  156. data/test/data/fastq/longreads_as_illumina.fastq +40 -0
  157. data/test/data/fastq/longreads_as_sanger.fastq +40 -0
  158. data/test/data/fastq/longreads_as_solexa.fastq +40 -0
  159. data/test/data/fastq/longreads_original_sanger.fastq +120 -0
  160. data/test/data/fastq/misc_dna_as_illumina.fastq +16 -0
  161. data/test/data/fastq/misc_dna_as_sanger.fastq +16 -0
  162. data/test/data/fastq/misc_dna_as_solexa.fastq +16 -0
  163. data/test/data/fastq/misc_dna_original_sanger.fastq +16 -0
  164. data/test/data/fastq/misc_rna_as_illumina.fastq +16 -0
  165. data/test/data/fastq/misc_rna_as_sanger.fastq +16 -0
  166. data/test/data/fastq/misc_rna_as_solexa.fastq +16 -0
  167. data/test/data/fastq/misc_rna_original_sanger.fastq +16 -0
  168. data/test/data/fastq/sanger_full_range_as_illumina.fastq +8 -0
  169. data/test/data/fastq/sanger_full_range_as_sanger.fastq +8 -0
  170. data/test/data/fastq/sanger_full_range_as_solexa.fastq +8 -0
  171. data/test/data/fastq/sanger_full_range_original_sanger.fastq +8 -0
  172. data/test/data/fastq/solexa_full_range_as_illumina.fastq +8 -0
  173. data/test/data/fastq/solexa_full_range_as_sanger.fastq +8 -0
  174. data/test/data/fastq/solexa_full_range_as_solexa.fastq +8 -0
  175. data/test/data/fastq/solexa_full_range_original_solexa.fastq +8 -0
  176. data/test/data/fastq/wrapping_as_illumina.fastq +12 -0
  177. data/test/data/fastq/wrapping_as_sanger.fastq +12 -0
  178. data/test/data/fastq/wrapping_as_solexa.fastq +12 -0
  179. data/test/data/fastq/wrapping_original_sanger.fastq +24 -0
  180. data/test/data/meme/db +0 -0
  181. data/test/data/meme/mast +0 -0
  182. data/test/data/meme/mast.out +13 -0
  183. data/test/data/meme/meme.out +3 -0
  184. data/test/data/phyloxml/apaf.xml +666 -0
  185. data/test/data/phyloxml/bcl_2.xml +2097 -0
  186. data/test/data/phyloxml/made_up.xml +144 -0
  187. data/test/data/phyloxml/ncbi_taxonomy_mollusca_short.xml +65 -0
  188. data/test/data/phyloxml/phyloxml_examples.xml +415 -0
  189. data/test/data/sanger_chromatogram/test_chromatogram_abif.ab1 +0 -0
  190. data/test/data/sanger_chromatogram/test_chromatogram_scf_v2.scf +0 -0
  191. data/test/data/sanger_chromatogram/test_chromatogram_scf_v3.scf +0 -0
  192. data/test/functional/bio/appl/test_pts1.rb +7 -5
  193. data/test/functional/bio/io/test_ensembl.rb +4 -3
  194. data/test/functional/bio/io/test_pubmed.rb +9 -3
  195. data/test/functional/bio/io/test_soapwsdl.rb +5 -4
  196. data/test/functional/bio/io/test_togows.rb +5 -4
  197. data/test/functional/bio/sequence/test_output_embl.rb +6 -4
  198. data/test/functional/bio/test_command.rb +54 -5
  199. data/test/runner.rb +5 -3
  200. data/test/unit/bio/appl/bl2seq/test_report.rb +5 -4
  201. data/test/unit/bio/appl/blast/test_ncbioptions.rb +4 -2
  202. data/test/unit/bio/appl/blast/test_report.rb +5 -4
  203. data/test/unit/bio/appl/blast/test_rpsblast.rb +5 -4
  204. data/test/unit/bio/appl/gcg/test_msf.rb +5 -5
  205. data/test/unit/bio/appl/genscan/test_report.rb +8 -9
  206. data/test/unit/bio/appl/hmmer/test_report.rb +5 -4
  207. data/test/unit/bio/appl/iprscan/test_report.rb +6 -5
  208. data/test/unit/bio/appl/mafft/test_report.rb +6 -5
  209. data/test/unit/bio/appl/meme/mast/test_report.rb +46 -0
  210. data/test/unit/bio/appl/meme/test_mast.rb +103 -0
  211. data/test/unit/bio/appl/meme/test_motif.rb +38 -0
  212. data/test/unit/bio/appl/paml/codeml/test_rates.rb +5 -4
  213. data/test/unit/bio/appl/paml/codeml/test_report.rb +5 -4
  214. data/test/unit/bio/appl/paml/test_codeml.rb +5 -4
  215. data/test/unit/bio/appl/sim4/test_report.rb +5 -4
  216. data/test/unit/bio/appl/sosui/test_report.rb +6 -5
  217. data/test/unit/bio/appl/targetp/test_report.rb +5 -3
  218. data/test/unit/bio/appl/test_blast.rb +5 -4
  219. data/test/unit/bio/appl/test_fasta.rb +4 -2
  220. data/test/unit/bio/appl/test_pts1.rb +4 -2
  221. data/test/unit/bio/appl/tmhmm/test_report.rb +6 -5
  222. data/test/unit/bio/data/test_aa.rb +5 -3
  223. data/test/unit/bio/data/test_codontable.rb +5 -4
  224. data/test/unit/bio/data/test_na.rb +5 -3
  225. data/test/unit/bio/db/biosql/tc_biosql.rb +5 -1
  226. data/test/unit/bio/db/embl/test_common.rb +4 -2
  227. data/test/unit/bio/db/embl/test_embl.rb +6 -6
  228. data/test/unit/bio/db/embl/test_embl_rel89.rb +6 -6
  229. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +7 -8
  230. data/test/unit/bio/db/embl/test_sptr.rb +6 -8
  231. data/test/unit/bio/db/embl/test_uniprot.rb +6 -5
  232. data/test/unit/bio/db/fasta/test_format_qual.rb +346 -0
  233. data/test/unit/bio/db/kegg/test_compound.rb +146 -0
  234. data/test/unit/bio/db/kegg/test_drug.rb +194 -0
  235. data/test/unit/bio/db/kegg/test_enzyme.rb +241 -0
  236. data/test/unit/bio/db/kegg/test_genes.rb +32 -4
  237. data/test/unit/bio/db/kegg/test_glycan.rb +260 -0
  238. data/test/unit/bio/db/kegg/test_orthology.rb +50 -0
  239. data/test/unit/bio/db/kegg/test_reaction.rb +96 -0
  240. data/test/unit/bio/db/pdb/test_pdb.rb +4 -2
  241. data/test/unit/bio/db/sanger_chromatogram/test_abif.rb +76 -0
  242. data/test/unit/bio/db/sanger_chromatogram/test_scf.rb +98 -0
  243. data/test/unit/bio/db/test_aaindex.rb +6 -6
  244. data/test/unit/bio/db/test_fasta.rb +5 -46
  245. data/test/unit/bio/db/test_fastq.rb +829 -0
  246. data/test/unit/bio/db/test_gff.rb +4 -2
  247. data/test/unit/bio/db/test_lasergene.rb +7 -5
  248. data/test/unit/bio/db/test_medline.rb +4 -2
  249. data/test/unit/bio/db/test_newick.rb +6 -6
  250. data/test/unit/bio/db/test_nexus.rb +4 -2
  251. data/test/unit/bio/db/test_phyloxml.rb +769 -0
  252. data/test/unit/bio/db/test_phyloxml_writer.rb +328 -0
  253. data/test/unit/bio/db/test_prosite.rb +6 -5
  254. data/test/unit/bio/db/test_qual.rb +63 -0
  255. data/test/unit/bio/db/test_rebase.rb +5 -3
  256. data/test/unit/bio/db/test_soft.rb +7 -6
  257. data/test/unit/bio/io/flatfile/test_autodetection.rb +6 -7
  258. data/test/unit/bio/io/flatfile/test_buffer.rb +6 -5
  259. data/test/unit/bio/io/flatfile/test_splitter.rb +4 -4
  260. data/test/unit/bio/io/test_ddbjxml.rb +4 -3
  261. data/test/unit/bio/io/test_ensembl.rb +5 -3
  262. data/test/unit/bio/io/test_fastacmd.rb +4 -3
  263. data/test/unit/bio/io/test_flatfile.rb +6 -5
  264. data/test/unit/bio/io/test_soapwsdl.rb +4 -3
  265. data/test/unit/bio/io/test_togows.rb +4 -2
  266. data/test/unit/bio/sequence/test_aa.rb +5 -3
  267. data/test/unit/bio/sequence/test_common.rb +4 -2
  268. data/test/unit/bio/sequence/test_compat.rb +4 -2
  269. data/test/unit/bio/sequence/test_dblink.rb +5 -3
  270. data/test/unit/bio/sequence/test_na.rb +4 -2
  271. data/test/unit/bio/sequence/test_quality_score.rb +330 -0
  272. data/test/unit/bio/shell/plugin/test_seq.rb +5 -3
  273. data/test/unit/bio/test_alignment.rb +5 -3
  274. data/test/unit/bio/test_command.rb +4 -3
  275. data/test/unit/bio/test_db.rb +5 -3
  276. data/test/unit/bio/test_feature.rb +4 -2
  277. data/test/unit/bio/test_location.rb +4 -2
  278. data/test/unit/bio/test_map.rb +5 -3
  279. data/test/unit/bio/test_pathway.rb +4 -2
  280. data/test/unit/bio/test_reference.rb +4 -2
  281. data/test/unit/bio/test_sequence.rb +5 -3
  282. data/test/unit/bio/test_shell.rb +5 -3
  283. data/test/unit/bio/test_tree.rb +6 -6
  284. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +4 -2
  285. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +4 -2
  286. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +4 -2
  287. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -2
  288. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +4 -2
  289. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +4 -2
  290. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +4 -2
  291. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +4 -2
  292. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +4 -2
  293. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +4 -2
  294. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -2
  295. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +4 -2
  296. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +17 -13
  297. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +17 -13
  298. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +4 -2
  299. data/test/unit/bio/util/test_color_scheme.rb +5 -3
  300. data/test/unit/bio/util/test_contingency_table.rb +5 -3
  301. data/test/unit/bio/util/test_restriction_enzyme.rb +4 -2
  302. data/test/unit/bio/util/test_sirna.rb +6 -4
  303. metadata +147 -2
@@ -0,0 +1,133 @@
1
+ #
2
+ # = bio/db/sanger_chromatogram/chromatogram.rb - Sanger Chromatogram class
3
+ #
4
+ # Copyright:: Copyright (C) 2009 Anthony Underwood <anthony.underwood@hpa.org.uk>, <email2ants@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ # $Id:$
8
+ #
9
+ require 'bio/sequence/adapter'
10
+ module Bio
11
+ # == Description
12
+ #
13
+ # This is the Superclass for the Abif and Scf classes that allow importing of the common scf
14
+ # and abi sequence chromatogram formats
15
+ # The following attributes are Common to both the Abif and Scf subclasses
16
+ #
17
+ # * *chromatogram_type* (String): This is extracted from the chromatogram file itself and will
18
+ # probably be either .scf or ABIF for Scf and Abif files respectively.
19
+ # * *version* (String): The version of the Scf or Abif file
20
+ # * *sequence* (String): the sequence contained within the chromatogram as a string.
21
+ # * *qualities* (Array): the quality scores of each base as an array of integers. These will
22
+ # probably be phred scores.
23
+ # * *peak_indices* (Array): if the sequence traces contained within the chromatogram are imagined
24
+ # as being plotted on an x,y graph, the peak indices are the x positions of the peaks that
25
+ # represent the nucleotides bases found in the sequence from the chromatogram. For example if
26
+ # the peak_indices are [16,24,37,49 ....] and the sequence is AGGT...., at position 16 the
27
+ # traces in the chromatogram were base-called as an A, position 24 a G, position 37 a G,
28
+ # position 49 a T etc
29
+ # * *atrace*, *ctrace*, *gtrace*, *ttrace* (Array): If the sequence traces contained within
30
+ # the chromatogram are imagined as being plotted on an x,y graph, these attributes are arrays of
31
+ # y positions for each of the 4 nucleotide bases along the length of the x axis. If these were
32
+ # plotted joined by lines of different colours then the resulting graph should look like the
33
+ # original chromatogram file when viewed in a chromtogram viewer such as Chromas, 4Peaks or
34
+ # FinchTV.
35
+ # * *dye_mobility* (String): The mobility of the dye used when sequencing. This can influence the
36
+ # base calling
37
+ #
38
+ # == Usage
39
+ # filename = "path/to/sequence_chromatogram_file"
40
+ #
41
+ # for Abif files
42
+ # chromatogram_ff = Bio::Abif.open(filename)
43
+ # for Scf files
44
+ # chromatogram_ff = Bio::Scf.open(filename)
45
+ #
46
+ # chromatogram = chromatogram_ff.next_entry
47
+ # chromatogram.to_seq # => returns a Bio::Sequence object
48
+ # chromatogram.sequence # => returns the sequence contained within the chromatogram as a string
49
+ # chromatogram.qualities # => returns an array of quality values for each base
50
+ # chromatogram.atrace # => returns an array of the a trace y positions
51
+ #
52
+ class SangerChromatogram
53
+ # The type of chromatogram file .scf for Scf files and ABIF doe Abif files
54
+ attr_accessor :chromatogram_type
55
+ # The Version of the Scf or Abif file (String)
56
+ attr_accessor :version
57
+ # The sequence contained within the chromatogram (String)
58
+ attr_accessor :sequence
59
+ # An array of quality scores for each base in the sequence (Array)
60
+ attr_accessor :qualities
61
+ # An array 'x' positions (see description) on the trace where the bases occur/have been called (Array)
62
+ attr_accessor :peak_indices
63
+ # An array of 'y' positions (see description) for the 'A' trace from the chromatogram (Array
64
+ attr_accessor :atrace
65
+ # An array of 'y' positions (see description) for the 'C' trace from the chromatogram (Array
66
+ attr_accessor :ctrace
67
+ # An array of 'y' positions (see description) for the 'G' trace from the chromatogram (Array
68
+ attr_accessor :gtrace
69
+ # An array of 'y' positions (see description) for the 'T' trace from the chromatogram (Array
70
+ attr_accessor :ttrace
71
+ #The mobility of the dye used when sequencing (String)
72
+ attr_accessor :dye_mobility
73
+
74
+ def self.open(filename)
75
+ Bio::FlatFile.open(self, filename)
76
+ end
77
+
78
+ # Returns a Bio::Sequence::NA object based on the sequence from the chromatogram
79
+ def seq
80
+ Bio::Sequence::NA.new(@sequence)
81
+ end
82
+
83
+ # Returns a Bio::Sequence object based on the sequence from the chromatogram
84
+ def to_biosequence
85
+ Bio::Sequence.adapter(self, Bio::Sequence::Adapter::SangerChromatogram)
86
+ end
87
+ alias :to_seq :to_biosequence
88
+
89
+ # Returns the sequence from the chromatogram as a string
90
+ def sequence_string
91
+ @sequence
92
+ end
93
+
94
+ # Reverses and complements the current chromatogram object including its sequence, traces
95
+ # and qualities
96
+ def complement!
97
+ # reverse traces
98
+ tmp_trace = @atrace
99
+ @atrace = @ttrace.reverse
100
+ @ttrace = tmp_trace.reverse
101
+ tmp_trace = @ctrace
102
+ @ctrace = @gtrace.reverse
103
+ @gtrace = tmp_trace.reverse
104
+
105
+ # reverse base qualities
106
+ if !@aqual.nil? # if qualities exist
107
+ tmp_qual = @aqual
108
+ @aqual = @tqual.reverse
109
+ @tqual = tmp_qual.reverse
110
+ tmp_qual = @cqual
111
+ @cqual = @gqual.reverse
112
+ @gqual = tmp_qual.reverse
113
+ end
114
+
115
+ #reverse qualities
116
+ @qualities = @qualities.reverse
117
+
118
+ #reverse peak indices
119
+ @peak_indices = @peak_indices.map{|index| @atrace.size - index}
120
+ @peak_indices.reverse!
121
+
122
+ # reverse sequence
123
+ @sequence = @sequence.reverse.tr('atgcnrykmswbvdh','tacgnyrmkswvbhd')
124
+ end
125
+ # Returns a new chromatogram object of the appropriate subclass (scf or abi) where the
126
+ # sequence, traces and qualities have all been revesed and complemented
127
+ def complement
128
+ chromatogram = self.dup
129
+ chromatogram.complement!
130
+ return chromatogram
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,32 @@
1
+ #
2
+ # = bio/db/sanger_chromatogram/chromatogram_to_biosequence.rb - Bio::SangerChromatogram to Bio::Sequence adapter module
3
+ #
4
+ # Copyright:: Copyright (C) 2009 Anthony Underwood <anthony.underwood@hpa.org.uk>, <email2ants@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ # $Id:$
8
+ #
9
+
10
+ require 'bio/sequence'
11
+ require 'bio/sequence/adapter'
12
+
13
+ # Internal use only. Normal users should not use this module.
14
+ #
15
+ # Bio::SangerChromatogram to Bio::Sequence adapter module.
16
+ # It is internally used in Bio::SangerChromatogram#to_biosequence.
17
+ #
18
+ module Bio::Sequence::Adapter::SangerChromatogram
19
+
20
+ extend Bio::Sequence::Adapter
21
+
22
+ private
23
+
24
+ def_biosequence_adapter :seq
25
+
26
+ # primary accession
27
+ def_biosequence_adapter :primary_accession do |orig|
28
+ orig.version
29
+ end
30
+
31
+ end #module Bio::Sequence::Adapter::SangerChromatogram
32
+
@@ -0,0 +1,210 @@
1
+ #
2
+ # = bio/db/sanger_chromatogram/scf.rb - Scf class
3
+ #
4
+ # Copyright:: Copyright (C) 2009 Anthony Underwood <anthony.underwood@hpa.org.uk>, <email2ants@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+
8
+ require 'bio/db/sanger_chromatogram/chromatogram'
9
+
10
+ module Bio
11
+ # == Description
12
+ #
13
+ # This class inherits from the SangerChromatogram superclass. It captures the information contained
14
+ # within an scf format chromatogram file generated by DNA sequencing. See the SangerChromatogram class
15
+ # for usage
16
+ class Scf < SangerChromatogram
17
+ # sequence attributes
18
+
19
+ # The quality of each base at each position along the length of the sequence is captured
20
+ # by the nqual attributes where n is one of a, c, g or t. Generally the quality will be
21
+ # high for the base that is called at a particular position and low for all the other bases.
22
+ # However at positions of poor sequence quality, more than one base may have similar top scores.
23
+ # By analysing the nqual attributes it may be possible to determine if the base calling was
24
+ # correct or not.
25
+ # The quality of the A base at each sequence position
26
+ attr_accessor :aqual
27
+ # The quality of the C base at each sequence position
28
+ attr_accessor :cqual
29
+ # The quality of the G base at each sequence position
30
+ attr_accessor :gqual
31
+ # The quality of the T base at each sequence position
32
+ attr_accessor :tqual
33
+ # A hash of extra information extracted from the chromatogram file
34
+ attr_accessor :comments
35
+
36
+ # see SangerChromatogram class for how to create an Scf object and its usage
37
+ def initialize(string)
38
+ header = string.slice(0,128)
39
+ # read in header info
40
+ @chromatogram_type, @samples, @sample_offset, @bases, @bases_left_clip, @bases_right_clip, @bases_offset, @comment_size, @comments_offset, @version, @sample_size, @code_set, @header_spare = header.unpack("a4 NNNNNNNN a4 NN N20")
41
+ get_traces(string)
42
+ get_bases_peakIndices_and_qualities(string)
43
+ get_comments(string)
44
+ if @comments["DYEP"]
45
+ @dye_mobility = @comments["DYEP"]
46
+ else
47
+ @dye_mobility = "Unnown"
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def get_traces(string)
54
+ if @version == "3.00"
55
+ # read in trace info
56
+ offset = @sample_offset
57
+ length = @samples * @sample_size
58
+ # determine whether the data is stored in 1 byte as an unsigned byte or 2 bytes as an unsigned short
59
+ @sample_size == 2 ? byte = "n" : byte = "c"
60
+ for base in ["a" , "c" , "g" , "t"]
61
+ trace_read = string.slice(offset,length).unpack("#{byte}#{@samples}")
62
+ # convert offsets
63
+ for sample_num in (0..trace_read.size-1)
64
+ if trace_read[sample_num] > 30000
65
+ trace_read[sample_num] = trace_read[sample_num] - 65536
66
+ end
67
+ end
68
+ # For 8-bit data we need to emulate a signed/unsigned
69
+ # cast that is implicit in the C implementations.....
70
+ if @sample_size == 1
71
+ for sample_num in (0..trace_read.size-1)
72
+ trace_read[sample_num] += 256 if trace_read[sample_num] < 0
73
+ end
74
+ end
75
+ trace_read = convert_deltas_to_values(trace_read)
76
+ self.instance_variable_set("@#{base}trace", trace_read)
77
+ offset += length
78
+ end
79
+ elsif @version == "2.00"
80
+ @atrace = []
81
+ @ctrace = []
82
+ @gtrace = []
83
+ @ttrace = []
84
+ # read in trace info
85
+ offset = @sample_offset
86
+ length = @samples * @sample_size * 4
87
+ # determine whether the data is stored in 1 byte as an unsigned byte or 2 bytes as an unsigned short
88
+ @sample_size == 2 ? byte = "n" : byte = "c"
89
+ trace_read = string.slice(offset,length).unpack("#{byte}#{@samples*4}")
90
+ (0..(@samples-1)*4).step(4) do |offset2|
91
+ @atrace << trace_read[offset2]
92
+ @ctrace << trace_read[offset2+1]
93
+ @gtrace << trace_read[offset2+2]
94
+ @ttrace << trace_read[offset2+3]
95
+ end
96
+ end
97
+ end
98
+ def get_bases_peakIndices_and_qualities(string)
99
+ if @version == "3.00"
100
+ # now go and get the peak index information
101
+ offset = @bases_offset
102
+ length = @bases * 4
103
+ get_v3_peak_indices(string,offset,length)
104
+
105
+ # now go and get the accuracy information
106
+ offset += length;
107
+ get_v3_accuracies(string,offset,length)
108
+
109
+ # OK, now go and get the base information.
110
+ offset += length;
111
+ length = @bases;
112
+ get_v3_sequence(string,offset,length)
113
+
114
+ #combine accuracies to get quality scores
115
+ @qualities= convert_accuracies_to_qualities
116
+ elsif @version == "2.00"
117
+ @peak_indices = []
118
+ @aqual = []
119
+ @cqual = []
120
+ @gqual = []
121
+ @tqual = []
122
+ @qualities = []
123
+ @sequence = ""
124
+ # now go and get the base information
125
+ offset = @bases_offset
126
+ length = @bases * 12
127
+ all_bases_info = string.slice(offset,length)
128
+
129
+ (0..length-1).step(12) do |offset2|
130
+ base_info = all_bases_info.slice(offset2,12).unpack("N C C C C a C3")
131
+ @peak_indices << base_info[0]
132
+ @aqual << base_info[1]
133
+ @cqual << base_info[2]
134
+ @gqual << base_info[3]
135
+ @tqual << base_info[4]
136
+ @sequence += base_info[5].downcase
137
+ case base_info[5].downcase
138
+ when "a"
139
+ @qualities << base_info[1]
140
+ when "c"
141
+ @qualities << base_info[2]
142
+ when "g"
143
+ @qualities << base_info[3]
144
+ when "t"
145
+ @qualities << base_info[4]
146
+ else
147
+ @qualities << 0
148
+ end
149
+ end
150
+ end
151
+ end
152
+ def get_v3_peak_indices(string,offset,length)
153
+ @peak_indices = string.slice(offset,length).unpack("N#{length/4}")
154
+ end
155
+ def get_v3_accuracies(string,offset,length)
156
+ qualities = string.slice(offset,length)
157
+ qual_length = length/4;
158
+ qual_offset = 0;
159
+ for base in ["a" , "c" , "g" , "t"]
160
+ self.instance_variable_set("@#{base}qual",qualities.slice(qual_offset,qual_length).unpack("C#{qual_length}"))
161
+ qual_offset += qual_length
162
+ end
163
+ end
164
+ def get_v3_sequence(string,offset,length)
165
+ @sequence = string.slice(offset,length).unpack("a#{length}").join('').downcase
166
+ end
167
+
168
+ def convert_deltas_to_values(trace_read)
169
+ p_sample = 0;
170
+ for sample_num in (0..trace_read.size-1)
171
+ trace_read[sample_num] = trace_read[sample_num] + p_sample
172
+ p_sample = trace_read[sample_num];
173
+ end
174
+ p_sample = 0;
175
+ for sample_num in (0..trace_read.size-1)
176
+ trace_read[sample_num] = trace_read[sample_num] + p_sample
177
+ p_sample = trace_read[sample_num];
178
+ end
179
+ return trace_read
180
+ end
181
+ def convert_accuracies_to_qualities
182
+ qualities = Array.new
183
+ for base_pos in (0..@sequence.length-1)
184
+ case sequence.slice(base_pos,1)
185
+ when "a"
186
+ qualities << @aqual[base_pos]
187
+ when "c"
188
+ qualities << @cqual[base_pos]
189
+ when "g"
190
+ qualities << @gqual[base_pos]
191
+ when "t"
192
+ qualities << @tqual[base_pos]
193
+ else
194
+ qualities << 0
195
+ end
196
+ end
197
+ return qualities
198
+ end
199
+ def get_comments(string)
200
+ @comments = Hash.new
201
+ comment_string = string.slice(@comments_offset,@comment_size)
202
+ comment_string.gsub!(/\0/, "")
203
+ comment_array = comment_string.split("\n")
204
+ comment_array.each do |comment|
205
+ comment =~ /(\w+)=(.*)/
206
+ @comments[$1] = $2
207
+ end
208
+ end
209
+ end
210
+ end
@@ -415,47 +415,3 @@ end
415
415
 
416
416
  end # module Bio
417
417
 
418
-
419
- if __FILE__ == $0
420
-
421
- # begin
422
- # require 'pp'
423
- # alias p pp
424
- # rescue LoadError
425
- # end
426
-
427
- puts "### WormBase"
428
- wormbase = Bio::DAS.new('http://www.wormbase.org/db/')
429
-
430
- puts ">>> test get_dsn"
431
- p wormbase.get_dsn
432
-
433
- puts ">>> create segment obj Bio::DAS::SEGMENT.region('I', 1, 1000)"
434
- seg = Bio::DAS::SEGMENT.region('I', 1, 1000)
435
- p seg
436
-
437
- puts ">>> test get_dna"
438
- p wormbase.get_dna('elegans', seg)
439
-
440
- puts "### test get_features"
441
- p wormbase.get_features('elegans', seg)
442
-
443
- puts "### KEGG DAS"
444
- kegg_das = Bio::DAS.new("http://das.hgc.jp/cgi-bin/")
445
-
446
- dsn_list = kegg_das.get_dsn
447
- org_list = dsn_list.collect {|x| x.source}
448
-
449
- puts ">>> dsn : entry_points"
450
- org_list.each do |org|
451
- print "#{org} : "
452
- list = kegg_das.get_entry_points(org)
453
- list.segments.each do |seg|
454
- print " #{seg.entry_id}"
455
- end
456
- puts
457
- end
458
-
459
- end
460
-
461
-
@@ -5,7 +5,7 @@
5
5
  # Toshiaki Katayama <k@bioruby.org>
6
6
  # License:: The Ruby License
7
7
  #
8
- # $Id: ddbjxml.rb,v 1.14 2007/04/05 23:35:41 trevor Exp $
8
+ # $Id:$
9
9
  #
10
10
 
11
11
  require 'bio/io/soapwsdl'
@@ -456,183 +456,3 @@ end # XML
456
456
  end # DDBJ
457
457
  end # Bio
458
458
 
459
-
460
-
461
- if __FILE__ == $0
462
-
463
- begin
464
- require 'pp'
465
- alias p pp
466
- rescue LoadError
467
- end
468
-
469
- puts ">>> Bio::DDBJ::XML::Blast"
470
- serv = Bio::DDBJ::XML::Blast.new
471
- # serv.log = STDERR
472
-
473
- query = "MSSRIARALALVVTLLHLTRLALSTCPAACHCPLEAPKCAPGVGLVRDGCGCCKVCAKQL"
474
-
475
- puts "### searchSimple('blastp', 'SWISS', query)"
476
- puts serv.searchSimple('blastp', 'SWISS', query)
477
-
478
- puts "### searchParam('tblastn', 'ddbjvrl', query, '-m 8')"
479
- puts serv.searchParam('tblastn', 'ddbjvrl', query, '-m 8')
480
-
481
-
482
- puts ">>> Bio::DDBJ::XML::ClustalW"
483
- serv = Bio::DDBJ::XML::ClustalW.new
484
-
485
- query = <<END
486
- > RABSTOUT rabbit Guinness receptor
487
- LKMHLMGHLKMGLKMGLKGMHLMHLKHMHLMTYTYTTYRRWPLWMWLPDFGHAS
488
- ADSCVCAHGFAVCACFAHFDVCFGAVCFHAVCFAHVCFAAAVCFAVCAC
489
- > MUSNOSE mouse nose drying factor
490
- mhkmmhkgmkhmhgmhmhglhmkmhlkmgkhmgkmkytytytryrwtqtqwtwyt
491
- fdgfdsgafdagfdgfsagdfavdfdvgavfsvfgvdfsvdgvagvfdv
492
- > HSHEAVEN human Guinness receptor repeat
493
- mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
494
- fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
495
- mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
496
- fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
497
- END
498
-
499
- puts "### analyzeSimple(query)"
500
- puts serv.analyzeSimple(query)
501
-
502
- puts "### analyzeParam(query, '-align -matrix=blosum')"
503
- puts serv.analyzeParam(query, '-align -matrix=blosum')
504
-
505
-
506
- puts ">>> Bio::DDBJ::XML::DDBJ"
507
- serv = Bio::DDBJ::XML::DDBJ.new
508
-
509
- puts "### getFFEntry('AB000050')"
510
- puts serv.getFFEntry('AB000050')
511
-
512
- puts "### getXMLEntry('AB000050')"
513
- puts serv.getXMLEntry('AB000050')
514
-
515
- puts "### getFeatureInfo('AB000050', 'cds')"
516
- puts serv.getFeatureInfo('AB000050', 'cds')
517
-
518
- puts "### getAllFeatures('AB000050')"
519
- puts serv.getAllFeatures('AB000050')
520
-
521
- puts "### getRelatedFeatures('AL121903', '59000', '64000')"
522
- puts serv.getRelatedFeatures('AL121903', '59000', '64000')
523
-
524
- puts "### getRelatedFeaturesSeq('AL121903', '59000', '64000')"
525
- puts serv.getRelatedFeaturesSeq('AL121903', '59000', '64000')
526
-
527
-
528
- puts ">>> Bio::DDBJ::XML::Fasta"
529
- serv = Bio::DDBJ::XML::Fasta.new
530
-
531
- query = ">Test\nMSDGAVQPDG GQPAVRNERA TGSGNGSGGG GGGGSGGVGI"
532
-
533
- puts "### searchSimple('fasta34', 'PDB', query)"
534
- puts serv.searchSimple('fasta34', 'PDB', query)
535
-
536
- query = ">Test\nAGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"
537
-
538
- puts "### searchParam('fastx34_t', 'PDB', query, '-n')"
539
- puts serv.searchParam('fastx34_t', 'PDB', query, '-n')
540
-
541
-
542
- puts ">>> Bio::DDBJ::XML::GetEntry"
543
- serv = Bio::DDBJ::XML::GetEntry.new
544
-
545
- puts "### getDDBJEntry('AB000050')"
546
- puts serv.getDDBJEntry('AB000050')
547
-
548
- puts "### getPDBEntry('1AAR')"
549
- puts serv. getPDBEntry('1AAR')
550
-
551
-
552
- puts ">>> Bio::DDBJ::XML::Gib"
553
- serv = Bio::DDBJ::XML::Gib.new
554
-
555
- puts "### getOrganismList"
556
- puts serv.getOrganismList
557
-
558
- puts "### getChIDList"
559
- puts serv.getChIDList
560
-
561
- puts "### getOrganismNameFromChid('Sent_CT18:')"
562
- puts serv.getOrganismNameFromChid('Sent_CT18:')
563
-
564
- puts "### getChIDFromOrganismName('Aquifex aeolicus VF5')"
565
- puts serv.getChIDFromOrganismName('Aquifex aeolicus VF5')
566
-
567
- puts "### getAccession('Ecol_K12_MG1655:')"
568
- puts serv.getAccession('Ecol_K12_MG1655:')
569
-
570
- puts "### getPieceNumber('Mgen_G37:')"
571
- puts serv.getPieceNumber('Mgen_G37:')
572
-
573
- puts "### getDivision('Mgen_G37:')"
574
- puts serv.getDivision('Mgen_G37:')
575
-
576
- puts "### getType('Mgen_G37:')"
577
- puts serv.getType('Mgen_G37:')
578
-
579
- puts "### getCDS('Aaeo_VF5:ece1')"
580
- puts serv.getCDS('Aaeo_VF5:ece1')
581
-
582
- puts "### getFlatFile('Nost_PCC7120:pCC7120zeta')"
583
- puts serv.getFlatFile('Nost_PCC7120:pCC7120zeta')
584
-
585
- puts "### getFastaFile('Nost_PCC7120:pCC7120zeta')"
586
- puts serv.getFastaFile('Nost_PCC7120:pCC7120zeta', 'cdsaa')
587
-
588
-
589
- puts ">>> Bio::DDBJ::XML::Gtop"
590
- serv = Bio::DDBJ::XML::Gtop.new
591
-
592
- puts "### getOrganismList"
593
- puts serv.getOrganismList
594
-
595
- puts "### getMasterInfo"
596
- puts serv.getMasterInfo('thrA', 'ecol0')
597
-
598
-
599
- # puts ">>> Bio::DDBJ::XML::PML"
600
- # serv = Bio::DDBJ::XML::PML.new
601
- #
602
- # puts "### getVariation('1')"
603
- # puts serv.getVariation('1')
604
-
605
-
606
- puts ">>> Bio::DDBJ::XML::SRS"
607
- serv = Bio::DDBJ::XML::SRS.new
608
-
609
- puts "### searchSimple('[pathway-des:sugar]')"
610
- puts serv.searchSimple('[pathway-des:sugar]')
611
-
612
- puts "### searchParam('[swissprot-des:cohesin]', '-f seq -sf fasta')"
613
- puts serv.searchParam('[swissprot-des:cohesin]', '-f seq -sf fasta')
614
-
615
-
616
- puts ">>> Bio::DDBJ::XML::TxSearch"
617
- serv = Bio::DDBJ::XML::TxSearch.new
618
-
619
- puts "### searchSimple('*coli')"
620
- puts serv.searchSimple('*coli')
621
-
622
- puts "### searchSimple('*tardigrada*')"
623
- puts serv.searchSimple('*tardigrada*')
624
-
625
- puts "### getTxId('Escherichia coli')"
626
- puts serv.getTxId('Escherichia coli')
627
-
628
- puts "### getTxName('562')"
629
- puts serv.getTxName('562')
630
-
631
- query = "Campylobacter coli\nEscherichia coli"
632
- rank = "family\ngenus"
633
-
634
- puts "### searchLineage(query, rank, 'Bacteria')"
635
- puts serv.searchLineage(query, rank, 'Bacteria')
636
-
637
- end
638
-