bio 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (303) hide show
  1. data/ChangeLog +2105 -3728
  2. data/KNOWN_ISSUES.rdoc +35 -3
  3. data/README.rdoc +8 -2
  4. data/RELEASE_NOTES.rdoc +166 -0
  5. data/bin/bioruby +4 -1
  6. data/bioruby.gemspec +146 -1
  7. data/bioruby.gemspec.erb +3 -1
  8. data/doc/ChangeLog-before-1.3.1 +3961 -0
  9. data/doc/Tutorial.rd +154 -22
  10. data/doc/Tutorial.rd.html +125 -68
  11. data/lib/bio.rb +21 -6
  12. data/lib/bio/appl/bl2seq/report.rb +11 -202
  13. data/lib/bio/appl/blast/format0.rb +0 -193
  14. data/lib/bio/appl/blast/report.rb +2 -147
  15. data/lib/bio/appl/blast/wublast.rb +0 -208
  16. data/lib/bio/appl/fasta.rb +4 -19
  17. data/lib/bio/appl/fasta/format10.rb +0 -14
  18. data/lib/bio/appl/genscan/report.rb +0 -176
  19. data/lib/bio/appl/hmmer.rb +1 -15
  20. data/lib/bio/appl/hmmer/report.rb +0 -100
  21. data/lib/bio/appl/meme/mast.rb +156 -0
  22. data/lib/bio/appl/meme/mast/report.rb +91 -0
  23. data/lib/bio/appl/meme/motif.rb +48 -0
  24. data/lib/bio/appl/psort.rb +0 -111
  25. data/lib/bio/appl/psort/report.rb +1 -45
  26. data/lib/bio/appl/pts1.rb +2 -4
  27. data/lib/bio/appl/sosui/report.rb +5 -54
  28. data/lib/bio/appl/targetp/report.rb +1 -104
  29. data/lib/bio/appl/tmhmm/report.rb +0 -36
  30. data/lib/bio/command.rb +94 -10
  31. data/lib/bio/data/aa.rb +1 -77
  32. data/lib/bio/data/codontable.rb +1 -95
  33. data/lib/bio/data/na.rb +1 -26
  34. data/lib/bio/db/aaindex.rb +1 -38
  35. data/lib/bio/db/fasta.rb +1 -134
  36. data/lib/bio/db/fasta/format_qual.rb +204 -0
  37. data/lib/bio/db/fasta/qual.rb +102 -0
  38. data/lib/bio/db/fastq.rb +645 -0
  39. data/lib/bio/db/fastq/fastq_to_biosequence.rb +40 -0
  40. data/lib/bio/db/fastq/format_fastq.rb +175 -0
  41. data/lib/bio/db/genbank/genbank.rb +1 -86
  42. data/lib/bio/db/gff.rb +0 -17
  43. data/lib/bio/db/go.rb +4 -72
  44. data/lib/bio/db/kegg/common.rb +112 -0
  45. data/lib/bio/db/kegg/compound.rb +29 -20
  46. data/lib/bio/db/kegg/drug.rb +74 -34
  47. data/lib/bio/db/kegg/enzyme.rb +26 -5
  48. data/lib/bio/db/kegg/genes.rb +128 -15
  49. data/lib/bio/db/kegg/genome.rb +3 -41
  50. data/lib/bio/db/kegg/glycan.rb +19 -24
  51. data/lib/bio/db/kegg/orthology.rb +16 -56
  52. data/lib/bio/db/kegg/reaction.rb +81 -28
  53. data/lib/bio/db/kegg/taxonomy.rb +1 -52
  54. data/lib/bio/db/litdb.rb +1 -16
  55. data/lib/bio/db/phyloxml/phyloxml.xsd +582 -0
  56. data/lib/bio/db/phyloxml/phyloxml_elements.rb +1174 -0
  57. data/lib/bio/db/phyloxml/phyloxml_parser.rb +954 -0
  58. data/lib/bio/db/phyloxml/phyloxml_writer.rb +228 -0
  59. data/lib/bio/db/prosite.rb +2 -95
  60. data/lib/bio/db/rebase.rb +5 -6
  61. data/lib/bio/db/sanger_chromatogram/abif.rb +120 -0
  62. data/lib/bio/db/sanger_chromatogram/chromatogram.rb +133 -0
  63. data/lib/bio/db/sanger_chromatogram/chromatogram_to_biosequence.rb +32 -0
  64. data/lib/bio/db/sanger_chromatogram/scf.rb +210 -0
  65. data/lib/bio/io/das.rb +0 -44
  66. data/lib/bio/io/ddbjxml.rb +1 -181
  67. data/lib/bio/io/flatfile.rb +1 -7
  68. data/lib/bio/io/flatfile/autodetection.rb +6 -0
  69. data/lib/bio/io/keggapi.rb +0 -442
  70. data/lib/bio/io/ncbirest.rb +130 -132
  71. data/lib/bio/io/ncbisoap.rb +2 -1
  72. data/lib/bio/io/pubmed.rb +0 -88
  73. data/lib/bio/location.rb +0 -73
  74. data/lib/bio/pathway.rb +0 -171
  75. data/lib/bio/sequence.rb +18 -1
  76. data/lib/bio/sequence/adapter.rb +3 -0
  77. data/lib/bio/sequence/format.rb +16 -0
  78. data/lib/bio/sequence/quality_score.rb +205 -0
  79. data/lib/bio/tree.rb +70 -5
  80. data/lib/bio/util/restriction_enzyme/single_strand.rb +3 -2
  81. data/lib/bio/util/sirna.rb +1 -23
  82. data/lib/bio/version.rb +1 -1
  83. data/sample/demo_aaindex.rb +67 -0
  84. data/sample/demo_aminoacid.rb +101 -0
  85. data/sample/demo_bl2seq_report.rb +220 -0
  86. data/sample/demo_blast_report.rb +285 -0
  87. data/sample/demo_codontable.rb +119 -0
  88. data/sample/demo_das.rb +105 -0
  89. data/sample/demo_ddbjxml.rb +212 -0
  90. data/sample/demo_fasta_remote.rb +51 -0
  91. data/sample/demo_fastaformat.rb +105 -0
  92. data/sample/demo_genbank.rb +132 -0
  93. data/sample/demo_genscan_report.rb +202 -0
  94. data/sample/demo_gff1.rb +49 -0
  95. data/sample/demo_go.rb +98 -0
  96. data/sample/demo_hmmer_report.rb +149 -0
  97. data/sample/demo_kegg_compound.rb +57 -0
  98. data/sample/demo_kegg_drug.rb +65 -0
  99. data/sample/demo_kegg_genome.rb +74 -0
  100. data/sample/demo_kegg_glycan.rb +72 -0
  101. data/sample/demo_kegg_orthology.rb +62 -0
  102. data/sample/demo_kegg_reaction.rb +66 -0
  103. data/sample/demo_kegg_taxonomy.rb +92 -0
  104. data/sample/demo_keggapi.rb +502 -0
  105. data/sample/demo_litdb.rb +42 -0
  106. data/sample/demo_locations.rb +99 -0
  107. data/sample/demo_ncbi_rest.rb +130 -0
  108. data/sample/demo_nucleicacid.rb +49 -0
  109. data/sample/demo_pathway.rb +196 -0
  110. data/sample/demo_prosite.rb +120 -0
  111. data/sample/demo_psort.rb +138 -0
  112. data/sample/demo_psort_report.rb +70 -0
  113. data/sample/demo_pubmed.rb +118 -0
  114. data/sample/demo_sirna.rb +63 -0
  115. data/sample/demo_sosui_report.rb +89 -0
  116. data/sample/demo_targetp_report.rb +135 -0
  117. data/sample/demo_tmhmm_report.rb +68 -0
  118. data/sample/pmfetch.rb +13 -4
  119. data/sample/pmsearch.rb +15 -4
  120. data/sample/test_phyloxml_big.rb +205 -0
  121. data/test/bioruby_test_helper.rb +61 -0
  122. data/test/data/KEGG/1.1.1.1.enzyme +935 -0
  123. data/test/data/KEGG/C00025.compound +102 -0
  124. data/test/data/KEGG/D00063.drug +104 -0
  125. data/test/data/KEGG/G00024.glycan +47 -0
  126. data/test/data/KEGG/G01366.glycan +18 -0
  127. data/test/data/KEGG/K02338.orthology +902 -0
  128. data/test/data/KEGG/R00006.reaction +14 -0
  129. data/test/data/fastq/README.txt +109 -0
  130. data/test/data/fastq/error_diff_ids.fastq +20 -0
  131. data/test/data/fastq/error_double_qual.fastq +22 -0
  132. data/test/data/fastq/error_double_seq.fastq +22 -0
  133. data/test/data/fastq/error_long_qual.fastq +20 -0
  134. data/test/data/fastq/error_no_qual.fastq +20 -0
  135. data/test/data/fastq/error_qual_del.fastq +20 -0
  136. data/test/data/fastq/error_qual_escape.fastq +20 -0
  137. data/test/data/fastq/error_qual_null.fastq +0 -0
  138. data/test/data/fastq/error_qual_space.fastq +21 -0
  139. data/test/data/fastq/error_qual_tab.fastq +21 -0
  140. data/test/data/fastq/error_qual_unit_sep.fastq +20 -0
  141. data/test/data/fastq/error_qual_vtab.fastq +20 -0
  142. data/test/data/fastq/error_short_qual.fastq +20 -0
  143. data/test/data/fastq/error_spaces.fastq +20 -0
  144. data/test/data/fastq/error_tabs.fastq +21 -0
  145. data/test/data/fastq/error_trunc_at_plus.fastq +19 -0
  146. data/test/data/fastq/error_trunc_at_qual.fastq +19 -0
  147. data/test/data/fastq/error_trunc_at_seq.fastq +18 -0
  148. data/test/data/fastq/error_trunc_in_plus.fastq +19 -0
  149. data/test/data/fastq/error_trunc_in_qual.fastq +20 -0
  150. data/test/data/fastq/error_trunc_in_seq.fastq +18 -0
  151. data/test/data/fastq/error_trunc_in_title.fastq +17 -0
  152. data/test/data/fastq/illumina_full_range_as_illumina.fastq +8 -0
  153. data/test/data/fastq/illumina_full_range_as_sanger.fastq +8 -0
  154. data/test/data/fastq/illumina_full_range_as_solexa.fastq +8 -0
  155. data/test/data/fastq/illumina_full_range_original_illumina.fastq +8 -0
  156. data/test/data/fastq/longreads_as_illumina.fastq +40 -0
  157. data/test/data/fastq/longreads_as_sanger.fastq +40 -0
  158. data/test/data/fastq/longreads_as_solexa.fastq +40 -0
  159. data/test/data/fastq/longreads_original_sanger.fastq +120 -0
  160. data/test/data/fastq/misc_dna_as_illumina.fastq +16 -0
  161. data/test/data/fastq/misc_dna_as_sanger.fastq +16 -0
  162. data/test/data/fastq/misc_dna_as_solexa.fastq +16 -0
  163. data/test/data/fastq/misc_dna_original_sanger.fastq +16 -0
  164. data/test/data/fastq/misc_rna_as_illumina.fastq +16 -0
  165. data/test/data/fastq/misc_rna_as_sanger.fastq +16 -0
  166. data/test/data/fastq/misc_rna_as_solexa.fastq +16 -0
  167. data/test/data/fastq/misc_rna_original_sanger.fastq +16 -0
  168. data/test/data/fastq/sanger_full_range_as_illumina.fastq +8 -0
  169. data/test/data/fastq/sanger_full_range_as_sanger.fastq +8 -0
  170. data/test/data/fastq/sanger_full_range_as_solexa.fastq +8 -0
  171. data/test/data/fastq/sanger_full_range_original_sanger.fastq +8 -0
  172. data/test/data/fastq/solexa_full_range_as_illumina.fastq +8 -0
  173. data/test/data/fastq/solexa_full_range_as_sanger.fastq +8 -0
  174. data/test/data/fastq/solexa_full_range_as_solexa.fastq +8 -0
  175. data/test/data/fastq/solexa_full_range_original_solexa.fastq +8 -0
  176. data/test/data/fastq/wrapping_as_illumina.fastq +12 -0
  177. data/test/data/fastq/wrapping_as_sanger.fastq +12 -0
  178. data/test/data/fastq/wrapping_as_solexa.fastq +12 -0
  179. data/test/data/fastq/wrapping_original_sanger.fastq +24 -0
  180. data/test/data/meme/db +0 -0
  181. data/test/data/meme/mast +0 -0
  182. data/test/data/meme/mast.out +13 -0
  183. data/test/data/meme/meme.out +3 -0
  184. data/test/data/phyloxml/apaf.xml +666 -0
  185. data/test/data/phyloxml/bcl_2.xml +2097 -0
  186. data/test/data/phyloxml/made_up.xml +144 -0
  187. data/test/data/phyloxml/ncbi_taxonomy_mollusca_short.xml +65 -0
  188. data/test/data/phyloxml/phyloxml_examples.xml +415 -0
  189. data/test/data/sanger_chromatogram/test_chromatogram_abif.ab1 +0 -0
  190. data/test/data/sanger_chromatogram/test_chromatogram_scf_v2.scf +0 -0
  191. data/test/data/sanger_chromatogram/test_chromatogram_scf_v3.scf +0 -0
  192. data/test/functional/bio/appl/test_pts1.rb +7 -5
  193. data/test/functional/bio/io/test_ensembl.rb +4 -3
  194. data/test/functional/bio/io/test_pubmed.rb +9 -3
  195. data/test/functional/bio/io/test_soapwsdl.rb +5 -4
  196. data/test/functional/bio/io/test_togows.rb +5 -4
  197. data/test/functional/bio/sequence/test_output_embl.rb +6 -4
  198. data/test/functional/bio/test_command.rb +54 -5
  199. data/test/runner.rb +5 -3
  200. data/test/unit/bio/appl/bl2seq/test_report.rb +5 -4
  201. data/test/unit/bio/appl/blast/test_ncbioptions.rb +4 -2
  202. data/test/unit/bio/appl/blast/test_report.rb +5 -4
  203. data/test/unit/bio/appl/blast/test_rpsblast.rb +5 -4
  204. data/test/unit/bio/appl/gcg/test_msf.rb +5 -5
  205. data/test/unit/bio/appl/genscan/test_report.rb +8 -9
  206. data/test/unit/bio/appl/hmmer/test_report.rb +5 -4
  207. data/test/unit/bio/appl/iprscan/test_report.rb +6 -5
  208. data/test/unit/bio/appl/mafft/test_report.rb +6 -5
  209. data/test/unit/bio/appl/meme/mast/test_report.rb +46 -0
  210. data/test/unit/bio/appl/meme/test_mast.rb +103 -0
  211. data/test/unit/bio/appl/meme/test_motif.rb +38 -0
  212. data/test/unit/bio/appl/paml/codeml/test_rates.rb +5 -4
  213. data/test/unit/bio/appl/paml/codeml/test_report.rb +5 -4
  214. data/test/unit/bio/appl/paml/test_codeml.rb +5 -4
  215. data/test/unit/bio/appl/sim4/test_report.rb +5 -4
  216. data/test/unit/bio/appl/sosui/test_report.rb +6 -5
  217. data/test/unit/bio/appl/targetp/test_report.rb +5 -3
  218. data/test/unit/bio/appl/test_blast.rb +5 -4
  219. data/test/unit/bio/appl/test_fasta.rb +4 -2
  220. data/test/unit/bio/appl/test_pts1.rb +4 -2
  221. data/test/unit/bio/appl/tmhmm/test_report.rb +6 -5
  222. data/test/unit/bio/data/test_aa.rb +5 -3
  223. data/test/unit/bio/data/test_codontable.rb +5 -4
  224. data/test/unit/bio/data/test_na.rb +5 -3
  225. data/test/unit/bio/db/biosql/tc_biosql.rb +5 -1
  226. data/test/unit/bio/db/embl/test_common.rb +4 -2
  227. data/test/unit/bio/db/embl/test_embl.rb +6 -6
  228. data/test/unit/bio/db/embl/test_embl_rel89.rb +6 -6
  229. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +7 -8
  230. data/test/unit/bio/db/embl/test_sptr.rb +6 -8
  231. data/test/unit/bio/db/embl/test_uniprot.rb +6 -5
  232. data/test/unit/bio/db/fasta/test_format_qual.rb +346 -0
  233. data/test/unit/bio/db/kegg/test_compound.rb +146 -0
  234. data/test/unit/bio/db/kegg/test_drug.rb +194 -0
  235. data/test/unit/bio/db/kegg/test_enzyme.rb +241 -0
  236. data/test/unit/bio/db/kegg/test_genes.rb +32 -4
  237. data/test/unit/bio/db/kegg/test_glycan.rb +260 -0
  238. data/test/unit/bio/db/kegg/test_orthology.rb +50 -0
  239. data/test/unit/bio/db/kegg/test_reaction.rb +96 -0
  240. data/test/unit/bio/db/pdb/test_pdb.rb +4 -2
  241. data/test/unit/bio/db/sanger_chromatogram/test_abif.rb +76 -0
  242. data/test/unit/bio/db/sanger_chromatogram/test_scf.rb +98 -0
  243. data/test/unit/bio/db/test_aaindex.rb +6 -6
  244. data/test/unit/bio/db/test_fasta.rb +5 -46
  245. data/test/unit/bio/db/test_fastq.rb +829 -0
  246. data/test/unit/bio/db/test_gff.rb +4 -2
  247. data/test/unit/bio/db/test_lasergene.rb +7 -5
  248. data/test/unit/bio/db/test_medline.rb +4 -2
  249. data/test/unit/bio/db/test_newick.rb +6 -6
  250. data/test/unit/bio/db/test_nexus.rb +4 -2
  251. data/test/unit/bio/db/test_phyloxml.rb +769 -0
  252. data/test/unit/bio/db/test_phyloxml_writer.rb +328 -0
  253. data/test/unit/bio/db/test_prosite.rb +6 -5
  254. data/test/unit/bio/db/test_qual.rb +63 -0
  255. data/test/unit/bio/db/test_rebase.rb +5 -3
  256. data/test/unit/bio/db/test_soft.rb +7 -6
  257. data/test/unit/bio/io/flatfile/test_autodetection.rb +6 -7
  258. data/test/unit/bio/io/flatfile/test_buffer.rb +6 -5
  259. data/test/unit/bio/io/flatfile/test_splitter.rb +4 -4
  260. data/test/unit/bio/io/test_ddbjxml.rb +4 -3
  261. data/test/unit/bio/io/test_ensembl.rb +5 -3
  262. data/test/unit/bio/io/test_fastacmd.rb +4 -3
  263. data/test/unit/bio/io/test_flatfile.rb +6 -5
  264. data/test/unit/bio/io/test_soapwsdl.rb +4 -3
  265. data/test/unit/bio/io/test_togows.rb +4 -2
  266. data/test/unit/bio/sequence/test_aa.rb +5 -3
  267. data/test/unit/bio/sequence/test_common.rb +4 -2
  268. data/test/unit/bio/sequence/test_compat.rb +4 -2
  269. data/test/unit/bio/sequence/test_dblink.rb +5 -3
  270. data/test/unit/bio/sequence/test_na.rb +4 -2
  271. data/test/unit/bio/sequence/test_quality_score.rb +330 -0
  272. data/test/unit/bio/shell/plugin/test_seq.rb +5 -3
  273. data/test/unit/bio/test_alignment.rb +5 -3
  274. data/test/unit/bio/test_command.rb +4 -3
  275. data/test/unit/bio/test_db.rb +5 -3
  276. data/test/unit/bio/test_feature.rb +4 -2
  277. data/test/unit/bio/test_location.rb +4 -2
  278. data/test/unit/bio/test_map.rb +5 -3
  279. data/test/unit/bio/test_pathway.rb +4 -2
  280. data/test/unit/bio/test_reference.rb +4 -2
  281. data/test/unit/bio/test_sequence.rb +5 -3
  282. data/test/unit/bio/test_shell.rb +5 -3
  283. data/test/unit/bio/test_tree.rb +6 -6
  284. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +4 -2
  285. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +4 -2
  286. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +4 -2
  287. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -2
  288. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +4 -2
  289. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +4 -2
  290. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +4 -2
  291. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +4 -2
  292. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +4 -2
  293. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +4 -2
  294. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -2
  295. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +4 -2
  296. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +17 -13
  297. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +17 -13
  298. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +4 -2
  299. data/test/unit/bio/util/test_color_scheme.rb +5 -3
  300. data/test/unit/bio/util/test_contingency_table.rb +5 -3
  301. data/test/unit/bio/util/test_restriction_enzyme.rb +4 -2
  302. data/test/unit/bio/util/test_sirna.rb +6 -4
  303. metadata +147 -2
@@ -0,0 +1,133 @@
1
+ #
2
+ # = bio/db/sanger_chromatogram/chromatogram.rb - Sanger Chromatogram class
3
+ #
4
+ # Copyright:: Copyright (C) 2009 Anthony Underwood <anthony.underwood@hpa.org.uk>, <email2ants@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ # $Id:$
8
+ #
9
+ require 'bio/sequence/adapter'
10
+ module Bio
11
+ # == Description
12
+ #
13
+ # This is the Superclass for the Abif and Scf classes that allow importing of the common scf
14
+ # and abi sequence chromatogram formats
15
+ # The following attributes are Common to both the Abif and Scf subclasses
16
+ #
17
+ # * *chromatogram_type* (String): This is extracted from the chromatogram file itself and will
18
+ # probably be either .scf or ABIF for Scf and Abif files respectively.
19
+ # * *version* (String): The version of the Scf or Abif file
20
+ # * *sequence* (String): the sequence contained within the chromatogram as a string.
21
+ # * *qualities* (Array): the quality scores of each base as an array of integers. These will
22
+ # probably be phred scores.
23
+ # * *peak_indices* (Array): if the sequence traces contained within the chromatogram are imagined
24
+ # as being plotted on an x,y graph, the peak indices are the x positions of the peaks that
25
+ # represent the nucleotides bases found in the sequence from the chromatogram. For example if
26
+ # the peak_indices are [16,24,37,49 ....] and the sequence is AGGT...., at position 16 the
27
+ # traces in the chromatogram were base-called as an A, position 24 a G, position 37 a G,
28
+ # position 49 a T etc
29
+ # * *atrace*, *ctrace*, *gtrace*, *ttrace* (Array): If the sequence traces contained within
30
+ # the chromatogram are imagined as being plotted on an x,y graph, these attributes are arrays of
31
+ # y positions for each of the 4 nucleotide bases along the length of the x axis. If these were
32
+ # plotted joined by lines of different colours then the resulting graph should look like the
33
+ # original chromatogram file when viewed in a chromtogram viewer such as Chromas, 4Peaks or
34
+ # FinchTV.
35
+ # * *dye_mobility* (String): The mobility of the dye used when sequencing. This can influence the
36
+ # base calling
37
+ #
38
+ # == Usage
39
+ # filename = "path/to/sequence_chromatogram_file"
40
+ #
41
+ # for Abif files
42
+ # chromatogram_ff = Bio::Abif.open(filename)
43
+ # for Scf files
44
+ # chromatogram_ff = Bio::Scf.open(filename)
45
+ #
46
+ # chromatogram = chromatogram_ff.next_entry
47
+ # chromatogram.to_seq # => returns a Bio::Sequence object
48
+ # chromatogram.sequence # => returns the sequence contained within the chromatogram as a string
49
+ # chromatogram.qualities # => returns an array of quality values for each base
50
+ # chromatogram.atrace # => returns an array of the a trace y positions
51
+ #
52
+ class SangerChromatogram
53
+ # The type of chromatogram file .scf for Scf files and ABIF doe Abif files
54
+ attr_accessor :chromatogram_type
55
+ # The Version of the Scf or Abif file (String)
56
+ attr_accessor :version
57
+ # The sequence contained within the chromatogram (String)
58
+ attr_accessor :sequence
59
+ # An array of quality scores for each base in the sequence (Array)
60
+ attr_accessor :qualities
61
+ # An array 'x' positions (see description) on the trace where the bases occur/have been called (Array)
62
+ attr_accessor :peak_indices
63
+ # An array of 'y' positions (see description) for the 'A' trace from the chromatogram (Array
64
+ attr_accessor :atrace
65
+ # An array of 'y' positions (see description) for the 'C' trace from the chromatogram (Array
66
+ attr_accessor :ctrace
67
+ # An array of 'y' positions (see description) for the 'G' trace from the chromatogram (Array
68
+ attr_accessor :gtrace
69
+ # An array of 'y' positions (see description) for the 'T' trace from the chromatogram (Array
70
+ attr_accessor :ttrace
71
+ #The mobility of the dye used when sequencing (String)
72
+ attr_accessor :dye_mobility
73
+
74
+ def self.open(filename)
75
+ Bio::FlatFile.open(self, filename)
76
+ end
77
+
78
+ # Returns a Bio::Sequence::NA object based on the sequence from the chromatogram
79
+ def seq
80
+ Bio::Sequence::NA.new(@sequence)
81
+ end
82
+
83
+ # Returns a Bio::Sequence object based on the sequence from the chromatogram
84
+ def to_biosequence
85
+ Bio::Sequence.adapter(self, Bio::Sequence::Adapter::SangerChromatogram)
86
+ end
87
+ alias :to_seq :to_biosequence
88
+
89
+ # Returns the sequence from the chromatogram as a string
90
+ def sequence_string
91
+ @sequence
92
+ end
93
+
94
+ # Reverses and complements the current chromatogram object including its sequence, traces
95
+ # and qualities
96
+ def complement!
97
+ # reverse traces
98
+ tmp_trace = @atrace
99
+ @atrace = @ttrace.reverse
100
+ @ttrace = tmp_trace.reverse
101
+ tmp_trace = @ctrace
102
+ @ctrace = @gtrace.reverse
103
+ @gtrace = tmp_trace.reverse
104
+
105
+ # reverse base qualities
106
+ if !@aqual.nil? # if qualities exist
107
+ tmp_qual = @aqual
108
+ @aqual = @tqual.reverse
109
+ @tqual = tmp_qual.reverse
110
+ tmp_qual = @cqual
111
+ @cqual = @gqual.reverse
112
+ @gqual = tmp_qual.reverse
113
+ end
114
+
115
+ #reverse qualities
116
+ @qualities = @qualities.reverse
117
+
118
+ #reverse peak indices
119
+ @peak_indices = @peak_indices.map{|index| @atrace.size - index}
120
+ @peak_indices.reverse!
121
+
122
+ # reverse sequence
123
+ @sequence = @sequence.reverse.tr('atgcnrykmswbvdh','tacgnyrmkswvbhd')
124
+ end
125
+ # Returns a new chromatogram object of the appropriate subclass (scf or abi) where the
126
+ # sequence, traces and qualities have all been revesed and complemented
127
+ def complement
128
+ chromatogram = self.dup
129
+ chromatogram.complement!
130
+ return chromatogram
131
+ end
132
+ end
133
+ end
@@ -0,0 +1,32 @@
1
+ #
2
+ # = bio/db/sanger_chromatogram/chromatogram_to_biosequence.rb - Bio::SangerChromatogram to Bio::Sequence adapter module
3
+ #
4
+ # Copyright:: Copyright (C) 2009 Anthony Underwood <anthony.underwood@hpa.org.uk>, <email2ants@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+ # $Id:$
8
+ #
9
+
10
+ require 'bio/sequence'
11
+ require 'bio/sequence/adapter'
12
+
13
+ # Internal use only. Normal users should not use this module.
14
+ #
15
+ # Bio::SangerChromatogram to Bio::Sequence adapter module.
16
+ # It is internally used in Bio::SangerChromatogram#to_biosequence.
17
+ #
18
+ module Bio::Sequence::Adapter::SangerChromatogram
19
+
20
+ extend Bio::Sequence::Adapter
21
+
22
+ private
23
+
24
+ def_biosequence_adapter :seq
25
+
26
+ # primary accession
27
+ def_biosequence_adapter :primary_accession do |orig|
28
+ orig.version
29
+ end
30
+
31
+ end #module Bio::Sequence::Adapter::SangerChromatogram
32
+
@@ -0,0 +1,210 @@
1
+ #
2
+ # = bio/db/sanger_chromatogram/scf.rb - Scf class
3
+ #
4
+ # Copyright:: Copyright (C) 2009 Anthony Underwood <anthony.underwood@hpa.org.uk>, <email2ants@gmail.com>
5
+ # License:: The Ruby License
6
+ #
7
+
8
+ require 'bio/db/sanger_chromatogram/chromatogram'
9
+
10
+ module Bio
11
+ # == Description
12
+ #
13
+ # This class inherits from the SangerChromatogram superclass. It captures the information contained
14
+ # within an scf format chromatogram file generated by DNA sequencing. See the SangerChromatogram class
15
+ # for usage
16
+ class Scf < SangerChromatogram
17
+ # sequence attributes
18
+
19
+ # The quality of each base at each position along the length of the sequence is captured
20
+ # by the nqual attributes where n is one of a, c, g or t. Generally the quality will be
21
+ # high for the base that is called at a particular position and low for all the other bases.
22
+ # However at positions of poor sequence quality, more than one base may have similar top scores.
23
+ # By analysing the nqual attributes it may be possible to determine if the base calling was
24
+ # correct or not.
25
+ # The quality of the A base at each sequence position
26
+ attr_accessor :aqual
27
+ # The quality of the C base at each sequence position
28
+ attr_accessor :cqual
29
+ # The quality of the G base at each sequence position
30
+ attr_accessor :gqual
31
+ # The quality of the T base at each sequence position
32
+ attr_accessor :tqual
33
+ # A hash of extra information extracted from the chromatogram file
34
+ attr_accessor :comments
35
+
36
+ # see SangerChromatogram class for how to create an Scf object and its usage
37
+ def initialize(string)
38
+ header = string.slice(0,128)
39
+ # read in header info
40
+ @chromatogram_type, @samples, @sample_offset, @bases, @bases_left_clip, @bases_right_clip, @bases_offset, @comment_size, @comments_offset, @version, @sample_size, @code_set, @header_spare = header.unpack("a4 NNNNNNNN a4 NN N20")
41
+ get_traces(string)
42
+ get_bases_peakIndices_and_qualities(string)
43
+ get_comments(string)
44
+ if @comments["DYEP"]
45
+ @dye_mobility = @comments["DYEP"]
46
+ else
47
+ @dye_mobility = "Unnown"
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def get_traces(string)
54
+ if @version == "3.00"
55
+ # read in trace info
56
+ offset = @sample_offset
57
+ length = @samples * @sample_size
58
+ # determine whether the data is stored in 1 byte as an unsigned byte or 2 bytes as an unsigned short
59
+ @sample_size == 2 ? byte = "n" : byte = "c"
60
+ for base in ["a" , "c" , "g" , "t"]
61
+ trace_read = string.slice(offset,length).unpack("#{byte}#{@samples}")
62
+ # convert offsets
63
+ for sample_num in (0..trace_read.size-1)
64
+ if trace_read[sample_num] > 30000
65
+ trace_read[sample_num] = trace_read[sample_num] - 65536
66
+ end
67
+ end
68
+ # For 8-bit data we need to emulate a signed/unsigned
69
+ # cast that is implicit in the C implementations.....
70
+ if @sample_size == 1
71
+ for sample_num in (0..trace_read.size-1)
72
+ trace_read[sample_num] += 256 if trace_read[sample_num] < 0
73
+ end
74
+ end
75
+ trace_read = convert_deltas_to_values(trace_read)
76
+ self.instance_variable_set("@#{base}trace", trace_read)
77
+ offset += length
78
+ end
79
+ elsif @version == "2.00"
80
+ @atrace = []
81
+ @ctrace = []
82
+ @gtrace = []
83
+ @ttrace = []
84
+ # read in trace info
85
+ offset = @sample_offset
86
+ length = @samples * @sample_size * 4
87
+ # determine whether the data is stored in 1 byte as an unsigned byte or 2 bytes as an unsigned short
88
+ @sample_size == 2 ? byte = "n" : byte = "c"
89
+ trace_read = string.slice(offset,length).unpack("#{byte}#{@samples*4}")
90
+ (0..(@samples-1)*4).step(4) do |offset2|
91
+ @atrace << trace_read[offset2]
92
+ @ctrace << trace_read[offset2+1]
93
+ @gtrace << trace_read[offset2+2]
94
+ @ttrace << trace_read[offset2+3]
95
+ end
96
+ end
97
+ end
98
+ def get_bases_peakIndices_and_qualities(string)
99
+ if @version == "3.00"
100
+ # now go and get the peak index information
101
+ offset = @bases_offset
102
+ length = @bases * 4
103
+ get_v3_peak_indices(string,offset,length)
104
+
105
+ # now go and get the accuracy information
106
+ offset += length;
107
+ get_v3_accuracies(string,offset,length)
108
+
109
+ # OK, now go and get the base information.
110
+ offset += length;
111
+ length = @bases;
112
+ get_v3_sequence(string,offset,length)
113
+
114
+ #combine accuracies to get quality scores
115
+ @qualities= convert_accuracies_to_qualities
116
+ elsif @version == "2.00"
117
+ @peak_indices = []
118
+ @aqual = []
119
+ @cqual = []
120
+ @gqual = []
121
+ @tqual = []
122
+ @qualities = []
123
+ @sequence = ""
124
+ # now go and get the base information
125
+ offset = @bases_offset
126
+ length = @bases * 12
127
+ all_bases_info = string.slice(offset,length)
128
+
129
+ (0..length-1).step(12) do |offset2|
130
+ base_info = all_bases_info.slice(offset2,12).unpack("N C C C C a C3")
131
+ @peak_indices << base_info[0]
132
+ @aqual << base_info[1]
133
+ @cqual << base_info[2]
134
+ @gqual << base_info[3]
135
+ @tqual << base_info[4]
136
+ @sequence += base_info[5].downcase
137
+ case base_info[5].downcase
138
+ when "a"
139
+ @qualities << base_info[1]
140
+ when "c"
141
+ @qualities << base_info[2]
142
+ when "g"
143
+ @qualities << base_info[3]
144
+ when "t"
145
+ @qualities << base_info[4]
146
+ else
147
+ @qualities << 0
148
+ end
149
+ end
150
+ end
151
+ end
152
+ def get_v3_peak_indices(string,offset,length)
153
+ @peak_indices = string.slice(offset,length).unpack("N#{length/4}")
154
+ end
155
+ def get_v3_accuracies(string,offset,length)
156
+ qualities = string.slice(offset,length)
157
+ qual_length = length/4;
158
+ qual_offset = 0;
159
+ for base in ["a" , "c" , "g" , "t"]
160
+ self.instance_variable_set("@#{base}qual",qualities.slice(qual_offset,qual_length).unpack("C#{qual_length}"))
161
+ qual_offset += qual_length
162
+ end
163
+ end
164
+ def get_v3_sequence(string,offset,length)
165
+ @sequence = string.slice(offset,length).unpack("a#{length}").join('').downcase
166
+ end
167
+
168
+ def convert_deltas_to_values(trace_read)
169
+ p_sample = 0;
170
+ for sample_num in (0..trace_read.size-1)
171
+ trace_read[sample_num] = trace_read[sample_num] + p_sample
172
+ p_sample = trace_read[sample_num];
173
+ end
174
+ p_sample = 0;
175
+ for sample_num in (0..trace_read.size-1)
176
+ trace_read[sample_num] = trace_read[sample_num] + p_sample
177
+ p_sample = trace_read[sample_num];
178
+ end
179
+ return trace_read
180
+ end
181
+ def convert_accuracies_to_qualities
182
+ qualities = Array.new
183
+ for base_pos in (0..@sequence.length-1)
184
+ case sequence.slice(base_pos,1)
185
+ when "a"
186
+ qualities << @aqual[base_pos]
187
+ when "c"
188
+ qualities << @cqual[base_pos]
189
+ when "g"
190
+ qualities << @gqual[base_pos]
191
+ when "t"
192
+ qualities << @tqual[base_pos]
193
+ else
194
+ qualities << 0
195
+ end
196
+ end
197
+ return qualities
198
+ end
199
+ def get_comments(string)
200
+ @comments = Hash.new
201
+ comment_string = string.slice(@comments_offset,@comment_size)
202
+ comment_string.gsub!(/\0/, "")
203
+ comment_array = comment_string.split("\n")
204
+ comment_array.each do |comment|
205
+ comment =~ /(\w+)=(.*)/
206
+ @comments[$1] = $2
207
+ end
208
+ end
209
+ end
210
+ end
@@ -415,47 +415,3 @@ end
415
415
 
416
416
  end # module Bio
417
417
 
418
-
419
- if __FILE__ == $0
420
-
421
- # begin
422
- # require 'pp'
423
- # alias p pp
424
- # rescue LoadError
425
- # end
426
-
427
- puts "### WormBase"
428
- wormbase = Bio::DAS.new('http://www.wormbase.org/db/')
429
-
430
- puts ">>> test get_dsn"
431
- p wormbase.get_dsn
432
-
433
- puts ">>> create segment obj Bio::DAS::SEGMENT.region('I', 1, 1000)"
434
- seg = Bio::DAS::SEGMENT.region('I', 1, 1000)
435
- p seg
436
-
437
- puts ">>> test get_dna"
438
- p wormbase.get_dna('elegans', seg)
439
-
440
- puts "### test get_features"
441
- p wormbase.get_features('elegans', seg)
442
-
443
- puts "### KEGG DAS"
444
- kegg_das = Bio::DAS.new("http://das.hgc.jp/cgi-bin/")
445
-
446
- dsn_list = kegg_das.get_dsn
447
- org_list = dsn_list.collect {|x| x.source}
448
-
449
- puts ">>> dsn : entry_points"
450
- org_list.each do |org|
451
- print "#{org} : "
452
- list = kegg_das.get_entry_points(org)
453
- list.segments.each do |seg|
454
- print " #{seg.entry_id}"
455
- end
456
- puts
457
- end
458
-
459
- end
460
-
461
-
@@ -5,7 +5,7 @@
5
5
  # Toshiaki Katayama <k@bioruby.org>
6
6
  # License:: The Ruby License
7
7
  #
8
- # $Id: ddbjxml.rb,v 1.14 2007/04/05 23:35:41 trevor Exp $
8
+ # $Id:$
9
9
  #
10
10
 
11
11
  require 'bio/io/soapwsdl'
@@ -456,183 +456,3 @@ end # XML
456
456
  end # DDBJ
457
457
  end # Bio
458
458
 
459
-
460
-
461
- if __FILE__ == $0
462
-
463
- begin
464
- require 'pp'
465
- alias p pp
466
- rescue LoadError
467
- end
468
-
469
- puts ">>> Bio::DDBJ::XML::Blast"
470
- serv = Bio::DDBJ::XML::Blast.new
471
- # serv.log = STDERR
472
-
473
- query = "MSSRIARALALVVTLLHLTRLALSTCPAACHCPLEAPKCAPGVGLVRDGCGCCKVCAKQL"
474
-
475
- puts "### searchSimple('blastp', 'SWISS', query)"
476
- puts serv.searchSimple('blastp', 'SWISS', query)
477
-
478
- puts "### searchParam('tblastn', 'ddbjvrl', query, '-m 8')"
479
- puts serv.searchParam('tblastn', 'ddbjvrl', query, '-m 8')
480
-
481
-
482
- puts ">>> Bio::DDBJ::XML::ClustalW"
483
- serv = Bio::DDBJ::XML::ClustalW.new
484
-
485
- query = <<END
486
- > RABSTOUT rabbit Guinness receptor
487
- LKMHLMGHLKMGLKMGLKGMHLMHLKHMHLMTYTYTTYRRWPLWMWLPDFGHAS
488
- ADSCVCAHGFAVCACFAHFDVCFGAVCFHAVCFAHVCFAAAVCFAVCAC
489
- > MUSNOSE mouse nose drying factor
490
- mhkmmhkgmkhmhgmhmhglhmkmhlkmgkhmgkmkytytytryrwtqtqwtwyt
491
- fdgfdsgafdagfdgfsagdfavdfdvgavfsvfgvdfsvdgvagvfdv
492
- > HSHEAVEN human Guinness receptor repeat
493
- mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
494
- fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
495
- mhkmmhkgmkhmhgmhmhg lhmkmhlkmgkhmgkmk ytytytryrwtqtqwtwyt
496
- fdgfdsgafdagfdgfsag dfavdfdvgavfsvfgv dfsvdgvagvfdv
497
- END
498
-
499
- puts "### analyzeSimple(query)"
500
- puts serv.analyzeSimple(query)
501
-
502
- puts "### analyzeParam(query, '-align -matrix=blosum')"
503
- puts serv.analyzeParam(query, '-align -matrix=blosum')
504
-
505
-
506
- puts ">>> Bio::DDBJ::XML::DDBJ"
507
- serv = Bio::DDBJ::XML::DDBJ.new
508
-
509
- puts "### getFFEntry('AB000050')"
510
- puts serv.getFFEntry('AB000050')
511
-
512
- puts "### getXMLEntry('AB000050')"
513
- puts serv.getXMLEntry('AB000050')
514
-
515
- puts "### getFeatureInfo('AB000050', 'cds')"
516
- puts serv.getFeatureInfo('AB000050', 'cds')
517
-
518
- puts "### getAllFeatures('AB000050')"
519
- puts serv.getAllFeatures('AB000050')
520
-
521
- puts "### getRelatedFeatures('AL121903', '59000', '64000')"
522
- puts serv.getRelatedFeatures('AL121903', '59000', '64000')
523
-
524
- puts "### getRelatedFeaturesSeq('AL121903', '59000', '64000')"
525
- puts serv.getRelatedFeaturesSeq('AL121903', '59000', '64000')
526
-
527
-
528
- puts ">>> Bio::DDBJ::XML::Fasta"
529
- serv = Bio::DDBJ::XML::Fasta.new
530
-
531
- query = ">Test\nMSDGAVQPDG GQPAVRNERA TGSGNGSGGG GGGGSGGVGI"
532
-
533
- puts "### searchSimple('fasta34', 'PDB', query)"
534
- puts serv.searchSimple('fasta34', 'PDB', query)
535
-
536
- query = ">Test\nAGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"
537
-
538
- puts "### searchParam('fastx34_t', 'PDB', query, '-n')"
539
- puts serv.searchParam('fastx34_t', 'PDB', query, '-n')
540
-
541
-
542
- puts ">>> Bio::DDBJ::XML::GetEntry"
543
- serv = Bio::DDBJ::XML::GetEntry.new
544
-
545
- puts "### getDDBJEntry('AB000050')"
546
- puts serv.getDDBJEntry('AB000050')
547
-
548
- puts "### getPDBEntry('1AAR')"
549
- puts serv. getPDBEntry('1AAR')
550
-
551
-
552
- puts ">>> Bio::DDBJ::XML::Gib"
553
- serv = Bio::DDBJ::XML::Gib.new
554
-
555
- puts "### getOrganismList"
556
- puts serv.getOrganismList
557
-
558
- puts "### getChIDList"
559
- puts serv.getChIDList
560
-
561
- puts "### getOrganismNameFromChid('Sent_CT18:')"
562
- puts serv.getOrganismNameFromChid('Sent_CT18:')
563
-
564
- puts "### getChIDFromOrganismName('Aquifex aeolicus VF5')"
565
- puts serv.getChIDFromOrganismName('Aquifex aeolicus VF5')
566
-
567
- puts "### getAccession('Ecol_K12_MG1655:')"
568
- puts serv.getAccession('Ecol_K12_MG1655:')
569
-
570
- puts "### getPieceNumber('Mgen_G37:')"
571
- puts serv.getPieceNumber('Mgen_G37:')
572
-
573
- puts "### getDivision('Mgen_G37:')"
574
- puts serv.getDivision('Mgen_G37:')
575
-
576
- puts "### getType('Mgen_G37:')"
577
- puts serv.getType('Mgen_G37:')
578
-
579
- puts "### getCDS('Aaeo_VF5:ece1')"
580
- puts serv.getCDS('Aaeo_VF5:ece1')
581
-
582
- puts "### getFlatFile('Nost_PCC7120:pCC7120zeta')"
583
- puts serv.getFlatFile('Nost_PCC7120:pCC7120zeta')
584
-
585
- puts "### getFastaFile('Nost_PCC7120:pCC7120zeta')"
586
- puts serv.getFastaFile('Nost_PCC7120:pCC7120zeta', 'cdsaa')
587
-
588
-
589
- puts ">>> Bio::DDBJ::XML::Gtop"
590
- serv = Bio::DDBJ::XML::Gtop.new
591
-
592
- puts "### getOrganismList"
593
- puts serv.getOrganismList
594
-
595
- puts "### getMasterInfo"
596
- puts serv.getMasterInfo('thrA', 'ecol0')
597
-
598
-
599
- # puts ">>> Bio::DDBJ::XML::PML"
600
- # serv = Bio::DDBJ::XML::PML.new
601
- #
602
- # puts "### getVariation('1')"
603
- # puts serv.getVariation('1')
604
-
605
-
606
- puts ">>> Bio::DDBJ::XML::SRS"
607
- serv = Bio::DDBJ::XML::SRS.new
608
-
609
- puts "### searchSimple('[pathway-des:sugar]')"
610
- puts serv.searchSimple('[pathway-des:sugar]')
611
-
612
- puts "### searchParam('[swissprot-des:cohesin]', '-f seq -sf fasta')"
613
- puts serv.searchParam('[swissprot-des:cohesin]', '-f seq -sf fasta')
614
-
615
-
616
- puts ">>> Bio::DDBJ::XML::TxSearch"
617
- serv = Bio::DDBJ::XML::TxSearch.new
618
-
619
- puts "### searchSimple('*coli')"
620
- puts serv.searchSimple('*coli')
621
-
622
- puts "### searchSimple('*tardigrada*')"
623
- puts serv.searchSimple('*tardigrada*')
624
-
625
- puts "### getTxId('Escherichia coli')"
626
- puts serv.getTxId('Escherichia coli')
627
-
628
- puts "### getTxName('562')"
629
- puts serv.getTxName('562')
630
-
631
- query = "Campylobacter coli\nEscherichia coli"
632
- rank = "family\ngenus"
633
-
634
- puts "### searchLineage(query, rank, 'Bacteria')"
635
- puts serv.searchLineage(query, rank, 'Bacteria')
636
-
637
- end
638
-