bio 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (303) hide show
  1. data/ChangeLog +2105 -3728
  2. data/KNOWN_ISSUES.rdoc +35 -3
  3. data/README.rdoc +8 -2
  4. data/RELEASE_NOTES.rdoc +166 -0
  5. data/bin/bioruby +4 -1
  6. data/bioruby.gemspec +146 -1
  7. data/bioruby.gemspec.erb +3 -1
  8. data/doc/ChangeLog-before-1.3.1 +3961 -0
  9. data/doc/Tutorial.rd +154 -22
  10. data/doc/Tutorial.rd.html +125 -68
  11. data/lib/bio.rb +21 -6
  12. data/lib/bio/appl/bl2seq/report.rb +11 -202
  13. data/lib/bio/appl/blast/format0.rb +0 -193
  14. data/lib/bio/appl/blast/report.rb +2 -147
  15. data/lib/bio/appl/blast/wublast.rb +0 -208
  16. data/lib/bio/appl/fasta.rb +4 -19
  17. data/lib/bio/appl/fasta/format10.rb +0 -14
  18. data/lib/bio/appl/genscan/report.rb +0 -176
  19. data/lib/bio/appl/hmmer.rb +1 -15
  20. data/lib/bio/appl/hmmer/report.rb +0 -100
  21. data/lib/bio/appl/meme/mast.rb +156 -0
  22. data/lib/bio/appl/meme/mast/report.rb +91 -0
  23. data/lib/bio/appl/meme/motif.rb +48 -0
  24. data/lib/bio/appl/psort.rb +0 -111
  25. data/lib/bio/appl/psort/report.rb +1 -45
  26. data/lib/bio/appl/pts1.rb +2 -4
  27. data/lib/bio/appl/sosui/report.rb +5 -54
  28. data/lib/bio/appl/targetp/report.rb +1 -104
  29. data/lib/bio/appl/tmhmm/report.rb +0 -36
  30. data/lib/bio/command.rb +94 -10
  31. data/lib/bio/data/aa.rb +1 -77
  32. data/lib/bio/data/codontable.rb +1 -95
  33. data/lib/bio/data/na.rb +1 -26
  34. data/lib/bio/db/aaindex.rb +1 -38
  35. data/lib/bio/db/fasta.rb +1 -134
  36. data/lib/bio/db/fasta/format_qual.rb +204 -0
  37. data/lib/bio/db/fasta/qual.rb +102 -0
  38. data/lib/bio/db/fastq.rb +645 -0
  39. data/lib/bio/db/fastq/fastq_to_biosequence.rb +40 -0
  40. data/lib/bio/db/fastq/format_fastq.rb +175 -0
  41. data/lib/bio/db/genbank/genbank.rb +1 -86
  42. data/lib/bio/db/gff.rb +0 -17
  43. data/lib/bio/db/go.rb +4 -72
  44. data/lib/bio/db/kegg/common.rb +112 -0
  45. data/lib/bio/db/kegg/compound.rb +29 -20
  46. data/lib/bio/db/kegg/drug.rb +74 -34
  47. data/lib/bio/db/kegg/enzyme.rb +26 -5
  48. data/lib/bio/db/kegg/genes.rb +128 -15
  49. data/lib/bio/db/kegg/genome.rb +3 -41
  50. data/lib/bio/db/kegg/glycan.rb +19 -24
  51. data/lib/bio/db/kegg/orthology.rb +16 -56
  52. data/lib/bio/db/kegg/reaction.rb +81 -28
  53. data/lib/bio/db/kegg/taxonomy.rb +1 -52
  54. data/lib/bio/db/litdb.rb +1 -16
  55. data/lib/bio/db/phyloxml/phyloxml.xsd +582 -0
  56. data/lib/bio/db/phyloxml/phyloxml_elements.rb +1174 -0
  57. data/lib/bio/db/phyloxml/phyloxml_parser.rb +954 -0
  58. data/lib/bio/db/phyloxml/phyloxml_writer.rb +228 -0
  59. data/lib/bio/db/prosite.rb +2 -95
  60. data/lib/bio/db/rebase.rb +5 -6
  61. data/lib/bio/db/sanger_chromatogram/abif.rb +120 -0
  62. data/lib/bio/db/sanger_chromatogram/chromatogram.rb +133 -0
  63. data/lib/bio/db/sanger_chromatogram/chromatogram_to_biosequence.rb +32 -0
  64. data/lib/bio/db/sanger_chromatogram/scf.rb +210 -0
  65. data/lib/bio/io/das.rb +0 -44
  66. data/lib/bio/io/ddbjxml.rb +1 -181
  67. data/lib/bio/io/flatfile.rb +1 -7
  68. data/lib/bio/io/flatfile/autodetection.rb +6 -0
  69. data/lib/bio/io/keggapi.rb +0 -442
  70. data/lib/bio/io/ncbirest.rb +130 -132
  71. data/lib/bio/io/ncbisoap.rb +2 -1
  72. data/lib/bio/io/pubmed.rb +0 -88
  73. data/lib/bio/location.rb +0 -73
  74. data/lib/bio/pathway.rb +0 -171
  75. data/lib/bio/sequence.rb +18 -1
  76. data/lib/bio/sequence/adapter.rb +3 -0
  77. data/lib/bio/sequence/format.rb +16 -0
  78. data/lib/bio/sequence/quality_score.rb +205 -0
  79. data/lib/bio/tree.rb +70 -5
  80. data/lib/bio/util/restriction_enzyme/single_strand.rb +3 -2
  81. data/lib/bio/util/sirna.rb +1 -23
  82. data/lib/bio/version.rb +1 -1
  83. data/sample/demo_aaindex.rb +67 -0
  84. data/sample/demo_aminoacid.rb +101 -0
  85. data/sample/demo_bl2seq_report.rb +220 -0
  86. data/sample/demo_blast_report.rb +285 -0
  87. data/sample/demo_codontable.rb +119 -0
  88. data/sample/demo_das.rb +105 -0
  89. data/sample/demo_ddbjxml.rb +212 -0
  90. data/sample/demo_fasta_remote.rb +51 -0
  91. data/sample/demo_fastaformat.rb +105 -0
  92. data/sample/demo_genbank.rb +132 -0
  93. data/sample/demo_genscan_report.rb +202 -0
  94. data/sample/demo_gff1.rb +49 -0
  95. data/sample/demo_go.rb +98 -0
  96. data/sample/demo_hmmer_report.rb +149 -0
  97. data/sample/demo_kegg_compound.rb +57 -0
  98. data/sample/demo_kegg_drug.rb +65 -0
  99. data/sample/demo_kegg_genome.rb +74 -0
  100. data/sample/demo_kegg_glycan.rb +72 -0
  101. data/sample/demo_kegg_orthology.rb +62 -0
  102. data/sample/demo_kegg_reaction.rb +66 -0
  103. data/sample/demo_kegg_taxonomy.rb +92 -0
  104. data/sample/demo_keggapi.rb +502 -0
  105. data/sample/demo_litdb.rb +42 -0
  106. data/sample/demo_locations.rb +99 -0
  107. data/sample/demo_ncbi_rest.rb +130 -0
  108. data/sample/demo_nucleicacid.rb +49 -0
  109. data/sample/demo_pathway.rb +196 -0
  110. data/sample/demo_prosite.rb +120 -0
  111. data/sample/demo_psort.rb +138 -0
  112. data/sample/demo_psort_report.rb +70 -0
  113. data/sample/demo_pubmed.rb +118 -0
  114. data/sample/demo_sirna.rb +63 -0
  115. data/sample/demo_sosui_report.rb +89 -0
  116. data/sample/demo_targetp_report.rb +135 -0
  117. data/sample/demo_tmhmm_report.rb +68 -0
  118. data/sample/pmfetch.rb +13 -4
  119. data/sample/pmsearch.rb +15 -4
  120. data/sample/test_phyloxml_big.rb +205 -0
  121. data/test/bioruby_test_helper.rb +61 -0
  122. data/test/data/KEGG/1.1.1.1.enzyme +935 -0
  123. data/test/data/KEGG/C00025.compound +102 -0
  124. data/test/data/KEGG/D00063.drug +104 -0
  125. data/test/data/KEGG/G00024.glycan +47 -0
  126. data/test/data/KEGG/G01366.glycan +18 -0
  127. data/test/data/KEGG/K02338.orthology +902 -0
  128. data/test/data/KEGG/R00006.reaction +14 -0
  129. data/test/data/fastq/README.txt +109 -0
  130. data/test/data/fastq/error_diff_ids.fastq +20 -0
  131. data/test/data/fastq/error_double_qual.fastq +22 -0
  132. data/test/data/fastq/error_double_seq.fastq +22 -0
  133. data/test/data/fastq/error_long_qual.fastq +20 -0
  134. data/test/data/fastq/error_no_qual.fastq +20 -0
  135. data/test/data/fastq/error_qual_del.fastq +20 -0
  136. data/test/data/fastq/error_qual_escape.fastq +20 -0
  137. data/test/data/fastq/error_qual_null.fastq +0 -0
  138. data/test/data/fastq/error_qual_space.fastq +21 -0
  139. data/test/data/fastq/error_qual_tab.fastq +21 -0
  140. data/test/data/fastq/error_qual_unit_sep.fastq +20 -0
  141. data/test/data/fastq/error_qual_vtab.fastq +20 -0
  142. data/test/data/fastq/error_short_qual.fastq +20 -0
  143. data/test/data/fastq/error_spaces.fastq +20 -0
  144. data/test/data/fastq/error_tabs.fastq +21 -0
  145. data/test/data/fastq/error_trunc_at_plus.fastq +19 -0
  146. data/test/data/fastq/error_trunc_at_qual.fastq +19 -0
  147. data/test/data/fastq/error_trunc_at_seq.fastq +18 -0
  148. data/test/data/fastq/error_trunc_in_plus.fastq +19 -0
  149. data/test/data/fastq/error_trunc_in_qual.fastq +20 -0
  150. data/test/data/fastq/error_trunc_in_seq.fastq +18 -0
  151. data/test/data/fastq/error_trunc_in_title.fastq +17 -0
  152. data/test/data/fastq/illumina_full_range_as_illumina.fastq +8 -0
  153. data/test/data/fastq/illumina_full_range_as_sanger.fastq +8 -0
  154. data/test/data/fastq/illumina_full_range_as_solexa.fastq +8 -0
  155. data/test/data/fastq/illumina_full_range_original_illumina.fastq +8 -0
  156. data/test/data/fastq/longreads_as_illumina.fastq +40 -0
  157. data/test/data/fastq/longreads_as_sanger.fastq +40 -0
  158. data/test/data/fastq/longreads_as_solexa.fastq +40 -0
  159. data/test/data/fastq/longreads_original_sanger.fastq +120 -0
  160. data/test/data/fastq/misc_dna_as_illumina.fastq +16 -0
  161. data/test/data/fastq/misc_dna_as_sanger.fastq +16 -0
  162. data/test/data/fastq/misc_dna_as_solexa.fastq +16 -0
  163. data/test/data/fastq/misc_dna_original_sanger.fastq +16 -0
  164. data/test/data/fastq/misc_rna_as_illumina.fastq +16 -0
  165. data/test/data/fastq/misc_rna_as_sanger.fastq +16 -0
  166. data/test/data/fastq/misc_rna_as_solexa.fastq +16 -0
  167. data/test/data/fastq/misc_rna_original_sanger.fastq +16 -0
  168. data/test/data/fastq/sanger_full_range_as_illumina.fastq +8 -0
  169. data/test/data/fastq/sanger_full_range_as_sanger.fastq +8 -0
  170. data/test/data/fastq/sanger_full_range_as_solexa.fastq +8 -0
  171. data/test/data/fastq/sanger_full_range_original_sanger.fastq +8 -0
  172. data/test/data/fastq/solexa_full_range_as_illumina.fastq +8 -0
  173. data/test/data/fastq/solexa_full_range_as_sanger.fastq +8 -0
  174. data/test/data/fastq/solexa_full_range_as_solexa.fastq +8 -0
  175. data/test/data/fastq/solexa_full_range_original_solexa.fastq +8 -0
  176. data/test/data/fastq/wrapping_as_illumina.fastq +12 -0
  177. data/test/data/fastq/wrapping_as_sanger.fastq +12 -0
  178. data/test/data/fastq/wrapping_as_solexa.fastq +12 -0
  179. data/test/data/fastq/wrapping_original_sanger.fastq +24 -0
  180. data/test/data/meme/db +0 -0
  181. data/test/data/meme/mast +0 -0
  182. data/test/data/meme/mast.out +13 -0
  183. data/test/data/meme/meme.out +3 -0
  184. data/test/data/phyloxml/apaf.xml +666 -0
  185. data/test/data/phyloxml/bcl_2.xml +2097 -0
  186. data/test/data/phyloxml/made_up.xml +144 -0
  187. data/test/data/phyloxml/ncbi_taxonomy_mollusca_short.xml +65 -0
  188. data/test/data/phyloxml/phyloxml_examples.xml +415 -0
  189. data/test/data/sanger_chromatogram/test_chromatogram_abif.ab1 +0 -0
  190. data/test/data/sanger_chromatogram/test_chromatogram_scf_v2.scf +0 -0
  191. data/test/data/sanger_chromatogram/test_chromatogram_scf_v3.scf +0 -0
  192. data/test/functional/bio/appl/test_pts1.rb +7 -5
  193. data/test/functional/bio/io/test_ensembl.rb +4 -3
  194. data/test/functional/bio/io/test_pubmed.rb +9 -3
  195. data/test/functional/bio/io/test_soapwsdl.rb +5 -4
  196. data/test/functional/bio/io/test_togows.rb +5 -4
  197. data/test/functional/bio/sequence/test_output_embl.rb +6 -4
  198. data/test/functional/bio/test_command.rb +54 -5
  199. data/test/runner.rb +5 -3
  200. data/test/unit/bio/appl/bl2seq/test_report.rb +5 -4
  201. data/test/unit/bio/appl/blast/test_ncbioptions.rb +4 -2
  202. data/test/unit/bio/appl/blast/test_report.rb +5 -4
  203. data/test/unit/bio/appl/blast/test_rpsblast.rb +5 -4
  204. data/test/unit/bio/appl/gcg/test_msf.rb +5 -5
  205. data/test/unit/bio/appl/genscan/test_report.rb +8 -9
  206. data/test/unit/bio/appl/hmmer/test_report.rb +5 -4
  207. data/test/unit/bio/appl/iprscan/test_report.rb +6 -5
  208. data/test/unit/bio/appl/mafft/test_report.rb +6 -5
  209. data/test/unit/bio/appl/meme/mast/test_report.rb +46 -0
  210. data/test/unit/bio/appl/meme/test_mast.rb +103 -0
  211. data/test/unit/bio/appl/meme/test_motif.rb +38 -0
  212. data/test/unit/bio/appl/paml/codeml/test_rates.rb +5 -4
  213. data/test/unit/bio/appl/paml/codeml/test_report.rb +5 -4
  214. data/test/unit/bio/appl/paml/test_codeml.rb +5 -4
  215. data/test/unit/bio/appl/sim4/test_report.rb +5 -4
  216. data/test/unit/bio/appl/sosui/test_report.rb +6 -5
  217. data/test/unit/bio/appl/targetp/test_report.rb +5 -3
  218. data/test/unit/bio/appl/test_blast.rb +5 -4
  219. data/test/unit/bio/appl/test_fasta.rb +4 -2
  220. data/test/unit/bio/appl/test_pts1.rb +4 -2
  221. data/test/unit/bio/appl/tmhmm/test_report.rb +6 -5
  222. data/test/unit/bio/data/test_aa.rb +5 -3
  223. data/test/unit/bio/data/test_codontable.rb +5 -4
  224. data/test/unit/bio/data/test_na.rb +5 -3
  225. data/test/unit/bio/db/biosql/tc_biosql.rb +5 -1
  226. data/test/unit/bio/db/embl/test_common.rb +4 -2
  227. data/test/unit/bio/db/embl/test_embl.rb +6 -6
  228. data/test/unit/bio/db/embl/test_embl_rel89.rb +6 -6
  229. data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +7 -8
  230. data/test/unit/bio/db/embl/test_sptr.rb +6 -8
  231. data/test/unit/bio/db/embl/test_uniprot.rb +6 -5
  232. data/test/unit/bio/db/fasta/test_format_qual.rb +346 -0
  233. data/test/unit/bio/db/kegg/test_compound.rb +146 -0
  234. data/test/unit/bio/db/kegg/test_drug.rb +194 -0
  235. data/test/unit/bio/db/kegg/test_enzyme.rb +241 -0
  236. data/test/unit/bio/db/kegg/test_genes.rb +32 -4
  237. data/test/unit/bio/db/kegg/test_glycan.rb +260 -0
  238. data/test/unit/bio/db/kegg/test_orthology.rb +50 -0
  239. data/test/unit/bio/db/kegg/test_reaction.rb +96 -0
  240. data/test/unit/bio/db/pdb/test_pdb.rb +4 -2
  241. data/test/unit/bio/db/sanger_chromatogram/test_abif.rb +76 -0
  242. data/test/unit/bio/db/sanger_chromatogram/test_scf.rb +98 -0
  243. data/test/unit/bio/db/test_aaindex.rb +6 -6
  244. data/test/unit/bio/db/test_fasta.rb +5 -46
  245. data/test/unit/bio/db/test_fastq.rb +829 -0
  246. data/test/unit/bio/db/test_gff.rb +4 -2
  247. data/test/unit/bio/db/test_lasergene.rb +7 -5
  248. data/test/unit/bio/db/test_medline.rb +4 -2
  249. data/test/unit/bio/db/test_newick.rb +6 -6
  250. data/test/unit/bio/db/test_nexus.rb +4 -2
  251. data/test/unit/bio/db/test_phyloxml.rb +769 -0
  252. data/test/unit/bio/db/test_phyloxml_writer.rb +328 -0
  253. data/test/unit/bio/db/test_prosite.rb +6 -5
  254. data/test/unit/bio/db/test_qual.rb +63 -0
  255. data/test/unit/bio/db/test_rebase.rb +5 -3
  256. data/test/unit/bio/db/test_soft.rb +7 -6
  257. data/test/unit/bio/io/flatfile/test_autodetection.rb +6 -7
  258. data/test/unit/bio/io/flatfile/test_buffer.rb +6 -5
  259. data/test/unit/bio/io/flatfile/test_splitter.rb +4 -4
  260. data/test/unit/bio/io/test_ddbjxml.rb +4 -3
  261. data/test/unit/bio/io/test_ensembl.rb +5 -3
  262. data/test/unit/bio/io/test_fastacmd.rb +4 -3
  263. data/test/unit/bio/io/test_flatfile.rb +6 -5
  264. data/test/unit/bio/io/test_soapwsdl.rb +4 -3
  265. data/test/unit/bio/io/test_togows.rb +4 -2
  266. data/test/unit/bio/sequence/test_aa.rb +5 -3
  267. data/test/unit/bio/sequence/test_common.rb +4 -2
  268. data/test/unit/bio/sequence/test_compat.rb +4 -2
  269. data/test/unit/bio/sequence/test_dblink.rb +5 -3
  270. data/test/unit/bio/sequence/test_na.rb +4 -2
  271. data/test/unit/bio/sequence/test_quality_score.rb +330 -0
  272. data/test/unit/bio/shell/plugin/test_seq.rb +5 -3
  273. data/test/unit/bio/test_alignment.rb +5 -3
  274. data/test/unit/bio/test_command.rb +4 -3
  275. data/test/unit/bio/test_db.rb +5 -3
  276. data/test/unit/bio/test_feature.rb +4 -2
  277. data/test/unit/bio/test_location.rb +4 -2
  278. data/test/unit/bio/test_map.rb +5 -3
  279. data/test/unit/bio/test_pathway.rb +4 -2
  280. data/test/unit/bio/test_reference.rb +4 -2
  281. data/test/unit/bio/test_sequence.rb +5 -3
  282. data/test/unit/bio/test_shell.rb +5 -3
  283. data/test/unit/bio/test_tree.rb +6 -6
  284. data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +4 -2
  285. data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +4 -2
  286. data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +4 -2
  287. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -2
  288. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +4 -2
  289. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +4 -2
  290. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +4 -2
  291. data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +4 -2
  292. data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +4 -2
  293. data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +4 -2
  294. data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -2
  295. data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +4 -2
  296. data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +17 -13
  297. data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +17 -13
  298. data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +4 -2
  299. data/test/unit/bio/util/test_color_scheme.rb +5 -3
  300. data/test/unit/bio/util/test_contingency_table.rb +5 -3
  301. data/test/unit/bio/util/test_restriction_enzyme.rb +4 -2
  302. data/test/unit/bio/util/test_sirna.rb +6 -4
  303. metadata +147 -2
@@ -0,0 +1,102 @@
1
+ #
2
+ # = bio/db/fasta/qual.rb - Qual format, FASTA formatted numeric entry
3
+ #
4
+ # Copyright:: Copyright (C) 2001, 2002, 2009
5
+ # Naohisa Goto <ng@bioruby.org>,
6
+ # Toshiaki Katayama <k@bioruby.org>
7
+ # License:: The Ruby License
8
+ #
9
+ # $Id:$
10
+ #
11
+ # == Description
12
+ #
13
+ # QUAL format, FASTA formatted numeric entry.
14
+ #
15
+ # == Examples
16
+ #
17
+ # See documents of Bio::FastaNumericFormat class.
18
+ #
19
+ # == References
20
+ #
21
+ # * FASTA format (WikiPedia)
22
+ # http://en.wikipedia.org/wiki/FASTA_format
23
+ #
24
+ # * Phred quality score (WikiPedia)
25
+ # http://en.wikipedia.org/wiki/Phred_quality_score
26
+ #
27
+ # * Fasta format description (NCBI)
28
+ # http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
29
+ #
30
+
31
+ require 'bio/db/fasta'
32
+
33
+ module Bio
34
+
35
+ # Treats a FASTA formatted numerical entry, such as:
36
+ #
37
+ # >id and/or some comments <== comment line
38
+ # 24 15 23 29 20 13 20 21 21 23 22 25 13 <== numerical data
39
+ # 22 17 15 25 27 32 26 32 29 29 25
40
+ #
41
+ # The precedent '>' can be omitted and the trailing '>' will be removed
42
+ # automatically.
43
+ #
44
+ # --- Bio::FastaNumericFormat.new(entry)
45
+ #
46
+ # Stores the comment and the list of the numerical data.
47
+ #
48
+ # --- Bio::FastaNumericFormat#definition
49
+ #
50
+ # The comment line of the FASTA formatted data.
51
+ #
52
+ # * FASTA format (Wikipedia)
53
+ # http://en.wikipedia.org/wiki/FASTA_format
54
+ #
55
+ # * Phred quality score (WikiPedia)
56
+ # http://en.wikipedia.org/wiki/Phred_quality_score
57
+ #
58
+ class FastaNumericFormat < FastaFormat
59
+
60
+ # Returns the list of the numerical data (typically the quality score
61
+ # of its corresponding sequence) as an Array.
62
+ # ---
63
+ # *Returns*:: (Array containing Integer) numbers
64
+ def data
65
+ unless defined?(@list)
66
+ @list = @data.strip.split(/\s+/).map {|x| x.to_i}
67
+ end
68
+ @list
69
+ end
70
+
71
+ # Returns the number of elements in the numerical data,
72
+ # which will be the same of its corresponding sequence length.
73
+ # ---
74
+ # *Returns*:: (Integer) the number of elements
75
+ def length
76
+ data.length
77
+ end
78
+
79
+ # Yields on each elements of the numerical data.
80
+ # ---
81
+ # *Yields*:: (Integer) a numerical data element
82
+ # *Returns*:: (undefined)
83
+ def each
84
+ data.each do |x|
85
+ yield x
86
+ end
87
+ end
88
+
89
+ # Returns the n-th element. If out of range, returns nil.
90
+ # ---
91
+ # *Arguments*:
92
+ # * (required) _n_: (Integer) position
93
+ # *Returns*:: (Integer or nil) the value
94
+ def [](n)
95
+ data[n]
96
+ end
97
+
98
+ undef query, blast, fasta, seq, naseq, nalen, aaseq, aalen
99
+
100
+ end #class FastaNumericFormat
101
+
102
+ end #module Bio
@@ -0,0 +1,645 @@
1
+ #
2
+ # = bio/db/fastq.rb - FASTQ format parser class
3
+ #
4
+ # Copyright:: Copyright (C) 2009
5
+ # Naohisa Goto <ng@bioruby.org>
6
+ # License:: The Ruby License
7
+ #
8
+ # == Description
9
+ #
10
+ # FASTQ format parser class.
11
+ #
12
+ # Be careful that it is for the fastQ format, not for the fastA format.
13
+ #
14
+ # == Examples
15
+ #
16
+ # See documents of Bio::Fastq class.
17
+ #
18
+ # == References
19
+ #
20
+ # * FASTQ format specification
21
+ # http://maq.sourceforge.net/fastq.shtml
22
+ #
23
+
24
+ require "strscan"
25
+ require "singleton"
26
+
27
+ require 'bio/sequence'
28
+ require 'bio/io/flatfile'
29
+
30
+ module Bio
31
+
32
+ # Bio::Fastq is a parser for FASTQ format.
33
+ #
34
+ class Fastq
35
+
36
+ # Bio::Fastq::FormatData is a data class to store Fastq format parameters
37
+ # and quality calculation methods.
38
+ # Bio::Fastq internal use only.
39
+ class FormatData
40
+
41
+ # Format name. Should be redefined in subclass.
42
+ NAME = nil
43
+
44
+ # Offset. Should be redefined in subclass.
45
+ OFFSET = nil
46
+
47
+ # Range of score. Should be redefined in subclass.
48
+ # The range must not exclude end value, i.e. it must be X..Y,
49
+ # and must not be X...Y.
50
+ SCORE_RANGE = nil
51
+
52
+ def initialize
53
+ @name = self.class::NAME
54
+ @symbol = @name.gsub(/\-/, '_').to_sym
55
+ @offset = self.class::OFFSET
56
+ @score_range = self.class::SCORE_RANGE
57
+ end
58
+
59
+ # Format name
60
+ attr_reader :name
61
+
62
+ # Format name symbol.
63
+ # Note that "-" in the format name is substituted to "_" because
64
+ # "-" in a symbol is relatively difficult to handle.
65
+ attr_reader :symbol
66
+
67
+ # Offset when converting a score to a character
68
+ attr_reader :offset
69
+
70
+ # Allowed range of a score value
71
+ attr_reader :score_range
72
+
73
+ # Type of quality scores. Maybe one of :phred or :solexa.
74
+ attr_reader :quality_score_type if false # for RDoc
75
+
76
+ # Converts quality string to scores.
77
+ # No overflow/underflow checks will be performed.
78
+ # ---
79
+ # *Arguments*:
80
+ # * (required) _c_: (String) quality string
81
+ # *Returns*:: (Array containing Integer) score values
82
+ def str2scores(str)
83
+ a = str.unpack('C*')
84
+ a.collect! { |i| i - @offset }
85
+ a
86
+ end
87
+
88
+ # Converts scores to a string.
89
+ # Overflow/underflow checks will be performed.
90
+ # If a block is given, when overflow/underflow detected,
91
+ # the score value is passed to the block, and uses returned value
92
+ # as the score. If no blocks, silently truncated.
93
+ #
94
+ # ---
95
+ # *Arguments*:
96
+ # * (required) _a_: (Array containing Integer) score values
97
+ # *Returns*:: (String) quality string
98
+ def scores2str(a)
99
+ if block_given? then
100
+ tmp = a.collect do |i|
101
+ i = yield(i) unless @score_range.include?(i)
102
+ i + @offset
103
+ end
104
+ else
105
+ min = @score_range.begin
106
+ max = @score_range.end
107
+ tmp = a.collect do |i|
108
+ if i < min then
109
+ i = min
110
+ elsif i > max then
111
+ i = max
112
+ end
113
+ i + @offset
114
+ end
115
+ end
116
+ tmp.pack('C*')
117
+ end
118
+
119
+ # Format information for "fastq-sanger".
120
+ # Bio::Fastq internal use only.
121
+ class FASTQ_SANGER < FormatData
122
+ include Singleton
123
+
124
+ include Bio::Sequence::QualityScore::Phred
125
+
126
+ # format name
127
+ NAME = 'fastq-sanger'.freeze
128
+ # offset
129
+ OFFSET = 33
130
+ # score range
131
+ SCORE_RANGE = 0..93
132
+
133
+ end #class FASTQ_SANGER
134
+
135
+ # Format information for "fastq-solexa"
136
+ # Bio::Fastq internal use only.
137
+ class FASTQ_SOLEXA < FormatData
138
+ include Singleton
139
+
140
+ include Bio::Sequence::QualityScore::Solexa
141
+
142
+ # format name
143
+ NAME = 'fastq-solexa'.freeze
144
+ # offset
145
+ OFFSET = 64
146
+ # score range
147
+ SCORE_RANGE = (-5)..62
148
+
149
+ end #class FASTQ_SOLEXA
150
+
151
+ # Format information for "fastq-illumina"
152
+ # Bio::Fastq internal use only.
153
+ class FASTQ_ILLUMINA < FormatData
154
+ include Singleton
155
+
156
+ include Bio::Sequence::QualityScore::Phred
157
+
158
+ # format name
159
+ NAME = 'fastq-illumina'.freeze
160
+ # offset
161
+ OFFSET = 64
162
+ # score range
163
+ SCORE_RANGE = 0..62
164
+
165
+ end #class FASTQ_ILLUMINA
166
+
167
+ end #class FormatData
168
+
169
+
170
+ # Available format names.
171
+ FormatNames = {
172
+ "fastq-sanger" => FormatData::FASTQ_SANGER,
173
+ "fastq-solexa" => FormatData::FASTQ_SOLEXA,
174
+ "fastq-illumina" => FormatData::FASTQ_ILLUMINA
175
+ }.freeze
176
+
177
+ # Available format name symbols.
178
+ Formats = {
179
+ :fastq_sanger => FormatData::FASTQ_SANGER,
180
+ :fastq_solexa => FormatData::FASTQ_SOLEXA,
181
+ :fastq_illumina => FormatData::FASTQ_ILLUMINA
182
+ }.freeze
183
+
184
+ # Default format name
185
+ DefaultFormatName = 'fastq-sanger'.freeze
186
+
187
+ # Splitter for Bio::FlatFile
188
+ FLATFILE_SPLITTER = Bio::FlatFile::Splitter::LineOriented
189
+
190
+
191
+ # Basic exception class of all Bio::Fastq::Error:XXXX.
192
+ # Bio::Fastq internal use only.
193
+ class Error < RuntimeError
194
+
195
+ private
196
+ # default error message for this exception
197
+ def default_message(i)
198
+ "FASTQ error #{i}"
199
+ end
200
+
201
+ # Creates a new object.
202
+ # If error message is not given, default error message is stored.
203
+ # If error message is a Integer value, it is treated as the
204
+ # position inside the sequence or the quality, and default
205
+ # error message including the position is stored.
206
+ # ---
207
+ # *Arguments*:
208
+ # * (optional) <em>error_message</em>: error message (see above)
209
+ def initialize(error_message = nil)
210
+ if !error_message or error_message.kind_of?(Integer) then
211
+ error_message = default_message(error_message)
212
+ end
213
+ super(error_message)
214
+ end
215
+
216
+ # Error::No_atmark -- the first identifier does not begin with "@"
217
+ class No_atmark < Error
218
+ private
219
+ # default error message for this exception
220
+ def default_message(i)
221
+ 'the first identifier does not begin with "@"'
222
+ end
223
+ end
224
+
225
+ # Error::No_ids -- sequence identifier not found
226
+ class No_ids < Error
227
+ private
228
+ # default error message for this exception
229
+ def default_message(i)
230
+ 'sequence identifier not found'
231
+ end
232
+ end
233
+
234
+ # Error::Diff_ids -- the identifier in the two lines are different
235
+ class Diff_ids < Error
236
+ private
237
+ # default error message for this exception
238
+ def default_message(i)
239
+ 'the identifier in the two lines are different'
240
+ end
241
+ end
242
+
243
+ # Error::Long_qual -- length of quality is longer than the sequence
244
+ class Long_qual < Error
245
+ private
246
+ # default error message for this exception
247
+ def default_message(i)
248
+ 'length of quality is longer than the sequence'
249
+ end
250
+ end
251
+
252
+ # Error::Short_qual -- length of quality is shorter than the sequence
253
+ class Short_qual < Error
254
+ private
255
+ # default error message for this exception
256
+ def default_message(i)
257
+ 'length of quality is shorter than the sequence'
258
+ end
259
+ end
260
+
261
+ # Error::No_qual -- no quality characters found
262
+ class No_qual < Error
263
+ private
264
+ # default error message for this exception
265
+ def default_message(i)
266
+ 'no quality characters found'
267
+ end
268
+ end
269
+
270
+ # Error::No_seq -- no sequence found
271
+ class No_seq < Error
272
+ private
273
+ # default error message for this exception
274
+ def default_message(i)
275
+ 'no sequence found'
276
+ end
277
+ end
278
+
279
+ # Error::Qual_char -- invalid character in the quality
280
+ class Qual_char < Error
281
+ private
282
+ # default error message for this exception
283
+ def default_message(i)
284
+ pos = i ? " at [#{i}]" : ''
285
+ "invalid character in the quality#{pos}"
286
+ end
287
+ end
288
+
289
+ # Error::Seq_char -- invalid character in the sequence
290
+ class Seq_char < Error
291
+ private
292
+ # default error message for this exception
293
+ def default_message(i)
294
+ pos = i ? " at [#{i}]" : ''
295
+ "invalid character in the sequence#{pos}"
296
+ end
297
+ end
298
+
299
+ # Error::Qual_range -- quality score value out of range
300
+ class Qual_range < Error
301
+ private
302
+ # default error message for this exception
303
+ def default_message(i)
304
+ pos = i ? " at [#{i}]" : ''
305
+ "quality score value out of range#{pos}"
306
+ end
307
+ end
308
+
309
+ # Error::Skipped_unformatted_lines -- the parser skipped unformatted
310
+ # lines that could not be recognized as FASTQ format
311
+ class Skipped_unformatted_lines < Error
312
+ private
313
+ # default error message for this exception
314
+ def default_message(i)
315
+ "the parser skipped unformatted lines that could not be recognized as FASTQ format"
316
+ end
317
+ end
318
+ end #class Error
319
+
320
+ # Adds a header line if the header data is not yet given and
321
+ # the given line is suitable for header.
322
+ # Returns self if adding header line is succeeded.
323
+ # Otherwise, returns false (the line is not added).
324
+ def add_header_line(line)
325
+ @header ||= ""
326
+ if line[0,1] == "@" then
327
+ false
328
+ else
329
+ @header.concat line
330
+ self
331
+ end
332
+ end
333
+
334
+ # misc lines before the entry (String or nil)
335
+ attr_reader :header
336
+
337
+ # Adds a line to the entry if the given line is regarded as
338
+ # a part of the current entry.
339
+ def add_line(line)
340
+ line = line.chomp
341
+ if !defined? @definition then
342
+ if line[0, 1] == "@" then
343
+ @definition = line[1..-1]
344
+ else
345
+ @definition = line
346
+ @parse_errors ||= []
347
+ @parse_errors.push Error::No_atmark.new
348
+ end
349
+ return self
350
+ end
351
+ if defined? @definition2 then
352
+ @quality_string ||= ''
353
+ if line[0, 1] == "@" and
354
+ @quality_string.size >= @sequence_string.size then
355
+ return false
356
+ else
357
+ @quality_string.concat line
358
+ return self
359
+ end
360
+ else
361
+ @sequence_string ||= ''
362
+ if line[0, 1] == '+' then
363
+ @definition2 = line[1..-1]
364
+ else
365
+ @sequence_string.concat line
366
+ end
367
+ return self
368
+ end
369
+ raise "Bug: should not reach here!"
370
+ end
371
+
372
+ # entry_overrun
373
+ attr_reader :entry_overrun
374
+
375
+ # Creates a new Fastq object from formatted text string.
376
+ #
377
+ # The format of quality scores should be specified later
378
+ # by using <tt>format=</tt> method.
379
+ #
380
+ # ---
381
+ # *Arguments*:
382
+ # * _str_: Formatted string (String)
383
+ def initialize(str = nil)
384
+ return unless str
385
+ sc = StringScanner.new(str)
386
+ while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
387
+ unless add_header_line(line) then
388
+ sc.unscan
389
+ break
390
+ end
391
+ end
392
+ while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
393
+ unless add_line(line) then
394
+ sc.unscan
395
+ break
396
+ end
397
+ end
398
+ @entry_overrun = sc.rest
399
+ end
400
+
401
+ # definition; ID line (begins with @)
402
+ attr_reader :definition
403
+
404
+ # quality as a string
405
+ attr_reader :quality_string
406
+
407
+ # raw sequence data as a String object
408
+ attr_reader :sequence_string
409
+
410
+ # returns Bio::Sequence::NA
411
+ def naseq
412
+ unless defined? @naseq then
413
+ @naseq = Bio::Sequence::NA.new(@sequence_string)
414
+ end
415
+ @naseq
416
+ end
417
+
418
+ # length of naseq
419
+ def nalen
420
+ naseq.length
421
+ end
422
+
423
+ # returns Bio::Sequence::Generic
424
+ def seq
425
+ unless defined? @seq then
426
+ @seq = Bio::Sequence::Generic.new(@sequence_string)
427
+ end
428
+ @seq
429
+ end
430
+
431
+ # Identifier of the entry. Normally, the first word of the ID line.
432
+ def entry_id
433
+ unless defined? @entry_id then
434
+ eid = @definition.strip.split(/\s+/)[0] || @definition
435
+ @entry_id = eid
436
+ end
437
+ @entry_id
438
+ end
439
+
440
+ # (private) reset internal state
441
+ def reset_state
442
+ if defined? @quality_scores then
443
+ remove_instance_variable(:@quality_scores)
444
+ end
445
+ if defined? @error_probabilities then
446
+ remove_instance_variable(:@error_probabilities)
447
+ end
448
+ end
449
+ private :reset_state
450
+
451
+ # Specify the format. If the format is not found, raises RuntimeError.
452
+ #
453
+ # Available formats are:
454
+ # "fastq-sanger" or :fastq_sanger
455
+ # "fastq-solexa" or :fastq_solexa
456
+ # "fastq-illumina" or :fastq_illumina
457
+ #
458
+ # ---
459
+ # *Arguments*:
460
+ # * (required) _name_: format name (String or Symbol).
461
+ # *Returns*:: (String) format name
462
+ def format=(name)
463
+ if name then
464
+ f = FormatNames[name] || Formats[name]
465
+ if f then
466
+ reset_state
467
+ @format = f.instance
468
+ self.format
469
+ else
470
+ raise "unknown format"
471
+ end
472
+ else
473
+ reset_state
474
+ nil
475
+ end
476
+ end
477
+
478
+ # Format name.
479
+ # One of "fastq-sanger", "fastq-solexa", "fastq-illumina",
480
+ # or nil (when not specified).
481
+ # ---
482
+ # *Returns*:: (String or nil) format name
483
+ def format
484
+ @format ? @format.name : nil
485
+ end
486
+
487
+
488
+ # The meaning of the quality scores.
489
+ # It may be one of :phred, :solexa, or nil.
490
+ def quality_score_type
491
+ self.format ||= self.class::DefaultFormatName
492
+ @format.quality_score_type
493
+ end
494
+
495
+ # Quality score for each base.
496
+ # For "fastq-sanger" or "fastq-illumina", it is PHRED score.
497
+ # For "fastq-solexa", it is Solexa score.
498
+ #
499
+ # ---
500
+ # *Returns*:: (Array containing Integer) quality score values
501
+ def quality_scores
502
+ unless defined? @quality_scores then
503
+ self.format ||= self.class::DefaultFormatName
504
+ s = @format.str2scores(@quality_string)
505
+ @quality_scores = s
506
+ end
507
+ @quality_scores
508
+ end
509
+
510
+ alias qualities quality_scores
511
+
512
+ # Estimated probability of error for each base.
513
+ # ---
514
+ # *Returns*:: (Array containing Float) error probability values
515
+ def error_probabilities
516
+ unless defined? @error_probabilities then
517
+ self.format ||= self.class::DefaultFormatName
518
+ a = @format.q2p(self.quality_scores)
519
+ @error_probabilities = a
520
+ end
521
+ @error_probabilities
522
+ end
523
+
524
+ # Format validation.
525
+ #
526
+ # If an array is given as the argument, when errors are found,
527
+ # error objects are pushed to the array.
528
+ # Currently, following errors may be added to the array.
529
+ # (All errors are under the Bio::Fastq namespace, for example,
530
+ # Bio::Fastq::Error::Diff_ids).
531
+ #
532
+ # Error::Diff_ids -- the identifier in the two lines are different
533
+ # Error::Long_qual -- length of quality is longer than the sequence
534
+ # Error::Short_qual -- length of quality is shorter than the sequence
535
+ # Error::No_qual -- no quality characters found
536
+ # Error::No_seq -- no sequence found
537
+ # Error::Qual_char -- invalid character in the quality
538
+ # Error::Seq_char -- invalid character in the sequence
539
+ # Error::Qual_range -- quality score value out of range
540
+ # Error::No_ids -- sequence identifier not found
541
+ # Error::No_atmark -- the first identifier does not begin with "@"
542
+ # Error::Skipped_unformatted_lines -- the parser skipped unformatted lines that could not be recognized as FASTQ format
543
+ #
544
+ # ---
545
+ # *Arguments*:
546
+ # * (optional) _errors_: (Array or nil) an array for pushing error messages. The array should be empty.
547
+ # *Returns*:: true:no error, false: containing error.
548
+ def validate_format(errors = nil)
549
+ err = []
550
+
551
+ # if header exists, the format might be broken.
552
+ if defined? @header and @header and !@header.strip.empty? then
553
+ err.push Error::Skipped_unformatted_lines.new
554
+ end
555
+
556
+ # if parse errors exist, adding them
557
+ if defined? @parse_errors and @parse_errors then
558
+ err.concat @parse_errors
559
+ end
560
+
561
+ # check if identifier exists, and identifier matches
562
+ if !defined?(@definition) or !@definition then
563
+ err.push Error::No_ids.new
564
+ elsif defined?(@definition2) and
565
+ !@definition2.to_s.empty? and
566
+ @definition != @definition2 then
567
+ err.push Error::Diff_ids.new
568
+ end
569
+
570
+ # check if sequence exists
571
+ has_seq = true
572
+ if !defined?(@sequence_string) or !@sequence_string then
573
+ err.push Error::No_seq.new
574
+ has_seq = false
575
+ end
576
+
577
+ # check if quality exists
578
+ has_qual = true
579
+ if !defined?(@quality_string) or !@quality_string then
580
+ err.push Error::No_qual.new
581
+ has_qual = false
582
+ end
583
+
584
+ # sequence and quality length check
585
+ if has_seq and has_qual then
586
+ slen = @sequence_string.length
587
+ qlen = @quality_string.length
588
+ if slen > qlen then
589
+ err.push Error::Short_qual.new
590
+ elsif qlen > slen then
591
+ err.push Error::Long_qual.new
592
+ end
593
+ end
594
+
595
+ # sequence character check
596
+ if has_seq then
597
+ sc = StringScanner.new(@sequence_string)
598
+ while sc.scan_until(/[ \x00-\x1f\x7f-\xff]/n)
599
+ err.push Error::Seq_char.new(sc.pos - sc.matched_size)
600
+ end
601
+ end
602
+
603
+ # sequence character check
604
+ if has_qual then
605
+ fmt = if defined?(@format) and @format then
606
+ @format.name
607
+ else
608
+ nil
609
+ end
610
+ re = case fmt
611
+ when 'fastq-sanger'
612
+ /[^\x21-\x7e]/n
613
+ when 'fastq-solexa'
614
+ /[^\x3b-\x7e]/n
615
+ when 'fastq-illumina'
616
+ /[^\x40-\x7e]/n
617
+ else
618
+ /[ \x00-\x1f\x7f-\xff]/n
619
+ end
620
+ sc = StringScanner.new(@quality_string)
621
+ while sc.scan_until(re)
622
+ err.push Error::Qual_char.new(sc.pos - sc.matched_size)
623
+ end
624
+ end
625
+
626
+ # if "errors" is given, set errors
627
+ errors.concat err if errors
628
+ # returns true if no error; otherwise, returns false
629
+ err.empty? ? true : false
630
+ end
631
+
632
+ # Returns sequence as a Bio::Sequence object.
633
+ #
634
+ # Note: If you modify the returned Bio::Sequence object,
635
+ # the sequence or definition in this Fastq object
636
+ # might also be changed (but not always be changed)
637
+ # because of efficiency.
638
+ #
639
+ def to_biosequence
640
+ Bio::Sequence.adapter(self, Bio::Sequence::Adapter::Fastq)
641
+ end
642
+
643
+ end #class Fastq
644
+
645
+ end #module Bio