bio 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +2105 -3728
- data/KNOWN_ISSUES.rdoc +35 -3
- data/README.rdoc +8 -2
- data/RELEASE_NOTES.rdoc +166 -0
- data/bin/bioruby +4 -1
- data/bioruby.gemspec +146 -1
- data/bioruby.gemspec.erb +3 -1
- data/doc/ChangeLog-before-1.3.1 +3961 -0
- data/doc/Tutorial.rd +154 -22
- data/doc/Tutorial.rd.html +125 -68
- data/lib/bio.rb +21 -6
- data/lib/bio/appl/bl2seq/report.rb +11 -202
- data/lib/bio/appl/blast/format0.rb +0 -193
- data/lib/bio/appl/blast/report.rb +2 -147
- data/lib/bio/appl/blast/wublast.rb +0 -208
- data/lib/bio/appl/fasta.rb +4 -19
- data/lib/bio/appl/fasta/format10.rb +0 -14
- data/lib/bio/appl/genscan/report.rb +0 -176
- data/lib/bio/appl/hmmer.rb +1 -15
- data/lib/bio/appl/hmmer/report.rb +0 -100
- data/lib/bio/appl/meme/mast.rb +156 -0
- data/lib/bio/appl/meme/mast/report.rb +91 -0
- data/lib/bio/appl/meme/motif.rb +48 -0
- data/lib/bio/appl/psort.rb +0 -111
- data/lib/bio/appl/psort/report.rb +1 -45
- data/lib/bio/appl/pts1.rb +2 -4
- data/lib/bio/appl/sosui/report.rb +5 -54
- data/lib/bio/appl/targetp/report.rb +1 -104
- data/lib/bio/appl/tmhmm/report.rb +0 -36
- data/lib/bio/command.rb +94 -10
- data/lib/bio/data/aa.rb +1 -77
- data/lib/bio/data/codontable.rb +1 -95
- data/lib/bio/data/na.rb +1 -26
- data/lib/bio/db/aaindex.rb +1 -38
- data/lib/bio/db/fasta.rb +1 -134
- data/lib/bio/db/fasta/format_qual.rb +204 -0
- data/lib/bio/db/fasta/qual.rb +102 -0
- data/lib/bio/db/fastq.rb +645 -0
- data/lib/bio/db/fastq/fastq_to_biosequence.rb +40 -0
- data/lib/bio/db/fastq/format_fastq.rb +175 -0
- data/lib/bio/db/genbank/genbank.rb +1 -86
- data/lib/bio/db/gff.rb +0 -17
- data/lib/bio/db/go.rb +4 -72
- data/lib/bio/db/kegg/common.rb +112 -0
- data/lib/bio/db/kegg/compound.rb +29 -20
- data/lib/bio/db/kegg/drug.rb +74 -34
- data/lib/bio/db/kegg/enzyme.rb +26 -5
- data/lib/bio/db/kegg/genes.rb +128 -15
- data/lib/bio/db/kegg/genome.rb +3 -41
- data/lib/bio/db/kegg/glycan.rb +19 -24
- data/lib/bio/db/kegg/orthology.rb +16 -56
- data/lib/bio/db/kegg/reaction.rb +81 -28
- data/lib/bio/db/kegg/taxonomy.rb +1 -52
- data/lib/bio/db/litdb.rb +1 -16
- data/lib/bio/db/phyloxml/phyloxml.xsd +582 -0
- data/lib/bio/db/phyloxml/phyloxml_elements.rb +1174 -0
- data/lib/bio/db/phyloxml/phyloxml_parser.rb +954 -0
- data/lib/bio/db/phyloxml/phyloxml_writer.rb +228 -0
- data/lib/bio/db/prosite.rb +2 -95
- data/lib/bio/db/rebase.rb +5 -6
- data/lib/bio/db/sanger_chromatogram/abif.rb +120 -0
- data/lib/bio/db/sanger_chromatogram/chromatogram.rb +133 -0
- data/lib/bio/db/sanger_chromatogram/chromatogram_to_biosequence.rb +32 -0
- data/lib/bio/db/sanger_chromatogram/scf.rb +210 -0
- data/lib/bio/io/das.rb +0 -44
- data/lib/bio/io/ddbjxml.rb +1 -181
- data/lib/bio/io/flatfile.rb +1 -7
- data/lib/bio/io/flatfile/autodetection.rb +6 -0
- data/lib/bio/io/keggapi.rb +0 -442
- data/lib/bio/io/ncbirest.rb +130 -132
- data/lib/bio/io/ncbisoap.rb +2 -1
- data/lib/bio/io/pubmed.rb +0 -88
- data/lib/bio/location.rb +0 -73
- data/lib/bio/pathway.rb +0 -171
- data/lib/bio/sequence.rb +18 -1
- data/lib/bio/sequence/adapter.rb +3 -0
- data/lib/bio/sequence/format.rb +16 -0
- data/lib/bio/sequence/quality_score.rb +205 -0
- data/lib/bio/tree.rb +70 -5
- data/lib/bio/util/restriction_enzyme/single_strand.rb +3 -2
- data/lib/bio/util/sirna.rb +1 -23
- data/lib/bio/version.rb +1 -1
- data/sample/demo_aaindex.rb +67 -0
- data/sample/demo_aminoacid.rb +101 -0
- data/sample/demo_bl2seq_report.rb +220 -0
- data/sample/demo_blast_report.rb +285 -0
- data/sample/demo_codontable.rb +119 -0
- data/sample/demo_das.rb +105 -0
- data/sample/demo_ddbjxml.rb +212 -0
- data/sample/demo_fasta_remote.rb +51 -0
- data/sample/demo_fastaformat.rb +105 -0
- data/sample/demo_genbank.rb +132 -0
- data/sample/demo_genscan_report.rb +202 -0
- data/sample/demo_gff1.rb +49 -0
- data/sample/demo_go.rb +98 -0
- data/sample/demo_hmmer_report.rb +149 -0
- data/sample/demo_kegg_compound.rb +57 -0
- data/sample/demo_kegg_drug.rb +65 -0
- data/sample/demo_kegg_genome.rb +74 -0
- data/sample/demo_kegg_glycan.rb +72 -0
- data/sample/demo_kegg_orthology.rb +62 -0
- data/sample/demo_kegg_reaction.rb +66 -0
- data/sample/demo_kegg_taxonomy.rb +92 -0
- data/sample/demo_keggapi.rb +502 -0
- data/sample/demo_litdb.rb +42 -0
- data/sample/demo_locations.rb +99 -0
- data/sample/demo_ncbi_rest.rb +130 -0
- data/sample/demo_nucleicacid.rb +49 -0
- data/sample/demo_pathway.rb +196 -0
- data/sample/demo_prosite.rb +120 -0
- data/sample/demo_psort.rb +138 -0
- data/sample/demo_psort_report.rb +70 -0
- data/sample/demo_pubmed.rb +118 -0
- data/sample/demo_sirna.rb +63 -0
- data/sample/demo_sosui_report.rb +89 -0
- data/sample/demo_targetp_report.rb +135 -0
- data/sample/demo_tmhmm_report.rb +68 -0
- data/sample/pmfetch.rb +13 -4
- data/sample/pmsearch.rb +15 -4
- data/sample/test_phyloxml_big.rb +205 -0
- data/test/bioruby_test_helper.rb +61 -0
- data/test/data/KEGG/1.1.1.1.enzyme +935 -0
- data/test/data/KEGG/C00025.compound +102 -0
- data/test/data/KEGG/D00063.drug +104 -0
- data/test/data/KEGG/G00024.glycan +47 -0
- data/test/data/KEGG/G01366.glycan +18 -0
- data/test/data/KEGG/K02338.orthology +902 -0
- data/test/data/KEGG/R00006.reaction +14 -0
- data/test/data/fastq/README.txt +109 -0
- data/test/data/fastq/error_diff_ids.fastq +20 -0
- data/test/data/fastq/error_double_qual.fastq +22 -0
- data/test/data/fastq/error_double_seq.fastq +22 -0
- data/test/data/fastq/error_long_qual.fastq +20 -0
- data/test/data/fastq/error_no_qual.fastq +20 -0
- data/test/data/fastq/error_qual_del.fastq +20 -0
- data/test/data/fastq/error_qual_escape.fastq +20 -0
- data/test/data/fastq/error_qual_null.fastq +0 -0
- data/test/data/fastq/error_qual_space.fastq +21 -0
- data/test/data/fastq/error_qual_tab.fastq +21 -0
- data/test/data/fastq/error_qual_unit_sep.fastq +20 -0
- data/test/data/fastq/error_qual_vtab.fastq +20 -0
- data/test/data/fastq/error_short_qual.fastq +20 -0
- data/test/data/fastq/error_spaces.fastq +20 -0
- data/test/data/fastq/error_tabs.fastq +21 -0
- data/test/data/fastq/error_trunc_at_plus.fastq +19 -0
- data/test/data/fastq/error_trunc_at_qual.fastq +19 -0
- data/test/data/fastq/error_trunc_at_seq.fastq +18 -0
- data/test/data/fastq/error_trunc_in_plus.fastq +19 -0
- data/test/data/fastq/error_trunc_in_qual.fastq +20 -0
- data/test/data/fastq/error_trunc_in_seq.fastq +18 -0
- data/test/data/fastq/error_trunc_in_title.fastq +17 -0
- data/test/data/fastq/illumina_full_range_as_illumina.fastq +8 -0
- data/test/data/fastq/illumina_full_range_as_sanger.fastq +8 -0
- data/test/data/fastq/illumina_full_range_as_solexa.fastq +8 -0
- data/test/data/fastq/illumina_full_range_original_illumina.fastq +8 -0
- data/test/data/fastq/longreads_as_illumina.fastq +40 -0
- data/test/data/fastq/longreads_as_sanger.fastq +40 -0
- data/test/data/fastq/longreads_as_solexa.fastq +40 -0
- data/test/data/fastq/longreads_original_sanger.fastq +120 -0
- data/test/data/fastq/misc_dna_as_illumina.fastq +16 -0
- data/test/data/fastq/misc_dna_as_sanger.fastq +16 -0
- data/test/data/fastq/misc_dna_as_solexa.fastq +16 -0
- data/test/data/fastq/misc_dna_original_sanger.fastq +16 -0
- data/test/data/fastq/misc_rna_as_illumina.fastq +16 -0
- data/test/data/fastq/misc_rna_as_sanger.fastq +16 -0
- data/test/data/fastq/misc_rna_as_solexa.fastq +16 -0
- data/test/data/fastq/misc_rna_original_sanger.fastq +16 -0
- data/test/data/fastq/sanger_full_range_as_illumina.fastq +8 -0
- data/test/data/fastq/sanger_full_range_as_sanger.fastq +8 -0
- data/test/data/fastq/sanger_full_range_as_solexa.fastq +8 -0
- data/test/data/fastq/sanger_full_range_original_sanger.fastq +8 -0
- data/test/data/fastq/solexa_full_range_as_illumina.fastq +8 -0
- data/test/data/fastq/solexa_full_range_as_sanger.fastq +8 -0
- data/test/data/fastq/solexa_full_range_as_solexa.fastq +8 -0
- data/test/data/fastq/solexa_full_range_original_solexa.fastq +8 -0
- data/test/data/fastq/wrapping_as_illumina.fastq +12 -0
- data/test/data/fastq/wrapping_as_sanger.fastq +12 -0
- data/test/data/fastq/wrapping_as_solexa.fastq +12 -0
- data/test/data/fastq/wrapping_original_sanger.fastq +24 -0
- data/test/data/meme/db +0 -0
- data/test/data/meme/mast +0 -0
- data/test/data/meme/mast.out +13 -0
- data/test/data/meme/meme.out +3 -0
- data/test/data/phyloxml/apaf.xml +666 -0
- data/test/data/phyloxml/bcl_2.xml +2097 -0
- data/test/data/phyloxml/made_up.xml +144 -0
- data/test/data/phyloxml/ncbi_taxonomy_mollusca_short.xml +65 -0
- data/test/data/phyloxml/phyloxml_examples.xml +415 -0
- data/test/data/sanger_chromatogram/test_chromatogram_abif.ab1 +0 -0
- data/test/data/sanger_chromatogram/test_chromatogram_scf_v2.scf +0 -0
- data/test/data/sanger_chromatogram/test_chromatogram_scf_v3.scf +0 -0
- data/test/functional/bio/appl/test_pts1.rb +7 -5
- data/test/functional/bio/io/test_ensembl.rb +4 -3
- data/test/functional/bio/io/test_pubmed.rb +9 -3
- data/test/functional/bio/io/test_soapwsdl.rb +5 -4
- data/test/functional/bio/io/test_togows.rb +5 -4
- data/test/functional/bio/sequence/test_output_embl.rb +6 -4
- data/test/functional/bio/test_command.rb +54 -5
- data/test/runner.rb +5 -3
- data/test/unit/bio/appl/bl2seq/test_report.rb +5 -4
- data/test/unit/bio/appl/blast/test_ncbioptions.rb +4 -2
- data/test/unit/bio/appl/blast/test_report.rb +5 -4
- data/test/unit/bio/appl/blast/test_rpsblast.rb +5 -4
- data/test/unit/bio/appl/gcg/test_msf.rb +5 -5
- data/test/unit/bio/appl/genscan/test_report.rb +8 -9
- data/test/unit/bio/appl/hmmer/test_report.rb +5 -4
- data/test/unit/bio/appl/iprscan/test_report.rb +6 -5
- data/test/unit/bio/appl/mafft/test_report.rb +6 -5
- data/test/unit/bio/appl/meme/mast/test_report.rb +46 -0
- data/test/unit/bio/appl/meme/test_mast.rb +103 -0
- data/test/unit/bio/appl/meme/test_motif.rb +38 -0
- data/test/unit/bio/appl/paml/codeml/test_rates.rb +5 -4
- data/test/unit/bio/appl/paml/codeml/test_report.rb +5 -4
- data/test/unit/bio/appl/paml/test_codeml.rb +5 -4
- data/test/unit/bio/appl/sim4/test_report.rb +5 -4
- data/test/unit/bio/appl/sosui/test_report.rb +6 -5
- data/test/unit/bio/appl/targetp/test_report.rb +5 -3
- data/test/unit/bio/appl/test_blast.rb +5 -4
- data/test/unit/bio/appl/test_fasta.rb +4 -2
- data/test/unit/bio/appl/test_pts1.rb +4 -2
- data/test/unit/bio/appl/tmhmm/test_report.rb +6 -5
- data/test/unit/bio/data/test_aa.rb +5 -3
- data/test/unit/bio/data/test_codontable.rb +5 -4
- data/test/unit/bio/data/test_na.rb +5 -3
- data/test/unit/bio/db/biosql/tc_biosql.rb +5 -1
- data/test/unit/bio/db/embl/test_common.rb +4 -2
- data/test/unit/bio/db/embl/test_embl.rb +6 -6
- data/test/unit/bio/db/embl/test_embl_rel89.rb +6 -6
- data/test/unit/bio/db/embl/test_embl_to_bioseq.rb +7 -8
- data/test/unit/bio/db/embl/test_sptr.rb +6 -8
- data/test/unit/bio/db/embl/test_uniprot.rb +6 -5
- data/test/unit/bio/db/fasta/test_format_qual.rb +346 -0
- data/test/unit/bio/db/kegg/test_compound.rb +146 -0
- data/test/unit/bio/db/kegg/test_drug.rb +194 -0
- data/test/unit/bio/db/kegg/test_enzyme.rb +241 -0
- data/test/unit/bio/db/kegg/test_genes.rb +32 -4
- data/test/unit/bio/db/kegg/test_glycan.rb +260 -0
- data/test/unit/bio/db/kegg/test_orthology.rb +50 -0
- data/test/unit/bio/db/kegg/test_reaction.rb +96 -0
- data/test/unit/bio/db/pdb/test_pdb.rb +4 -2
- data/test/unit/bio/db/sanger_chromatogram/test_abif.rb +76 -0
- data/test/unit/bio/db/sanger_chromatogram/test_scf.rb +98 -0
- data/test/unit/bio/db/test_aaindex.rb +6 -6
- data/test/unit/bio/db/test_fasta.rb +5 -46
- data/test/unit/bio/db/test_fastq.rb +829 -0
- data/test/unit/bio/db/test_gff.rb +4 -2
- data/test/unit/bio/db/test_lasergene.rb +7 -5
- data/test/unit/bio/db/test_medline.rb +4 -2
- data/test/unit/bio/db/test_newick.rb +6 -6
- data/test/unit/bio/db/test_nexus.rb +4 -2
- data/test/unit/bio/db/test_phyloxml.rb +769 -0
- data/test/unit/bio/db/test_phyloxml_writer.rb +328 -0
- data/test/unit/bio/db/test_prosite.rb +6 -5
- data/test/unit/bio/db/test_qual.rb +63 -0
- data/test/unit/bio/db/test_rebase.rb +5 -3
- data/test/unit/bio/db/test_soft.rb +7 -6
- data/test/unit/bio/io/flatfile/test_autodetection.rb +6 -7
- data/test/unit/bio/io/flatfile/test_buffer.rb +6 -5
- data/test/unit/bio/io/flatfile/test_splitter.rb +4 -4
- data/test/unit/bio/io/test_ddbjxml.rb +4 -3
- data/test/unit/bio/io/test_ensembl.rb +5 -3
- data/test/unit/bio/io/test_fastacmd.rb +4 -3
- data/test/unit/bio/io/test_flatfile.rb +6 -5
- data/test/unit/bio/io/test_soapwsdl.rb +4 -3
- data/test/unit/bio/io/test_togows.rb +4 -2
- data/test/unit/bio/sequence/test_aa.rb +5 -3
- data/test/unit/bio/sequence/test_common.rb +4 -2
- data/test/unit/bio/sequence/test_compat.rb +4 -2
- data/test/unit/bio/sequence/test_dblink.rb +5 -3
- data/test/unit/bio/sequence/test_na.rb +4 -2
- data/test/unit/bio/sequence/test_quality_score.rb +330 -0
- data/test/unit/bio/shell/plugin/test_seq.rb +5 -3
- data/test/unit/bio/test_alignment.rb +5 -3
- data/test/unit/bio/test_command.rb +4 -3
- data/test/unit/bio/test_db.rb +5 -3
- data/test/unit/bio/test_feature.rb +4 -2
- data/test/unit/bio/test_location.rb +4 -2
- data/test/unit/bio/test_map.rb +5 -3
- data/test/unit/bio/test_pathway.rb +4 -2
- data/test/unit/bio/test_reference.rb +4 -2
- data/test/unit/bio/test_sequence.rb +5 -3
- data/test/unit/bio/test_shell.rb +5 -3
- data/test/unit/bio/test_tree.rb +6 -6
- data/test/unit/bio/util/restriction_enzyme/analysis/test_calculated_cuts.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/analysis/test_cut_ranges.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/analysis/test_sequence_range.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_aligned_strands.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_location_pair_in_enzyme_notation.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/double_stranded/test_cut_locations_in_enzyme_notation.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/single_strand/test_cut_locations_in_enzyme_notation.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/test_analysis.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/test_cut_symbol.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/test_double_stranded.rb +4 -2
- data/test/unit/bio/util/restriction_enzyme/test_single_strand.rb +17 -13
- data/test/unit/bio/util/restriction_enzyme/test_single_strand_complement.rb +17 -13
- data/test/unit/bio/util/restriction_enzyme/test_string_formatting.rb +4 -2
- data/test/unit/bio/util/test_color_scheme.rb +5 -3
- data/test/unit/bio/util/test_contingency_table.rb +5 -3
- data/test/unit/bio/util/test_restriction_enzyme.rb +4 -2
- data/test/unit/bio/util/test_sirna.rb +6 -4
- metadata +147 -2
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/fasta/qual.rb - Qual format, FASTA formatted numeric entry
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2001, 2002, 2009
|
|
5
|
+
# Naohisa Goto <ng@bioruby.org>,
|
|
6
|
+
# Toshiaki Katayama <k@bioruby.org>
|
|
7
|
+
# License:: The Ruby License
|
|
8
|
+
#
|
|
9
|
+
# $Id:$
|
|
10
|
+
#
|
|
11
|
+
# == Description
|
|
12
|
+
#
|
|
13
|
+
# QUAL format, FASTA formatted numeric entry.
|
|
14
|
+
#
|
|
15
|
+
# == Examples
|
|
16
|
+
#
|
|
17
|
+
# See documents of Bio::FastaNumericFormat class.
|
|
18
|
+
#
|
|
19
|
+
# == References
|
|
20
|
+
#
|
|
21
|
+
# * FASTA format (WikiPedia)
|
|
22
|
+
# http://en.wikipedia.org/wiki/FASTA_format
|
|
23
|
+
#
|
|
24
|
+
# * Phred quality score (WikiPedia)
|
|
25
|
+
# http://en.wikipedia.org/wiki/Phred_quality_score
|
|
26
|
+
#
|
|
27
|
+
# * Fasta format description (NCBI)
|
|
28
|
+
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
|
|
29
|
+
#
|
|
30
|
+
|
|
31
|
+
require 'bio/db/fasta'
|
|
32
|
+
|
|
33
|
+
module Bio
|
|
34
|
+
|
|
35
|
+
# Treats a FASTA formatted numerical entry, such as:
|
|
36
|
+
#
|
|
37
|
+
# >id and/or some comments <== comment line
|
|
38
|
+
# 24 15 23 29 20 13 20 21 21 23 22 25 13 <== numerical data
|
|
39
|
+
# 22 17 15 25 27 32 26 32 29 29 25
|
|
40
|
+
#
|
|
41
|
+
# The precedent '>' can be omitted and the trailing '>' will be removed
|
|
42
|
+
# automatically.
|
|
43
|
+
#
|
|
44
|
+
# --- Bio::FastaNumericFormat.new(entry)
|
|
45
|
+
#
|
|
46
|
+
# Stores the comment and the list of the numerical data.
|
|
47
|
+
#
|
|
48
|
+
# --- Bio::FastaNumericFormat#definition
|
|
49
|
+
#
|
|
50
|
+
# The comment line of the FASTA formatted data.
|
|
51
|
+
#
|
|
52
|
+
# * FASTA format (Wikipedia)
|
|
53
|
+
# http://en.wikipedia.org/wiki/FASTA_format
|
|
54
|
+
#
|
|
55
|
+
# * Phred quality score (WikiPedia)
|
|
56
|
+
# http://en.wikipedia.org/wiki/Phred_quality_score
|
|
57
|
+
#
|
|
58
|
+
class FastaNumericFormat < FastaFormat
|
|
59
|
+
|
|
60
|
+
# Returns the list of the numerical data (typically the quality score
|
|
61
|
+
# of its corresponding sequence) as an Array.
|
|
62
|
+
# ---
|
|
63
|
+
# *Returns*:: (Array containing Integer) numbers
|
|
64
|
+
def data
|
|
65
|
+
unless defined?(@list)
|
|
66
|
+
@list = @data.strip.split(/\s+/).map {|x| x.to_i}
|
|
67
|
+
end
|
|
68
|
+
@list
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Returns the number of elements in the numerical data,
|
|
72
|
+
# which will be the same of its corresponding sequence length.
|
|
73
|
+
# ---
|
|
74
|
+
# *Returns*:: (Integer) the number of elements
|
|
75
|
+
def length
|
|
76
|
+
data.length
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Yields on each elements of the numerical data.
|
|
80
|
+
# ---
|
|
81
|
+
# *Yields*:: (Integer) a numerical data element
|
|
82
|
+
# *Returns*:: (undefined)
|
|
83
|
+
def each
|
|
84
|
+
data.each do |x|
|
|
85
|
+
yield x
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Returns the n-th element. If out of range, returns nil.
|
|
90
|
+
# ---
|
|
91
|
+
# *Arguments*:
|
|
92
|
+
# * (required) _n_: (Integer) position
|
|
93
|
+
# *Returns*:: (Integer or nil) the value
|
|
94
|
+
def [](n)
|
|
95
|
+
data[n]
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
undef query, blast, fasta, seq, naseq, nalen, aaseq, aalen
|
|
99
|
+
|
|
100
|
+
end #class FastaNumericFormat
|
|
101
|
+
|
|
102
|
+
end #module Bio
|
data/lib/bio/db/fastq.rb
ADDED
|
@@ -0,0 +1,645 @@
|
|
|
1
|
+
#
|
|
2
|
+
# = bio/db/fastq.rb - FASTQ format parser class
|
|
3
|
+
#
|
|
4
|
+
# Copyright:: Copyright (C) 2009
|
|
5
|
+
# Naohisa Goto <ng@bioruby.org>
|
|
6
|
+
# License:: The Ruby License
|
|
7
|
+
#
|
|
8
|
+
# == Description
|
|
9
|
+
#
|
|
10
|
+
# FASTQ format parser class.
|
|
11
|
+
#
|
|
12
|
+
# Be careful that it is for the fastQ format, not for the fastA format.
|
|
13
|
+
#
|
|
14
|
+
# == Examples
|
|
15
|
+
#
|
|
16
|
+
# See documents of Bio::Fastq class.
|
|
17
|
+
#
|
|
18
|
+
# == References
|
|
19
|
+
#
|
|
20
|
+
# * FASTQ format specification
|
|
21
|
+
# http://maq.sourceforge.net/fastq.shtml
|
|
22
|
+
#
|
|
23
|
+
|
|
24
|
+
require "strscan"
|
|
25
|
+
require "singleton"
|
|
26
|
+
|
|
27
|
+
require 'bio/sequence'
|
|
28
|
+
require 'bio/io/flatfile'
|
|
29
|
+
|
|
30
|
+
module Bio
|
|
31
|
+
|
|
32
|
+
# Bio::Fastq is a parser for FASTQ format.
|
|
33
|
+
#
|
|
34
|
+
class Fastq
|
|
35
|
+
|
|
36
|
+
# Bio::Fastq::FormatData is a data class to store Fastq format parameters
|
|
37
|
+
# and quality calculation methods.
|
|
38
|
+
# Bio::Fastq internal use only.
|
|
39
|
+
class FormatData
|
|
40
|
+
|
|
41
|
+
# Format name. Should be redefined in subclass.
|
|
42
|
+
NAME = nil
|
|
43
|
+
|
|
44
|
+
# Offset. Should be redefined in subclass.
|
|
45
|
+
OFFSET = nil
|
|
46
|
+
|
|
47
|
+
# Range of score. Should be redefined in subclass.
|
|
48
|
+
# The range must not exclude end value, i.e. it must be X..Y,
|
|
49
|
+
# and must not be X...Y.
|
|
50
|
+
SCORE_RANGE = nil
|
|
51
|
+
|
|
52
|
+
def initialize
|
|
53
|
+
@name = self.class::NAME
|
|
54
|
+
@symbol = @name.gsub(/\-/, '_').to_sym
|
|
55
|
+
@offset = self.class::OFFSET
|
|
56
|
+
@score_range = self.class::SCORE_RANGE
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Format name
|
|
60
|
+
attr_reader :name
|
|
61
|
+
|
|
62
|
+
# Format name symbol.
|
|
63
|
+
# Note that "-" in the format name is substituted to "_" because
|
|
64
|
+
# "-" in a symbol is relatively difficult to handle.
|
|
65
|
+
attr_reader :symbol
|
|
66
|
+
|
|
67
|
+
# Offset when converting a score to a character
|
|
68
|
+
attr_reader :offset
|
|
69
|
+
|
|
70
|
+
# Allowed range of a score value
|
|
71
|
+
attr_reader :score_range
|
|
72
|
+
|
|
73
|
+
# Type of quality scores. Maybe one of :phred or :solexa.
|
|
74
|
+
attr_reader :quality_score_type if false # for RDoc
|
|
75
|
+
|
|
76
|
+
# Converts quality string to scores.
|
|
77
|
+
# No overflow/underflow checks will be performed.
|
|
78
|
+
# ---
|
|
79
|
+
# *Arguments*:
|
|
80
|
+
# * (required) _c_: (String) quality string
|
|
81
|
+
# *Returns*:: (Array containing Integer) score values
|
|
82
|
+
def str2scores(str)
|
|
83
|
+
a = str.unpack('C*')
|
|
84
|
+
a.collect! { |i| i - @offset }
|
|
85
|
+
a
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Converts scores to a string.
|
|
89
|
+
# Overflow/underflow checks will be performed.
|
|
90
|
+
# If a block is given, when overflow/underflow detected,
|
|
91
|
+
# the score value is passed to the block, and uses returned value
|
|
92
|
+
# as the score. If no blocks, silently truncated.
|
|
93
|
+
#
|
|
94
|
+
# ---
|
|
95
|
+
# *Arguments*:
|
|
96
|
+
# * (required) _a_: (Array containing Integer) score values
|
|
97
|
+
# *Returns*:: (String) quality string
|
|
98
|
+
def scores2str(a)
|
|
99
|
+
if block_given? then
|
|
100
|
+
tmp = a.collect do |i|
|
|
101
|
+
i = yield(i) unless @score_range.include?(i)
|
|
102
|
+
i + @offset
|
|
103
|
+
end
|
|
104
|
+
else
|
|
105
|
+
min = @score_range.begin
|
|
106
|
+
max = @score_range.end
|
|
107
|
+
tmp = a.collect do |i|
|
|
108
|
+
if i < min then
|
|
109
|
+
i = min
|
|
110
|
+
elsif i > max then
|
|
111
|
+
i = max
|
|
112
|
+
end
|
|
113
|
+
i + @offset
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
tmp.pack('C*')
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
# Format information for "fastq-sanger".
|
|
120
|
+
# Bio::Fastq internal use only.
|
|
121
|
+
class FASTQ_SANGER < FormatData
|
|
122
|
+
include Singleton
|
|
123
|
+
|
|
124
|
+
include Bio::Sequence::QualityScore::Phred
|
|
125
|
+
|
|
126
|
+
# format name
|
|
127
|
+
NAME = 'fastq-sanger'.freeze
|
|
128
|
+
# offset
|
|
129
|
+
OFFSET = 33
|
|
130
|
+
# score range
|
|
131
|
+
SCORE_RANGE = 0..93
|
|
132
|
+
|
|
133
|
+
end #class FASTQ_SANGER
|
|
134
|
+
|
|
135
|
+
# Format information for "fastq-solexa"
|
|
136
|
+
# Bio::Fastq internal use only.
|
|
137
|
+
class FASTQ_SOLEXA < FormatData
|
|
138
|
+
include Singleton
|
|
139
|
+
|
|
140
|
+
include Bio::Sequence::QualityScore::Solexa
|
|
141
|
+
|
|
142
|
+
# format name
|
|
143
|
+
NAME = 'fastq-solexa'.freeze
|
|
144
|
+
# offset
|
|
145
|
+
OFFSET = 64
|
|
146
|
+
# score range
|
|
147
|
+
SCORE_RANGE = (-5)..62
|
|
148
|
+
|
|
149
|
+
end #class FASTQ_SOLEXA
|
|
150
|
+
|
|
151
|
+
# Format information for "fastq-illumina"
|
|
152
|
+
# Bio::Fastq internal use only.
|
|
153
|
+
class FASTQ_ILLUMINA < FormatData
|
|
154
|
+
include Singleton
|
|
155
|
+
|
|
156
|
+
include Bio::Sequence::QualityScore::Phred
|
|
157
|
+
|
|
158
|
+
# format name
|
|
159
|
+
NAME = 'fastq-illumina'.freeze
|
|
160
|
+
# offset
|
|
161
|
+
OFFSET = 64
|
|
162
|
+
# score range
|
|
163
|
+
SCORE_RANGE = 0..62
|
|
164
|
+
|
|
165
|
+
end #class FASTQ_ILLUMINA
|
|
166
|
+
|
|
167
|
+
end #class FormatData
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Available format names.
|
|
171
|
+
FormatNames = {
|
|
172
|
+
"fastq-sanger" => FormatData::FASTQ_SANGER,
|
|
173
|
+
"fastq-solexa" => FormatData::FASTQ_SOLEXA,
|
|
174
|
+
"fastq-illumina" => FormatData::FASTQ_ILLUMINA
|
|
175
|
+
}.freeze
|
|
176
|
+
|
|
177
|
+
# Available format name symbols.
|
|
178
|
+
Formats = {
|
|
179
|
+
:fastq_sanger => FormatData::FASTQ_SANGER,
|
|
180
|
+
:fastq_solexa => FormatData::FASTQ_SOLEXA,
|
|
181
|
+
:fastq_illumina => FormatData::FASTQ_ILLUMINA
|
|
182
|
+
}.freeze
|
|
183
|
+
|
|
184
|
+
# Default format name
|
|
185
|
+
DefaultFormatName = 'fastq-sanger'.freeze
|
|
186
|
+
|
|
187
|
+
# Splitter for Bio::FlatFile
|
|
188
|
+
FLATFILE_SPLITTER = Bio::FlatFile::Splitter::LineOriented
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# Basic exception class of all Bio::Fastq::Error:XXXX.
|
|
192
|
+
# Bio::Fastq internal use only.
|
|
193
|
+
class Error < RuntimeError
|
|
194
|
+
|
|
195
|
+
private
|
|
196
|
+
# default error message for this exception
|
|
197
|
+
def default_message(i)
|
|
198
|
+
"FASTQ error #{i}"
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
# Creates a new object.
|
|
202
|
+
# If error message is not given, default error message is stored.
|
|
203
|
+
# If error message is a Integer value, it is treated as the
|
|
204
|
+
# position inside the sequence or the quality, and default
|
|
205
|
+
# error message including the position is stored.
|
|
206
|
+
# ---
|
|
207
|
+
# *Arguments*:
|
|
208
|
+
# * (optional) <em>error_message</em>: error message (see above)
|
|
209
|
+
def initialize(error_message = nil)
|
|
210
|
+
if !error_message or error_message.kind_of?(Integer) then
|
|
211
|
+
error_message = default_message(error_message)
|
|
212
|
+
end
|
|
213
|
+
super(error_message)
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Error::No_atmark -- the first identifier does not begin with "@"
|
|
217
|
+
class No_atmark < Error
|
|
218
|
+
private
|
|
219
|
+
# default error message for this exception
|
|
220
|
+
def default_message(i)
|
|
221
|
+
'the first identifier does not begin with "@"'
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
# Error::No_ids -- sequence identifier not found
|
|
226
|
+
class No_ids < Error
|
|
227
|
+
private
|
|
228
|
+
# default error message for this exception
|
|
229
|
+
def default_message(i)
|
|
230
|
+
'sequence identifier not found'
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# Error::Diff_ids -- the identifier in the two lines are different
|
|
235
|
+
class Diff_ids < Error
|
|
236
|
+
private
|
|
237
|
+
# default error message for this exception
|
|
238
|
+
def default_message(i)
|
|
239
|
+
'the identifier in the two lines are different'
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Error::Long_qual -- length of quality is longer than the sequence
|
|
244
|
+
class Long_qual < Error
|
|
245
|
+
private
|
|
246
|
+
# default error message for this exception
|
|
247
|
+
def default_message(i)
|
|
248
|
+
'length of quality is longer than the sequence'
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Error::Short_qual -- length of quality is shorter than the sequence
|
|
253
|
+
class Short_qual < Error
|
|
254
|
+
private
|
|
255
|
+
# default error message for this exception
|
|
256
|
+
def default_message(i)
|
|
257
|
+
'length of quality is shorter than the sequence'
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Error::No_qual -- no quality characters found
|
|
262
|
+
class No_qual < Error
|
|
263
|
+
private
|
|
264
|
+
# default error message for this exception
|
|
265
|
+
def default_message(i)
|
|
266
|
+
'no quality characters found'
|
|
267
|
+
end
|
|
268
|
+
end
|
|
269
|
+
|
|
270
|
+
# Error::No_seq -- no sequence found
|
|
271
|
+
class No_seq < Error
|
|
272
|
+
private
|
|
273
|
+
# default error message for this exception
|
|
274
|
+
def default_message(i)
|
|
275
|
+
'no sequence found'
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# Error::Qual_char -- invalid character in the quality
|
|
280
|
+
class Qual_char < Error
|
|
281
|
+
private
|
|
282
|
+
# default error message for this exception
|
|
283
|
+
def default_message(i)
|
|
284
|
+
pos = i ? " at [#{i}]" : ''
|
|
285
|
+
"invalid character in the quality#{pos}"
|
|
286
|
+
end
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Error::Seq_char -- invalid character in the sequence
|
|
290
|
+
class Seq_char < Error
|
|
291
|
+
private
|
|
292
|
+
# default error message for this exception
|
|
293
|
+
def default_message(i)
|
|
294
|
+
pos = i ? " at [#{i}]" : ''
|
|
295
|
+
"invalid character in the sequence#{pos}"
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# Error::Qual_range -- quality score value out of range
|
|
300
|
+
class Qual_range < Error
|
|
301
|
+
private
|
|
302
|
+
# default error message for this exception
|
|
303
|
+
def default_message(i)
|
|
304
|
+
pos = i ? " at [#{i}]" : ''
|
|
305
|
+
"quality score value out of range#{pos}"
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
# Error::Skipped_unformatted_lines -- the parser skipped unformatted
|
|
310
|
+
# lines that could not be recognized as FASTQ format
|
|
311
|
+
class Skipped_unformatted_lines < Error
|
|
312
|
+
private
|
|
313
|
+
# default error message for this exception
|
|
314
|
+
def default_message(i)
|
|
315
|
+
"the parser skipped unformatted lines that could not be recognized as FASTQ format"
|
|
316
|
+
end
|
|
317
|
+
end
|
|
318
|
+
end #class Error
|
|
319
|
+
|
|
320
|
+
# Adds a header line if the header data is not yet given and
|
|
321
|
+
# the given line is suitable for header.
|
|
322
|
+
# Returns self if adding header line is succeeded.
|
|
323
|
+
# Otherwise, returns false (the line is not added).
|
|
324
|
+
def add_header_line(line)
|
|
325
|
+
@header ||= ""
|
|
326
|
+
if line[0,1] == "@" then
|
|
327
|
+
false
|
|
328
|
+
else
|
|
329
|
+
@header.concat line
|
|
330
|
+
self
|
|
331
|
+
end
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
# misc lines before the entry (String or nil)
|
|
335
|
+
attr_reader :header
|
|
336
|
+
|
|
337
|
+
# Adds a line to the entry if the given line is regarded as
|
|
338
|
+
# a part of the current entry.
|
|
339
|
+
def add_line(line)
|
|
340
|
+
line = line.chomp
|
|
341
|
+
if !defined? @definition then
|
|
342
|
+
if line[0, 1] == "@" then
|
|
343
|
+
@definition = line[1..-1]
|
|
344
|
+
else
|
|
345
|
+
@definition = line
|
|
346
|
+
@parse_errors ||= []
|
|
347
|
+
@parse_errors.push Error::No_atmark.new
|
|
348
|
+
end
|
|
349
|
+
return self
|
|
350
|
+
end
|
|
351
|
+
if defined? @definition2 then
|
|
352
|
+
@quality_string ||= ''
|
|
353
|
+
if line[0, 1] == "@" and
|
|
354
|
+
@quality_string.size >= @sequence_string.size then
|
|
355
|
+
return false
|
|
356
|
+
else
|
|
357
|
+
@quality_string.concat line
|
|
358
|
+
return self
|
|
359
|
+
end
|
|
360
|
+
else
|
|
361
|
+
@sequence_string ||= ''
|
|
362
|
+
if line[0, 1] == '+' then
|
|
363
|
+
@definition2 = line[1..-1]
|
|
364
|
+
else
|
|
365
|
+
@sequence_string.concat line
|
|
366
|
+
end
|
|
367
|
+
return self
|
|
368
|
+
end
|
|
369
|
+
raise "Bug: should not reach here!"
|
|
370
|
+
end
|
|
371
|
+
|
|
372
|
+
# entry_overrun
|
|
373
|
+
attr_reader :entry_overrun
|
|
374
|
+
|
|
375
|
+
# Creates a new Fastq object from formatted text string.
|
|
376
|
+
#
|
|
377
|
+
# The format of quality scores should be specified later
|
|
378
|
+
# by using <tt>format=</tt> method.
|
|
379
|
+
#
|
|
380
|
+
# ---
|
|
381
|
+
# *Arguments*:
|
|
382
|
+
# * _str_: Formatted string (String)
|
|
383
|
+
def initialize(str = nil)
|
|
384
|
+
return unless str
|
|
385
|
+
sc = StringScanner.new(str)
|
|
386
|
+
while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
|
|
387
|
+
unless add_header_line(line) then
|
|
388
|
+
sc.unscan
|
|
389
|
+
break
|
|
390
|
+
end
|
|
391
|
+
end
|
|
392
|
+
while !sc.eos? and line = sc.scan(/.*(?:\n|\r|\r\n)?/)
|
|
393
|
+
unless add_line(line) then
|
|
394
|
+
sc.unscan
|
|
395
|
+
break
|
|
396
|
+
end
|
|
397
|
+
end
|
|
398
|
+
@entry_overrun = sc.rest
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
# definition; ID line (begins with @)
|
|
402
|
+
attr_reader :definition
|
|
403
|
+
|
|
404
|
+
# quality as a string
|
|
405
|
+
attr_reader :quality_string
|
|
406
|
+
|
|
407
|
+
# raw sequence data as a String object
|
|
408
|
+
attr_reader :sequence_string
|
|
409
|
+
|
|
410
|
+
# returns Bio::Sequence::NA
|
|
411
|
+
def naseq
|
|
412
|
+
unless defined? @naseq then
|
|
413
|
+
@naseq = Bio::Sequence::NA.new(@sequence_string)
|
|
414
|
+
end
|
|
415
|
+
@naseq
|
|
416
|
+
end
|
|
417
|
+
|
|
418
|
+
# length of naseq
|
|
419
|
+
def nalen
|
|
420
|
+
naseq.length
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
# returns Bio::Sequence::Generic
|
|
424
|
+
def seq
|
|
425
|
+
unless defined? @seq then
|
|
426
|
+
@seq = Bio::Sequence::Generic.new(@sequence_string)
|
|
427
|
+
end
|
|
428
|
+
@seq
|
|
429
|
+
end
|
|
430
|
+
|
|
431
|
+
# Identifier of the entry. Normally, the first word of the ID line.
|
|
432
|
+
def entry_id
|
|
433
|
+
unless defined? @entry_id then
|
|
434
|
+
eid = @definition.strip.split(/\s+/)[0] || @definition
|
|
435
|
+
@entry_id = eid
|
|
436
|
+
end
|
|
437
|
+
@entry_id
|
|
438
|
+
end
|
|
439
|
+
|
|
440
|
+
# (private) reset internal state
|
|
441
|
+
def reset_state
|
|
442
|
+
if defined? @quality_scores then
|
|
443
|
+
remove_instance_variable(:@quality_scores)
|
|
444
|
+
end
|
|
445
|
+
if defined? @error_probabilities then
|
|
446
|
+
remove_instance_variable(:@error_probabilities)
|
|
447
|
+
end
|
|
448
|
+
end
|
|
449
|
+
private :reset_state
|
|
450
|
+
|
|
451
|
+
# Specify the format. If the format is not found, raises RuntimeError.
|
|
452
|
+
#
|
|
453
|
+
# Available formats are:
|
|
454
|
+
# "fastq-sanger" or :fastq_sanger
|
|
455
|
+
# "fastq-solexa" or :fastq_solexa
|
|
456
|
+
# "fastq-illumina" or :fastq_illumina
|
|
457
|
+
#
|
|
458
|
+
# ---
|
|
459
|
+
# *Arguments*:
|
|
460
|
+
# * (required) _name_: format name (String or Symbol).
|
|
461
|
+
# *Returns*:: (String) format name
|
|
462
|
+
def format=(name)
|
|
463
|
+
if name then
|
|
464
|
+
f = FormatNames[name] || Formats[name]
|
|
465
|
+
if f then
|
|
466
|
+
reset_state
|
|
467
|
+
@format = f.instance
|
|
468
|
+
self.format
|
|
469
|
+
else
|
|
470
|
+
raise "unknown format"
|
|
471
|
+
end
|
|
472
|
+
else
|
|
473
|
+
reset_state
|
|
474
|
+
nil
|
|
475
|
+
end
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
# Format name.
|
|
479
|
+
# One of "fastq-sanger", "fastq-solexa", "fastq-illumina",
|
|
480
|
+
# or nil (when not specified).
|
|
481
|
+
# ---
|
|
482
|
+
# *Returns*:: (String or nil) format name
|
|
483
|
+
def format
|
|
484
|
+
@format ? @format.name : nil
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
# The meaning of the quality scores.
|
|
489
|
+
# It may be one of :phred, :solexa, or nil.
|
|
490
|
+
def quality_score_type
|
|
491
|
+
self.format ||= self.class::DefaultFormatName
|
|
492
|
+
@format.quality_score_type
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
# Quality score for each base.
|
|
496
|
+
# For "fastq-sanger" or "fastq-illumina", it is PHRED score.
|
|
497
|
+
# For "fastq-solexa", it is Solexa score.
|
|
498
|
+
#
|
|
499
|
+
# ---
|
|
500
|
+
# *Returns*:: (Array containing Integer) quality score values
|
|
501
|
+
def quality_scores
|
|
502
|
+
unless defined? @quality_scores then
|
|
503
|
+
self.format ||= self.class::DefaultFormatName
|
|
504
|
+
s = @format.str2scores(@quality_string)
|
|
505
|
+
@quality_scores = s
|
|
506
|
+
end
|
|
507
|
+
@quality_scores
|
|
508
|
+
end
|
|
509
|
+
|
|
510
|
+
alias qualities quality_scores
|
|
511
|
+
|
|
512
|
+
# Estimated probability of error for each base.
|
|
513
|
+
# ---
|
|
514
|
+
# *Returns*:: (Array containing Float) error probability values
|
|
515
|
+
def error_probabilities
|
|
516
|
+
unless defined? @error_probabilities then
|
|
517
|
+
self.format ||= self.class::DefaultFormatName
|
|
518
|
+
a = @format.q2p(self.quality_scores)
|
|
519
|
+
@error_probabilities = a
|
|
520
|
+
end
|
|
521
|
+
@error_probabilities
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
# Format validation.
|
|
525
|
+
#
|
|
526
|
+
# If an array is given as the argument, when errors are found,
|
|
527
|
+
# error objects are pushed to the array.
|
|
528
|
+
# Currently, following errors may be added to the array.
|
|
529
|
+
# (All errors are under the Bio::Fastq namespace, for example,
|
|
530
|
+
# Bio::Fastq::Error::Diff_ids).
|
|
531
|
+
#
|
|
532
|
+
# Error::Diff_ids -- the identifier in the two lines are different
|
|
533
|
+
# Error::Long_qual -- length of quality is longer than the sequence
|
|
534
|
+
# Error::Short_qual -- length of quality is shorter than the sequence
|
|
535
|
+
# Error::No_qual -- no quality characters found
|
|
536
|
+
# Error::No_seq -- no sequence found
|
|
537
|
+
# Error::Qual_char -- invalid character in the quality
|
|
538
|
+
# Error::Seq_char -- invalid character in the sequence
|
|
539
|
+
# Error::Qual_range -- quality score value out of range
|
|
540
|
+
# Error::No_ids -- sequence identifier not found
|
|
541
|
+
# Error::No_atmark -- the first identifier does not begin with "@"
|
|
542
|
+
# Error::Skipped_unformatted_lines -- the parser skipped unformatted lines that could not be recognized as FASTQ format
|
|
543
|
+
#
|
|
544
|
+
# ---
|
|
545
|
+
# *Arguments*:
|
|
546
|
+
# * (optional) _errors_: (Array or nil) an array for pushing error messages. The array should be empty.
|
|
547
|
+
# *Returns*:: true:no error, false: containing error.
|
|
548
|
+
def validate_format(errors = nil)
|
|
549
|
+
err = []
|
|
550
|
+
|
|
551
|
+
# if header exists, the format might be broken.
|
|
552
|
+
if defined? @header and @header and !@header.strip.empty? then
|
|
553
|
+
err.push Error::Skipped_unformatted_lines.new
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
# if parse errors exist, adding them
|
|
557
|
+
if defined? @parse_errors and @parse_errors then
|
|
558
|
+
err.concat @parse_errors
|
|
559
|
+
end
|
|
560
|
+
|
|
561
|
+
# check if identifier exists, and identifier matches
|
|
562
|
+
if !defined?(@definition) or !@definition then
|
|
563
|
+
err.push Error::No_ids.new
|
|
564
|
+
elsif defined?(@definition2) and
|
|
565
|
+
!@definition2.to_s.empty? and
|
|
566
|
+
@definition != @definition2 then
|
|
567
|
+
err.push Error::Diff_ids.new
|
|
568
|
+
end
|
|
569
|
+
|
|
570
|
+
# check if sequence exists
|
|
571
|
+
has_seq = true
|
|
572
|
+
if !defined?(@sequence_string) or !@sequence_string then
|
|
573
|
+
err.push Error::No_seq.new
|
|
574
|
+
has_seq = false
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
# check if quality exists
|
|
578
|
+
has_qual = true
|
|
579
|
+
if !defined?(@quality_string) or !@quality_string then
|
|
580
|
+
err.push Error::No_qual.new
|
|
581
|
+
has_qual = false
|
|
582
|
+
end
|
|
583
|
+
|
|
584
|
+
# sequence and quality length check
|
|
585
|
+
if has_seq and has_qual then
|
|
586
|
+
slen = @sequence_string.length
|
|
587
|
+
qlen = @quality_string.length
|
|
588
|
+
if slen > qlen then
|
|
589
|
+
err.push Error::Short_qual.new
|
|
590
|
+
elsif qlen > slen then
|
|
591
|
+
err.push Error::Long_qual.new
|
|
592
|
+
end
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
# sequence character check
|
|
596
|
+
if has_seq then
|
|
597
|
+
sc = StringScanner.new(@sequence_string)
|
|
598
|
+
while sc.scan_until(/[ \x00-\x1f\x7f-\xff]/n)
|
|
599
|
+
err.push Error::Seq_char.new(sc.pos - sc.matched_size)
|
|
600
|
+
end
|
|
601
|
+
end
|
|
602
|
+
|
|
603
|
+
# sequence character check
|
|
604
|
+
if has_qual then
|
|
605
|
+
fmt = if defined?(@format) and @format then
|
|
606
|
+
@format.name
|
|
607
|
+
else
|
|
608
|
+
nil
|
|
609
|
+
end
|
|
610
|
+
re = case fmt
|
|
611
|
+
when 'fastq-sanger'
|
|
612
|
+
/[^\x21-\x7e]/n
|
|
613
|
+
when 'fastq-solexa'
|
|
614
|
+
/[^\x3b-\x7e]/n
|
|
615
|
+
when 'fastq-illumina'
|
|
616
|
+
/[^\x40-\x7e]/n
|
|
617
|
+
else
|
|
618
|
+
/[ \x00-\x1f\x7f-\xff]/n
|
|
619
|
+
end
|
|
620
|
+
sc = StringScanner.new(@quality_string)
|
|
621
|
+
while sc.scan_until(re)
|
|
622
|
+
err.push Error::Qual_char.new(sc.pos - sc.matched_size)
|
|
623
|
+
end
|
|
624
|
+
end
|
|
625
|
+
|
|
626
|
+
# if "errors" is given, set errors
|
|
627
|
+
errors.concat err if errors
|
|
628
|
+
# returns true if no error; otherwise, returns false
|
|
629
|
+
err.empty? ? true : false
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
# Returns sequence as a Bio::Sequence object.
|
|
633
|
+
#
|
|
634
|
+
# Note: If you modify the returned Bio::Sequence object,
|
|
635
|
+
# the sequence or definition in this Fastq object
|
|
636
|
+
# might also be changed (but not always be changed)
|
|
637
|
+
# because of efficiency.
|
|
638
|
+
#
|
|
639
|
+
def to_biosequence
|
|
640
|
+
Bio::Sequence.adapter(self, Bio::Sequence::Adapter::Fastq)
|
|
641
|
+
end
|
|
642
|
+
|
|
643
|
+
end #class Fastq
|
|
644
|
+
|
|
645
|
+
end #module Bio
|