bio 1.4.2 → 1.4.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +66 -0
- data/ChangeLog +989 -4524
- data/KNOWN_ISSUES.rdoc +67 -2
- data/README.rdoc +89 -23
- data/README_DEV.rdoc +93 -2
- data/RELEASE_NOTES.rdoc +167 -95
- data/Rakefile +199 -7
- data/bioruby.gemspec +27 -12
- data/bioruby.gemspec.erb +6 -3
- data/doc/ChangeLog-before-1.4.2 +5013 -0
- data/doc/RELEASE_NOTES-1.4.2.rdoc +132 -0
- data/doc/Tutorial.rd +21 -3
- data/doc/Tutorial.rd.html +20 -12
- data/etc/bioinformatics/seqdatabase.ini +13 -196
- data/gemfiles/Gemfile.travis-jruby1.8 +7 -0
- data/gemfiles/Gemfile.travis-jruby1.9 +10 -0
- data/gemfiles/Gemfile.travis-ruby1.8 +7 -0
- data/gemfiles/Gemfile.travis-ruby1.9 +10 -0
- data/gemfiles/modify-Gemfile.rb +28 -0
- data/gemfiles/prepare-gemspec.rb +25 -0
- data/lib/bio/alignment.rb +1 -1
- data/lib/bio/appl/bl2seq/report.rb +3 -3
- data/lib/bio/appl/blast/ddbj.rb +0 -3
- data/lib/bio/appl/blast/format0.rb +4 -22
- data/lib/bio/appl/blast/genomenet.rb +33 -16
- data/lib/bio/appl/blast/ncbioptions.rb +8 -3
- data/lib/bio/appl/blast/remote.rb +6 -5
- data/lib/bio/appl/blast/report.rb +10 -6
- data/lib/bio/appl/blast/rpsblast.rb +3 -2
- data/lib/bio/appl/blast/wublast.rb +3 -3
- data/lib/bio/command.rb +118 -36
- data/lib/bio/data/na.rb +1 -1
- data/lib/bio/db/embl/embl.rb +74 -0
- data/lib/bio/db/embl/format_embl.rb +0 -4
- data/lib/bio/db/fasta.rb +57 -45
- data/lib/bio/db/fasta/defline.rb +1 -1
- data/lib/bio/db/fasta/format_fasta.rb +0 -4
- data/lib/bio/db/fasta/format_qual.rb +0 -5
- data/lib/bio/db/fastq/format_fastq.rb +0 -1
- data/lib/bio/db/genbank/format_genbank.rb +0 -4
- data/lib/bio/db/gff.rb +41 -12
- data/lib/bio/db/kegg/genes.rb +3 -3
- data/lib/bio/db/kegg/kgml.rb +465 -64
- data/lib/bio/db/newick.rb +0 -244
- data/lib/bio/db/pdb.rb +1 -4
- data/lib/bio/db/pdb/atom.rb +3 -2
- data/lib/bio/db/pdb/chain.rb +2 -3
- data/lib/bio/db/pdb/chemicalcomponent.rb +3 -2
- data/lib/bio/db/pdb/model.rb +2 -2
- data/lib/bio/db/pdb/pdb.rb +2 -1
- data/lib/bio/db/pdb/residue.rb +2 -2
- data/lib/bio/db/pdb/utils.rb +7 -4
- data/lib/bio/db/phyloxml/phyloxml_parser.rb +52 -5
- data/lib/bio/feature.rb +2 -3
- data/lib/bio/io/flatfile/autodetection.rb +1 -1
- data/lib/bio/io/flatfile/buffer.rb +84 -0
- data/lib/bio/sequence.rb +6 -4
- data/lib/bio/sequence/aa.rb +3 -5
- data/lib/bio/sequence/adapter.rb +6 -6
- data/lib/bio/sequence/common.rb +3 -3
- data/lib/bio/sequence/compat.rb +2 -7
- data/lib/bio/sequence/dblink.rb +6 -5
- data/lib/bio/sequence/format.rb +0 -6
- data/lib/bio/sequence/format_raw.rb +0 -4
- data/lib/bio/sequence/generic.rb +3 -4
- data/lib/bio/sequence/na.rb +4 -6
- data/lib/bio/sequence/quality_score.rb +2 -0
- data/lib/bio/sequence/sequence_masker.rb +3 -0
- data/lib/bio/shell/core.rb +1 -0
- data/lib/bio/tree.rb +1 -2
- data/lib/bio/tree/output.rb +264 -0
- data/lib/bio/util/restriction_enzyme.rb +1 -3
- data/lib/bio/util/restriction_enzyme/analysis.rb +8 -5
- data/lib/bio/util/restriction_enzyme/analysis_basic.rb +4 -3
- data/lib/bio/util/restriction_enzyme/cut_symbol.rb +3 -2
- data/lib/bio/util/restriction_enzyme/dense_int_array.rb +3 -0
- data/lib/bio/util/restriction_enzyme/double_stranded.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/aligned_strands.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/cut_location_pair.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/cut_location_pair_in_enzyme_notation.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/cut_locations.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/cut_locations_in_enzyme_notation.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/cut_range.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/cut_ranges.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/horizontal_cut_range.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/sequence_range.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/sequence_range/calculated_cuts.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/sequence_range/fragment.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/sequence_range/fragments.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/vertical_cut_range.rb +3 -4
- data/lib/bio/util/restriction_enzyme/single_strand.rb +3 -3
- data/lib/bio/util/restriction_enzyme/single_strand/cut_locations_in_enzyme_notation.rb +3 -4
- data/lib/bio/util/restriction_enzyme/single_strand_complement.rb +3 -4
- data/lib/bio/util/restriction_enzyme/sorted_num_array.rb +3 -0
- data/lib/bio/util/restriction_enzyme/string_formatting.rb +3 -4
- data/lib/bio/version.rb +11 -2
- data/sample/seqdatabase.ini +210 -0
- data/test/bioruby_test_helper.rb +37 -12
- data/test/data/KEGG/test.kgml +37 -0
- data/test/data/command/echoarg2.bat +0 -0
- data/test/data/command/echoarg2.sh +4 -0
- data/test/functional/bio/test_command.rb +58 -28
- data/test/{functional → network}/bio/appl/blast/test_remote.rb +0 -0
- data/test/{functional → network}/bio/appl/test_blast.rb +0 -0
- data/test/{functional → network}/bio/appl/test_pts1.rb +0 -0
- data/test/{functional → network}/bio/io/test_ddbjrest.rb +0 -0
- data/test/{functional → network}/bio/io/test_ensembl.rb +0 -0
- data/test/{functional → network}/bio/io/test_pubmed.rb +0 -0
- data/test/{functional → network}/bio/io/test_soapwsdl.rb +0 -0
- data/test/{functional → network}/bio/io/test_togows.rb +0 -0
- data/test/network/bio/test_command.rb +35 -0
- data/test/runner.rb +16 -6
- data/test/unit/bio/appl/blast/test_report.rb +119 -0
- data/test/unit/bio/appl/blast/test_rpsblast.rb +1 -0
- data/test/unit/bio/data/test_na.rb +1 -1
- data/test/unit/bio/db/embl/test_embl.rb +2 -7
- data/test/unit/bio/db/embl/test_embl_rel89.rb +2 -7
- data/test/unit/bio/db/fasta/test_defline.rb +1 -1
- data/test/unit/bio/db/genbank/test_genpept.rb +1 -1
- data/test/unit/bio/db/kegg/test_drug.rb +1 -1
- data/test/unit/bio/db/kegg/test_genome.rb +1 -1
- data/test/unit/bio/db/kegg/test_glycan.rb +1 -1
- data/test/unit/bio/db/kegg/test_kgml.rb +1022 -0
- data/test/unit/bio/db/sanger_chromatogram/test_abif.rb +2 -1
- data/test/unit/bio/db/sanger_chromatogram/test_scf.rb +4 -2
- data/test/unit/bio/db/test_newick.rb +2 -0
- data/test/unit/bio/db/test_phyloxml.rb +54 -2
- data/test/unit/bio/db/test_phyloxml_writer.rb +15 -9
- data/test/unit/bio/db/test_soft.rb +1 -1
- data/test/unit/bio/io/flatfile/test_autodetection.rb +6 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +141 -0
- data/test/unit/bio/sequence/test_common.rb +36 -4
- data/test/unit/bio/sequence/test_na.rb +1 -1
- data/test/unit/bio/test_command.rb +9 -4
- data/test/unit/bio/test_sequence.rb +2 -2
- data/test/unit/bio/test_tree.rb +11 -11
- data/test/unit/bio/util/test_restriction_enzyme.rb +1 -1
- metadata +1428 -655
- data/rdoc.zsh +0 -8
data/lib/bio/data/na.rb
CHANGED
data/lib/bio/db/embl/embl.rb
CHANGED
@@ -196,53 +196,127 @@ class EMBL < EMBLDB
|
|
196
196
|
|
197
197
|
|
198
198
|
|
199
|
+
#--
|
199
200
|
##
|
200
201
|
# DE Line; description (>=1)
|
201
202
|
#
|
203
|
+
#++
|
202
204
|
|
203
205
|
|
206
|
+
#--
|
204
207
|
##
|
205
208
|
# KW Line; keyword (>=1)
|
206
209
|
# KW [Keyword;]+
|
207
210
|
#
|
208
211
|
# Bio::EMBLDB#kw -> Array
|
209
212
|
# #keywords -> Array
|
213
|
+
#++
|
210
214
|
|
211
215
|
|
216
|
+
#--
|
212
217
|
##
|
213
218
|
# OS Line; organism species (>=1)
|
214
219
|
# OS Genus species (name)
|
215
220
|
# "OS Trifolium repens (white clover)"
|
216
221
|
#
|
217
222
|
# Bio::EMBLDB#os -> Array
|
223
|
+
#++
|
218
224
|
|
225
|
+
# returns contents in the OS line.
|
226
|
+
# * Bio::EMBL#os -> Array of <OS Hash>
|
227
|
+
# where <OS Hash> is:
|
228
|
+
# [{'name'=>'Human', 'os'=>'Homo sapiens'},
|
229
|
+
# {'name'=>'Rat', 'os'=>'Rattus norveticus'}]
|
230
|
+
# * Bio::EMBL#os[0]['name'] => "Human"
|
231
|
+
# * Bio::EMBL#os[0] => {'name'=>"Human", 'os'=>'Homo sapiens'}
|
232
|
+
#--
|
233
|
+
# * Bio::EMBL#os(0) => "Homo sapiens (Human)"
|
234
|
+
#++
|
235
|
+
#
|
236
|
+
# OS Line; organism species (>=1)
|
237
|
+
# OS Trifolium repens (white clover)
|
238
|
+
#
|
239
|
+
# Typically, OS line shows "Genus species (name)" style:
|
240
|
+
# OS Genus species (name)
|
241
|
+
#
|
242
|
+
# Other examples:
|
243
|
+
# OS uncultured bacterium
|
244
|
+
# OS xxxxxx metagenome
|
245
|
+
# OS Cloning vector xxxxxxxx
|
246
|
+
# Complicated examples:
|
247
|
+
# OS Poeciliopsis gracilis (Poeciliopsis gracilis (Heckel, 1848))
|
248
|
+
# OS Etmopterus sp. B Last & Stevens, 1994 (bristled lanternshark)
|
249
|
+
# OS Galaxias sp. D (Allibone et al., 1996) (Pool Burn galaxias)
|
250
|
+
# OS Sicydiinae sp. 'Keith et al., 2010'
|
251
|
+
# OS Acanthopagrus sp. 'Jean & Lee, 2008'
|
252
|
+
# OS Gaussia princeps (T. Scott, 1894)
|
253
|
+
# OS Rana sp. 8 Hillis & Wilcox, 2005
|
254
|
+
# OS Contracaecum rudolphii C D'Amelio et al., 2007
|
255
|
+
# OS Partula sp. 'Mt. Marau, Tahiti'
|
256
|
+
# OS Leptocephalus sp. 'type II larva' (Smith, 1989)
|
257
|
+
# OS Tayloria grandis (D.G.Long) Goffinet & A.J.Shaw, 2002
|
258
|
+
# OS Non-A, non-B hepatitis virus
|
259
|
+
# OS Canidae (dog, coyote, wolf, fox)
|
260
|
+
# OS Salmonella enterica subsp. enterica serovar 4,[5],12:i:-
|
261
|
+
# OS Yersinia enterocolitica (type O:5,27)
|
262
|
+
# OS Influenza A virus (A/green-winged teal/OH/72/99(H6N1,4))
|
263
|
+
# OS Influenza A virus (A/Beijing/352/1989,(highgrowth reassortant NIB26)(H3N2))
|
264
|
+
# OS Recombinant Hepatitis C virus H77(5'UTR-NS2)/JFH1_V787A,Q1247L
|
265
|
+
#
|
266
|
+
def os(num = nil)
|
267
|
+
unless @data['OS']
|
268
|
+
os = Array.new
|
269
|
+
tmp = fetch('OS')
|
270
|
+
if /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d]) *\(([\w\d ]+)\)\s*\z/ =~ tmp
|
271
|
+
org = $1
|
272
|
+
os.push({'name' => $2, 'os' => $1})
|
273
|
+
else
|
274
|
+
os.push({'name' => nil, 'os' => tmp})
|
275
|
+
end
|
276
|
+
@data['OS'] = os
|
277
|
+
end
|
278
|
+
if num
|
279
|
+
# EX. "Trifolium repens (white clover)"
|
280
|
+
"#{@data['OS'][num]['os']} {#data['OS'][num]['name']"
|
281
|
+
end
|
282
|
+
@data['OS']
|
283
|
+
end
|
219
284
|
|
285
|
+
|
286
|
+
#--
|
220
287
|
##
|
221
288
|
# OC Line; organism classification (>=1)
|
222
289
|
#
|
223
290
|
# Bio::EMBLDB#oc -> Array
|
291
|
+
#++
|
224
292
|
|
225
293
|
|
294
|
+
#--
|
226
295
|
##
|
227
296
|
# OG Line; organella (0 or 1/entry)
|
228
297
|
# ["Mitochondrion", "Chloroplast","Kinetoplast", "Cyanelle", "Plastid"]
|
229
298
|
# or a plasmid name (e.g. "Plasmid pBR322").
|
230
299
|
#
|
231
300
|
# Bio::EMBLDB#og -> String
|
301
|
+
#++
|
232
302
|
|
233
303
|
|
304
|
+
#--
|
234
305
|
##
|
235
306
|
# R Lines
|
236
307
|
# RN RC RP RX RA RT RL
|
237
308
|
#
|
238
309
|
# Bio::EMBLDB#ref
|
310
|
+
#++
|
239
311
|
|
240
312
|
|
313
|
+
#--
|
241
314
|
##
|
242
315
|
# DR Line; defabases cross-regerence (>=0)
|
243
316
|
# "DR database_identifier; primary_identifier; secondary_identifier."
|
244
317
|
#
|
245
318
|
# Bio::EMBLDB#dr
|
319
|
+
#++
|
246
320
|
|
247
321
|
|
248
322
|
# returns feature table header (String) in the feature header (FH) line.
|
data/lib/bio/db/fasta.rb
CHANGED
@@ -35,7 +35,7 @@ module Bio
|
|
35
35
|
|
36
36
|
# Treats a FASTA formatted entry, such as:
|
37
37
|
#
|
38
|
-
# >id and/or some comments <==
|
38
|
+
# >id and/or some comments <== definition line
|
39
39
|
# ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines
|
40
40
|
# ATGCATGCATGCATGCATGCATGCATGCATGCATGC
|
41
41
|
# ATGCATGCATGC
|
@@ -45,52 +45,64 @@ module Bio
|
|
45
45
|
#
|
46
46
|
# === Examples
|
47
47
|
#
|
48
|
-
#
|
49
|
-
# >
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
#
|
55
|
-
# QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES
|
56
|
-
# >sce:YBR274W CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST]
|
57
|
-
# MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP
|
58
|
-
# TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG
|
59
|
-
# GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL
|
60
|
-
# DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA
|
61
|
-
# DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI
|
62
|
-
# EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL
|
63
|
-
# AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF
|
64
|
-
# QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND
|
65
|
-
# CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
|
66
|
-
# FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
|
67
|
-
# KTGDPLEWRRLFKKISTICRDIILIPN
|
48
|
+
# fasta_string = <<END_OF_STRING
|
49
|
+
# >gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]
|
50
|
+
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI
|
51
|
+
# VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ
|
52
|
+
# NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP
|
53
|
+
# IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP
|
54
|
+
# INRISARRAAIHPYFQES
|
68
55
|
# END_OF_STRING
|
69
56
|
#
|
70
|
-
# f = Bio::FastaFormat.new(
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
75
|
-
#
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
82
|
-
#
|
83
|
-
#
|
84
|
-
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
#
|
90
|
-
#
|
91
|
-
#
|
92
|
-
#
|
93
|
-
#
|
57
|
+
# f = Bio::FastaFormat.new(fasta_string)
|
58
|
+
#
|
59
|
+
# f.entry #=> ">gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]\n"+
|
60
|
+
# # MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\n"+
|
61
|
+
# # VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\n"+
|
62
|
+
# # NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\n"+
|
63
|
+
# # IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\n"+
|
64
|
+
# # INRISARRAAIHPYFQES"
|
65
|
+
#
|
66
|
+
# ==== Methods related to the name of the sequence
|
67
|
+
#
|
68
|
+
# A larger range of methods for dealing with Fasta definition lines can be found in FastaDefline, accessed through the FastaFormat#identifiers method.
|
69
|
+
#
|
70
|
+
# f.entry_id #=> "gi|398365175"
|
71
|
+
# f.definition #=> "gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]"
|
72
|
+
# f.identifiers #=> Bio::FastaDefline instance
|
73
|
+
# f.accession #=> "NP_009718"
|
74
|
+
# f.accessions #=> ["NP_009718"]
|
75
|
+
# f.acc_version #=> "NP_009718.3"
|
76
|
+
# f.comment #=> nil
|
77
|
+
#
|
78
|
+
# ==== Methods related to the actual sequence
|
79
|
+
#
|
80
|
+
# f.seq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES"
|
81
|
+
# f.data #=> "\nMSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\nVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\nNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\nIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\nINRISARRAAIHPYFQES\n"
|
82
|
+
# f.length #=> 298
|
83
|
+
# f.aaseq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES"
|
84
|
+
# f.aaseq.composition #=> {"M"=>5, "S"=>15, "G"=>21, "E"=>16, "L"=>36, "A"=>17, "N"=>8, "Y"=>13, "K"=>22, "R"=>20, "V"=>18, "T"=>7, "D"=>23, "P"=>17, "Q"=>10, "I"=>23, "H"=>7, "F"=>12, "C"=>4, "W"=>4}
|
85
|
+
# f.aalen #=> 298
|
86
|
+
#
|
87
|
+
#
|
88
|
+
# === A less structured fasta entry
|
89
|
+
#
|
90
|
+
# f.entry #=> ">abc 123 456\nASDF"
|
91
|
+
#
|
92
|
+
# f.entry_id #=> "abc"
|
93
|
+
# f.definition #=> "abc 123 456"
|
94
|
+
# f.comment #=> nil
|
95
|
+
# f.accession #=> nil
|
96
|
+
# f.accessions #=> []
|
97
|
+
# f.acc_version #=> nil
|
98
|
+
#
|
99
|
+
# f.seq #=> "ASDF"
|
100
|
+
# f.data #=> "\nASDF\n"
|
101
|
+
# f.length #=> 4
|
102
|
+
# f.aaseq #=> "ASDF"
|
103
|
+
# f.aaseq.composition #=> {"A"=>1, "S"=>1, "D"=>1, "F"=>1}
|
104
|
+
# f.aalen #=> 4
|
105
|
+
#
|
94
106
|
#
|
95
107
|
# === References
|
96
108
|
#
|
data/lib/bio/db/fasta/defline.rb
CHANGED
@@ -120,7 +120,7 @@ module Bio
|
|
120
120
|
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
121
121
|
# ["gb", "AAB29504.1", nil], ["Cavia"]]
|
122
122
|
#
|
123
|
-
# ===
|
123
|
+
# === References
|
124
124
|
#
|
125
125
|
# * Fasta format description (NCBI)
|
126
126
|
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
|
data/lib/bio/db/gff.rb
CHANGED
@@ -454,7 +454,8 @@ module Bio
|
|
454
454
|
|
455
455
|
# Return the record as a GFF2 compatible string
|
456
456
|
def to_s
|
457
|
-
cmnt = if @comment and
|
457
|
+
cmnt = if defined?(@comment) and @comment and
|
458
|
+
!@comment.to_s.strip.empty? then
|
458
459
|
@comment.gsub(/[\r\n]+/, ' ')
|
459
460
|
else
|
460
461
|
false
|
@@ -996,21 +997,46 @@ module Bio
|
|
996
997
|
str.empty? ? '.' : str
|
997
998
|
end
|
998
999
|
|
1000
|
+
if URI.const_defined?(:Parser) then
|
1001
|
+
# (private) URI::Parser object for escape/unescape GFF3 columns
|
1002
|
+
URI_PARSER = URI::Parser.new
|
1003
|
+
|
1004
|
+
# (private) the same as URI::Parser#escape(str, unsafe)
|
1005
|
+
def _escape(str, unsafe)
|
1006
|
+
URI_PARSER.escape(str, unsafe)
|
1007
|
+
end
|
1008
|
+
|
1009
|
+
# (private) the same as URI::Parser#unescape(str)
|
1010
|
+
def _unescape(str)
|
1011
|
+
URI_PARSER.unescape(str)
|
1012
|
+
end
|
1013
|
+
else
|
1014
|
+
# (private) the same as URI.escape(str, unsafe)
|
1015
|
+
def _escape(str, unsafe)
|
1016
|
+
URI.escape(str, unsafe)
|
1017
|
+
end
|
1018
|
+
|
1019
|
+
# (private) the same as URI.unescape(str)
|
1020
|
+
def _unescape(str)
|
1021
|
+
URI.unescape(str)
|
1022
|
+
end
|
1023
|
+
end
|
1024
|
+
|
999
1025
|
# Return the string corresponding to these characters unescaped
|
1000
1026
|
def unescape(string)
|
1001
|
-
|
1027
|
+
_unescape(string)
|
1002
1028
|
end
|
1003
1029
|
|
1004
1030
|
# Escape a column according to the specification at
|
1005
1031
|
# http://song.sourceforge.net/gff3.shtml.
|
1006
1032
|
def escape(string)
|
1007
|
-
|
1033
|
+
_escape(string, UNSAFE)
|
1008
1034
|
end
|
1009
1035
|
|
1010
1036
|
# Escape seqid column according to the specification at
|
1011
1037
|
# http://song.sourceforge.net/gff3.shtml.
|
1012
1038
|
def escape_seqid(string)
|
1013
|
-
|
1039
|
+
_escape(string, UNSAFE_SEQID)
|
1014
1040
|
end
|
1015
1041
|
|
1016
1042
|
# Escape attribute according to the specification at
|
@@ -1019,7 +1045,7 @@ module Bio
|
|
1019
1045
|
# are escaped: ",=;".
|
1020
1046
|
# Returns the string corresponding to these characters escaped.
|
1021
1047
|
def escape_attribute(string)
|
1022
|
-
|
1048
|
+
_escape(string, UNSAFE_ATTRIBUTE)
|
1023
1049
|
end
|
1024
1050
|
end #module Escape
|
1025
1051
|
|
@@ -1028,6 +1054,7 @@ module Bio
|
|
1028
1054
|
# Stores meta-data "##sequence-region seqid start end".
|
1029
1055
|
class SequenceRegion
|
1030
1056
|
include Escape
|
1057
|
+
extend Escape
|
1031
1058
|
|
1032
1059
|
# creates a new SequenceRegion class
|
1033
1060
|
def initialize(seqid, start, endpos)
|
@@ -1039,7 +1066,7 @@ module Bio
|
|
1039
1066
|
# parses given string and returns SequenceRegion class
|
1040
1067
|
def self.parse(str)
|
1041
1068
|
dummy, seqid, start, endpos =
|
1042
|
-
str.chomp.split(/\s+/, 4).collect { |x|
|
1069
|
+
str.chomp.split(/\s+/, 4).collect { |x| unescape(x) }
|
1043
1070
|
self.new(seqid, start, endpos)
|
1044
1071
|
end
|
1045
1072
|
|
@@ -1139,7 +1166,8 @@ module Bio
|
|
1139
1166
|
|
1140
1167
|
# Return the record as a GFF3 compatible string
|
1141
1168
|
def to_s
|
1142
|
-
cmnt = if @comment and
|
1169
|
+
cmnt = if defined?(@comment) and @comment and
|
1170
|
+
!@comment.to_s.strip.empty? then
|
1143
1171
|
@comment.gsub(/[\r\n]+/, ' ')
|
1144
1172
|
else
|
1145
1173
|
false
|
@@ -1163,6 +1191,7 @@ module Bio
|
|
1163
1191
|
# data of "Target" attribute.
|
1164
1192
|
class Target
|
1165
1193
|
include GFF3::Escape
|
1194
|
+
extend GFF3::Escape
|
1166
1195
|
|
1167
1196
|
# Creates a new Target object.
|
1168
1197
|
def initialize(target_id, start, endpos, strand = nil)
|
@@ -1190,7 +1219,7 @@ module Bio
|
|
1190
1219
|
#
|
1191
1220
|
def self.parse(str)
|
1192
1221
|
target_id, start, endpos, strand =
|
1193
|
-
str.split(/ +/, 4).collect { |x|
|
1222
|
+
str.split(/ +/, 4).collect { |x| unescape(x) }
|
1194
1223
|
self.new(target_id, start, endpos, strand)
|
1195
1224
|
end
|
1196
1225
|
|
@@ -1332,15 +1361,15 @@ module Bio
|
|
1332
1361
|
|
1333
1362
|
# rest of data_ref
|
1334
1363
|
len = 0
|
1335
|
-
data_ref.each do |
|
1336
|
-
len +=
|
1364
|
+
data_ref.each do |r|
|
1365
|
+
len += r.length if r.code == :M
|
1337
1366
|
end
|
1338
1367
|
data.push Code.new(:D, len) if len > 0
|
1339
1368
|
|
1340
1369
|
# rest of data_tgt
|
1341
1370
|
len = 0
|
1342
|
-
data_tgt.each do |
|
1343
|
-
len +=
|
1371
|
+
data_tgt.each do |t|
|
1372
|
+
len += t.length if t.code == :M
|
1344
1373
|
end
|
1345
1374
|
data.push Code.new(:I, len) if len > 0
|
1346
1375
|
|