bio 1.4.2 → 1.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +66 -0
- data/ChangeLog +989 -4524
- data/KNOWN_ISSUES.rdoc +67 -2
- data/README.rdoc +89 -23
- data/README_DEV.rdoc +93 -2
- data/RELEASE_NOTES.rdoc +167 -95
- data/Rakefile +199 -7
- data/bioruby.gemspec +27 -12
- data/bioruby.gemspec.erb +6 -3
- data/doc/ChangeLog-before-1.4.2 +5013 -0
- data/doc/RELEASE_NOTES-1.4.2.rdoc +132 -0
- data/doc/Tutorial.rd +21 -3
- data/doc/Tutorial.rd.html +20 -12
- data/etc/bioinformatics/seqdatabase.ini +13 -196
- data/gemfiles/Gemfile.travis-jruby1.8 +7 -0
- data/gemfiles/Gemfile.travis-jruby1.9 +10 -0
- data/gemfiles/Gemfile.travis-ruby1.8 +7 -0
- data/gemfiles/Gemfile.travis-ruby1.9 +10 -0
- data/gemfiles/modify-Gemfile.rb +28 -0
- data/gemfiles/prepare-gemspec.rb +25 -0
- data/lib/bio/alignment.rb +1 -1
- data/lib/bio/appl/bl2seq/report.rb +3 -3
- data/lib/bio/appl/blast/ddbj.rb +0 -3
- data/lib/bio/appl/blast/format0.rb +4 -22
- data/lib/bio/appl/blast/genomenet.rb +33 -16
- data/lib/bio/appl/blast/ncbioptions.rb +8 -3
- data/lib/bio/appl/blast/remote.rb +6 -5
- data/lib/bio/appl/blast/report.rb +10 -6
- data/lib/bio/appl/blast/rpsblast.rb +3 -2
- data/lib/bio/appl/blast/wublast.rb +3 -3
- data/lib/bio/command.rb +118 -36
- data/lib/bio/data/na.rb +1 -1
- data/lib/bio/db/embl/embl.rb +74 -0
- data/lib/bio/db/embl/format_embl.rb +0 -4
- data/lib/bio/db/fasta.rb +57 -45
- data/lib/bio/db/fasta/defline.rb +1 -1
- data/lib/bio/db/fasta/format_fasta.rb +0 -4
- data/lib/bio/db/fasta/format_qual.rb +0 -5
- data/lib/bio/db/fastq/format_fastq.rb +0 -1
- data/lib/bio/db/genbank/format_genbank.rb +0 -4
- data/lib/bio/db/gff.rb +41 -12
- data/lib/bio/db/kegg/genes.rb +3 -3
- data/lib/bio/db/kegg/kgml.rb +465 -64
- data/lib/bio/db/newick.rb +0 -244
- data/lib/bio/db/pdb.rb +1 -4
- data/lib/bio/db/pdb/atom.rb +3 -2
- data/lib/bio/db/pdb/chain.rb +2 -3
- data/lib/bio/db/pdb/chemicalcomponent.rb +3 -2
- data/lib/bio/db/pdb/model.rb +2 -2
- data/lib/bio/db/pdb/pdb.rb +2 -1
- data/lib/bio/db/pdb/residue.rb +2 -2
- data/lib/bio/db/pdb/utils.rb +7 -4
- data/lib/bio/db/phyloxml/phyloxml_parser.rb +52 -5
- data/lib/bio/feature.rb +2 -3
- data/lib/bio/io/flatfile/autodetection.rb +1 -1
- data/lib/bio/io/flatfile/buffer.rb +84 -0
- data/lib/bio/sequence.rb +6 -4
- data/lib/bio/sequence/aa.rb +3 -5
- data/lib/bio/sequence/adapter.rb +6 -6
- data/lib/bio/sequence/common.rb +3 -3
- data/lib/bio/sequence/compat.rb +2 -7
- data/lib/bio/sequence/dblink.rb +6 -5
- data/lib/bio/sequence/format.rb +0 -6
- data/lib/bio/sequence/format_raw.rb +0 -4
- data/lib/bio/sequence/generic.rb +3 -4
- data/lib/bio/sequence/na.rb +4 -6
- data/lib/bio/sequence/quality_score.rb +2 -0
- data/lib/bio/sequence/sequence_masker.rb +3 -0
- data/lib/bio/shell/core.rb +1 -0
- data/lib/bio/tree.rb +1 -2
- data/lib/bio/tree/output.rb +264 -0
- data/lib/bio/util/restriction_enzyme.rb +1 -3
- data/lib/bio/util/restriction_enzyme/analysis.rb +8 -5
- data/lib/bio/util/restriction_enzyme/analysis_basic.rb +4 -3
- data/lib/bio/util/restriction_enzyme/cut_symbol.rb +3 -2
- data/lib/bio/util/restriction_enzyme/dense_int_array.rb +3 -0
- data/lib/bio/util/restriction_enzyme/double_stranded.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/aligned_strands.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/cut_location_pair.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/cut_location_pair_in_enzyme_notation.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/cut_locations.rb +3 -4
- data/lib/bio/util/restriction_enzyme/double_stranded/cut_locations_in_enzyme_notation.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/cut_range.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/cut_ranges.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/horizontal_cut_range.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/sequence_range.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/sequence_range/calculated_cuts.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/sequence_range/fragment.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/sequence_range/fragments.rb +3 -4
- data/lib/bio/util/restriction_enzyme/range/vertical_cut_range.rb +3 -4
- data/lib/bio/util/restriction_enzyme/single_strand.rb +3 -3
- data/lib/bio/util/restriction_enzyme/single_strand/cut_locations_in_enzyme_notation.rb +3 -4
- data/lib/bio/util/restriction_enzyme/single_strand_complement.rb +3 -4
- data/lib/bio/util/restriction_enzyme/sorted_num_array.rb +3 -0
- data/lib/bio/util/restriction_enzyme/string_formatting.rb +3 -4
- data/lib/bio/version.rb +11 -2
- data/sample/seqdatabase.ini +210 -0
- data/test/bioruby_test_helper.rb +37 -12
- data/test/data/KEGG/test.kgml +37 -0
- data/test/data/command/echoarg2.bat +0 -0
- data/test/data/command/echoarg2.sh +4 -0
- data/test/functional/bio/test_command.rb +58 -28
- data/test/{functional → network}/bio/appl/blast/test_remote.rb +0 -0
- data/test/{functional → network}/bio/appl/test_blast.rb +0 -0
- data/test/{functional → network}/bio/appl/test_pts1.rb +0 -0
- data/test/{functional → network}/bio/io/test_ddbjrest.rb +0 -0
- data/test/{functional → network}/bio/io/test_ensembl.rb +0 -0
- data/test/{functional → network}/bio/io/test_pubmed.rb +0 -0
- data/test/{functional → network}/bio/io/test_soapwsdl.rb +0 -0
- data/test/{functional → network}/bio/io/test_togows.rb +0 -0
- data/test/network/bio/test_command.rb +35 -0
- data/test/runner.rb +16 -6
- data/test/unit/bio/appl/blast/test_report.rb +119 -0
- data/test/unit/bio/appl/blast/test_rpsblast.rb +1 -0
- data/test/unit/bio/data/test_na.rb +1 -1
- data/test/unit/bio/db/embl/test_embl.rb +2 -7
- data/test/unit/bio/db/embl/test_embl_rel89.rb +2 -7
- data/test/unit/bio/db/fasta/test_defline.rb +1 -1
- data/test/unit/bio/db/genbank/test_genpept.rb +1 -1
- data/test/unit/bio/db/kegg/test_drug.rb +1 -1
- data/test/unit/bio/db/kegg/test_genome.rb +1 -1
- data/test/unit/bio/db/kegg/test_glycan.rb +1 -1
- data/test/unit/bio/db/kegg/test_kgml.rb +1022 -0
- data/test/unit/bio/db/sanger_chromatogram/test_abif.rb +2 -1
- data/test/unit/bio/db/sanger_chromatogram/test_scf.rb +4 -2
- data/test/unit/bio/db/test_newick.rb +2 -0
- data/test/unit/bio/db/test_phyloxml.rb +54 -2
- data/test/unit/bio/db/test_phyloxml_writer.rb +15 -9
- data/test/unit/bio/db/test_soft.rb +1 -1
- data/test/unit/bio/io/flatfile/test_autodetection.rb +6 -0
- data/test/unit/bio/io/flatfile/test_buffer.rb +141 -0
- data/test/unit/bio/sequence/test_common.rb +36 -4
- data/test/unit/bio/sequence/test_na.rb +1 -1
- data/test/unit/bio/test_command.rb +9 -4
- data/test/unit/bio/test_sequence.rb +2 -2
- data/test/unit/bio/test_tree.rb +11 -11
- data/test/unit/bio/util/test_restriction_enzyme.rb +1 -1
- metadata +1428 -655
- data/rdoc.zsh +0 -8
data/lib/bio/data/na.rb
CHANGED
data/lib/bio/db/embl/embl.rb
CHANGED
@@ -196,53 +196,127 @@ class EMBL < EMBLDB
|
|
196
196
|
|
197
197
|
|
198
198
|
|
199
|
+
#--
|
199
200
|
##
|
200
201
|
# DE Line; description (>=1)
|
201
202
|
#
|
203
|
+
#++
|
202
204
|
|
203
205
|
|
206
|
+
#--
|
204
207
|
##
|
205
208
|
# KW Line; keyword (>=1)
|
206
209
|
# KW [Keyword;]+
|
207
210
|
#
|
208
211
|
# Bio::EMBLDB#kw -> Array
|
209
212
|
# #keywords -> Array
|
213
|
+
#++
|
210
214
|
|
211
215
|
|
216
|
+
#--
|
212
217
|
##
|
213
218
|
# OS Line; organism species (>=1)
|
214
219
|
# OS Genus species (name)
|
215
220
|
# "OS Trifolium repens (white clover)"
|
216
221
|
#
|
217
222
|
# Bio::EMBLDB#os -> Array
|
223
|
+
#++
|
218
224
|
|
225
|
+
# returns contents in the OS line.
|
226
|
+
# * Bio::EMBL#os -> Array of <OS Hash>
|
227
|
+
# where <OS Hash> is:
|
228
|
+
# [{'name'=>'Human', 'os'=>'Homo sapiens'},
|
229
|
+
# {'name'=>'Rat', 'os'=>'Rattus norveticus'}]
|
230
|
+
# * Bio::EMBL#os[0]['name'] => "Human"
|
231
|
+
# * Bio::EMBL#os[0] => {'name'=>"Human", 'os'=>'Homo sapiens'}
|
232
|
+
#--
|
233
|
+
# * Bio::EMBL#os(0) => "Homo sapiens (Human)"
|
234
|
+
#++
|
235
|
+
#
|
236
|
+
# OS Line; organism species (>=1)
|
237
|
+
# OS Trifolium repens (white clover)
|
238
|
+
#
|
239
|
+
# Typically, OS line shows "Genus species (name)" style:
|
240
|
+
# OS Genus species (name)
|
241
|
+
#
|
242
|
+
# Other examples:
|
243
|
+
# OS uncultured bacterium
|
244
|
+
# OS xxxxxx metagenome
|
245
|
+
# OS Cloning vector xxxxxxxx
|
246
|
+
# Complicated examples:
|
247
|
+
# OS Poeciliopsis gracilis (Poeciliopsis gracilis (Heckel, 1848))
|
248
|
+
# OS Etmopterus sp. B Last & Stevens, 1994 (bristled lanternshark)
|
249
|
+
# OS Galaxias sp. D (Allibone et al., 1996) (Pool Burn galaxias)
|
250
|
+
# OS Sicydiinae sp. 'Keith et al., 2010'
|
251
|
+
# OS Acanthopagrus sp. 'Jean & Lee, 2008'
|
252
|
+
# OS Gaussia princeps (T. Scott, 1894)
|
253
|
+
# OS Rana sp. 8 Hillis & Wilcox, 2005
|
254
|
+
# OS Contracaecum rudolphii C D'Amelio et al., 2007
|
255
|
+
# OS Partula sp. 'Mt. Marau, Tahiti'
|
256
|
+
# OS Leptocephalus sp. 'type II larva' (Smith, 1989)
|
257
|
+
# OS Tayloria grandis (D.G.Long) Goffinet & A.J.Shaw, 2002
|
258
|
+
# OS Non-A, non-B hepatitis virus
|
259
|
+
# OS Canidae (dog, coyote, wolf, fox)
|
260
|
+
# OS Salmonella enterica subsp. enterica serovar 4,[5],12:i:-
|
261
|
+
# OS Yersinia enterocolitica (type O:5,27)
|
262
|
+
# OS Influenza A virus (A/green-winged teal/OH/72/99(H6N1,4))
|
263
|
+
# OS Influenza A virus (A/Beijing/352/1989,(highgrowth reassortant NIB26)(H3N2))
|
264
|
+
# OS Recombinant Hepatitis C virus H77(5'UTR-NS2)/JFH1_V787A,Q1247L
|
265
|
+
#
|
266
|
+
def os(num = nil)
|
267
|
+
unless @data['OS']
|
268
|
+
os = Array.new
|
269
|
+
tmp = fetch('OS')
|
270
|
+
if /([A-Z][a-z]* *[\w\d \:\'\+\-]+[\w\d]) *\(([\w\d ]+)\)\s*\z/ =~ tmp
|
271
|
+
org = $1
|
272
|
+
os.push({'name' => $2, 'os' => $1})
|
273
|
+
else
|
274
|
+
os.push({'name' => nil, 'os' => tmp})
|
275
|
+
end
|
276
|
+
@data['OS'] = os
|
277
|
+
end
|
278
|
+
if num
|
279
|
+
# EX. "Trifolium repens (white clover)"
|
280
|
+
"#{@data['OS'][num]['os']} {#data['OS'][num]['name']"
|
281
|
+
end
|
282
|
+
@data['OS']
|
283
|
+
end
|
219
284
|
|
285
|
+
|
286
|
+
#--
|
220
287
|
##
|
221
288
|
# OC Line; organism classification (>=1)
|
222
289
|
#
|
223
290
|
# Bio::EMBLDB#oc -> Array
|
291
|
+
#++
|
224
292
|
|
225
293
|
|
294
|
+
#--
|
226
295
|
##
|
227
296
|
# OG Line; organella (0 or 1/entry)
|
228
297
|
# ["Mitochondrion", "Chloroplast","Kinetoplast", "Cyanelle", "Plastid"]
|
229
298
|
# or a plasmid name (e.g. "Plasmid pBR322").
|
230
299
|
#
|
231
300
|
# Bio::EMBLDB#og -> String
|
301
|
+
#++
|
232
302
|
|
233
303
|
|
304
|
+
#--
|
234
305
|
##
|
235
306
|
# R Lines
|
236
307
|
# RN RC RP RX RA RT RL
|
237
308
|
#
|
238
309
|
# Bio::EMBLDB#ref
|
310
|
+
#++
|
239
311
|
|
240
312
|
|
313
|
+
#--
|
241
314
|
##
|
242
315
|
# DR Line; defabases cross-regerence (>=0)
|
243
316
|
# "DR database_identifier; primary_identifier; secondary_identifier."
|
244
317
|
#
|
245
318
|
# Bio::EMBLDB#dr
|
319
|
+
#++
|
246
320
|
|
247
321
|
|
248
322
|
# returns feature table header (String) in the feature header (FH) line.
|
data/lib/bio/db/fasta.rb
CHANGED
@@ -35,7 +35,7 @@ module Bio
|
|
35
35
|
|
36
36
|
# Treats a FASTA formatted entry, such as:
|
37
37
|
#
|
38
|
-
# >id and/or some comments <==
|
38
|
+
# >id and/or some comments <== definition line
|
39
39
|
# ATGCATGCATGCATGCATGCATGCATGCATGCATGC <== sequence lines
|
40
40
|
# ATGCATGCATGCATGCATGCATGCATGCATGCATGC
|
41
41
|
# ATGCATGCATGC
|
@@ -45,52 +45,64 @@ module Bio
|
|
45
45
|
#
|
46
46
|
# === Examples
|
47
47
|
#
|
48
|
-
#
|
49
|
-
# >
|
50
|
-
#
|
51
|
-
#
|
52
|
-
#
|
53
|
-
#
|
54
|
-
#
|
55
|
-
# QWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES
|
56
|
-
# >sce:YBR274W CHK1; probable serine/threonine-protein kinase [EC:2.7.1.-] [SP:KB9S_YEAST]
|
57
|
-
# MSLSQVSPLPHIKDVVLGDTVGQGAFACVKNAHLQMDPSIILAVKFIHVP
|
58
|
-
# TCKKMGLSDKDITKEVVLQSKCSKHPNVLRLIDCNVSKEYMWIILEMADG
|
59
|
-
# GDLFDKIEPDVGVDSDVAQFYFQQLVSAINYLHVECGVAHRDIKPENILL
|
60
|
-
# DKNGNLKLADFGLASQFRRKDGTLRVSMDQRGSPPYMAPEVLYSEEGYYA
|
61
|
-
# DRTDIWSIGILLFVLLTGQTPWELPSLENEDFVFFIENDGNLNWGPWSKI
|
62
|
-
# EFTHLNLLRKILQPDPNKRVTLKALKLHPWVLRRASFSGDDGLCNDPELL
|
63
|
-
# AKKLFSHLKVSLSNENYLKFTQDTNSNNRYISTQPIGNELAELEHDSMHF
|
64
|
-
# QTVSNTQRAFTSYDSNTNYNSGTGMTQEAKWTQFISYDIAALQFHSDEND
|
65
|
-
# CNELVKRHLQFNPNKLTKFYTLQPMDVLLPILEKALNLSQIRVKPDLFAN
|
66
|
-
# FERLCELLGYDNVFPLIINIKTKSNGGYQLCGSISIIKIEEELKSVGFER
|
67
|
-
# KTGDPLEWRRLFKKISTICRDIILIPN
|
48
|
+
# fasta_string = <<END_OF_STRING
|
49
|
+
# >gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]
|
50
|
+
# MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI
|
51
|
+
# VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ
|
52
|
+
# NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP
|
53
|
+
# IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP
|
54
|
+
# INRISARRAAIHPYFQES
|
68
55
|
# END_OF_STRING
|
69
56
|
#
|
70
|
-
# f = Bio::FastaFormat.new(
|
71
|
-
#
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
75
|
-
#
|
76
|
-
#
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
82
|
-
#
|
83
|
-
#
|
84
|
-
#
|
85
|
-
#
|
86
|
-
#
|
87
|
-
#
|
88
|
-
#
|
89
|
-
#
|
90
|
-
#
|
91
|
-
#
|
92
|
-
#
|
93
|
-
#
|
57
|
+
# f = Bio::FastaFormat.new(fasta_string)
|
58
|
+
#
|
59
|
+
# f.entry #=> ">gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]\n"+
|
60
|
+
# # MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\n"+
|
61
|
+
# # VRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\n"+
|
62
|
+
# # NLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\n"+
|
63
|
+
# # IFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\n"+
|
64
|
+
# # INRISARRAAIHPYFQES"
|
65
|
+
#
|
66
|
+
# ==== Methods related to the name of the sequence
|
67
|
+
#
|
68
|
+
# A larger range of methods for dealing with Fasta definition lines can be found in FastaDefline, accessed through the FastaFormat#identifiers method.
|
69
|
+
#
|
70
|
+
# f.entry_id #=> "gi|398365175"
|
71
|
+
# f.definition #=> "gi|398365175|ref|NP_009718.3| Cdc28p [Saccharomyces cerevisiae S288c]"
|
72
|
+
# f.identifiers #=> Bio::FastaDefline instance
|
73
|
+
# f.accession #=> "NP_009718"
|
74
|
+
# f.accessions #=> ["NP_009718"]
|
75
|
+
# f.acc_version #=> "NP_009718.3"
|
76
|
+
# f.comment #=> nil
|
77
|
+
#
|
78
|
+
# ==== Methods related to the actual sequence
|
79
|
+
#
|
80
|
+
# f.seq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES"
|
81
|
+
# f.data #=> "\nMSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNI\nVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQ\nNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKP\nIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDP\nINRISARRAAIHPYFQES\n"
|
82
|
+
# f.length #=> 298
|
83
|
+
# f.aaseq #=> "MSGELANYKRLEKVGEGTYGVVYKALDLRPGQGQRVVALKKIRLESEDEGVPSTAIREISLLKELKDDNIVRLYDIVHSDAHKLYLVFEFLDLDLKRYMEGIPKDQPLGADIVKKFMMQLCKGIAYCHSHRILHRDLKPQNLLINKDGNLKLGDFGLARAFGVPLRAYTHEIVTLWYRAPEVLLGGKQYSTGVDTWSIGCIFAEMCNRKPIFSGDSEIDQIFKIFRVLGTPNEAIWPDIVYLPDFKPSFPQWRRKDLSQVVPSLDPRGIDLLDKLLAYDPINRISARRAAIHPYFQES"
|
84
|
+
# f.aaseq.composition #=> {"M"=>5, "S"=>15, "G"=>21, "E"=>16, "L"=>36, "A"=>17, "N"=>8, "Y"=>13, "K"=>22, "R"=>20, "V"=>18, "T"=>7, "D"=>23, "P"=>17, "Q"=>10, "I"=>23, "H"=>7, "F"=>12, "C"=>4, "W"=>4}
|
85
|
+
# f.aalen #=> 298
|
86
|
+
#
|
87
|
+
#
|
88
|
+
# === A less structured fasta entry
|
89
|
+
#
|
90
|
+
# f.entry #=> ">abc 123 456\nASDF"
|
91
|
+
#
|
92
|
+
# f.entry_id #=> "abc"
|
93
|
+
# f.definition #=> "abc 123 456"
|
94
|
+
# f.comment #=> nil
|
95
|
+
# f.accession #=> nil
|
96
|
+
# f.accessions #=> []
|
97
|
+
# f.acc_version #=> nil
|
98
|
+
#
|
99
|
+
# f.seq #=> "ASDF"
|
100
|
+
# f.data #=> "\nASDF\n"
|
101
|
+
# f.length #=> 4
|
102
|
+
# f.aaseq #=> "ASDF"
|
103
|
+
# f.aaseq.composition #=> {"A"=>1, "S"=>1, "D"=>1, "F"=>1}
|
104
|
+
# f.aalen #=> 4
|
105
|
+
#
|
94
106
|
#
|
95
107
|
# === References
|
96
108
|
#
|
data/lib/bio/db/fasta/defline.rb
CHANGED
@@ -120,7 +120,7 @@ module Bio
|
|
120
120
|
# ["gi", "2147182"], ["pir", nil, "I51898"], ["gi", "544724"],
|
121
121
|
# ["gb", "AAB29504.1", nil], ["Cavia"]]
|
122
122
|
#
|
123
|
-
# ===
|
123
|
+
# === References
|
124
124
|
#
|
125
125
|
# * Fasta format description (NCBI)
|
126
126
|
# http://www.ncbi.nlm.nih.gov/BLAST/fasta.shtml
|
data/lib/bio/db/gff.rb
CHANGED
@@ -454,7 +454,8 @@ module Bio
|
|
454
454
|
|
455
455
|
# Return the record as a GFF2 compatible string
|
456
456
|
def to_s
|
457
|
-
cmnt = if @comment and
|
457
|
+
cmnt = if defined?(@comment) and @comment and
|
458
|
+
!@comment.to_s.strip.empty? then
|
458
459
|
@comment.gsub(/[\r\n]+/, ' ')
|
459
460
|
else
|
460
461
|
false
|
@@ -996,21 +997,46 @@ module Bio
|
|
996
997
|
str.empty? ? '.' : str
|
997
998
|
end
|
998
999
|
|
1000
|
+
if URI.const_defined?(:Parser) then
|
1001
|
+
# (private) URI::Parser object for escape/unescape GFF3 columns
|
1002
|
+
URI_PARSER = URI::Parser.new
|
1003
|
+
|
1004
|
+
# (private) the same as URI::Parser#escape(str, unsafe)
|
1005
|
+
def _escape(str, unsafe)
|
1006
|
+
URI_PARSER.escape(str, unsafe)
|
1007
|
+
end
|
1008
|
+
|
1009
|
+
# (private) the same as URI::Parser#unescape(str)
|
1010
|
+
def _unescape(str)
|
1011
|
+
URI_PARSER.unescape(str)
|
1012
|
+
end
|
1013
|
+
else
|
1014
|
+
# (private) the same as URI.escape(str, unsafe)
|
1015
|
+
def _escape(str, unsafe)
|
1016
|
+
URI.escape(str, unsafe)
|
1017
|
+
end
|
1018
|
+
|
1019
|
+
# (private) the same as URI.unescape(str)
|
1020
|
+
def _unescape(str)
|
1021
|
+
URI.unescape(str)
|
1022
|
+
end
|
1023
|
+
end
|
1024
|
+
|
999
1025
|
# Return the string corresponding to these characters unescaped
|
1000
1026
|
def unescape(string)
|
1001
|
-
|
1027
|
+
_unescape(string)
|
1002
1028
|
end
|
1003
1029
|
|
1004
1030
|
# Escape a column according to the specification at
|
1005
1031
|
# http://song.sourceforge.net/gff3.shtml.
|
1006
1032
|
def escape(string)
|
1007
|
-
|
1033
|
+
_escape(string, UNSAFE)
|
1008
1034
|
end
|
1009
1035
|
|
1010
1036
|
# Escape seqid column according to the specification at
|
1011
1037
|
# http://song.sourceforge.net/gff3.shtml.
|
1012
1038
|
def escape_seqid(string)
|
1013
|
-
|
1039
|
+
_escape(string, UNSAFE_SEQID)
|
1014
1040
|
end
|
1015
1041
|
|
1016
1042
|
# Escape attribute according to the specification at
|
@@ -1019,7 +1045,7 @@ module Bio
|
|
1019
1045
|
# are escaped: ",=;".
|
1020
1046
|
# Returns the string corresponding to these characters escaped.
|
1021
1047
|
def escape_attribute(string)
|
1022
|
-
|
1048
|
+
_escape(string, UNSAFE_ATTRIBUTE)
|
1023
1049
|
end
|
1024
1050
|
end #module Escape
|
1025
1051
|
|
@@ -1028,6 +1054,7 @@ module Bio
|
|
1028
1054
|
# Stores meta-data "##sequence-region seqid start end".
|
1029
1055
|
class SequenceRegion
|
1030
1056
|
include Escape
|
1057
|
+
extend Escape
|
1031
1058
|
|
1032
1059
|
# creates a new SequenceRegion class
|
1033
1060
|
def initialize(seqid, start, endpos)
|
@@ -1039,7 +1066,7 @@ module Bio
|
|
1039
1066
|
# parses given string and returns SequenceRegion class
|
1040
1067
|
def self.parse(str)
|
1041
1068
|
dummy, seqid, start, endpos =
|
1042
|
-
str.chomp.split(/\s+/, 4).collect { |x|
|
1069
|
+
str.chomp.split(/\s+/, 4).collect { |x| unescape(x) }
|
1043
1070
|
self.new(seqid, start, endpos)
|
1044
1071
|
end
|
1045
1072
|
|
@@ -1139,7 +1166,8 @@ module Bio
|
|
1139
1166
|
|
1140
1167
|
# Return the record as a GFF3 compatible string
|
1141
1168
|
def to_s
|
1142
|
-
cmnt = if @comment and
|
1169
|
+
cmnt = if defined?(@comment) and @comment and
|
1170
|
+
!@comment.to_s.strip.empty? then
|
1143
1171
|
@comment.gsub(/[\r\n]+/, ' ')
|
1144
1172
|
else
|
1145
1173
|
false
|
@@ -1163,6 +1191,7 @@ module Bio
|
|
1163
1191
|
# data of "Target" attribute.
|
1164
1192
|
class Target
|
1165
1193
|
include GFF3::Escape
|
1194
|
+
extend GFF3::Escape
|
1166
1195
|
|
1167
1196
|
# Creates a new Target object.
|
1168
1197
|
def initialize(target_id, start, endpos, strand = nil)
|
@@ -1190,7 +1219,7 @@ module Bio
|
|
1190
1219
|
#
|
1191
1220
|
def self.parse(str)
|
1192
1221
|
target_id, start, endpos, strand =
|
1193
|
-
str.split(/ +/, 4).collect { |x|
|
1222
|
+
str.split(/ +/, 4).collect { |x| unescape(x) }
|
1194
1223
|
self.new(target_id, start, endpos, strand)
|
1195
1224
|
end
|
1196
1225
|
|
@@ -1332,15 +1361,15 @@ module Bio
|
|
1332
1361
|
|
1333
1362
|
# rest of data_ref
|
1334
1363
|
len = 0
|
1335
|
-
data_ref.each do |
|
1336
|
-
len +=
|
1364
|
+
data_ref.each do |r|
|
1365
|
+
len += r.length if r.code == :M
|
1337
1366
|
end
|
1338
1367
|
data.push Code.new(:D, len) if len > 0
|
1339
1368
|
|
1340
1369
|
# rest of data_tgt
|
1341
1370
|
len = 0
|
1342
|
-
data_tgt.each do |
|
1343
|
-
len +=
|
1371
|
+
data_tgt.each do |t|
|
1372
|
+
len += t.length if t.code == :M
|
1344
1373
|
end
|
1345
1374
|
data.push Code.new(:I, len) if len > 0
|
1346
1375
|
|