ms-fasta 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.0
1
+ 0.4.1
@@ -1,4 +1,5 @@
1
1
  require 'ms/fasta/archive'
2
+ require 'ms/fasta/header'
2
3
 
3
4
  module Ms
4
5
  module Fasta
@@ -20,61 +21,7 @@ module Ms
20
21
  end
21
22
  end
22
23
 
23
- # returns :ipi, :ncbi, or nil if can't be determined
24
- def self.filetype(file_or_io)
25
- ft = nil
26
- io =
27
- if file_or_io.is_a?(String)
28
- File.open(file_or_io)
29
- else
30
- init_pos = file_or_io.pos
31
- file_or_io.rewind
32
- file_or_io
33
- end
34
- io.each_line do |line|
35
- if line =~ /^>/
36
- ft = header_to_filetype(line[1..-1])
37
- break
38
- end
39
- end
40
-
41
- if file_or_io.is_a?(String)
42
- io.close
43
- else
44
- io.pos = init_pos
45
- end
46
- ft
47
- end
48
-
49
- # takes the header line (no leading >) and returns the kind of file
50
- def self.header_to_filetype(line)
51
- if line =~ /^IPI\:/
52
- :ipi
53
- elsif line =~ /^gi\|/
54
- :ncbi
55
- else
56
- nil
57
- end
58
- end
59
-
60
- # kind is :ipi or :ncbi or a String (the header)
61
- # gives the regular expression for parsing the header (no leading >)
62
- def self.id_regexp(kind)
63
- sym =
64
- if kind.is_a?(String)
65
- header_to_filetype(kind)
66
- else ; kind
67
- end
68
- case sym
69
- when :ipi
70
- /^IPI:(.*?)\|/o
71
- when :ncbi
72
- /^gi\|(.*?)\|/o
73
- else
74
- nil
75
- end
76
- end
77
-
24
+ extend Ms::Fasta::Header
78
25
  end
79
26
  end
80
27
 
@@ -0,0 +1,66 @@
1
+
2
+ module Ms
3
+ module Fasta
4
+ module Header
5
+ # scans for a header and returns the results of header_to_filetype
6
+ def filetype(file_or_io)
7
+ ft = nil
8
+ io =
9
+ if file_or_io.is_a?(String)
10
+ File.open(file_or_io)
11
+ else
12
+ init_pos = file_or_io.pos
13
+ file_or_io.rewind
14
+ file_or_io
15
+ end
16
+ io.each_line do |line|
17
+ if line =~ /^>/
18
+ ft = header_to_filetype(line[1..-1])
19
+ break
20
+ end
21
+ end
22
+
23
+ if file_or_io.is_a?(String)
24
+ io.close
25
+ else
26
+ io.pos = init_pos
27
+ end
28
+ ft
29
+ end
30
+
31
+ # takes the header line (no leading >) and returns the kind of file
32
+ def header_to_filetype(line)
33
+ if line =~ /^sp|tr\|/
34
+ :uniprot
35
+ elsif line =~ /^IPI\:/
36
+ :ipi
37
+ elsif line =~ /^gi\|/
38
+ :ncbi
39
+ else
40
+ nil
41
+ end
42
+ end
43
+
44
+ # kind is :uniprot, :ipi, :ncbi or a String (the header)
45
+ # gives the regular expression for parsing the header (no leading >)
46
+ def id_regexp(kind)
47
+ sym =
48
+ if kind.is_a?(String)
49
+ header_to_filetype(kind)
50
+ else ; kind
51
+ end
52
+ case sym
53
+ when :uniprot
54
+ /^[st][pr]\|(.*?)\|/o
55
+ when :ipi
56
+ /^IPI:(.*?)\|/o
57
+ when :ncbi
58
+ /^gi\|(.*?)\|/o
59
+ else
60
+ nil
61
+ end
62
+ end
63
+ extend self
64
+ end
65
+ end
66
+ end
@@ -2,7 +2,7 @@ require File.join(File.dirname(__FILE__), '../../spec_helper.rb')
2
2
  require 'ms/fasta/archive'
3
3
 
4
4
  class FastaAchiveSpec
5
- include Ms::Fasta
5
+ include
6
6
 
7
7
  describe 'fasta archive operations' do
8
8
 
@@ -19,7 +19,7 @@ PROTEIN
19
19
  it 'reindexes' do
20
20
  strio = StringIO.new(FASTA_0 + FASTA_1)
21
21
  begin
22
- a = Archive.new(strio)
22
+ a = Ms::Fasta::Archive.new(strio)
23
23
 
24
24
  a.length.is 0
25
25
  a.reindex
@@ -35,10 +35,10 @@ PROTEIN
35
35
 
36
36
  it 'properly converts the fasta string to an entry object' do
37
37
  begin
38
- a = Archive.new
38
+ a = Ms::Fasta::Archive.new
39
39
  e = a.str_to_entry(FASTA_0)
40
40
 
41
- e.isa Entry
41
+ e.isa Ms::Fasta::Entry
42
42
  e.header.is "gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]"
43
43
  e.sequence.is("LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV" + "GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX" + "IENY")
44
44
  ensure
@@ -2,7 +2,6 @@ require File.join(File.dirname(__FILE__), '../../spec_helper.rb')
2
2
  require 'ms/fasta/entry'
3
3
 
4
4
  class FastaEntryTest
5
- include Ms::Fasta
6
5
 
7
6
  describe 'basic Entry operations' do
8
7
  # Abbreviated FASTA entry from wikipedia (http://en.wikipedia.org/wiki/FASTA_format)
@@ -17,7 +16,7 @@ IENY
17
16
  #
18
17
 
19
18
  it 'parses an entry as per docs' do
20
- entry = Entry.parse %q{
19
+ entry = Ms::Fasta::Entry.parse %q{
21
20
  >gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
22
21
  LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
23
22
  EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
@@ -35,7 +34,7 @@ IENY
35
34
  #
36
35
 
37
36
  it 'parses header and sequence' do
38
- e = Entry.parse(FASTA_0)
37
+ e = Ms::Fasta::Entry.parse(FASTA_0)
39
38
  e.header.is "gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]"
40
39
  e.sequence.is(
41
40
  "LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV" +
@@ -44,7 +43,7 @@ IENY
44
43
 
45
44
  it 'raises error for entries that do not start with gt' do
46
45
 
47
- lambda { Entry.parse "\n#{FASTA_0}" }.should.raise(RuntimeError).message.should.equal("input should begin with '>'")
46
+ lambda { Ms::Fasta::Entry.parse "\n#{FASTA_0}" }.should.raise(RuntimeError).message.should.equal("input should begin with '>'")
48
47
  end
49
48
 
50
49
  #
@@ -52,11 +51,11 @@ IENY
52
51
  #
53
52
 
54
53
  def test_entry_initialization
55
- e = Entry.new
54
+ e = Ms::Fasta::Entry.new
56
55
  assert_equal("", e.header)
57
56
  assert_equal("", e.sequence)
58
57
 
59
- e = Entry.new "head", "SEQ"
58
+ e = Ms::Fasta::Entry.new "head", "SEQ"
60
59
  assert_equal("head", e.header)
61
60
  assert_equal("SEQ", e.sequence)
62
61
  end
@@ -66,20 +65,20 @@ IENY
66
65
  #
67
66
 
68
67
  def test_dump_formats_a_fasta_entry
69
- e = Entry.new
68
+ e = Ms::Fasta::Entry.new
70
69
  assert_equal(">\n", e.dump)
71
70
 
72
- e = Entry.new "head", "SEQ"
71
+ e = Ms::Fasta::Entry.new "head", "SEQ"
73
72
  assert_equal(">head\nSEQ\n", e.dump)
74
73
  end
75
74
 
76
75
  def test_dump_formats_output_with_desired_line_length
77
- e = Entry.new "header", "ABCDEFGH"
76
+ e = Ms::Fasta::Entry.new "header", "ABCDEFGH"
78
77
  assert_equal(">header\nABC\nDEF\nGH\n", e.dump("", :line_length => 3))
79
78
  end
80
79
 
81
80
  def test_dump_line_length_less_than_1_raises_error
82
- e = Entry.new
81
+ e = Ms::Fasta::Entry.new
83
82
  assert_raise(ArgumentError) { e.dump("", :line_length => 0) }
84
83
  end
85
84
  end
@@ -0,0 +1,63 @@
1
+ require File.join(File.dirname(__FILE__), '../../spec_helper.rb')
2
+
3
+ require 'ms/fasta'
4
+ require 'ms/fasta/header'
5
+
6
+ shared 'determining filetype based on the header' do
7
+
8
+ before do
9
+ @header_lines = {
10
+ :ipi => ['IPI:IPI00000005.1|SWISS-PROT:P01111|TREMBL:Q5U091|ENSEMBL:ENSP00000358548;ENSP00000385392|REFSEQ:NP_002515|VEGA:OTTHUMP00000013879 Tax_Id=9606 Gene_Symbol=NRAS GTPase NRas'],
11
+ :uniprot => ['sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3', 'tr|D3DSH8|D3DSH8_HUMAN HCG2036819, isoform CRA_a OS=Homo sapiens GN=hCG_2036819 PE=4 SV=1'],
12
+ :ncbi => ['gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli K12]'],
13
+ }
14
+ end
15
+
16
+ xit 'can return the filetype given a file or io object' do
17
+ # need to write this
18
+ end
19
+
20
+ it 'returns a filetype or nil given a header line' do
21
+ @header_lines.each do |k,array|
22
+ array.each do |v|
23
+ @klass.header_to_filetype(v).is k
24
+ end
25
+ end
26
+ end
27
+
28
+ it 'returns a regular expression that retrieves the ID from a header' do
29
+ # A basic example:
30
+ header = "tr|D3DSH8|D3DSH8_HUMAN HCG2036819, iso ..."
31
+ regexp1 = @klass.id_regexp(:uniprot)
32
+ regexp2 = @klass.id_regexp(header)
33
+ regexp1.is regexp2
34
+ header.match(regexp1)[1].is "D3DSH8"
35
+
36
+ # exhaustively test:
37
+ {:ipi => %w(IPI00000005.1), :uniprot => %w(P31946 D3DSH8), :ncbi => %w(16127999)}.each do |symbol,v|
38
+ @header_lines[symbol].each do |header|
39
+ to_equal = v.shift
40
+ # takes either a header line (no >)
41
+ [symbol, header].each do |query|
42
+ regexp = @klass.id_regexp(query)
43
+ header.match(regexp)[1].is to_equal
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ end
50
+
51
+ describe 'Ms::Fasta::Header' do
52
+ before do
53
+ @klass = Ms::Fasta::Header
54
+ end
55
+ behaves_like 'determining filetype based on the header'
56
+ end
57
+
58
+ describe 'Ms::Fasta' do
59
+ before do
60
+ @klass = Ms::Fasta
61
+ end
62
+ behaves_like 'determining filetype based on the header'
63
+ end
@@ -2,33 +2,29 @@ require File.dirname(__FILE__) + '/../../spec_helper'
2
2
 
3
3
  require 'ms/fasta/ipi'
4
4
 
5
- class IpiSpec
6
- include Ms::Fasta
5
+ describe 'basic IPI operations' do
7
6
 
8
- describe 'basic IPI operations' do
7
+ before do
8
+ @headers = ['IPI:IPI00000001.2|SWISS-PROT:O95793-1|TREMBL:A8K622;Q59F99|ENSEMBL:ENSP00000360922;ENSP00000379466|REFSEQ:NP_059347|H-INV:HIT000329496|VEGA:OTTHUMP00000031233 Tax_Id=9606 Gene_Symbol=STAU1 Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1',
9
+ 'IPI:IPI00000005.1|SWISS-PROT:P01111|TREMBL:Q5U091|ENSEMBL:ENSP00000358548;ENSP00000385392|REFSEQ:NP_002515|VEGA:OTTHUMP00000013879 Tax_Id=9606 Gene_Symbol=NRAS GTPase NRas']
9
10
 
10
- before do
11
- @headers = ['IPI:IPI00000001.2|SWISS-PROT:O95793-1|TREMBL:A8K622;Q59F99|ENSEMBL:ENSP00000360922;ENSP00000379466|REFSEQ:NP_059347|H-INV:HIT000329496|VEGA:OTTHUMP00000031233 Tax_Id=9606 Gene_Symbol=STAU1 Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1',
12
- 'IPI:IPI00000005.1|SWISS-PROT:P01111|TREMBL:Q5U091|ENSEMBL:ENSP00000358548;ENSP00000385392|REFSEQ:NP_002515|VEGA:OTTHUMP00000013879 Tax_Id=9606 Gene_Symbol=NRAS GTPase NRas']
13
-
14
- @answers = [{"Gene_Symbol"=>"STAU1", "VEGA"=>"OTTHUMP00000031233", "IPI"=>"IPI00000001.2", "H-INV"=>"HIT000329496", "REFSEQ"=>"NP_059347", "Tax_Id"=>"9606", "SWISS-PROT"=>"O95793-1", "ENSEMBL"=>"ENSP00000360922;ENSP00000379466", "TREMBL"=>"A8K622;Q59F99", "description"=>"Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1"},
15
- {"Gene_Symbol"=>"NRAS", "VEGA"=>"OTTHUMP00000013879", "IPI"=>"IPI00000005.1", "REFSEQ"=>"NP_002515", "Tax_Id"=>"9606", "SWISS-PROT"=>"P01111", "ENSEMBL"=>"ENSP00000358548;ENSP00000385392", "TREMBL"=>"Q5U091", "description"=>"GTPase NRas"}]
16
- end
11
+ @answers = [{"Gene_Symbol"=>"STAU1", "VEGA"=>"OTTHUMP00000031233", "IPI"=>"IPI00000001.2", "H-INV"=>"HIT000329496", "REFSEQ"=>"NP_059347", "Tax_Id"=>"9606", "SWISS-PROT"=>"O95793-1", "ENSEMBL"=>"ENSP00000360922;ENSP00000379466", "TREMBL"=>"A8K622;Q59F99", "description"=>"Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1"},
12
+ {"Gene_Symbol"=>"NRAS", "VEGA"=>"OTTHUMP00000013879", "IPI"=>"IPI00000005.1", "REFSEQ"=>"NP_002515", "Tax_Id"=>"9606", "SWISS-PROT"=>"P01111", "ENSEMBL"=>"ENSP00000358548;ENSP00000385392", "TREMBL"=>"Q5U091", "description"=>"GTPase NRas"}]
13
+ end
17
14
 
18
- it 'parses IPI headers' do
19
- # assumes that the leading '>' has been removed
20
- @headers.zip(@answers) do |header, answ|
21
- Ipi.parse(header).is answ
22
- end
15
+ it 'parses IPI headers' do
16
+ # assumes that the leading '>' has been removed
17
+ @headers.zip(@answers) do |header, answ|
18
+ Ms::Fasta::Ipi.parse(header).is answ
23
19
  end
20
+ end
24
21
 
25
- it 'can retrive the IPI ID' do
26
- answers = ["IPI00000001.2", "IPI00000005.1"]
22
+ it 'can retrive the IPI ID' do
23
+ answers = ["IPI00000001.2", "IPI00000005.1"]
27
24
 
28
- @headers.zip(answers) do |header, answ|
29
- Ipi.ipi(header).is answ
30
- end
25
+ @headers.zip(answers) do |header, answ|
26
+ Ms::Fasta::Ipi.ipi(header).is answ
31
27
  end
32
-
33
28
  end
29
+
34
30
  end
@@ -13,16 +13,17 @@ describe 'basic fasta operations' do
13
13
  header + "\n" + data
14
14
  end.join("\n")
15
15
  @data['carriage_returns_and_newlines'] = @data['newlines'].gsub("\n", "\r\n")
16
+ @files = {}
16
17
  @data.each do |k,v|
17
18
  file_key = k + '_file'
18
19
  filename = k + '.tmp'
19
- @data[file_key] = filename
20
+ @files[file_key] = filename
20
21
  File.open(filename, 'w') {|out| out.print v }
21
22
  end
22
23
  end
23
24
 
24
25
  after do
25
- @data.select {|k,v| k =~ /_file$/ }.each do |k,filename|
26
+ @files.select {|k,v| k =~ /_file$/ }.each do |k,filename|
26
27
  index = filename.sub('.tmp', '.index')
27
28
  [filename, index].each do |fn|
28
29
  File.unlink(fn) if File.exist? fn
@@ -43,7 +44,7 @@ describe 'basic fasta operations' do
43
44
 
44
45
  it 'can read a file' do
45
46
  %w(newlines_file carriage_returns_and_newlines_file).each do |file|
46
- Ms::Fasta.open(@data[file]) do |fasta|
47
+ Ms::Fasta.open(@files[file]) do |fasta|
47
48
  fasta_correct? fasta
48
49
  end
49
50
  end
@@ -51,7 +52,7 @@ describe 'basic fasta operations' do
51
52
 
52
53
  it 'can read an IO object' do
53
54
  %w(newlines_file carriage_returns_and_newlines_file).each do |file|
54
- File.open(@data[file]) do |io|
55
+ File.open(@files[file]) do |io|
55
56
  fasta = Ms::Fasta.new(io)
56
57
  fasta_correct? fasta
57
58
  end
@@ -67,7 +68,7 @@ describe 'basic fasta operations' do
67
68
 
68
69
  it 'iterates entries with foreach' do
69
70
  %w(newlines_file carriage_returns_and_newlines_file).each do |file|
70
- Ms::Fasta.foreach(@data[file]) do |entry|
71
+ Ms::Fasta.foreach(@files[file]) do |entry|
71
72
  entry.isa Ms::Fasta::Entry
72
73
  end
73
74
  end
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ms-fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 4
8
+ - 1
9
+ version: 0.4.1
5
10
  platform: ruby
6
11
  authors:
7
12
  - John T. Prince
@@ -9,19 +14,23 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2009-12-09 00:00:00 -07:00
17
+ date: 2010-06-28 00:00:00 -06:00
13
18
  default_executable:
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: spec-more
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
20
24
  requirements:
21
25
  - - ">="
22
26
  - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 1
30
+ - 0
23
31
  version: 1.1.0
24
- version:
32
+ type: :development
33
+ version_requirements: *id001
25
34
  description: provides programmatic access to fasta files
26
35
  email: jtprince@gmail.com
27
36
  executables: []
@@ -41,6 +50,7 @@ files:
41
50
  - lib/ms/fasta.rb
42
51
  - lib/ms/fasta/archive.rb
43
52
  - lib/ms/fasta/entry.rb
53
+ - lib/ms/fasta/header.rb
44
54
  - lib/ms/fasta/ipi.rb
45
55
  - lib/ms/fasta/ipi/dat.rb
46
56
  - lib/ms/load/fasta.rb
@@ -48,7 +58,7 @@ files:
48
58
  - lib/ms/select/fasta.rb
49
59
  - spec/ms/fasta/archive_spec.rb
50
60
  - spec/ms/fasta/entry_spec.rb
51
- - spec/ms/fasta/ipi/dat_spec.rb
61
+ - spec/ms/fasta/header_spec.rb
52
62
  - spec/ms/fasta/ipi_spec.rb
53
63
  - spec/ms/fasta_spec.rb
54
64
  - spec/spec_helper.rb
@@ -66,25 +76,27 @@ required_ruby_version: !ruby/object:Gem::Requirement
66
76
  requirements:
67
77
  - - ">="
68
78
  - !ruby/object:Gem::Version
79
+ segments:
80
+ - 0
69
81
  version: "0"
70
- version:
71
82
  required_rubygems_version: !ruby/object:Gem::Requirement
72
83
  requirements:
73
84
  - - ">="
74
85
  - !ruby/object:Gem::Version
86
+ segments:
87
+ - 0
75
88
  version: "0"
76
- version:
77
89
  requirements: []
78
90
 
79
91
  rubyforge_project: mspire
80
- rubygems_version: 1.3.5
92
+ rubygems_version: 1.3.6
81
93
  signing_key:
82
94
  specification_version: 3
83
95
  summary: An mspire library for working with fasta formatted files
84
96
  test_files:
85
97
  - spec/ms/fasta/archive_spec.rb
86
98
  - spec/ms/fasta/ipi_spec.rb
87
- - spec/ms/fasta/ipi/dat_spec.rb
99
+ - spec/ms/fasta/header_spec.rb
88
100
  - spec/ms/fasta/entry_spec.rb
89
101
  - spec/ms/fasta_spec.rb
90
102
  - spec/spec_helper.rb
@@ -1,7 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../../spec_helper'
2
-
3
- require 'ms/fasta/ipi/dat'
4
-
5
- class IpiDatSpec
6
-
7
- end