ms-fasta 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.0
1
+ 0.4.1
@@ -1,4 +1,5 @@
1
1
  require 'ms/fasta/archive'
2
+ require 'ms/fasta/header'
2
3
 
3
4
  module Ms
4
5
  module Fasta
@@ -20,61 +21,7 @@ module Ms
20
21
  end
21
22
  end
22
23
 
23
- # returns :ipi, :ncbi, or nil if can't be determined
24
- def self.filetype(file_or_io)
25
- ft = nil
26
- io =
27
- if file_or_io.is_a?(String)
28
- File.open(file_or_io)
29
- else
30
- init_pos = file_or_io.pos
31
- file_or_io.rewind
32
- file_or_io
33
- end
34
- io.each_line do |line|
35
- if line =~ /^>/
36
- ft = header_to_filetype(line[1..-1])
37
- break
38
- end
39
- end
40
-
41
- if file_or_io.is_a?(String)
42
- io.close
43
- else
44
- io.pos = init_pos
45
- end
46
- ft
47
- end
48
-
49
- # takes the header line (no leading >) and returns the kind of file
50
- def self.header_to_filetype(line)
51
- if line =~ /^IPI\:/
52
- :ipi
53
- elsif line =~ /^gi\|/
54
- :ncbi
55
- else
56
- nil
57
- end
58
- end
59
-
60
- # kind is :ipi or :ncbi or a String (the header)
61
- # gives the regular expression for parsing the header (no leading >)
62
- def self.id_regexp(kind)
63
- sym =
64
- if kind.is_a?(String)
65
- header_to_filetype(kind)
66
- else ; kind
67
- end
68
- case sym
69
- when :ipi
70
- /^IPI:(.*?)\|/o
71
- when :ncbi
72
- /^gi\|(.*?)\|/o
73
- else
74
- nil
75
- end
76
- end
77
-
24
+ extend Ms::Fasta::Header
78
25
  end
79
26
  end
80
27
 
@@ -0,0 +1,66 @@
1
+
2
+ module Ms
3
+ module Fasta
4
+ module Header
5
+ # scans for a header and returns the results of header_to_filetype
6
+ def filetype(file_or_io)
7
+ ft = nil
8
+ io =
9
+ if file_or_io.is_a?(String)
10
+ File.open(file_or_io)
11
+ else
12
+ init_pos = file_or_io.pos
13
+ file_or_io.rewind
14
+ file_or_io
15
+ end
16
+ io.each_line do |line|
17
+ if line =~ /^>/
18
+ ft = header_to_filetype(line[1..-1])
19
+ break
20
+ end
21
+ end
22
+
23
+ if file_or_io.is_a?(String)
24
+ io.close
25
+ else
26
+ io.pos = init_pos
27
+ end
28
+ ft
29
+ end
30
+
31
+ # takes the header line (no leading >) and returns the kind of file
32
+ def header_to_filetype(line)
33
+ if line =~ /^sp|tr\|/
34
+ :uniprot
35
+ elsif line =~ /^IPI\:/
36
+ :ipi
37
+ elsif line =~ /^gi\|/
38
+ :ncbi
39
+ else
40
+ nil
41
+ end
42
+ end
43
+
44
+ # kind is :uniprot, :ipi, :ncbi or a String (the header)
45
+ # gives the regular expression for parsing the header (no leading >)
46
+ def id_regexp(kind)
47
+ sym =
48
+ if kind.is_a?(String)
49
+ header_to_filetype(kind)
50
+ else ; kind
51
+ end
52
+ case sym
53
+ when :uniprot
54
+ /^[st][pr]\|(.*?)\|/o
55
+ when :ipi
56
+ /^IPI:(.*?)\|/o
57
+ when :ncbi
58
+ /^gi\|(.*?)\|/o
59
+ else
60
+ nil
61
+ end
62
+ end
63
+ extend self
64
+ end
65
+ end
66
+ end
@@ -2,7 +2,7 @@ require File.join(File.dirname(__FILE__), '../../spec_helper.rb')
2
2
  require 'ms/fasta/archive'
3
3
 
4
4
  class FastaAchiveSpec
5
- include Ms::Fasta
5
+ include
6
6
 
7
7
  describe 'fasta archive operations' do
8
8
 
@@ -19,7 +19,7 @@ PROTEIN
19
19
  it 'reindexes' do
20
20
  strio = StringIO.new(FASTA_0 + FASTA_1)
21
21
  begin
22
- a = Archive.new(strio)
22
+ a = Ms::Fasta::Archive.new(strio)
23
23
 
24
24
  a.length.is 0
25
25
  a.reindex
@@ -35,10 +35,10 @@ PROTEIN
35
35
 
36
36
  it 'properly converts the fasta string to an entry object' do
37
37
  begin
38
- a = Archive.new
38
+ a = Ms::Fasta::Archive.new
39
39
  e = a.str_to_entry(FASTA_0)
40
40
 
41
- e.isa Entry
41
+ e.isa Ms::Fasta::Entry
42
42
  e.header.is "gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]"
43
43
  e.sequence.is("LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV" + "GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX" + "IENY")
44
44
  ensure
@@ -2,7 +2,6 @@ require File.join(File.dirname(__FILE__), '../../spec_helper.rb')
2
2
  require 'ms/fasta/entry'
3
3
 
4
4
  class FastaEntryTest
5
- include Ms::Fasta
6
5
 
7
6
  describe 'basic Entry operations' do
8
7
  # Abbreviated FASTA entry from wikipedia (http://en.wikipedia.org/wiki/FASTA_format)
@@ -17,7 +16,7 @@ IENY
17
16
  #
18
17
 
19
18
  it 'parses an entry as per docs' do
20
- entry = Entry.parse %q{
19
+ entry = Ms::Fasta::Entry.parse %q{
21
20
  >gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
22
21
  LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
23
22
  EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
@@ -35,7 +34,7 @@ IENY
35
34
  #
36
35
 
37
36
  it 'parses header and sequence' do
38
- e = Entry.parse(FASTA_0)
37
+ e = Ms::Fasta::Entry.parse(FASTA_0)
39
38
  e.header.is "gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]"
40
39
  e.sequence.is(
41
40
  "LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV" +
@@ -44,7 +43,7 @@ IENY
44
43
 
45
44
  it 'raises error for entries that do not start with gt' do
46
45
 
47
- lambda { Entry.parse "\n#{FASTA_0}" }.should.raise(RuntimeError).message.should.equal("input should begin with '>'")
46
+ lambda { Ms::Fasta::Entry.parse "\n#{FASTA_0}" }.should.raise(RuntimeError).message.should.equal("input should begin with '>'")
48
47
  end
49
48
 
50
49
  #
@@ -52,11 +51,11 @@ IENY
52
51
  #
53
52
 
54
53
  def test_entry_initialization
55
- e = Entry.new
54
+ e = Ms::Fasta::Entry.new
56
55
  assert_equal("", e.header)
57
56
  assert_equal("", e.sequence)
58
57
 
59
- e = Entry.new "head", "SEQ"
58
+ e = Ms::Fasta::Entry.new "head", "SEQ"
60
59
  assert_equal("head", e.header)
61
60
  assert_equal("SEQ", e.sequence)
62
61
  end
@@ -66,20 +65,20 @@ IENY
66
65
  #
67
66
 
68
67
  def test_dump_formats_a_fasta_entry
69
- e = Entry.new
68
+ e = Ms::Fasta::Entry.new
70
69
  assert_equal(">\n", e.dump)
71
70
 
72
- e = Entry.new "head", "SEQ"
71
+ e = Ms::Fasta::Entry.new "head", "SEQ"
73
72
  assert_equal(">head\nSEQ\n", e.dump)
74
73
  end
75
74
 
76
75
  def test_dump_formats_output_with_desired_line_length
77
- e = Entry.new "header", "ABCDEFGH"
76
+ e = Ms::Fasta::Entry.new "header", "ABCDEFGH"
78
77
  assert_equal(">header\nABC\nDEF\nGH\n", e.dump("", :line_length => 3))
79
78
  end
80
79
 
81
80
  def test_dump_line_length_less_than_1_raises_error
82
- e = Entry.new
81
+ e = Ms::Fasta::Entry.new
83
82
  assert_raise(ArgumentError) { e.dump("", :line_length => 0) }
84
83
  end
85
84
  end
@@ -0,0 +1,63 @@
1
+ require File.join(File.dirname(__FILE__), '../../spec_helper.rb')
2
+
3
+ require 'ms/fasta'
4
+ require 'ms/fasta/header'
5
+
6
+ shared 'determining filetype based on the header' do
7
+
8
+ before do
9
+ @header_lines = {
10
+ :ipi => ['IPI:IPI00000005.1|SWISS-PROT:P01111|TREMBL:Q5U091|ENSEMBL:ENSP00000358548;ENSP00000385392|REFSEQ:NP_002515|VEGA:OTTHUMP00000013879 Tax_Id=9606 Gene_Symbol=NRAS GTPase NRas'],
11
+ :uniprot => ['sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3', 'tr|D3DSH8|D3DSH8_HUMAN HCG2036819, isoform CRA_a OS=Homo sapiens GN=hCG_2036819 PE=4 SV=1'],
12
+ :ncbi => ['gi|16127999|ref|NP_414546.1| hypothetical protein b0005 [Escherichia coli K12]'],
13
+ }
14
+ end
15
+
16
+ xit 'can return the filetype given a file or io object' do
17
+ # need to write this
18
+ end
19
+
20
+ it 'returns a filetype or nil given a header line' do
21
+ @header_lines.each do |k,array|
22
+ array.each do |v|
23
+ @klass.header_to_filetype(v).is k
24
+ end
25
+ end
26
+ end
27
+
28
+ it 'returns a regular expression that retrieves the ID from a header' do
29
+ # A basic example:
30
+ header = "tr|D3DSH8|D3DSH8_HUMAN HCG2036819, iso ..."
31
+ regexp1 = @klass.id_regexp(:uniprot)
32
+ regexp2 = @klass.id_regexp(header)
33
+ regexp1.is regexp2
34
+ header.match(regexp1)[1].is "D3DSH8"
35
+
36
+ # exhaustively test:
37
+ {:ipi => %w(IPI00000005.1), :uniprot => %w(P31946 D3DSH8), :ncbi => %w(16127999)}.each do |symbol,v|
38
+ @header_lines[symbol].each do |header|
39
+ to_equal = v.shift
40
+ # takes either a header line (no >)
41
+ [symbol, header].each do |query|
42
+ regexp = @klass.id_regexp(query)
43
+ header.match(regexp)[1].is to_equal
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ end
50
+
51
+ describe 'Ms::Fasta::Header' do
52
+ before do
53
+ @klass = Ms::Fasta::Header
54
+ end
55
+ behaves_like 'determining filetype based on the header'
56
+ end
57
+
58
+ describe 'Ms::Fasta' do
59
+ before do
60
+ @klass = Ms::Fasta
61
+ end
62
+ behaves_like 'determining filetype based on the header'
63
+ end
@@ -2,33 +2,29 @@ require File.dirname(__FILE__) + '/../../spec_helper'
2
2
 
3
3
  require 'ms/fasta/ipi'
4
4
 
5
- class IpiSpec
6
- include Ms::Fasta
5
+ describe 'basic IPI operations' do
7
6
 
8
- describe 'basic IPI operations' do
7
+ before do
8
+ @headers = ['IPI:IPI00000001.2|SWISS-PROT:O95793-1|TREMBL:A8K622;Q59F99|ENSEMBL:ENSP00000360922;ENSP00000379466|REFSEQ:NP_059347|H-INV:HIT000329496|VEGA:OTTHUMP00000031233 Tax_Id=9606 Gene_Symbol=STAU1 Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1',
9
+ 'IPI:IPI00000005.1|SWISS-PROT:P01111|TREMBL:Q5U091|ENSEMBL:ENSP00000358548;ENSP00000385392|REFSEQ:NP_002515|VEGA:OTTHUMP00000013879 Tax_Id=9606 Gene_Symbol=NRAS GTPase NRas']
9
10
 
10
- before do
11
- @headers = ['IPI:IPI00000001.2|SWISS-PROT:O95793-1|TREMBL:A8K622;Q59F99|ENSEMBL:ENSP00000360922;ENSP00000379466|REFSEQ:NP_059347|H-INV:HIT000329496|VEGA:OTTHUMP00000031233 Tax_Id=9606 Gene_Symbol=STAU1 Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1',
12
- 'IPI:IPI00000005.1|SWISS-PROT:P01111|TREMBL:Q5U091|ENSEMBL:ENSP00000358548;ENSP00000385392|REFSEQ:NP_002515|VEGA:OTTHUMP00000013879 Tax_Id=9606 Gene_Symbol=NRAS GTPase NRas']
13
-
14
- @answers = [{"Gene_Symbol"=>"STAU1", "VEGA"=>"OTTHUMP00000031233", "IPI"=>"IPI00000001.2", "H-INV"=>"HIT000329496", "REFSEQ"=>"NP_059347", "Tax_Id"=>"9606", "SWISS-PROT"=>"O95793-1", "ENSEMBL"=>"ENSP00000360922;ENSP00000379466", "TREMBL"=>"A8K622;Q59F99", "description"=>"Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1"},
15
- {"Gene_Symbol"=>"NRAS", "VEGA"=>"OTTHUMP00000013879", "IPI"=>"IPI00000005.1", "REFSEQ"=>"NP_002515", "Tax_Id"=>"9606", "SWISS-PROT"=>"P01111", "ENSEMBL"=>"ENSP00000358548;ENSP00000385392", "TREMBL"=>"Q5U091", "description"=>"GTPase NRas"}]
16
- end
11
+ @answers = [{"Gene_Symbol"=>"STAU1", "VEGA"=>"OTTHUMP00000031233", "IPI"=>"IPI00000001.2", "H-INV"=>"HIT000329496", "REFSEQ"=>"NP_059347", "Tax_Id"=>"9606", "SWISS-PROT"=>"O95793-1", "ENSEMBL"=>"ENSP00000360922;ENSP00000379466", "TREMBL"=>"A8K622;Q59F99", "description"=>"Isoform Long of Double-stranded RNA-binding protein Staufen homolog 1"},
12
+ {"Gene_Symbol"=>"NRAS", "VEGA"=>"OTTHUMP00000013879", "IPI"=>"IPI00000005.1", "REFSEQ"=>"NP_002515", "Tax_Id"=>"9606", "SWISS-PROT"=>"P01111", "ENSEMBL"=>"ENSP00000358548;ENSP00000385392", "TREMBL"=>"Q5U091", "description"=>"GTPase NRas"}]
13
+ end
17
14
 
18
- it 'parses IPI headers' do
19
- # assumes that the leading '>' has been removed
20
- @headers.zip(@answers) do |header, answ|
21
- Ipi.parse(header).is answ
22
- end
15
+ it 'parses IPI headers' do
16
+ # assumes that the leading '>' has been removed
17
+ @headers.zip(@answers) do |header, answ|
18
+ Ms::Fasta::Ipi.parse(header).is answ
23
19
  end
20
+ end
24
21
 
25
- it 'can retrive the IPI ID' do
26
- answers = ["IPI00000001.2", "IPI00000005.1"]
22
+ it 'can retrive the IPI ID' do
23
+ answers = ["IPI00000001.2", "IPI00000005.1"]
27
24
 
28
- @headers.zip(answers) do |header, answ|
29
- Ipi.ipi(header).is answ
30
- end
25
+ @headers.zip(answers) do |header, answ|
26
+ Ms::Fasta::Ipi.ipi(header).is answ
31
27
  end
32
-
33
28
  end
29
+
34
30
  end
@@ -13,16 +13,17 @@ describe 'basic fasta operations' do
13
13
  header + "\n" + data
14
14
  end.join("\n")
15
15
  @data['carriage_returns_and_newlines'] = @data['newlines'].gsub("\n", "\r\n")
16
+ @files = {}
16
17
  @data.each do |k,v|
17
18
  file_key = k + '_file'
18
19
  filename = k + '.tmp'
19
- @data[file_key] = filename
20
+ @files[file_key] = filename
20
21
  File.open(filename, 'w') {|out| out.print v }
21
22
  end
22
23
  end
23
24
 
24
25
  after do
25
- @data.select {|k,v| k =~ /_file$/ }.each do |k,filename|
26
+ @files.select {|k,v| k =~ /_file$/ }.each do |k,filename|
26
27
  index = filename.sub('.tmp', '.index')
27
28
  [filename, index].each do |fn|
28
29
  File.unlink(fn) if File.exist? fn
@@ -43,7 +44,7 @@ describe 'basic fasta operations' do
43
44
 
44
45
  it 'can read a file' do
45
46
  %w(newlines_file carriage_returns_and_newlines_file).each do |file|
46
- Ms::Fasta.open(@data[file]) do |fasta|
47
+ Ms::Fasta.open(@files[file]) do |fasta|
47
48
  fasta_correct? fasta
48
49
  end
49
50
  end
@@ -51,7 +52,7 @@ describe 'basic fasta operations' do
51
52
 
52
53
  it 'can read an IO object' do
53
54
  %w(newlines_file carriage_returns_and_newlines_file).each do |file|
54
- File.open(@data[file]) do |io|
55
+ File.open(@files[file]) do |io|
55
56
  fasta = Ms::Fasta.new(io)
56
57
  fasta_correct? fasta
57
58
  end
@@ -67,7 +68,7 @@ describe 'basic fasta operations' do
67
68
 
68
69
  it 'iterates entries with foreach' do
69
70
  %w(newlines_file carriage_returns_and_newlines_file).each do |file|
70
- Ms::Fasta.foreach(@data[file]) do |entry|
71
+ Ms::Fasta.foreach(@files[file]) do |entry|
71
72
  entry.isa Ms::Fasta::Entry
72
73
  end
73
74
  end
metadata CHANGED
@@ -1,7 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ms-fasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 4
8
+ - 1
9
+ version: 0.4.1
5
10
  platform: ruby
6
11
  authors:
7
12
  - John T. Prince
@@ -9,19 +14,23 @@ autorequire:
9
14
  bindir: bin
10
15
  cert_chain: []
11
16
 
12
- date: 2009-12-09 00:00:00 -07:00
17
+ date: 2010-06-28 00:00:00 -06:00
13
18
  default_executable:
14
19
  dependencies:
15
20
  - !ruby/object:Gem::Dependency
16
21
  name: spec-more
17
- type: :development
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
20
24
  requirements:
21
25
  - - ">="
22
26
  - !ruby/object:Gem::Version
27
+ segments:
28
+ - 1
29
+ - 1
30
+ - 0
23
31
  version: 1.1.0
24
- version:
32
+ type: :development
33
+ version_requirements: *id001
25
34
  description: provides programmatic access to fasta files
26
35
  email: jtprince@gmail.com
27
36
  executables: []
@@ -41,6 +50,7 @@ files:
41
50
  - lib/ms/fasta.rb
42
51
  - lib/ms/fasta/archive.rb
43
52
  - lib/ms/fasta/entry.rb
53
+ - lib/ms/fasta/header.rb
44
54
  - lib/ms/fasta/ipi.rb
45
55
  - lib/ms/fasta/ipi/dat.rb
46
56
  - lib/ms/load/fasta.rb
@@ -48,7 +58,7 @@ files:
48
58
  - lib/ms/select/fasta.rb
49
59
  - spec/ms/fasta/archive_spec.rb
50
60
  - spec/ms/fasta/entry_spec.rb
51
- - spec/ms/fasta/ipi/dat_spec.rb
61
+ - spec/ms/fasta/header_spec.rb
52
62
  - spec/ms/fasta/ipi_spec.rb
53
63
  - spec/ms/fasta_spec.rb
54
64
  - spec/spec_helper.rb
@@ -66,25 +76,27 @@ required_ruby_version: !ruby/object:Gem::Requirement
66
76
  requirements:
67
77
  - - ">="
68
78
  - !ruby/object:Gem::Version
79
+ segments:
80
+ - 0
69
81
  version: "0"
70
- version:
71
82
  required_rubygems_version: !ruby/object:Gem::Requirement
72
83
  requirements:
73
84
  - - ">="
74
85
  - !ruby/object:Gem::Version
86
+ segments:
87
+ - 0
75
88
  version: "0"
76
- version:
77
89
  requirements: []
78
90
 
79
91
  rubyforge_project: mspire
80
- rubygems_version: 1.3.5
92
+ rubygems_version: 1.3.6
81
93
  signing_key:
82
94
  specification_version: 3
83
95
  summary: An mspire library for working with fasta formatted files
84
96
  test_files:
85
97
  - spec/ms/fasta/archive_spec.rb
86
98
  - spec/ms/fasta/ipi_spec.rb
87
- - spec/ms/fasta/ipi/dat_spec.rb
99
+ - spec/ms/fasta/header_spec.rb
88
100
  - spec/ms/fasta/entry_spec.rb
89
101
  - spec/ms/fasta_spec.rb
90
102
  - spec/spec_helper.rb
@@ -1,7 +0,0 @@
1
- require File.dirname(__FILE__) + '/../../../spec_helper'
2
-
3
- require 'ms/fasta/ipi/dat'
4
-
5
- class IpiDatSpec
6
-
7
- end