scaffolder 0.2.6 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ source "http://rubygems.org"
2
+
3
+ group :default do
4
+ gem "bio", "~> 1.4"
5
+ end
6
+
7
+ group :development do
8
+ gem "bundler", "~> 1.0"
9
+ gem "shoulda", "~> 2.11"
10
+ gem "mocha", "~> 0.9"
11
+ gem "yard", "~> 0.6"
12
+ gem "cucumber", "~> 0.9"
13
+ gem "jeweler", "~> 1.5"
14
+ gem "redgreen", "~> 1.2"
15
+ end
data/LICENSE CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2009 Michael Barton
1
+ Copyright (c) 2010 Michael Barton
2
2
 
3
3
  Permission is hereby granted, free of charge, to any person obtaining
4
4
  a copy of this software and associated documentation files (the
data/README.rdoc CHANGED
@@ -1,25 +1,37 @@
1
- = scaffolder
1
+ == Synopsis
2
2
 
3
- Description goes here.
3
+ A simple genome scaffolder API for merging sequence contigs to build a continuous
4
+ draft sequence. The draft sequence is constructed through specifying the order of
5
+ contigs in in human-readable YAML files. Since the draft genome is specified by the
6
+ scaffold YAML it is easy to remove or manipulate already sequences. In addition as
7
+ the scaffold file is easy to edit and is ideal for version control and
8
+ repeatability.
4
9
 
5
- == Note on Patches/Pull Requests
6
-
7
- * Fork the project.
8
- * Make your feature addition or bug fix.
9
- * Add tests for it. This is important so I don't break it in a
10
- future version unintentionally.
11
- * Commit, do not mess with rakefile, version, or history.
12
- (if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
13
- * Send me a pull request. Bonus points for topic branches.
10
+ == Feature List
14
11
 
15
- == Copyright
12
+ * Construct a draft sequence scaffold using human-readable and versionable
13
+ plain text files.
14
+ * A simple and extensible Ruby API to traverse the scaffold.
15
+
16
+ == Installing
17
+
18
+ Ruby and RubyGems are required to use scaffolder. Scaffolder is installed on
19
+ the command line using:
16
20
 
17
- Copyright (c) 2010 Michael Barton. See LICENSE for details.
21
+ $ gem install scaffolder
18
22
 
19
- == Notes
23
+ == Documentation
20
24
 
21
- Inserts processed are processed in reverse order according to end position. Last insert added up until first insert. Done to preserve insert coordinates.
25
+ See the Scaffolder class for getting started with Scaffolder.
22
26
 
23
- Overlapping inserts may cause unexpected behaviour
27
+ == Contact
28
+
29
+ Scaffolder was developed by Michael Barton (www.michaelbarton.me.uk). Pull
30
+ requests, patches and bug reports are welcome. The source code is available on
31
+ github[http://github.com/michaelbarton/scaffolder]. Bug reports and feature
32
+ requests may also be made there.
33
+
34
+ == Copyright
24
35
 
25
- Sequence reversed after inserts have been added.
36
+ Scaffolder © 2010 by Michael Barton. Scaffolder is licensed under the MIT
37
+ license. Please see the LICENSE document for more information.
data/Rakefile CHANGED
@@ -1,26 +1,25 @@
1
1
  require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
2
10
  require 'rake'
3
11
 
4
- begin
5
- require 'jeweler'
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "scaffolder"
8
- gem.summary = %Q{Scaffolder for genome sequence data}
9
- gem.description = %Q{Organise genome sequence data into scaffolds using YAML configuration files.}
10
- gem.email = "mail@michaelbarton.me.uk"
11
- gem.homepage = "http://github.com/michaelbarton/scaffolder"
12
- gem.authors = ["Michael Barton"]
13
- gem.add_dependency "bio", ">= 0"
14
- gem.add_development_dependency "rr", ">= 0.10.11"
15
- gem.add_development_dependency "shoulda", ">= 0"
16
- gem.add_development_dependency "redgreen", ">= 0"
17
- gem.add_development_dependency "yard", ">= 0"
18
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
19
- end
20
- Jeweler::GemcutterTasks.new
21
- rescue LoadError
22
- puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ gem.name = "scaffolder"
15
+ gem.homepage = "http://www.michaelbarton.me.uk/scaffolder/"
16
+ gem.license = "MIT"
17
+ gem.summary = %Q{Genome scaffolding for human beings.}
18
+ gem.description = %Q{Organise sequence contigs into genome scaffolds using simple human-readable YAML files.}
19
+ gem.email = "mail@michaelbarton.me.uk"
20
+ gem.authors = ["Michael Barton"]
23
21
  end
22
+ Jeweler::RubygemsDotOrgTasks.new
24
23
 
25
24
  require 'rake/testtask'
26
25
  Rake::TestTask.new(:test) do |test|
@@ -29,28 +28,10 @@ Rake::TestTask.new(:test) do |test|
29
28
  test.verbose = true
30
29
  end
31
30
 
32
- begin
33
- require 'rcov/rcovtask'
34
- Rcov::RcovTask.new do |test|
35
- test.libs << 'test'
36
- test.pattern = 'test/**/test_*.rb'
37
- test.verbose = true
38
- end
39
- rescue LoadError
40
- task :rcov do
41
- abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
42
- end
43
- end
31
+ require 'cucumber/rake/task'
32
+ Cucumber::Rake::Task.new(:features)
44
33
 
45
- task :test => :check_dependencies
34
+ require 'yard'
35
+ YARD::Rake::YardocTask.new
46
36
 
47
37
  task :default => :test
48
-
49
- begin
50
- require 'yard'
51
- YARD::Rake::YardocTask.new
52
- rescue LoadError
53
- task :yardoc do
54
- abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
55
- end
56
- end
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.6
1
+ 0.4.0
data/cucumber.yml ADDED
@@ -0,0 +1,2 @@
1
+ ---
2
+ default: --format progress --color
@@ -0,0 +1,15 @@
1
+ Feature: The insert keyword
2
+ In order to place close gaps in the scaffold
3
+ A user can use the insert keyword
4
+
5
+ Scenario: A scaffold with a single insert keyword
6
+ Given the scaffold file has the sequences:
7
+ | name | nucleotides |
8
+ | seq | ATGCCGCGTAA |
9
+ And the first scaffold sequence has the inserts:
10
+ | name | nucleotides | open | close |
11
+ | ins | AAAA | 4 | 6 |
12
+ When creating a scaffolder object
13
+ Then the scaffold should contain 1 sequence entries
14
+ Then the scaffold should contain 1 insert entries
15
+ And the scaffold sequence should be ATGAAAACGTAA
@@ -0,0 +1,20 @@
1
+ Feature: The sequence keyword
2
+ In order to place contigs the scaffold
3
+ A user can use the sequence keyword
4
+
5
+ Scenario: A scaffold with a single sequence keyword
6
+ Given the scaffold file has the sequences:
7
+ | name | nucleotides |
8
+ | seq | ATGCC |
9
+ When creating a scaffolder object
10
+ Then the scaffold should contain 1 sequence entries
11
+ And the scaffold sequence should be ATGCC
12
+
13
+ Scenario: A scaffold with a two sequence keywords
14
+ Given the scaffold file has the sequences:
15
+ | name | nucleotides |
16
+ | seq1 | ATGCC |
17
+ | seq2 | ATGCC |
18
+ When creating a scaffolder object
19
+ Then the scaffold should contain 2 sequence entries
20
+ And the scaffold sequence should be ATGCCATGCC
@@ -0,0 +1,48 @@
1
+ Given /^the scaffold file has the sequences:$/ do |sequences|
2
+ @entries ||= Array.new
3
+ @sequences ||= Array.new
4
+
5
+ sequences.hashes.each do |seq|
6
+ @entries << {'sequence' => {'source' => seq['name']}}
7
+ end
8
+ sequences.hashes.map do |seq|
9
+ @sequences << {:name => seq['name'], :sequence => seq['nucleotides']}
10
+ end
11
+ end
12
+
13
+ Given /^the first scaffold sequence has the inserts:$/ do |inserts|
14
+ sequence = @entries.detect{|s| s.keys.first == 'sequence' }
15
+ sequence['sequence']['inserts'] = (inserts.hashes.map do |insert|
16
+ i = {'source' => insert['name']}
17
+ i['open'] = insert['open'].to_i if insert['open']
18
+ i['close'] = insert['close'].to_i if insert['close']
19
+ i
20
+ end)
21
+ inserts.hashes.map do |insert|
22
+ @sequences << {:name => insert['name'], :sequence => insert['nucleotides']}
23
+ end
24
+ end
25
+
26
+ When /^creating a scaffolder object$/ do
27
+ @scf_file = write_scaffold_file(@entries)
28
+ @seq_file = write_sequence_file(@sequences)
29
+
30
+ @scaffold = Scaffolder.new(YAML.load(File.read(@scf_file)),@seq_file)
31
+ end
32
+
33
+ Then /^the scaffold should contain (.*) sequence entries$/ do |n|
34
+ @scaffold.select{|s| s.entry_type == :sequence}.length.should == n.to_i
35
+ end
36
+
37
+ Then /^the scaffold should contain (.*) insert entries$/ do |n|
38
+ @scaffold.select{|s| s.entry_type == :sequence}.inject(0) do |count,seq|
39
+ count =+ seq.inserts.length
40
+ end.should == n.to_i
41
+ end
42
+
43
+ And /^the scaffold sequence should be (.*)$/ do |sequence|
44
+ generated_sequence = @scaffold.inject(String.new) do |build,entry|
45
+ build << entry.sequence
46
+ end
47
+ generated_sequence.should == sequence
48
+ end
@@ -0,0 +1,30 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ require 'tempfile'
11
+
12
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
13
+ require 'scaffolder'
14
+
15
+ def write_sequence_file(*sequences)
16
+ file = Tempfile.new("sequence").path
17
+ File.open(file,'w') do |tmp|
18
+ sequences.flatten.each do |sequence|
19
+ seq = Bio::Sequence.new(sequence[:sequence])
20
+ tmp.print(seq.output(:fasta,:header => sequence[:name]))
21
+ end
22
+ end
23
+ file
24
+ end
25
+
26
+ def write_scaffold_file(scaffold)
27
+ file = Tempfile.new("scaffold").path
28
+ File.open(file,'w'){|tmp| tmp.print(YAML.dump(scaffold))}
29
+ file
30
+ end
@@ -0,0 +1,6 @@
1
+ # Mixin module to define standard errors for scaffolder.
2
+ #
3
+ module Scaffolder::Errors
4
+ exceptions = %w[ UnknownAttributeError CoordinateError UnknownSequenceError]
5
+ exceptions.each { |e| const_set(e, Class.new(StandardError)) }
6
+ end
@@ -0,0 +1,51 @@
1
+ # Inserts are used to additional usually smaller sequences to larger sequences.
2
+ # The attributes in the sequence class are used to specify where the host
3
+ # sequence is 'opened' and 'closed' to add the insert. Either one of these two
4
+ # attributes may be ommitted. Omitting the 'open' attribute will cause the
5
+ # insert open position to be calculated based on the close minus the sequence
6
+ # length. The reverse is true if the close position is ommittted.
7
+ #
8
+ # @see Scaffolder::Region::Sequence Scaffolder::Region::Sequence for an
9
+ # example on adding inserts to a sequence.
10
+ class Scaffolder::Region::Insert < Scaffolder::Region
11
+
12
+ # Fasta identifier for the insert sequence
13
+ #
14
+ # @param [String]
15
+ # @return [String]
16
+ attribute :source
17
+
18
+ # Open position where insert is added. Default is close position minus the
19
+ # sequence length.
20
+ #
21
+ # @param [Integer]
22
+ # @return [Integer]
23
+ attribute :open,
24
+ :default => lambda{|s| s.close - s.sequence_hook.length - 1 }
25
+
26
+ # End position where insert is added. Default is open position plus the
27
+ # sequence length.
28
+ #
29
+ # @param [Integer]
30
+ # @return [Integer]
31
+ attribute :close,
32
+ :default => lambda{|s| s.open + s.sequence_hook.length - 1 }
33
+
34
+ # Insertion position as a Range
35
+ #
36
+ # @return [Range]
37
+ # @raise [CoordinateError] if both the open and close positions are nil.
38
+ def position
39
+ raise CoordinateError if @close.nil? && @open.nil?
40
+ open-1..close-1
41
+ end
42
+
43
+ # Inserts are comaprable by close position.
44
+ #
45
+ # @return [Integer]
46
+ # @param [Scaffolder::Region::Insert]
47
+ def <=>(other)
48
+ self.close <=> other.close
49
+ end
50
+
51
+ end
@@ -0,0 +1,74 @@
1
+ # Class for inserting fasta sequence into the genome scaffold. The
2
+ # #raw_sequence method is also responsible for applying each of the sequence
3
+ # inserts to the original sequence. The following example specifies the
4
+ # insertion of a sequence identified by the fasta header 'sequence1'. The
5
+ # example also outlines and insert to be added to the sequence in the region
6
+ # between 3 and 10.
7
+ #
8
+ # - sequence:
9
+ # source: 'sequence1'
10
+ # inserts:
11
+ # -
12
+ # source: 'insert1'
13
+ # open: 3
14
+ # close: 10
15
+ class Scaffolder::Region::Sequence < Scaffolder::Region
16
+
17
+ # Fasta identifier for this sequence
18
+ #
19
+ # @param [String]
20
+ # @return [String]
21
+ attribute :source
22
+
23
+ # Array of inserts to add to this sequence. Each array entry may be either a
24
+ # Scaffolder::Region:Inserts or a corresponding to the attributes of an
25
+ # Insert. In the case of the latter each hash is used to generate a new
26
+ # Scaffolder::Region::Insert instance.
27
+ #
28
+ # @return [Array] Array of Scaffolder::Region::Insert
29
+ # @param [Array] inserts Accepts an array of either
30
+ # Scaffolder::Region::Insert or a hash of insert keyword data.
31
+ def inserts(inserts=nil)
32
+ if inserts.nil?
33
+ @inserts || Array.new
34
+ else
35
+ @inserts = inserts.map do |insert|
36
+ if insert.instance_of? Insert
37
+ insert
38
+ else
39
+ Insert.generate(insert)
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ # attribute :inserts, :default => Array.new
46
+
47
+ # Adds each of the sequence inserts to the raw sequence. Updates the sequence
48
+ # length each time an insert is added to reflect the change.
49
+ #
50
+ # @return [String] original sequence with inserts added.
51
+ # @raise [CoordinateError] if any insert open position is greater than the
52
+ # length of the original sequence
53
+ # @raise [CoordinateError] if any insert close position is less than one
54
+ # @raise [CoordinateError] if any insert open position is greater than the
55
+ # close position
56
+ def sequence_hook
57
+ # Set the sequence stop positon if not defined as the stop
58
+ # position is updated as each insert is added
59
+ @stop ||= raw_sequence.length
60
+
61
+ return inserts.sort.reverse.inject(raw_sequence) do |seq,insert|
62
+ raise CoordinateError if insert.open > raw_sequence.length
63
+ raise CoordinateError if insert.close < 1
64
+ raise CoordinateError if insert.open > insert.close
65
+
66
+ before_size = seq.length
67
+ seq[insert.position] = insert.sequence
68
+ diff = seq.length - before_size
69
+ stop(stop + diff)
70
+
71
+ seq
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,23 @@
1
+ # This class is used to insert unreolved sequence regions in to the genome
2
+ # build. The unresolved region is filled with N characters. The example below
3
+ # with insert the characters 'NNNNN' into the genome build.
4
+ #
5
+ # - unresolved:
6
+ # length: 5
7
+ #
8
+ class Scaffolder::Region::Unresolved < Scaffolder::Region
9
+
10
+ # The length of the unresolved region
11
+ # @return [Integer]
12
+ # @param [Integer]
13
+ attribute :length
14
+
15
+ # Calculate unresolved region sequence
16
+ # @return [String] a string of Ns equal to length attribute
17
+ # @raise [CoordinateError] if the length attribute is nil
18
+ def sequence_hook
19
+ raise CoordinateError if length.nil?
20
+ 'N' * length
21
+ end
22
+
23
+ end
@@ -1 +1,139 @@
1
- Scaffolder::Region = Struct.new(:entry_type,:sequence)
1
+ require 'scaffolder'
2
+
3
+ class Scaffolder::Region
4
+ include Scaffolder::Errors
5
+
6
+ class << self
7
+ include Scaffolder::Errors
8
+
9
+ # @return [Scaffolder::Region] Returns subclassed instances of
10
+ # Scaffolder::Region by name
11
+ def [](type)
12
+ self.const_get(type.capitalize)
13
+ end
14
+
15
+ # Links the specification of values in the scaffold file to the assignment
16
+ # of instance variables.
17
+ #
18
+ # @param [Symbol] Define attributes for this type of scaffold
19
+ # region. Attributes are read from the scaffold file and stored as
20
+ # instance variables.
21
+ # @param [Hash] options Attribute options.
22
+ # @option options [Object,Proc] Default Specify a default value for this
23
+ # attribute if a value is not defined in the scaffold file.
24
+ # @example Simple specification
25
+ # class MyRegion < Scaffolder::Region
26
+ # attribute :value # "value" usable as a keyword in the scaffold file
27
+ # end
28
+ # @example Specification with a default value
29
+ # attribute :value, :default => 1
30
+ # @example Specification with where proc is evaluated for the default
31
+ # attribute :value, :default => lamdba{ Time.now.to_s }
32
+ # @example Specification with proc where the region instance is avaiable
33
+ # attribute :value, :default => lamdba{|s| s.other_variable + 1 }
34
+ def attribute(name,options = {})
35
+ define_method(name) do |*arg|
36
+ var = "@#{name}"
37
+ default = options[:default]
38
+ unless arg.first # Is an argument is passed to the method?
39
+ value = instance_variable_get(var)
40
+ return value if value
41
+ return default.respond_to?(:call) ? default.call(self) : default
42
+ end
43
+ instance_variable_set(var,arg.first)
44
+ end
45
+ end
46
+
47
+ # Parse each key-value pair in the scaffold hash calling the corresponding
48
+ # attribute method for the key and passing the value as an argument.
49
+ #
50
+ # @param [Hash] region_data Key-Value pairs of the data required to define
51
+ # this scaffolder region.
52
+ # @return [Scaffolder::Region] Returns an region object where the
53
+ # instance variables have been assigned according to the region data
54
+ # hash.
55
+ # @raise [UnknownAttributeError] If a keyword in the scaffold file does not
56
+ # have a corresponding attribute in the class.
57
+ # @see Scaffolder::Region.attribute
58
+ def generate(region_data)
59
+ region = self.new
60
+ region_data.each_pair do |attribute,value|
61
+ begin
62
+ region.send(attribute.to_sym,value)
63
+ rescue NoMethodError => e
64
+ raise UnknownAttributeError.new(e)
65
+ end
66
+ end
67
+ region
68
+ end
69
+
70
+ end
71
+
72
+ # The raw sequence for this region.
73
+ #
74
+ # @param [String]
75
+ # @return [String]
76
+ attribute :raw_sequence
77
+
78
+ # Trim the start of sequence to this position. Default is 1.
79
+ #
80
+ # @param [Interger]
81
+ # @return [Interger]
82
+ attribute :start, :default => 1
83
+
84
+ # Trim the end of sequence to this position. Default is the sequence length..
85
+ #
86
+ # @param [Interger]
87
+ # @return [Interger]
88
+ attribute :stop, :default => lambda{|s| s.sequence_hook.length}
89
+
90
+
91
+ # Should the sequence be reverse complemented. Reverse complementation is
92
+ # performed after the start and end of the sequence has been trimmed.
93
+ #
94
+ # @param [Boolean]
95
+ # @return [Boolean]
96
+ attribute :reverse
97
+
98
+ # Override this to manipulate the sequence before it's subsequenced, reverse
99
+ # complemented etc. by Scaffolder::Region#sequence.
100
+ #
101
+ # @return [String] The value of the raw_sequence attribute
102
+ # @see Scaffolder::Region#sequence
103
+ def sequence_hook
104
+ raw_sequence
105
+ end
106
+
107
+ # The name of the class. Useful for selecting specific region types.
108
+ #
109
+ # @return [Symbol]
110
+ def entry_type
111
+ self.class.name.split('::').last.downcase.to_sym
112
+ end
113
+
114
+ # Returns the value of the Scaffolder::Region#raw_sequence after
115
+ # subsequencing and reverse complementation (if specified in the
116
+ # scaffold file).
117
+ #
118
+ # @return [String] Sequence after all modifications
119
+ # @raise [CoordinateError] if the start position is less than 1.
120
+ # @raise [CoordinateError] if the stop position is greater than the sequence
121
+ # length.
122
+ # @raise [CoordinateError] if the start position is greater than the stop
123
+ # position.
124
+ def sequence
125
+ seq = sequence_hook
126
+
127
+ raise CoordinateError.new if start < 1
128
+ raise CoordinateError.new if stop > seq.length
129
+ raise CoordinateError.new if start > stop
130
+
131
+ seq = seq[(start-1)..(stop-1)]
132
+ seq = Bio::Sequence::NA.new(seq).reverse_complement if reverse
133
+ seq.to_s.upcase
134
+ end
135
+
136
+ require 'scaffolder/region/unresolved'
137
+ require 'scaffolder/region/insert'
138
+ require 'scaffolder/region/sequence'
139
+ end