bio-jaspar 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +13 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +20 -0
- data/README.md +147 -0
- data/README.rdoc +126 -0
- data/Rakefile +45 -0
- data/lib/bio-jaspar.rb +15 -0
- data/lib/bio-jaspar/jaspar.rb +432 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-jaspar.rb +265 -0
- metadata +171 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 835fa6794d4fa377301992ede3fc0fc43d0013c0
|
4
|
+
data.tar.gz: 98b718e4a029fec3389078213fd7ab5613f89c38
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 29c60f81959d0880c2b690df72537953816cd5b571c635c835b8207d699cd372d429e01c7f17c69e02c146bc12679940dcb0f1b0fe5663e9a951fb78389a4ae5
|
7
|
+
data.tar.gz: a69f3fca22da7ccbc8c1e2138c809102828a208e3118f944c110986d5491ef7e1b2e33c5dc09d46547e5a2333b7161e50a942f251a32209bb79a9b8f7d8bee87
|
data/.document
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
|
7
|
+
# - rbx-19mode
|
8
|
+
# - 1.8.7
|
9
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
+
# - rbx-18mode
|
11
|
+
|
12
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
13
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
gem "bio", ">= 1.4.2"
|
6
|
+
gem "mysql2", "~> 0.3.19"
|
7
|
+
|
8
|
+
# Add dependencies to develop your gem here.
|
9
|
+
# Include everything needed to run rake, tests, features, etc.
|
10
|
+
group :development do
|
11
|
+
gem "shoulda", ">= 0"
|
12
|
+
gem "rake", "~> 0.9.3"
|
13
|
+
gem "rdoc", "~> 3.12"
|
14
|
+
gem 'test-unit'
|
15
|
+
gem "jeweler", "~> 2.0.1", :git => "https://github.com/technicalpickles/jeweler.git"
|
16
|
+
gem "bundler", ">= 1.0.21"
|
17
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2015 Wasserman Lab
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
# bio-jaspar
|
2
|
+
|
3
|
+
## Tools for JASPAR motif analysis
|
4
|
+
|
5
|
+
This gem provides methods for:
|
6
|
+
|
7
|
+
1. Reading and writing sequence motifs in JASPAR format
|
8
|
+
2. Accessing a JASPAR5 formatted database
|
9
|
+
3. Comparing, searching, and analyzing motifs in sequences
|
10
|
+
|
11
|
+
<sup>*</sup> **Note:** The JASPAR motif analysis tools consist of several modules that are directly imported from the Bio.motifs package in BioPython. Namely, those modules/submodules are: Bio.motifs, Bio.motifs.matrix, Bio.motifs.thresholds, Bio.motifs.jaspar. The functionality of this gem will be identical to the aforementioned modules/submodules.
|
12
|
+
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
```sh
|
17
|
+
gem install bio-jaspar
|
18
|
+
```
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
### Loading the gem
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
require 'bio-jaspar'
|
26
|
+
```
|
27
|
+
|
28
|
+
### Loading a motif/motifs from a JASPAR database
|
29
|
+
|
30
|
+
A connection to the JASPAR database is made by creating a JASPAR5 instance.
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
# Substitute the database credentials!
|
34
|
+
db = Bio::Jaspar::JASPAR5.new(
|
35
|
+
:host => <db_host.org>,
|
36
|
+
:name => <db_name>,
|
37
|
+
:user => <db_user>,
|
38
|
+
:password => <db_password>
|
39
|
+
)
|
40
|
+
```
|
41
|
+
|
42
|
+
Now, a motif can be retrieved by the matrix_id
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
m = db.fetch_motif_by_id("MA0049")
|
46
|
+
puts m.to_s
|
47
|
+
```
|
48
|
+
|
49
|
+
Or multiple motifs can be retrieved by various criteria
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
motifs = db.fetch_motifs(
|
53
|
+
:collection => "CORE",
|
54
|
+
:tax_group => ["fungi", "vertebrate"],
|
55
|
+
:tf_class => "Helix-Turn-Helix",
|
56
|
+
:min_ic => 2
|
57
|
+
)
|
58
|
+
motifs.each { |m| # do something with a motif }
|
59
|
+
```
|
60
|
+
|
61
|
+
### Motif analysis
|
62
|
+
|
63
|
+
Many methods are available for motif analysis. Here are some examples:
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
m = db.fetch_motif_by_id("MA0049")
|
67
|
+
|
68
|
+
# Consensus sequence
|
69
|
+
m.consensus # BioRuby Sequence object
|
70
|
+
puts m.consensus
|
71
|
+
|
72
|
+
# Anticonsensus sequence
|
73
|
+
m.anticonsensus # BioRuby Sequence object
|
74
|
+
puts m.anticonsensus
|
75
|
+
|
76
|
+
# Reverse complement motif
|
77
|
+
m.reverse_complement # Bio::Motif::Motifs object
|
78
|
+
|
79
|
+
# Pseudocounts
|
80
|
+
m.pseudocounts
|
81
|
+
|
82
|
+
# Background
|
83
|
+
m.background
|
84
|
+
|
85
|
+
# Position weight matrix
|
86
|
+
m.pwm
|
87
|
+
|
88
|
+
# Position specific scoring matrix
|
89
|
+
m.pssm
|
90
|
+
```
|
91
|
+
|
92
|
+
Matrix methods are also available. Here are some examples:
|
93
|
+
|
94
|
+
```ruby
|
95
|
+
m = db.fetch_motif_by_id("MA0049")
|
96
|
+
|
97
|
+
# Maximum possible score for the given motif
|
98
|
+
m.pssm.max
|
99
|
+
|
100
|
+
# Minimum possible score for the given motif
|
101
|
+
m.pssm.min
|
102
|
+
|
103
|
+
# Expected value of the motif score
|
104
|
+
m.pssm.mean
|
105
|
+
|
106
|
+
# Standard deviation of the given motif score
|
107
|
+
m.pssm.std
|
108
|
+
|
109
|
+
# Find hits with the PWM score above given threshold
|
110
|
+
m.pssm.search(Bio::Sequence.auto("ACCTGCCTAAAAAA"), threshold = 0.5)
|
111
|
+
```
|
112
|
+
|
113
|
+
### Read/write Jaspar file
|
114
|
+
|
115
|
+
Already downloaded pfm, jaspar, sites files can be loaded/written using the Jaspar module
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
# Read a pfm file
|
119
|
+
f = File.open("test.pfm", "r")
|
120
|
+
Bio::Jaspar.read(f, "pfm")
|
121
|
+
f.close
|
122
|
+
|
123
|
+
# Write motifs into a jaspar file
|
124
|
+
motifs = db.fetch_motifs(
|
125
|
+
:collection => "CORE",
|
126
|
+
:tax_group => ["fungi", "vertebrate"],
|
127
|
+
:tf_class => "Helix-Turn-Helix",
|
128
|
+
:min_ic => 2
|
129
|
+
)
|
130
|
+
File.open("test.jaspar", "w") do |f|
|
131
|
+
Bio::Jaspar.write(f, "jaspar")
|
132
|
+
end
|
133
|
+
```
|
134
|
+
|
135
|
+
Please refer to the rdoc for full information on all available methods & classes.
|
136
|
+
|
137
|
+
## Project home page
|
138
|
+
|
139
|
+
Information on the source tree, documentation, examples, issues and
|
140
|
+
how to contribute, see
|
141
|
+
|
142
|
+
http://github.com/wassermanlab/jaspar-bioruby
|
143
|
+
|
144
|
+
## Copyright
|
145
|
+
|
146
|
+
See LICENSE.txt for further details.
|
147
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
= bio-jaspar
|
2
|
+
|
3
|
+
== Tools for JASPAR motif analysis
|
4
|
+
|
5
|
+
This gem provides methods for:
|
6
|
+
|
7
|
+
1. Reading and writing sequence motifs in JASPAR format
|
8
|
+
2. Accessing a JASPAR5 formatted database
|
9
|
+
3. Comparing, searching, and analyzing motifs in sequences
|
10
|
+
|
11
|
+
\* *Note:* The JASPAR motif analysis tools consist of several modules that are directly imported from the Bio.motifs package in BioPython. Namely, those modules/submodules are: Bio.motifs, Bio.motifs.matrix, Bio.motifs.thresholds, Bio.motifs.jaspar. The functionality of this gem will be identical to the aforementioned modules/submodules.
|
12
|
+
|
13
|
+
== Installation
|
14
|
+
|
15
|
+
gem install bio-jaspar
|
16
|
+
|
17
|
+
== Usage
|
18
|
+
|
19
|
+
=== Loading the gem
|
20
|
+
|
21
|
+
require 'bio-jaspar'
|
22
|
+
|
23
|
+
=== Loading a motif/motifs from a JASPAR database
|
24
|
+
|
25
|
+
A connection to the JASPAR database is made by creating a JASPAR5 instance.
|
26
|
+
|
27
|
+
# Substitute the database credentials!
|
28
|
+
db = Bio::Jaspar::JASPAR5.new(
|
29
|
+
:host => <db_host.org>,
|
30
|
+
:name => <db_name>,
|
31
|
+
:user => <db_user>,
|
32
|
+
:password => <db_password>
|
33
|
+
)
|
34
|
+
|
35
|
+
Now, a motif can be retrieved by the matrix_id
|
36
|
+
|
37
|
+
m = db.fetch_motif_by_id("MA0049")
|
38
|
+
puts m.to_s
|
39
|
+
|
40
|
+
Or multiple motifs can be retrieved by various criteria
|
41
|
+
|
42
|
+
motifs = db.fetch_motifs(
|
43
|
+
:collection => "CORE",
|
44
|
+
:tax_group => ["fungi", "vertebrate"],
|
45
|
+
:tf_class => "Helix-Turn-Helix",
|
46
|
+
:min_ic => 2
|
47
|
+
)
|
48
|
+
|
49
|
+
=== Motif analysis
|
50
|
+
|
51
|
+
Many methods are available for motif analysis. Here are some examples:
|
52
|
+
|
53
|
+
m = db.fetch_motif_by_id("MA0049")
|
54
|
+
|
55
|
+
# Consensus sequence
|
56
|
+
m.consensus # BioRuby Sequence object
|
57
|
+
puts m.consensus
|
58
|
+
|
59
|
+
# Anticonsensus sequence
|
60
|
+
m.anticonsensus # BioRuby Sequence object
|
61
|
+
puts m.anticonsensus
|
62
|
+
|
63
|
+
# Reverse complement motif
|
64
|
+
m.reverse_complement # Bio::Motif::Motifs object
|
65
|
+
|
66
|
+
# Pseudocounts
|
67
|
+
m.pseudocounts
|
68
|
+
|
69
|
+
# Background
|
70
|
+
m.background
|
71
|
+
|
72
|
+
# Position weight matrix
|
73
|
+
m.pwm
|
74
|
+
|
75
|
+
# Position specific scoring matrix
|
76
|
+
m.pssm
|
77
|
+
|
78
|
+
Matrix methods are also available. Here are some examples:
|
79
|
+
|
80
|
+
m = db.fetch_motif_by_id("MA0049")
|
81
|
+
|
82
|
+
# Maximum possible score for the given motif
|
83
|
+
m.pssm.max
|
84
|
+
|
85
|
+
# Minimum possible score for the given motif
|
86
|
+
m.pssm.min
|
87
|
+
|
88
|
+
# Expected value of the motif score
|
89
|
+
m.pssm.mean
|
90
|
+
|
91
|
+
# Standard deviation of the given motif score
|
92
|
+
m.pssm.std
|
93
|
+
|
94
|
+
# Find hits with the PWM score above given threshold
|
95
|
+
m.pssm.search(Bio::Sequence.auto("ACCTGCCTAAAAAA"), threshold = 0.5)
|
96
|
+
|
97
|
+
=== Read/write Jaspar file
|
98
|
+
|
99
|
+
Already downloaded pfm, jaspar, sites files can be loaded/written using the Jaspar module
|
100
|
+
|
101
|
+
# Read a pfm file
|
102
|
+
f = File.open("test.pfm", "r")
|
103
|
+
Bio::Jaspar.read(f, "pfm")
|
104
|
+
f.close
|
105
|
+
|
106
|
+
# Write motifs into a jaspar file
|
107
|
+
motifs = db.fetch_motifs(
|
108
|
+
:collection => "CORE",
|
109
|
+
:tax_group => ["fungi", "vertebrate"],
|
110
|
+
:tf_class => "Helix-Turn-Helix",
|
111
|
+
:min_ic => 2
|
112
|
+
)
|
113
|
+
File.open("test.jaspar", "w") do |f|
|
114
|
+
Bio::Jaspar.write(f, "jaspar")
|
115
|
+
end
|
116
|
+
|
117
|
+
== Project home page
|
118
|
+
|
119
|
+
Information on the source tree, documentation, issues and how to contribute, see
|
120
|
+
|
121
|
+
http://github.com/wassermanlab/jaspar-bioruby
|
122
|
+
|
123
|
+
== Copyright
|
124
|
+
|
125
|
+
See LICENSE.txt for further details.
|
126
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
+
gem.name = "bio-jaspar"
|
18
|
+
gem.homepage = "http://github.com/wassermanlab/jaspar-bioruby"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Tools for JASPAR motifs in BioRuby}
|
21
|
+
gem.description = %Q{Basic tools for parsing, searching, and comparing JASPAR motifs; Based on Bio.motifs module in Biopython}
|
22
|
+
gem.authors = ["Jessica Lee", "Wasserman Lab"]
|
23
|
+
# dependencies defined in Gemfile
|
24
|
+
end
|
25
|
+
Jeweler::RubygemsDotOrgTasks.new
|
26
|
+
|
27
|
+
require 'rake/testtask'
|
28
|
+
Rake::TestTask.new(:test) do |test|
|
29
|
+
test.libs << 'lib' << 'test'
|
30
|
+
test.pattern = 'test/**/test_*.rb'
|
31
|
+
test.verbose = true
|
32
|
+
end
|
33
|
+
|
34
|
+
task :default => :test
|
35
|
+
|
36
|
+
require 'rdoc/task'
|
37
|
+
Rake::RDocTask.new do |rdoc|
|
38
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
39
|
+
|
40
|
+
rdoc.rdoc_dir = 'rdoc'
|
41
|
+
rdoc.title = "bio-jaspar #{version}"
|
42
|
+
rdoc.rdoc_files.include('README*')
|
43
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
44
|
+
rdoc.main = "README.rdoc"
|
45
|
+
end
|
data/lib/bio-jaspar.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-jaspar/motifs.rb'
|
12
|
+
require 'bio-jaspar/matrix.rb'
|
13
|
+
require 'bio-jaspar/thresholds.rb'
|
14
|
+
require 'bio-jaspar/jaspar.rb'
|
15
|
+
require 'bio-jaspar/db.rb'
|
@@ -0,0 +1,432 @@
|
|
1
|
+
#--
|
2
|
+
# = bio-jaspar/jaspar.rb
|
3
|
+
#
|
4
|
+
# Copyright:: (C) 2015-2015 Wasserman Lab
|
5
|
+
# License:: Ruby License
|
6
|
+
#
|
7
|
+
# JASPAR 2014 module
|
8
|
+
#
|
9
|
+
# A direct import of Bio.motifs.jaspar module in Biopython
|
10
|
+
#++
|
11
|
+
|
12
|
+
require 'bio'
|
13
|
+
|
14
|
+
module Bio # :nodoc:
|
15
|
+
# == JASPAR 2014 module
|
16
|
+
#
|
17
|
+
# Provides read access to a JASPAR5 formatted database.
|
18
|
+
#
|
19
|
+
# This module is a direct import of Bio.motifs.jaspar module in Biopython.
|
20
|
+
# The following document contains excerpts from Bio.motifs.jaspar module
|
21
|
+
# in Biopython.
|
22
|
+
module Jaspar
|
23
|
+
|
24
|
+
# Unambiguous DNA bases
|
25
|
+
DNA = Bio::Motifs::Alphabet.new.IUPAC_unambiguous_dna
|
26
|
+
|
27
|
+
# JASPAR OUTPUT specific DNA bases
|
28
|
+
JASPAR_ORDERED_DNA_LETTERS = ["A","C","G","T"] # Jaspar requires specific order for printouts
|
29
|
+
|
30
|
+
# A subclass of Bio::Motifs::Motif used to represent a JASPAR profile.
|
31
|
+
#
|
32
|
+
# Additional metadata information are stored if available. The metadata
|
33
|
+
# availability depends on the source of the JASPAR motif (a 'pfm' format
|
34
|
+
# file, a 'jaspar' format file or a JASPAR database).
|
35
|
+
#
|
36
|
+
# <i>A direct import of Bio.motifs.jaspar module in Biopython</i>
|
37
|
+
class Motif < Bio::Motifs::Motif
|
38
|
+
attr_accessor :matrix_id, :collection, :tf_class, :tf_family, :species,
|
39
|
+
:tax_group, :acc, :data_type, :medline, :pazar_id, :comment
|
40
|
+
|
41
|
+
# Construct a JASPAR Motif instance
|
42
|
+
#
|
43
|
+
def initialize(matrix_id, name, opts = {})
|
44
|
+
opts = {
|
45
|
+
:alphabet => DNA,
|
46
|
+
:instances => nil,
|
47
|
+
:counts => nil,
|
48
|
+
:collection => nil,
|
49
|
+
:tf_class => nil,
|
50
|
+
:tf_family => nil,
|
51
|
+
:species => nil,
|
52
|
+
:tax_group => nil,
|
53
|
+
:acc => nil,
|
54
|
+
:data_type => nil,
|
55
|
+
:medline => nil,
|
56
|
+
:pazar_id => nil,
|
57
|
+
:comment => nil
|
58
|
+
}.merge(opts)
|
59
|
+
|
60
|
+
super(opts[:alphabet], opts[:instances], opts[:counts])
|
61
|
+
|
62
|
+
@name = name
|
63
|
+
@matrix_id = matrix_id
|
64
|
+
@collection = opts[:collection]
|
65
|
+
@tf_class = opts[:tf_class]
|
66
|
+
@tf_family = opts[:tf_family]
|
67
|
+
@species = opts[:species]
|
68
|
+
@tax_group = opts[:tax_group]
|
69
|
+
@acc = opts[:acc]
|
70
|
+
@data_type = opts[:data_type]
|
71
|
+
@medline = opts[:medline]
|
72
|
+
@pazar_id = opts[:pazar_id]
|
73
|
+
@comment = opts[:comment]
|
74
|
+
end
|
75
|
+
|
76
|
+
# Return the JASPAR base matrix ID
|
77
|
+
def base_id
|
78
|
+
base_id, _ = Jaspar.split_jaspar_id(@matrix_id)
|
79
|
+
return base_id
|
80
|
+
end
|
81
|
+
|
82
|
+
# Return the JASPAR matrix version
|
83
|
+
def version
|
84
|
+
_, version = Jaspar.split_jaspar_id(@matrix_id)
|
85
|
+
return version
|
86
|
+
end
|
87
|
+
|
88
|
+
# Return a string represention of the JASPAR profile.
|
89
|
+
#
|
90
|
+
# We choose to provide only the filled metadata information.
|
91
|
+
def to_s
|
92
|
+
tf_name_str = "TF name\t#{@name}\n"
|
93
|
+
matrix_id_str = "Matrix ID\t#{@matrix_id}\n"
|
94
|
+
the_string = tf_name_str + matrix_id_str
|
95
|
+
|
96
|
+
if @collection
|
97
|
+
collection_str = "Collection\t#{@collection}\n"
|
98
|
+
the_string += collection_str
|
99
|
+
end
|
100
|
+
if @tf_class
|
101
|
+
tf_class_str = "TF class\t#{@tf_class}\n"
|
102
|
+
the_string += tf_class_str
|
103
|
+
end
|
104
|
+
if @tf_family
|
105
|
+
tf_family_str = "TF family\t#{@tf_family}\n"
|
106
|
+
the_string += tf_family_str
|
107
|
+
end
|
108
|
+
if @species
|
109
|
+
species_str = "Species\t#{@species.join(",")}\n"
|
110
|
+
the_string += species_str
|
111
|
+
end
|
112
|
+
if @tax_group
|
113
|
+
tax_group_str = "Taxonomic group\t#{@tax_group}\n"
|
114
|
+
the_string += tax_group_str
|
115
|
+
end
|
116
|
+
if @acc
|
117
|
+
acc_str = "Accession\t#{@acc}\n"
|
118
|
+
the_string += acc_str
|
119
|
+
end
|
120
|
+
if @data_type
|
121
|
+
data_type_str = "Data type used\t#{@data_type}\n"
|
122
|
+
the_string += data_type_str
|
123
|
+
end
|
124
|
+
if @medline
|
125
|
+
medline_str = "Medline\t#{@medline}\n"
|
126
|
+
the_string += medline_str
|
127
|
+
end
|
128
|
+
if @pazar_id
|
129
|
+
pazar_id_str = "PAZAR ID\t#{@pazar_id}\n"
|
130
|
+
the_string += pazar_id_str
|
131
|
+
end
|
132
|
+
if @comment
|
133
|
+
comment_str = "Comments\t#{@comment}\n"
|
134
|
+
the_string += comment_str
|
135
|
+
end
|
136
|
+
matrix_str = "Matrix:\n#{counts}\n\n"
|
137
|
+
the_string += matrix_str
|
138
|
+
return the_string
|
139
|
+
end
|
140
|
+
|
141
|
+
# Return the hash key corresponding to the JASPAR profile
|
142
|
+
#
|
143
|
+
# Note: We assume the unicity of matrix IDs
|
144
|
+
def hash
|
145
|
+
return @matrix_id.hash
|
146
|
+
end
|
147
|
+
|
148
|
+
# Compare two JASPAR motifs for equality. Two motifs are equal if their
|
149
|
+
# matrix_ids match
|
150
|
+
def ==(other)
|
151
|
+
return @matrix_id == other.matrix_id
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
|
156
|
+
# Represent a list of JASPAR motifs.
|
157
|
+
#
|
158
|
+
# <i>A direct import of Bio.motifs.jaspar module in Biopython</i>
|
159
|
+
#
|
160
|
+
# ==== Attributes
|
161
|
+
#
|
162
|
+
# * +version+ - The JASPAR version used
|
163
|
+
class Record < Array
|
164
|
+
# Construct a record instance
|
165
|
+
def initialize
|
166
|
+
super()
|
167
|
+
@version = nil
|
168
|
+
end
|
169
|
+
|
170
|
+
# Return a string of all JASPAR motifs in the list
|
171
|
+
def to_s
|
172
|
+
return self.map { |the_motif| the_motif.to_s }.join("\n")
|
173
|
+
end
|
174
|
+
|
175
|
+
# Return the list of matrices as a hash (ruby equivalent of dict)
|
176
|
+
# of matrices
|
177
|
+
def to_h
|
178
|
+
dic = {}
|
179
|
+
self.each { |motif|
|
180
|
+
dic[motif.matrix_id] = motif
|
181
|
+
}
|
182
|
+
return dic
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# Read motif(s) from a file in one of several different JASPAR formats.
|
187
|
+
#
|
188
|
+
# Return the record of PFM(s).
|
189
|
+
# Call the appropriate routine based on the format passed
|
190
|
+
def Jaspar.read(handle, format)
|
191
|
+
format = format.downcase
|
192
|
+
if format == "pfm"
|
193
|
+
record = _read_pfm(handle)
|
194
|
+
return record
|
195
|
+
elsif format == "sites"
|
196
|
+
record = _read_sites(handle)
|
197
|
+
return record
|
198
|
+
elsif format == "jaspar"
|
199
|
+
record = _read_jaspar(handle)
|
200
|
+
return record
|
201
|
+
else
|
202
|
+
raise ArgumentError, "Unknown JASPAR format #{format}"
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
|
207
|
+
# Return the representation of motifs in "pfm" or "jaspar" format.
|
208
|
+
def Jaspar.write(motifs, format)
|
209
|
+
letters = JASPAR_ORDERED_DNA_LETTERS
|
210
|
+
lines = []
|
211
|
+
if format == "pfm"
|
212
|
+
motif = motifs[0]
|
213
|
+
counts = motif.counts
|
214
|
+
letters.each do |letter|
|
215
|
+
terms = counts[letter].map { |value| "%6.2f" % value }
|
216
|
+
line = "#{terms.join(" ")}\n"
|
217
|
+
lines << line
|
218
|
+
end
|
219
|
+
elsif format == "jaspar"
|
220
|
+
motifs.each do |m|
|
221
|
+
counts = m.counts
|
222
|
+
line = ">#{m.matrix_id} #{m.name}\n"
|
223
|
+
lines << line
|
224
|
+
|
225
|
+
letters.each do |letter|
|
226
|
+
terms = counts[letter].map { |value| "%6.2f" % value }
|
227
|
+
line = "#{letter} [#{terms.join(" ")}]\n"
|
228
|
+
lines << line
|
229
|
+
end
|
230
|
+
end
|
231
|
+
else
|
232
|
+
raise ArgumentError, "Unknown JASPAR format #{format}"
|
233
|
+
end
|
234
|
+
|
235
|
+
text = lines.join("")
|
236
|
+
return text
|
237
|
+
end
|
238
|
+
|
239
|
+
# Return pseudocounts of a given JASPAR motif
|
240
|
+
def Jaspar.calculate_pseudocounts(motif)
|
241
|
+
alphabet = motif.alphabet
|
242
|
+
background = motif.background
|
243
|
+
|
244
|
+
total = 0
|
245
|
+
(0...motif.length).each do |i|
|
246
|
+
total += alphabet.letters.map { |letter| motif.counts[letter][i].to_f }.inject(:+)
|
247
|
+
end
|
248
|
+
|
249
|
+
avg_nb_instances = total / motif.length
|
250
|
+
sq_nb_instances = Math.sqrt(avg_nb_instances)
|
251
|
+
|
252
|
+
if background
|
253
|
+
background = Hash[background]
|
254
|
+
else
|
255
|
+
background = Hash[alphabet.letters.sort.map { |l| [l, 1.0] }]
|
256
|
+
end
|
257
|
+
|
258
|
+
total = background.values.inject(:+)
|
259
|
+
pseudocounts = {}
|
260
|
+
|
261
|
+
alphabet.letters.each do |letter|
|
262
|
+
background[letter] /= total
|
263
|
+
pseudocounts[letter] = sq_nb_instances * background[letter]
|
264
|
+
end
|
265
|
+
|
266
|
+
return pseudocounts
|
267
|
+
end
|
268
|
+
|
269
|
+
# Utility function to split a JASPAR matrix ID into its component.
|
270
|
+
#
|
271
|
+
# Components are base ID and version number, e.g. 'MA0047.2' is returned as
|
272
|
+
# ('MA0047', 2).
|
273
|
+
def Jaspar.split_jaspar_id(id)
|
274
|
+
id_split = id.split(".")
|
275
|
+
|
276
|
+
base_id = nil
|
277
|
+
version = nil
|
278
|
+
|
279
|
+
if id_split.length == 2
|
280
|
+
base_id = id_split[0]
|
281
|
+
version = id_split[1]
|
282
|
+
else
|
283
|
+
base_id = id
|
284
|
+
end
|
285
|
+
|
286
|
+
return base_id, version
|
287
|
+
end
|
288
|
+
|
289
|
+
# Private methods
|
290
|
+
private
|
291
|
+
|
292
|
+
# Read the motif from a JASPAR .pfm file (PRIVATE).
|
293
|
+
def Jaspar._read_pfm(handle)
|
294
|
+
alphabet = DNA
|
295
|
+
counts = {}
|
296
|
+
|
297
|
+
letters = JASPAR_ORDERED_DNA_LETTERS
|
298
|
+
letters.zip(handle).each do |letter, line|
|
299
|
+
words = line.split
|
300
|
+
if words[0] == letter
|
301
|
+
words = words[1..-1]
|
302
|
+
end
|
303
|
+
counts[letter] = words.map(&:to_f)
|
304
|
+
end
|
305
|
+
|
306
|
+
motif = Motif.new(nil, nil, :alphabet => alphabet, :counts => counts)
|
307
|
+
motif.mask = "*" * motif.length
|
308
|
+
record = Record.new
|
309
|
+
record << motif
|
310
|
+
|
311
|
+
return record
|
312
|
+
end
|
313
|
+
|
314
|
+
# Read the motif from JASPAR .sites file (PRIVATE).
|
315
|
+
def Jaspar._read_sites(handle)
|
316
|
+
alphabet = DNA
|
317
|
+
instances = []
|
318
|
+
|
319
|
+
handle_enum = handle.to_enum
|
320
|
+
|
321
|
+
handle.each do |line|
|
322
|
+
unless line.start_with?(">")
|
323
|
+
break
|
324
|
+
end
|
325
|
+
|
326
|
+
line = handle_enum.next
|
327
|
+
instance = ""
|
328
|
+
line.strip.each_char do |c|
|
329
|
+
if c == c.upcase
|
330
|
+
instance += c
|
331
|
+
end
|
332
|
+
end
|
333
|
+
instance = Bio::Sequence.auto(instance)
|
334
|
+
instances << instance
|
335
|
+
end
|
336
|
+
|
337
|
+
instances = Bio::Motifs::Instances.new(instances, alphabet)
|
338
|
+
motif = Motif.new(nil, nil, :alphabet => alphabet, :instances => instances)
|
339
|
+
motif.mask = "*" * motif.length
|
340
|
+
record = Record.new
|
341
|
+
record << motif
|
342
|
+
|
343
|
+
return record
|
344
|
+
end
|
345
|
+
|
346
|
+
# Read motifs from a JASPAR formatted file (PRIVATE).
|
347
|
+
#
|
348
|
+
# Format is one or more records of the form, e.g.::
|
349
|
+
#
|
350
|
+
# - JASPAR 2010 matrix_only format::
|
351
|
+
#
|
352
|
+
# >MA0001.1 AGL3
|
353
|
+
# A [ 0 3 79 40 66 48 65 11 65 0 ]
|
354
|
+
# C [94 75 4 3 1 2 5 2 3 3 ]
|
355
|
+
# G [ 1 0 3 4 1 0 5 3 28 88 ]
|
356
|
+
# T [ 2 19 11 50 29 47 22 81 1 6 ]
|
357
|
+
#
|
358
|
+
# - JASPAR 2010-2014 PFMs format::
|
359
|
+
#
|
360
|
+
# >MA0001.1 AGL3
|
361
|
+
# 0 3 79 40 66 48 65 11 65 0
|
362
|
+
# 94 75 4 3 1 2 5 2 3 3
|
363
|
+
# 1 0 3 4 1 0 5 3 28 88
|
364
|
+
# 2 19 11 50 29 47 22 81 1 6
|
365
|
+
#
|
366
|
+
def Jaspar._read_jaspar(handle)
|
367
|
+
alphabet = DNA
|
368
|
+
counts = {}
|
369
|
+
|
370
|
+
record = Record.new
|
371
|
+
|
372
|
+
head_pat = /^>\s*(\S+)(\s+(\S+))?/
|
373
|
+
row_pat_long = /\s*([ACGT])\s*\[\s*(.*)\s*\]/
|
374
|
+
row_pat_short = /\s*(.+)\s*/
|
375
|
+
|
376
|
+
identifier = nil
|
377
|
+
name = nil
|
378
|
+
row_count = 0
|
379
|
+
nucleotides = ["A","C","G","T"]
|
380
|
+
handle.each do |line|
|
381
|
+
line = line.strip
|
382
|
+
|
383
|
+
head_match = line.match(head_pat)
|
384
|
+
row_match_long = line.match(row_pat_long)
|
385
|
+
row_match_short = line.match(row_pat_short)
|
386
|
+
|
387
|
+
if head_match
|
388
|
+
identifier = head_match[1]
|
389
|
+
if head_match[3]
|
390
|
+
name = head_match[3]
|
391
|
+
else
|
392
|
+
name = identifier
|
393
|
+
end
|
394
|
+
elsif row_match_long
|
395
|
+
letter, counts_str = row_match_long[1..2]
|
396
|
+
words = counts_str.split
|
397
|
+
counts[letter] = words.map(&:to_f)
|
398
|
+
row_count += 1
|
399
|
+
if row_count == 4
|
400
|
+
record << Motif.new(identifier,
|
401
|
+
name,
|
402
|
+
:alphabet => alphabet,
|
403
|
+
:counts => counts)
|
404
|
+
identifier = nil
|
405
|
+
name = nil
|
406
|
+
counts = {}
|
407
|
+
row_count = 0
|
408
|
+
end
|
409
|
+
elsif row_match_short
|
410
|
+
words = row_match_short[1].split
|
411
|
+
counts[nucleotides[row_count]] = words.map(&:to_f)
|
412
|
+
row_count += 1
|
413
|
+
if row_count == 4
|
414
|
+
record << Motif.new(identifier,
|
415
|
+
name,
|
416
|
+
:alphabet => alphabet,
|
417
|
+
:counts => counts)
|
418
|
+
identifier = nil
|
419
|
+
name = nil
|
420
|
+
counts = {}
|
421
|
+
row_count = 0
|
422
|
+
end
|
423
|
+
end
|
424
|
+
end
|
425
|
+
|
426
|
+
return record
|
427
|
+
end
|
428
|
+
|
429
|
+
private_class_method :_read_pfm, :_read_sites, :_read_jaspar
|
430
|
+
|
431
|
+
end
|
432
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'bio-jaspar'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
@@ -0,0 +1,265 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'bio-jaspar'
|
3
|
+
require 'bio'
|
4
|
+
|
5
|
+
class TestBioJaspar < Test::Unit::TestCase
|
6
|
+
context 'JASPAR module' do
|
7
|
+
should "correctly read jaspar formatted file" do
|
8
|
+
f = File.open('test/data/jaspar-test.jaspar', "r")
|
9
|
+
motifs = Bio::Motifs.parse(f, "jaspar")
|
10
|
+
f.close
|
11
|
+
|
12
|
+
# Test first motif in the set
|
13
|
+
corr_motifs_beg_counts = {
|
14
|
+
"A" => [3.0, 21.0, 25.0, 0.0, 0.0, 24.0, 1.0, 0.0],
|
15
|
+
"C" => [13.0, 1.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0],
|
16
|
+
"T" => [5.0, 3.0, 0.0, 25.0, 20.0, 0.0, 24.0, 23.0],
|
17
|
+
"G" => [4.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0]
|
18
|
+
}
|
19
|
+
assert_equal corr_motifs_beg_counts, motifs[0].counts
|
20
|
+
assert_equal 8, motifs[0].length
|
21
|
+
assert_equal "HAT5", motifs[0].name
|
22
|
+
assert_equal "MA0008.1", motifs[0].matrix_id
|
23
|
+
|
24
|
+
# Test the last motif in the set
|
25
|
+
corr_motifs_end_counts = {
|
26
|
+
"A" => [4.0, 5.0, 5.0, 3.0, 0.0, 0.0, 25.0, 26.0, 0.0, 0.0, 26.0, 0.0, 0.0, 17.0, 0.0, 5.0, 2.0, 0.0, 0.0],
|
27
|
+
"C" => [2.0, 3.0, 4.0, 8.0, 1.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 25.0, 1.0, 6.0, 3.0, 5.0],
|
28
|
+
"T" => [2.0, 3.0, 5.0, 5.0, 0.0, 20.0, 0.0, 0.0, 26.0, 0.0, 0.0, 26.0, 23.0, 0.0, 0.0, 7.0, 4.0, 3.0, 0.0],
|
29
|
+
"G" => [1.0, 3.0, 2.0, 2.0, 25.0, 0.0, 1.0, 0.0, 0.0, 26.0, 0.0, 0.0, 3.0, 9.0, 1.0, 4.0, 0.0, 4.0, 3.0]
|
30
|
+
}
|
31
|
+
assert_equal corr_motifs_end_counts, motifs[-1].counts
|
32
|
+
assert_equal 19, motifs[-1].length
|
33
|
+
assert_equal "ATHB9", motifs[-1].name
|
34
|
+
assert_equal "MA0573.1", motifs[-1].matrix_id
|
35
|
+
end
|
36
|
+
|
37
|
+
should "correctly read pfm formatted file" do
|
38
|
+
f = File.open('test/data/jaspar-test.pfm', "r")
|
39
|
+
motif = Bio::Motifs.parse(f, "pfm")
|
40
|
+
f.close
|
41
|
+
|
42
|
+
corr_counts = {
|
43
|
+
"A" => [3.0, 21.0, 25.0, 0.0, 0.0, 24.0, 1.0, 0.0],
|
44
|
+
"C" => [13.0, 1.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0],
|
45
|
+
"T" => [5.0, 3.0, 0.0, 25.0, 20.0, 0.0, 24.0, 23.0],
|
46
|
+
"G" => [4.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0]
|
47
|
+
}
|
48
|
+
assert_equal 1, motif.length # should only read one motif
|
49
|
+
assert_equal 8, motif[0].length
|
50
|
+
assert_equal corr_counts, motif[0].counts
|
51
|
+
end
|
52
|
+
|
53
|
+
should "correctly read sites formatted file" do
|
54
|
+
f = File.open('test/data/jaspar-test.sites', "r")
|
55
|
+
motif = Bio::Motifs.parse(f, "sites")
|
56
|
+
f.close
|
57
|
+
|
58
|
+
corr_counts = {
|
59
|
+
"A" => [15, 4, 41, 36, 7, 19, 3],
|
60
|
+
"C" => [11, 35, 1, 2, 29, 14, 22],
|
61
|
+
"T" => [7, 2, 0, 1, 1, 3, 3],
|
62
|
+
"G" => [10, 2, 1, 4, 6, 7, 15]
|
63
|
+
}
|
64
|
+
assert_equal 7, motif[0].length
|
65
|
+
assert_equal corr_counts, motif[0].counts
|
66
|
+
|
67
|
+
# Check the first and last sequence motifs
|
68
|
+
assert_equal "ccaaccc", motif[0].instances[0].to_s
|
69
|
+
assert_equal "gtatctc", motif[0].instances[-1].to_s
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
# Once the reads pass the test, load the test files and setup the test
|
75
|
+
setup do
|
76
|
+
f = File.open('test/data/jaspar-test.jaspar', "r")
|
77
|
+
@motifs = Bio::Motifs.parse(f, "jaspar")
|
78
|
+
@motif = @motifs.first
|
79
|
+
f.close
|
80
|
+
end
|
81
|
+
|
82
|
+
context 'JASPAR module' do
|
83
|
+
should "correctly convert Motifs into jaspar formatted string" do
|
84
|
+
corr_jaspar = ">MA0008.1 HAT5\nA [ 3.00 21.00 25.00 0.00 0.00 24.00 1.00 0.00]\nC [ 13.00 1.00 0.00 0.00 5.00 0.00 0.00 0.00]\nG [ 4.00 0.00 0.00 0.00 0.00 1.00 0.00 2.00]\nT [ 5.00 3.00 0.00 25.00 20.00 0.00 24.00 23.00]\n>MA0027.1 En1\nA [ 4.00 5.00 3.00 0.00 4.00 3.00 3.00 2.00 1.00 1.00 1.00]\nC [ 1.00 2.00 0.00 0.00 0.00 0.00 0.00 1.00 3.00 4.00 6.00]\nG [ 2.00 2.00 7.00 2.00 3.00 7.00 0.00 4.00 3.00 1.00 1.00]\nT [ 3.00 1.00 0.00 8.00 3.00 0.00 7.00 3.00 3.00 4.00 2.00]\n>MA0046.1 HNF1A\nA [ 5.00 1.00 1.00 1.00 20.00 16.00 1.00 8.00 14.00 2.00 0.00 13.00 8.00 5.00]\nC [ 0.00 0.00 0.00 0.00 0.00 2.00 0.00 2.00 0.00 0.00 4.00 1.00 8.00 13.00]\nG [ 14.00 20.00 0.00 0.00 0.00 1.00 0.00 4.00 1.00 0.00 0.00 3.00 3.00 0.00]\nT [ 2.00 0.00 20.00 20.00 1.00 2.00 20.00 7.00 6.00 19.00 17.00 4.00 2.00 3.00]\n"
|
85
|
+
jaspar = Bio::Jaspar.write(@motifs[0, 3], "jaspar")
|
86
|
+
assert_equal corr_jaspar, jaspar
|
87
|
+
end
|
88
|
+
|
89
|
+
should "correctly convert Motifs into pfm formatted string" do
|
90
|
+
corr_pfm = " 3.00 21.00 25.00 0.00 0.00 24.00 1.00 0.00\n 13.00 1.00 0.00 0.00 5.00 0.00 0.00 0.00\n 4.00 0.00 0.00 0.00 0.00 1.00 0.00 2.00\n 5.00 3.00 0.00 25.00 20.00 0.00 24.00 23.00\n"
|
91
|
+
pfm = Bio::Jaspar.write(@motifs, "pfm")
|
92
|
+
assert_equal corr_pfm, pfm
|
93
|
+
end
|
94
|
+
|
95
|
+
should "correctly calculate pseudocounts" do
|
96
|
+
corr_pc = {"A" => 1.25, "C" => 1.25, "T" => 1.25, "G" => 1.25}
|
97
|
+
pc = Bio::Jaspar.calculate_pseudocounts(@motifs[0])
|
98
|
+
assert_equal corr_pc, pc
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
context 'JASPAR Motif class' do
|
103
|
+
should "return a correct length" do
|
104
|
+
assert_equal 8, @motif.length
|
105
|
+
end
|
106
|
+
|
107
|
+
should "return a correct consensus sequence" do
|
108
|
+
assert_equal Bio::Sequence.auto("CAATTATT").to_s, @motif.consensus.to_s
|
109
|
+
end
|
110
|
+
|
111
|
+
should "return a correct anticonsensus sequence" do
|
112
|
+
assert_equal Bio::Sequence.auto("AGGGGTGA").to_s, @motif.anticonsensus.to_s
|
113
|
+
end
|
114
|
+
|
115
|
+
should "return a correct degenerate consensus" do
|
116
|
+
assert_equal Bio::Sequence.auto("CAATTATT").to_s, @motif.degenerate_consensus.to_s
|
117
|
+
end
|
118
|
+
|
119
|
+
should "return a correct reverse complement" do
|
120
|
+
corr_rc_counts = {
|
121
|
+
"A" => [23.0, 24.0, 0.0, 20.0, 25.0, 0.0, 3.0, 5.0],
|
122
|
+
"C" => [2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 4.0],
|
123
|
+
"T" => [0.0, 1.0, 24.0, 0.0, 0.0, 25.0, 21.0, 3.0],
|
124
|
+
"G" => [0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 1.0, 13.0]
|
125
|
+
}
|
126
|
+
rc = @motif.reverse_complement
|
127
|
+
assert_equal 0.13, rc.counts.gc_content
|
128
|
+
assert_equal 8, rc.length
|
129
|
+
assert_equal corr_rc_counts, rc.counts
|
130
|
+
end
|
131
|
+
|
132
|
+
should "return a correct mask" do
|
133
|
+
assert_equal [1,1,1,1,1,1,1,1], @motif.mask
|
134
|
+
end
|
135
|
+
|
136
|
+
should "return correct pseudocounts" do
|
137
|
+
corr_pc = {"A" => 0.0, "C" => 0.0, "T" => 0.0, "G" => 0.0}
|
138
|
+
assert_equal corr_pc, @motif.pseudocounts
|
139
|
+
end
|
140
|
+
|
141
|
+
should "return a correct background" do
|
142
|
+
corr_bg = {"A" => 0.25, "C" => 0.25, "T" => 0.25, "G" => 0.25}
|
143
|
+
assert_equal corr_bg, @motif.background
|
144
|
+
end
|
145
|
+
|
146
|
+
should "return a correct pwm" do
|
147
|
+
corr_pwm = {
|
148
|
+
"A" => [0.12, 0.84, 1.0, 0.0, 0.0, 0.96, 0.04, 0.0],
|
149
|
+
"C" => [0.52, 0.04, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0],
|
150
|
+
"T" => [0.2, 0.12, 0.0, 1.0, 0.8, 0.0, 0.96, 0.92],
|
151
|
+
"G" => [0.16, 0.0, 0.0, 0.0, 0.0, 0.04, 0.0, 0.08]
|
152
|
+
}
|
153
|
+
assert_equal corr_pwm, @motif.pwm
|
154
|
+
end
|
155
|
+
|
156
|
+
should "return a correct pssm" do
|
157
|
+
corr_pssm = {
|
158
|
+
"A" => [-1.0588936890535685, 1.7484612330040357, 2.0, -Float::INFINITY, -Float::INFINITY, 1.9411063109464317, -2.643856189774725, -Float::INFINITY],
|
159
|
+
"C" => [1.0565835283663676, -2.643856189774725, -Float::INFINITY, -Float::INFINITY, -0.3219280948873623, -Float::INFINITY, -Float::INFINITY, -Float::INFINITY],
|
160
|
+
"T" => [-0.3219280948873623, -1.0588936890535685, -Float::INFINITY, 2.0, 1.6780719051126378, -Float::INFINITY, 1.9411063109464317, 1.8797057662822885],
|
161
|
+
"G" => [-0.6438561897747247, -Float::INFINITY, -Float::INFINITY, -Float::INFINITY, -Float::INFINITY, -2.643856189774725, -Float::INFINITY, -1.6438561897747248]
|
162
|
+
}
|
163
|
+
assert_equal corr_pssm, @motif.pssm
|
164
|
+
end
|
165
|
+
|
166
|
+
should "correctly format motif in jaspar format" do
|
167
|
+
corr_jaspar_str = ">MA0008.1 HAT5\nA [ 3.00 21.00 25.00 0.00 0.00 24.00 1.00 0.00]\nC [ 13.00 1.00 0.00 0.00 5.00 0.00 0.00 0.00]\nG [ 4.00 0.00 0.00 0.00 0.00 1.00 0.00 2.00]\nT [ 5.00 3.00 0.00 25.00 20.00 0.00 24.00 23.00]\n"
|
168
|
+
assert_equal corr_jaspar_str, @motif.format("jaspar")
|
169
|
+
end
|
170
|
+
|
171
|
+
should "correctly format motif in pfm format" do
|
172
|
+
corr_pfm_str = " 3.00 21.00 25.00 0.00 0.00 24.00 1.00 0.00\n 13.00 1.00 0.00 0.00 5.00 0.00 0.00 0.00\n 4.00 0.00 0.00 0.00 0.00 1.00 0.00 2.00\n 5.00 3.00 0.00 25.00 20.00 0.00 24.00 23.00\n"
|
173
|
+
assert_equal corr_pfm_str, @motif.format("pfm")
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
context "matrix" do
|
179
|
+
setup do
|
180
|
+
@motif2 = @motifs[1]
|
181
|
+
@non_inf_dist = @motifs[15].pssm.distribution
|
182
|
+
end
|
183
|
+
|
184
|
+
should "correctly return maximum possible score" do
|
185
|
+
assert_equal 14.245035054658192, @motif.pssm.max
|
186
|
+
end
|
187
|
+
|
188
|
+
should "correctly return the minimum possible score" do
|
189
|
+
assert_equal -Float::INFINITY, @motif.pssm.min
|
190
|
+
end
|
191
|
+
|
192
|
+
should "correctly refuse fraction gc content calculation on pssm" do
|
193
|
+
assert_raise do
|
194
|
+
@motif.pssm.gc_content
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
should "correctly calculate the mean" do
|
199
|
+
assert_equal 11.882147864914165, @motif.pssm.mean
|
200
|
+
end
|
201
|
+
|
202
|
+
should "correctly calculate the std" do
|
203
|
+
assert_equal 2.315187013634166, @motif.pssm.std
|
204
|
+
end
|
205
|
+
|
206
|
+
should "correctly calculates the PWM score for the given sequence" do
|
207
|
+
corr_res = [-Float::INFINITY, -Float::INFINITY, -Float::INFINITY, 4.7579989]
|
208
|
+
res = @motif.pssm.calculate(Bio::Sequence.auto("AGTTAATTAAG")).map{ |a|
|
209
|
+
if a.infinite?
|
210
|
+
a
|
211
|
+
else
|
212
|
+
(a * (10 ** 7)).floor / (10.0 ** 7)
|
213
|
+
end
|
214
|
+
}
|
215
|
+
assert_equal corr_res, res
|
216
|
+
end
|
217
|
+
|
218
|
+
should "correctly search and return the position of the hits with PWM higher than threshold" do
|
219
|
+
corr_hits = [[3, 4.7579989]]
|
220
|
+
hits = @motif.pssm.search(Bio::Sequence.auto("AGTTAATTAAG")).map{ |a, b|
|
221
|
+
[a, (b * 10 ** 7).floor / (10.0 ** 7)]
|
222
|
+
}
|
223
|
+
assert_equal corr_hits, hits
|
224
|
+
end
|
225
|
+
|
226
|
+
should "correctly compare sequences using Pearson's correlation" do
|
227
|
+
corr_pearson = [0.024879199790793116, -10]
|
228
|
+
assert_equal corr_pearson, @motif.pssm.dist_pearson(@motif2.pssm)
|
229
|
+
end
|
230
|
+
|
231
|
+
should "correctly generate a distribution for non-infinite pssms" do
|
232
|
+
assert_equal -54.665224748002345, @non_inf_dist.min_score
|
233
|
+
assert_equal 15000, @non_inf_dist.n_points
|
234
|
+
assert_equal 77.64489601267111, @non_inf_dist.interval
|
235
|
+
assert_equal 0.005176671512278893, @non_inf_dist.step
|
236
|
+
|
237
|
+
corr_md_beg_100 = [3.2600762748340303e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.4153180022070797e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.61062211083769e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.782556497068055e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.075095343542538e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.269147502758849e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.410691430657807e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.5132776385471104e-26, 0.0, 0.0, 0.0, 0.0, 4.620724355927226e-26, 0.0, 0.0, 0.0, 0.0, 0.0]
|
238
|
+
corr_md_end_100 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006362547198123186, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007703187540387484]
|
239
|
+
assert_equal corr_md_beg_100, @non_inf_dist.mo_density[0, 100]
|
240
|
+
assert_equal corr_md_end_100, @non_inf_dist.mo_density[@non_inf_dist.mo_density.length-100, 100]
|
241
|
+
|
242
|
+
corr_bd_beg_100 = [9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0]
|
243
|
+
corr_bd_end_100 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10]
|
244
|
+
assert_equal corr_bd_beg_100, @non_inf_dist.bg_density[0, 100]
|
245
|
+
assert_equal corr_bd_end_100, @non_inf_dist.bg_density[@non_inf_dist.bg_density.length-100, 100]
|
246
|
+
end
|
247
|
+
|
248
|
+
should "correctly calculate the threshold for false positive rate" do
|
249
|
+
assert_equal -11.00517721344216, @non_inf_dist.threshold_fpr(0.1)
|
250
|
+
end
|
251
|
+
|
252
|
+
should "correctly calculate the threshold for false negative rate" do
|
253
|
+
assert_equal 8.655821190193073, @non_inf_dist.threshold_fnr(0.1)
|
254
|
+
end
|
255
|
+
|
256
|
+
should "correctly calculate the balanced threshold" do
|
257
|
+
assert_equal 0.3058500408872149, @non_inf_dist.threshold_balanced()
|
258
|
+
end
|
259
|
+
|
260
|
+
should "correctly calculate the patser threshold" do
|
261
|
+
assert_equal 11.435693792286841, @non_inf_dist.threshold_patser()
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
end
|
metadata
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-jaspar
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jessica Lee
|
8
|
+
- Wasserman Lab
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2015-09-30 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bio
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ">="
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: 1.4.2
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 1.4.2
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: mysql2
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: 0.3.19
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 0.3.19
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: shoulda
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
type: :development
|
50
|
+
prerelease: false
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: rake
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - "~>"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 0.9.3
|
63
|
+
type: :development
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.9.3
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: rdoc
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '3.12'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - "~>"
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '3.12'
|
84
|
+
- !ruby/object:Gem::Dependency
|
85
|
+
name: test-unit
|
86
|
+
requirement: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '0'
|
91
|
+
type: :development
|
92
|
+
prerelease: false
|
93
|
+
version_requirements: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: jeweler
|
100
|
+
requirement: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - "~>"
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: 2.0.1
|
105
|
+
type: :development
|
106
|
+
prerelease: false
|
107
|
+
version_requirements: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - "~>"
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: 2.0.1
|
112
|
+
- !ruby/object:Gem::Dependency
|
113
|
+
name: bundler
|
114
|
+
requirement: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: 1.0.21
|
119
|
+
type: :development
|
120
|
+
prerelease: false
|
121
|
+
version_requirements: !ruby/object:Gem::Requirement
|
122
|
+
requirements:
|
123
|
+
- - ">="
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: 1.0.21
|
126
|
+
description: Basic tools for parsing, searching, and comparing JASPAR motifs; Based
|
127
|
+
on Bio.motifs module in Biopython
|
128
|
+
email:
|
129
|
+
executables: []
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files:
|
132
|
+
- LICENSE.txt
|
133
|
+
- README.md
|
134
|
+
- README.rdoc
|
135
|
+
files:
|
136
|
+
- ".document"
|
137
|
+
- ".travis.yml"
|
138
|
+
- Gemfile
|
139
|
+
- LICENSE.txt
|
140
|
+
- README.md
|
141
|
+
- README.rdoc
|
142
|
+
- Rakefile
|
143
|
+
- lib/bio-jaspar.rb
|
144
|
+
- lib/bio-jaspar/jaspar.rb
|
145
|
+
- test/helper.rb
|
146
|
+
- test/test_bio-jaspar.rb
|
147
|
+
homepage: http://github.com/wassermanlab/jaspar-bioruby
|
148
|
+
licenses:
|
149
|
+
- MIT
|
150
|
+
metadata: {}
|
151
|
+
post_install_message:
|
152
|
+
rdoc_options: []
|
153
|
+
require_paths:
|
154
|
+
- lib
|
155
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
161
|
+
requirements:
|
162
|
+
- - ">="
|
163
|
+
- !ruby/object:Gem::Version
|
164
|
+
version: '0'
|
165
|
+
requirements: []
|
166
|
+
rubyforge_project:
|
167
|
+
rubygems_version: 2.4.6
|
168
|
+
signing_key:
|
169
|
+
specification_version: 4
|
170
|
+
summary: Tools for JASPAR motifs in BioRuby
|
171
|
+
test_files: []
|