bio-jaspar 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +13 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +20 -0
- data/README.md +147 -0
- data/README.rdoc +126 -0
- data/Rakefile +45 -0
- data/lib/bio-jaspar.rb +15 -0
- data/lib/bio-jaspar/jaspar.rb +432 -0
- data/test/helper.rb +18 -0
- data/test/test_bio-jaspar.rb +265 -0
- metadata +171 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 835fa6794d4fa377301992ede3fc0fc43d0013c0
|
4
|
+
data.tar.gz: 98b718e4a029fec3389078213fd7ab5613f89c38
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 29c60f81959d0880c2b690df72537953816cd5b571c635c835b8207d699cd372d429e01c7f17c69e02c146bc12679940dcb0f1b0fe5663e9a951fb78389a4ae5
|
7
|
+
data.tar.gz: a69f3fca22da7ccbc8c1e2138c809102828a208e3118f944c110986d5491ef7e1b2e33c5dc09d46547e5a2333b7161e50a942f251a32209bb79a9b8f7d8bee87
|
data/.document
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
|
7
|
+
# - rbx-19mode
|
8
|
+
# - 1.8.7
|
9
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
+
# - rbx-18mode
|
11
|
+
|
12
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
13
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
gem "bio", ">= 1.4.2"
|
6
|
+
gem "mysql2", "~> 0.3.19"
|
7
|
+
|
8
|
+
# Add dependencies to develop your gem here.
|
9
|
+
# Include everything needed to run rake, tests, features, etc.
|
10
|
+
group :development do
|
11
|
+
gem "shoulda", ">= 0"
|
12
|
+
gem "rake", "~> 0.9.3"
|
13
|
+
gem "rdoc", "~> 3.12"
|
14
|
+
gem 'test-unit'
|
15
|
+
gem "jeweler", "~> 2.0.1", :git => "https://github.com/technicalpickles/jeweler.git"
|
16
|
+
gem "bundler", ">= 1.0.21"
|
17
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2015 Wasserman Lab
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
# bio-jaspar
|
2
|
+
|
3
|
+
## Tools for JASPAR motif analysis
|
4
|
+
|
5
|
+
This gem provides methods for:
|
6
|
+
|
7
|
+
1. Reading and writing sequence motifs in JASPAR format
|
8
|
+
2. Accessing a JASPAR5 formatted database
|
9
|
+
3. Comparing, searching, and analyzing motifs in sequences
|
10
|
+
|
11
|
+
<sup>*</sup> **Note:** The JASPAR motif analysis tools consist of several modules that are directly imported from the Bio.motifs package in BioPython. Namely, those modules/submodules are: Bio.motifs, Bio.motifs.matrix, Bio.motifs.thresholds, Bio.motifs.jaspar. The functionality of this gem will be identical to the aforementioned modules/submodules.
|
12
|
+
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
```sh
|
17
|
+
gem install bio-jaspar
|
18
|
+
```
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
### Loading the gem
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
require 'bio-jaspar'
|
26
|
+
```
|
27
|
+
|
28
|
+
### Loading a motif/motifs from a JASPAR database
|
29
|
+
|
30
|
+
A connection to the JASPAR database is made by creating a JASPAR5 instance.
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
# Substitute the database credentials!
|
34
|
+
db = Bio::Jaspar::JASPAR5.new(
|
35
|
+
:host => <db_host.org>,
|
36
|
+
:name => <db_name>,
|
37
|
+
:user => <db_user>,
|
38
|
+
:password => <db_password>
|
39
|
+
)
|
40
|
+
```
|
41
|
+
|
42
|
+
Now, a motif can be retrieved by the matrix_id
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
m = db.fetch_motif_by_id("MA0049")
|
46
|
+
puts m.to_s
|
47
|
+
```
|
48
|
+
|
49
|
+
Or multiple motifs can be retrieved by various criteria
|
50
|
+
|
51
|
+
```ruby
|
52
|
+
motifs = db.fetch_motifs(
|
53
|
+
:collection => "CORE",
|
54
|
+
:tax_group => ["fungi", "vertebrate"],
|
55
|
+
:tf_class => "Helix-Turn-Helix",
|
56
|
+
:min_ic => 2
|
57
|
+
)
|
58
|
+
motifs.each { |m| # do something with a motif }
|
59
|
+
```
|
60
|
+
|
61
|
+
### Motif analysis
|
62
|
+
|
63
|
+
Many methods are available for motif analysis. Here are some examples:
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
m = db.fetch_motif_by_id("MA0049")
|
67
|
+
|
68
|
+
# Consensus sequence
|
69
|
+
m.consensus # BioRuby Sequence object
|
70
|
+
puts m.consensus
|
71
|
+
|
72
|
+
# Anticonsensus sequence
|
73
|
+
m.anticonsensus # BioRuby Sequence object
|
74
|
+
puts m.anticonsensus
|
75
|
+
|
76
|
+
# Reverse complement motif
|
77
|
+
m.reverse_complement # Bio::Motif::Motifs object
|
78
|
+
|
79
|
+
# Pseudocounts
|
80
|
+
m.pseudocounts
|
81
|
+
|
82
|
+
# Background
|
83
|
+
m.background
|
84
|
+
|
85
|
+
# Position weight matrix
|
86
|
+
m.pwm
|
87
|
+
|
88
|
+
# Position specific scoring matrix
|
89
|
+
m.pssm
|
90
|
+
```
|
91
|
+
|
92
|
+
Matrix methods are also available. Here are some examples:
|
93
|
+
|
94
|
+
```ruby
|
95
|
+
m = db.fetch_motif_by_id("MA0049")
|
96
|
+
|
97
|
+
# Maximum possible score for the given motif
|
98
|
+
m.pssm.max
|
99
|
+
|
100
|
+
# Minimum possible score for the given motif
|
101
|
+
m.pssm.min
|
102
|
+
|
103
|
+
# Expected value of the motif score
|
104
|
+
m.pssm.mean
|
105
|
+
|
106
|
+
# Standard deviation of the given motif score
|
107
|
+
m.pssm.std
|
108
|
+
|
109
|
+
# Find hits with the PWM score above given threshold
|
110
|
+
m.pssm.search(Bio::Sequence.auto("ACCTGCCTAAAAAA"), threshold = 0.5)
|
111
|
+
```
|
112
|
+
|
113
|
+
### Read/write Jaspar file
|
114
|
+
|
115
|
+
Already downloaded pfm, jaspar, sites files can be loaded/written using the Jaspar module
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
# Read a pfm file
|
119
|
+
f = File.open("test.pfm", "r")
|
120
|
+
Bio::Jaspar.read(f, "pfm")
|
121
|
+
f.close
|
122
|
+
|
123
|
+
# Write motifs into a jaspar file
|
124
|
+
motifs = db.fetch_motifs(
|
125
|
+
:collection => "CORE",
|
126
|
+
:tax_group => ["fungi", "vertebrate"],
|
127
|
+
:tf_class => "Helix-Turn-Helix",
|
128
|
+
:min_ic => 2
|
129
|
+
)
|
130
|
+
File.open("test.jaspar", "w") do |f|
|
131
|
+
Bio::Jaspar.write(f, "jaspar")
|
132
|
+
end
|
133
|
+
```
|
134
|
+
|
135
|
+
Please refer to the rdoc for full information on all available methods & classes.
|
136
|
+
|
137
|
+
## Project home page
|
138
|
+
|
139
|
+
Information on the source tree, documentation, examples, issues and
|
140
|
+
how to contribute, see
|
141
|
+
|
142
|
+
http://github.com/wassermanlab/jaspar-bioruby
|
143
|
+
|
144
|
+
## Copyright
|
145
|
+
|
146
|
+
See LICENSE.txt for further details.
|
147
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
= bio-jaspar
|
2
|
+
|
3
|
+
== Tools for JASPAR motif analysis
|
4
|
+
|
5
|
+
This gem provides methods for:
|
6
|
+
|
7
|
+
1. Reading and writing sequence motifs in JASPAR format
|
8
|
+
2. Accessing a JASPAR5 formatted database
|
9
|
+
3. Comparing, searching, and analyzing motifs in sequences
|
10
|
+
|
11
|
+
\* *Note:* The JASPAR motif analysis tools consist of several modules that are directly imported from the Bio.motifs package in BioPython. Namely, those modules/submodules are: Bio.motifs, Bio.motifs.matrix, Bio.motifs.thresholds, Bio.motifs.jaspar. The functionality of this gem will be identical to the aforementioned modules/submodules.
|
12
|
+
|
13
|
+
== Installation
|
14
|
+
|
15
|
+
gem install bio-jaspar
|
16
|
+
|
17
|
+
== Usage
|
18
|
+
|
19
|
+
=== Loading the gem
|
20
|
+
|
21
|
+
require 'bio-jaspar'
|
22
|
+
|
23
|
+
=== Loading a motif/motifs from a JASPAR database
|
24
|
+
|
25
|
+
A connection to the JASPAR database is made by creating a JASPAR5 instance.
|
26
|
+
|
27
|
+
# Substitute the database credentials!
|
28
|
+
db = Bio::Jaspar::JASPAR5.new(
|
29
|
+
:host => <db_host.org>,
|
30
|
+
:name => <db_name>,
|
31
|
+
:user => <db_user>,
|
32
|
+
:password => <db_password>
|
33
|
+
)
|
34
|
+
|
35
|
+
Now, a motif can be retrieved by the matrix_id
|
36
|
+
|
37
|
+
m = db.fetch_motif_by_id("MA0049")
|
38
|
+
puts m.to_s
|
39
|
+
|
40
|
+
Or multiple motifs can be retrieved by various criteria
|
41
|
+
|
42
|
+
motifs = db.fetch_motifs(
|
43
|
+
:collection => "CORE",
|
44
|
+
:tax_group => ["fungi", "vertebrate"],
|
45
|
+
:tf_class => "Helix-Turn-Helix",
|
46
|
+
:min_ic => 2
|
47
|
+
)
|
48
|
+
|
49
|
+
=== Motif analysis
|
50
|
+
|
51
|
+
Many methods are available for motif analysis. Here are some examples:
|
52
|
+
|
53
|
+
m = db.fetch_motif_by_id("MA0049")
|
54
|
+
|
55
|
+
# Consensus sequence
|
56
|
+
m.consensus # BioRuby Sequence object
|
57
|
+
puts m.consensus
|
58
|
+
|
59
|
+
# Anticonsensus sequence
|
60
|
+
m.anticonsensus # BioRuby Sequence object
|
61
|
+
puts m.anticonsensus
|
62
|
+
|
63
|
+
# Reverse complement motif
|
64
|
+
m.reverse_complement # Bio::Motif::Motifs object
|
65
|
+
|
66
|
+
# Pseudocounts
|
67
|
+
m.pseudocounts
|
68
|
+
|
69
|
+
# Background
|
70
|
+
m.background
|
71
|
+
|
72
|
+
# Position weight matrix
|
73
|
+
m.pwm
|
74
|
+
|
75
|
+
# Position specific scoring matrix
|
76
|
+
m.pssm
|
77
|
+
|
78
|
+
Matrix methods are also available. Here are some examples:
|
79
|
+
|
80
|
+
m = db.fetch_motif_by_id("MA0049")
|
81
|
+
|
82
|
+
# Maximum possible score for the given motif
|
83
|
+
m.pssm.max
|
84
|
+
|
85
|
+
# Minimum possible score for the given motif
|
86
|
+
m.pssm.min
|
87
|
+
|
88
|
+
# Expected value of the motif score
|
89
|
+
m.pssm.mean
|
90
|
+
|
91
|
+
# Standard deviation of the given motif score
|
92
|
+
m.pssm.std
|
93
|
+
|
94
|
+
# Find hits with the PWM score above given threshold
|
95
|
+
m.pssm.search(Bio::Sequence.auto("ACCTGCCTAAAAAA"), threshold = 0.5)
|
96
|
+
|
97
|
+
=== Read/write Jaspar file
|
98
|
+
|
99
|
+
Already downloaded pfm, jaspar, sites files can be loaded/written using the Jaspar module
|
100
|
+
|
101
|
+
# Read a pfm file
|
102
|
+
f = File.open("test.pfm", "r")
|
103
|
+
Bio::Jaspar.read(f, "pfm")
|
104
|
+
f.close
|
105
|
+
|
106
|
+
# Write motifs into a jaspar file
|
107
|
+
motifs = db.fetch_motifs(
|
108
|
+
:collection => "CORE",
|
109
|
+
:tax_group => ["fungi", "vertebrate"],
|
110
|
+
:tf_class => "Helix-Turn-Helix",
|
111
|
+
:min_ic => 2
|
112
|
+
)
|
113
|
+
File.open("test.jaspar", "w") do |f|
|
114
|
+
Bio::Jaspar.write(f, "jaspar")
|
115
|
+
end
|
116
|
+
|
117
|
+
== Project home page
|
118
|
+
|
119
|
+
Information on the source tree, documentation, issues and how to contribute, see
|
120
|
+
|
121
|
+
http://github.com/wassermanlab/jaspar-bioruby
|
122
|
+
|
123
|
+
== Copyright
|
124
|
+
|
125
|
+
See LICENSE.txt for further details.
|
126
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
+
gem.name = "bio-jaspar"
|
18
|
+
gem.homepage = "http://github.com/wassermanlab/jaspar-bioruby"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Tools for JASPAR motifs in BioRuby}
|
21
|
+
gem.description = %Q{Basic tools for parsing, searching, and comparing JASPAR motifs; Based on Bio.motifs module in Biopython}
|
22
|
+
gem.authors = ["Jessica Lee", "Wasserman Lab"]
|
23
|
+
# dependencies defined in Gemfile
|
24
|
+
end
|
25
|
+
Jeweler::RubygemsDotOrgTasks.new
|
26
|
+
|
27
|
+
require 'rake/testtask'
|
28
|
+
Rake::TestTask.new(:test) do |test|
|
29
|
+
test.libs << 'lib' << 'test'
|
30
|
+
test.pattern = 'test/**/test_*.rb'
|
31
|
+
test.verbose = true
|
32
|
+
end
|
33
|
+
|
34
|
+
task :default => :test
|
35
|
+
|
36
|
+
require 'rdoc/task'
|
37
|
+
Rake::RDocTask.new do |rdoc|
|
38
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
39
|
+
|
40
|
+
rdoc.rdoc_dir = 'rdoc'
|
41
|
+
rdoc.title = "bio-jaspar #{version}"
|
42
|
+
rdoc.rdoc_files.include('README*')
|
43
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
44
|
+
rdoc.main = "README.rdoc"
|
45
|
+
end
|
data/lib/bio-jaspar.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-jaspar/motifs.rb'
|
12
|
+
require 'bio-jaspar/matrix.rb'
|
13
|
+
require 'bio-jaspar/thresholds.rb'
|
14
|
+
require 'bio-jaspar/jaspar.rb'
|
15
|
+
require 'bio-jaspar/db.rb'
|
@@ -0,0 +1,432 @@
|
|
1
|
+
#--
|
2
|
+
# = bio-jaspar/jaspar.rb
|
3
|
+
#
|
4
|
+
# Copyright:: (C) 2015-2015 Wasserman Lab
|
5
|
+
# License:: Ruby License
|
6
|
+
#
|
7
|
+
# JASPAR 2014 module
|
8
|
+
#
|
9
|
+
# A direct import of Bio.motifs.jaspar module in Biopython
|
10
|
+
#++
|
11
|
+
|
12
|
+
require 'bio'
|
13
|
+
|
14
|
+
module Bio # :nodoc:
|
15
|
+
# == JASPAR 2014 module
|
16
|
+
#
|
17
|
+
# Provides read access to a JASPAR5 formatted database.
|
18
|
+
#
|
19
|
+
# This module is a direct import of Bio.motifs.jaspar module in Biopython.
|
20
|
+
# The following document contains excerpts from Bio.motifs.jaspar module
|
21
|
+
# in Biopython.
|
22
|
+
module Jaspar
|
23
|
+
|
24
|
+
# Unambiguous DNA bases
|
25
|
+
DNA = Bio::Motifs::Alphabet.new.IUPAC_unambiguous_dna
|
26
|
+
|
27
|
+
# JASPAR OUTPUT specific DNA bases
|
28
|
+
JASPAR_ORDERED_DNA_LETTERS = ["A","C","G","T"] # Jaspar requires specific order for printouts
|
29
|
+
|
30
|
+
# A subclass of Bio::Motifs::Motif used to represent a JASPAR profile.
|
31
|
+
#
|
32
|
+
# Additional metadata information are stored if available. The metadata
|
33
|
+
# availability depends on the source of the JASPAR motif (a 'pfm' format
|
34
|
+
# file, a 'jaspar' format file or a JASPAR database).
|
35
|
+
#
|
36
|
+
# <i>A direct import of Bio.motifs.jaspar module in Biopython</i>
|
37
|
+
class Motif < Bio::Motifs::Motif
|
38
|
+
attr_accessor :matrix_id, :collection, :tf_class, :tf_family, :species,
|
39
|
+
:tax_group, :acc, :data_type, :medline, :pazar_id, :comment
|
40
|
+
|
41
|
+
# Construct a JASPAR Motif instance
|
42
|
+
#
|
43
|
+
def initialize(matrix_id, name, opts = {})
|
44
|
+
opts = {
|
45
|
+
:alphabet => DNA,
|
46
|
+
:instances => nil,
|
47
|
+
:counts => nil,
|
48
|
+
:collection => nil,
|
49
|
+
:tf_class => nil,
|
50
|
+
:tf_family => nil,
|
51
|
+
:species => nil,
|
52
|
+
:tax_group => nil,
|
53
|
+
:acc => nil,
|
54
|
+
:data_type => nil,
|
55
|
+
:medline => nil,
|
56
|
+
:pazar_id => nil,
|
57
|
+
:comment => nil
|
58
|
+
}.merge(opts)
|
59
|
+
|
60
|
+
super(opts[:alphabet], opts[:instances], opts[:counts])
|
61
|
+
|
62
|
+
@name = name
|
63
|
+
@matrix_id = matrix_id
|
64
|
+
@collection = opts[:collection]
|
65
|
+
@tf_class = opts[:tf_class]
|
66
|
+
@tf_family = opts[:tf_family]
|
67
|
+
@species = opts[:species]
|
68
|
+
@tax_group = opts[:tax_group]
|
69
|
+
@acc = opts[:acc]
|
70
|
+
@data_type = opts[:data_type]
|
71
|
+
@medline = opts[:medline]
|
72
|
+
@pazar_id = opts[:pazar_id]
|
73
|
+
@comment = opts[:comment]
|
74
|
+
end
|
75
|
+
|
76
|
+
# Return the JASPAR base matrix ID
|
77
|
+
def base_id
|
78
|
+
base_id, _ = Jaspar.split_jaspar_id(@matrix_id)
|
79
|
+
return base_id
|
80
|
+
end
|
81
|
+
|
82
|
+
# Return the JASPAR matrix version
|
83
|
+
def version
|
84
|
+
_, version = Jaspar.split_jaspar_id(@matrix_id)
|
85
|
+
return version
|
86
|
+
end
|
87
|
+
|
88
|
+
# Return a string represention of the JASPAR profile.
|
89
|
+
#
|
90
|
+
# We choose to provide only the filled metadata information.
|
91
|
+
def to_s
|
92
|
+
tf_name_str = "TF name\t#{@name}\n"
|
93
|
+
matrix_id_str = "Matrix ID\t#{@matrix_id}\n"
|
94
|
+
the_string = tf_name_str + matrix_id_str
|
95
|
+
|
96
|
+
if @collection
|
97
|
+
collection_str = "Collection\t#{@collection}\n"
|
98
|
+
the_string += collection_str
|
99
|
+
end
|
100
|
+
if @tf_class
|
101
|
+
tf_class_str = "TF class\t#{@tf_class}\n"
|
102
|
+
the_string += tf_class_str
|
103
|
+
end
|
104
|
+
if @tf_family
|
105
|
+
tf_family_str = "TF family\t#{@tf_family}\n"
|
106
|
+
the_string += tf_family_str
|
107
|
+
end
|
108
|
+
if @species
|
109
|
+
species_str = "Species\t#{@species.join(",")}\n"
|
110
|
+
the_string += species_str
|
111
|
+
end
|
112
|
+
if @tax_group
|
113
|
+
tax_group_str = "Taxonomic group\t#{@tax_group}\n"
|
114
|
+
the_string += tax_group_str
|
115
|
+
end
|
116
|
+
if @acc
|
117
|
+
acc_str = "Accession\t#{@acc}\n"
|
118
|
+
the_string += acc_str
|
119
|
+
end
|
120
|
+
if @data_type
|
121
|
+
data_type_str = "Data type used\t#{@data_type}\n"
|
122
|
+
the_string += data_type_str
|
123
|
+
end
|
124
|
+
if @medline
|
125
|
+
medline_str = "Medline\t#{@medline}\n"
|
126
|
+
the_string += medline_str
|
127
|
+
end
|
128
|
+
if @pazar_id
|
129
|
+
pazar_id_str = "PAZAR ID\t#{@pazar_id}\n"
|
130
|
+
the_string += pazar_id_str
|
131
|
+
end
|
132
|
+
if @comment
|
133
|
+
comment_str = "Comments\t#{@comment}\n"
|
134
|
+
the_string += comment_str
|
135
|
+
end
|
136
|
+
matrix_str = "Matrix:\n#{counts}\n\n"
|
137
|
+
the_string += matrix_str
|
138
|
+
return the_string
|
139
|
+
end
|
140
|
+
|
141
|
+
# Return the hash key corresponding to the JASPAR profile
|
142
|
+
#
|
143
|
+
# Note: We assume the unicity of matrix IDs
|
144
|
+
def hash
|
145
|
+
return @matrix_id.hash
|
146
|
+
end
|
147
|
+
|
148
|
+
# Compare two JASPAR motifs for equality. Two motifs are equal if their
|
149
|
+
# matrix_ids match
|
150
|
+
def ==(other)
|
151
|
+
return @matrix_id == other.matrix_id
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
|
156
|
+
# Represent a list of JASPAR motifs.
|
157
|
+
#
|
158
|
+
# <i>A direct import of Bio.motifs.jaspar module in Biopython</i>
|
159
|
+
#
|
160
|
+
# ==== Attributes
|
161
|
+
#
|
162
|
+
# * +version+ - The JASPAR version used
|
163
|
+
class Record < Array
|
164
|
+
# Construct a record instance
|
165
|
+
def initialize
|
166
|
+
super()
|
167
|
+
@version = nil
|
168
|
+
end
|
169
|
+
|
170
|
+
# Return a string of all JASPAR motifs in the list
|
171
|
+
def to_s
|
172
|
+
return self.map { |the_motif| the_motif.to_s }.join("\n")
|
173
|
+
end
|
174
|
+
|
175
|
+
# Return the list of matrices as a hash (ruby equivalent of dict)
|
176
|
+
# of matrices
|
177
|
+
def to_h
|
178
|
+
dic = {}
|
179
|
+
self.each { |motif|
|
180
|
+
dic[motif.matrix_id] = motif
|
181
|
+
}
|
182
|
+
return dic
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# Read motif(s) from a file in one of several different JASPAR formats.
|
187
|
+
#
|
188
|
+
# Return the record of PFM(s).
|
189
|
+
# Call the appropriate routine based on the format passed
|
190
|
+
def Jaspar.read(handle, format)
|
191
|
+
format = format.downcase
|
192
|
+
if format == "pfm"
|
193
|
+
record = _read_pfm(handle)
|
194
|
+
return record
|
195
|
+
elsif format == "sites"
|
196
|
+
record = _read_sites(handle)
|
197
|
+
return record
|
198
|
+
elsif format == "jaspar"
|
199
|
+
record = _read_jaspar(handle)
|
200
|
+
return record
|
201
|
+
else
|
202
|
+
raise ArgumentError, "Unknown JASPAR format #{format}"
|
203
|
+
end
|
204
|
+
|
205
|
+
end
|
206
|
+
|
207
|
+
# Return the representation of motifs in "pfm" or "jaspar" format.
|
208
|
+
def Jaspar.write(motifs, format)
|
209
|
+
letters = JASPAR_ORDERED_DNA_LETTERS
|
210
|
+
lines = []
|
211
|
+
if format == "pfm"
|
212
|
+
motif = motifs[0]
|
213
|
+
counts = motif.counts
|
214
|
+
letters.each do |letter|
|
215
|
+
terms = counts[letter].map { |value| "%6.2f" % value }
|
216
|
+
line = "#{terms.join(" ")}\n"
|
217
|
+
lines << line
|
218
|
+
end
|
219
|
+
elsif format == "jaspar"
|
220
|
+
motifs.each do |m|
|
221
|
+
counts = m.counts
|
222
|
+
line = ">#{m.matrix_id} #{m.name}\n"
|
223
|
+
lines << line
|
224
|
+
|
225
|
+
letters.each do |letter|
|
226
|
+
terms = counts[letter].map { |value| "%6.2f" % value }
|
227
|
+
line = "#{letter} [#{terms.join(" ")}]\n"
|
228
|
+
lines << line
|
229
|
+
end
|
230
|
+
end
|
231
|
+
else
|
232
|
+
raise ArgumentError, "Unknown JASPAR format #{format}"
|
233
|
+
end
|
234
|
+
|
235
|
+
text = lines.join("")
|
236
|
+
return text
|
237
|
+
end
|
238
|
+
|
239
|
+
# Return pseudocounts of a given JASPAR motif
|
240
|
+
def Jaspar.calculate_pseudocounts(motif)
|
241
|
+
alphabet = motif.alphabet
|
242
|
+
background = motif.background
|
243
|
+
|
244
|
+
total = 0
|
245
|
+
(0...motif.length).each do |i|
|
246
|
+
total += alphabet.letters.map { |letter| motif.counts[letter][i].to_f }.inject(:+)
|
247
|
+
end
|
248
|
+
|
249
|
+
avg_nb_instances = total / motif.length
|
250
|
+
sq_nb_instances = Math.sqrt(avg_nb_instances)
|
251
|
+
|
252
|
+
if background
|
253
|
+
background = Hash[background]
|
254
|
+
else
|
255
|
+
background = Hash[alphabet.letters.sort.map { |l| [l, 1.0] }]
|
256
|
+
end
|
257
|
+
|
258
|
+
total = background.values.inject(:+)
|
259
|
+
pseudocounts = {}
|
260
|
+
|
261
|
+
alphabet.letters.each do |letter|
|
262
|
+
background[letter] /= total
|
263
|
+
pseudocounts[letter] = sq_nb_instances * background[letter]
|
264
|
+
end
|
265
|
+
|
266
|
+
return pseudocounts
|
267
|
+
end
|
268
|
+
|
269
|
+
# Utility function to split a JASPAR matrix ID into its component.
|
270
|
+
#
|
271
|
+
# Components are base ID and version number, e.g. 'MA0047.2' is returned as
|
272
|
+
# ('MA0047', 2).
|
273
|
+
def Jaspar.split_jaspar_id(id)
|
274
|
+
id_split = id.split(".")
|
275
|
+
|
276
|
+
base_id = nil
|
277
|
+
version = nil
|
278
|
+
|
279
|
+
if id_split.length == 2
|
280
|
+
base_id = id_split[0]
|
281
|
+
version = id_split[1]
|
282
|
+
else
|
283
|
+
base_id = id
|
284
|
+
end
|
285
|
+
|
286
|
+
return base_id, version
|
287
|
+
end
|
288
|
+
|
289
|
+
# Private methods
|
290
|
+
private
|
291
|
+
|
292
|
+
# Read the motif from a JASPAR .pfm file (PRIVATE).
|
293
|
+
def Jaspar._read_pfm(handle)
|
294
|
+
alphabet = DNA
|
295
|
+
counts = {}
|
296
|
+
|
297
|
+
letters = JASPAR_ORDERED_DNA_LETTERS
|
298
|
+
letters.zip(handle).each do |letter, line|
|
299
|
+
words = line.split
|
300
|
+
if words[0] == letter
|
301
|
+
words = words[1..-1]
|
302
|
+
end
|
303
|
+
counts[letter] = words.map(&:to_f)
|
304
|
+
end
|
305
|
+
|
306
|
+
motif = Motif.new(nil, nil, :alphabet => alphabet, :counts => counts)
|
307
|
+
motif.mask = "*" * motif.length
|
308
|
+
record = Record.new
|
309
|
+
record << motif
|
310
|
+
|
311
|
+
return record
|
312
|
+
end
|
313
|
+
|
314
|
+
# Read the motif from JASPAR .sites file (PRIVATE).
|
315
|
+
def Jaspar._read_sites(handle)
|
316
|
+
alphabet = DNA
|
317
|
+
instances = []
|
318
|
+
|
319
|
+
handle_enum = handle.to_enum
|
320
|
+
|
321
|
+
handle.each do |line|
|
322
|
+
unless line.start_with?(">")
|
323
|
+
break
|
324
|
+
end
|
325
|
+
|
326
|
+
line = handle_enum.next
|
327
|
+
instance = ""
|
328
|
+
line.strip.each_char do |c|
|
329
|
+
if c == c.upcase
|
330
|
+
instance += c
|
331
|
+
end
|
332
|
+
end
|
333
|
+
instance = Bio::Sequence.auto(instance)
|
334
|
+
instances << instance
|
335
|
+
end
|
336
|
+
|
337
|
+
instances = Bio::Motifs::Instances.new(instances, alphabet)
|
338
|
+
motif = Motif.new(nil, nil, :alphabet => alphabet, :instances => instances)
|
339
|
+
motif.mask = "*" * motif.length
|
340
|
+
record = Record.new
|
341
|
+
record << motif
|
342
|
+
|
343
|
+
return record
|
344
|
+
end
|
345
|
+
|
346
|
+
# Read motifs from a JASPAR formatted file (PRIVATE).
|
347
|
+
#
|
348
|
+
# Format is one or more records of the form, e.g.::
|
349
|
+
#
|
350
|
+
# - JASPAR 2010 matrix_only format::
|
351
|
+
#
|
352
|
+
# >MA0001.1 AGL3
|
353
|
+
# A [ 0 3 79 40 66 48 65 11 65 0 ]
|
354
|
+
# C [94 75 4 3 1 2 5 2 3 3 ]
|
355
|
+
# G [ 1 0 3 4 1 0 5 3 28 88 ]
|
356
|
+
# T [ 2 19 11 50 29 47 22 81 1 6 ]
|
357
|
+
#
|
358
|
+
# - JASPAR 2010-2014 PFMs format::
|
359
|
+
#
|
360
|
+
# >MA0001.1 AGL3
|
361
|
+
# 0 3 79 40 66 48 65 11 65 0
|
362
|
+
# 94 75 4 3 1 2 5 2 3 3
|
363
|
+
# 1 0 3 4 1 0 5 3 28 88
|
364
|
+
# 2 19 11 50 29 47 22 81 1 6
|
365
|
+
#
|
366
|
+
def Jaspar._read_jaspar(handle)
|
367
|
+
alphabet = DNA
|
368
|
+
counts = {}
|
369
|
+
|
370
|
+
record = Record.new
|
371
|
+
|
372
|
+
head_pat = /^>\s*(\S+)(\s+(\S+))?/
|
373
|
+
row_pat_long = /\s*([ACGT])\s*\[\s*(.*)\s*\]/
|
374
|
+
row_pat_short = /\s*(.+)\s*/
|
375
|
+
|
376
|
+
identifier = nil
|
377
|
+
name = nil
|
378
|
+
row_count = 0
|
379
|
+
nucleotides = ["A","C","G","T"]
|
380
|
+
handle.each do |line|
|
381
|
+
line = line.strip
|
382
|
+
|
383
|
+
head_match = line.match(head_pat)
|
384
|
+
row_match_long = line.match(row_pat_long)
|
385
|
+
row_match_short = line.match(row_pat_short)
|
386
|
+
|
387
|
+
if head_match
|
388
|
+
identifier = head_match[1]
|
389
|
+
if head_match[3]
|
390
|
+
name = head_match[3]
|
391
|
+
else
|
392
|
+
name = identifier
|
393
|
+
end
|
394
|
+
elsif row_match_long
|
395
|
+
letter, counts_str = row_match_long[1..2]
|
396
|
+
words = counts_str.split
|
397
|
+
counts[letter] = words.map(&:to_f)
|
398
|
+
row_count += 1
|
399
|
+
if row_count == 4
|
400
|
+
record << Motif.new(identifier,
|
401
|
+
name,
|
402
|
+
:alphabet => alphabet,
|
403
|
+
:counts => counts)
|
404
|
+
identifier = nil
|
405
|
+
name = nil
|
406
|
+
counts = {}
|
407
|
+
row_count = 0
|
408
|
+
end
|
409
|
+
elsif row_match_short
|
410
|
+
words = row_match_short[1].split
|
411
|
+
counts[nucleotides[row_count]] = words.map(&:to_f)
|
412
|
+
row_count += 1
|
413
|
+
if row_count == 4
|
414
|
+
record << Motif.new(identifier,
|
415
|
+
name,
|
416
|
+
:alphabet => alphabet,
|
417
|
+
:counts => counts)
|
418
|
+
identifier = nil
|
419
|
+
name = nil
|
420
|
+
counts = {}
|
421
|
+
row_count = 0
|
422
|
+
end
|
423
|
+
end
|
424
|
+
end
|
425
|
+
|
426
|
+
return record
|
427
|
+
end
|
428
|
+
|
429
|
+
private_class_method :_read_pfm, :_read_sites, :_read_jaspar
|
430
|
+
|
431
|
+
end
|
432
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'test/unit'
|
11
|
+
require 'shoulda'
|
12
|
+
|
13
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
14
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
15
|
+
require 'bio-jaspar'
|
16
|
+
|
17
|
+
class Test::Unit::TestCase
|
18
|
+
end
|
@@ -0,0 +1,265 @@
|
|
1
|
+
require 'helper'
|
2
|
+
require 'bio-jaspar'
|
3
|
+
require 'bio'
|
4
|
+
|
5
|
+
class TestBioJaspar < Test::Unit::TestCase
|
6
|
+
context 'JASPAR module' do
|
7
|
+
should "correctly read jaspar formatted file" do
|
8
|
+
f = File.open('test/data/jaspar-test.jaspar', "r")
|
9
|
+
motifs = Bio::Motifs.parse(f, "jaspar")
|
10
|
+
f.close
|
11
|
+
|
12
|
+
# Test first motif in the set
|
13
|
+
corr_motifs_beg_counts = {
|
14
|
+
"A" => [3.0, 21.0, 25.0, 0.0, 0.0, 24.0, 1.0, 0.0],
|
15
|
+
"C" => [13.0, 1.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0],
|
16
|
+
"T" => [5.0, 3.0, 0.0, 25.0, 20.0, 0.0, 24.0, 23.0],
|
17
|
+
"G" => [4.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0]
|
18
|
+
}
|
19
|
+
assert_equal corr_motifs_beg_counts, motifs[0].counts
|
20
|
+
assert_equal 8, motifs[0].length
|
21
|
+
assert_equal "HAT5", motifs[0].name
|
22
|
+
assert_equal "MA0008.1", motifs[0].matrix_id
|
23
|
+
|
24
|
+
# Test the last motif in the set
|
25
|
+
corr_motifs_end_counts = {
|
26
|
+
"A" => [4.0, 5.0, 5.0, 3.0, 0.0, 0.0, 25.0, 26.0, 0.0, 0.0, 26.0, 0.0, 0.0, 17.0, 0.0, 5.0, 2.0, 0.0, 0.0],
|
27
|
+
"C" => [2.0, 3.0, 4.0, 8.0, 1.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 25.0, 1.0, 6.0, 3.0, 5.0],
|
28
|
+
"T" => [2.0, 3.0, 5.0, 5.0, 0.0, 20.0, 0.0, 0.0, 26.0, 0.0, 0.0, 26.0, 23.0, 0.0, 0.0, 7.0, 4.0, 3.0, 0.0],
|
29
|
+
"G" => [1.0, 3.0, 2.0, 2.0, 25.0, 0.0, 1.0, 0.0, 0.0, 26.0, 0.0, 0.0, 3.0, 9.0, 1.0, 4.0, 0.0, 4.0, 3.0]
|
30
|
+
}
|
31
|
+
assert_equal corr_motifs_end_counts, motifs[-1].counts
|
32
|
+
assert_equal 19, motifs[-1].length
|
33
|
+
assert_equal "ATHB9", motifs[-1].name
|
34
|
+
assert_equal "MA0573.1", motifs[-1].matrix_id
|
35
|
+
end
|
36
|
+
|
37
|
+
should "correctly read pfm formatted file" do
|
38
|
+
f = File.open('test/data/jaspar-test.pfm', "r")
|
39
|
+
motif = Bio::Motifs.parse(f, "pfm")
|
40
|
+
f.close
|
41
|
+
|
42
|
+
corr_counts = {
|
43
|
+
"A" => [3.0, 21.0, 25.0, 0.0, 0.0, 24.0, 1.0, 0.0],
|
44
|
+
"C" => [13.0, 1.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0],
|
45
|
+
"T" => [5.0, 3.0, 0.0, 25.0, 20.0, 0.0, 24.0, 23.0],
|
46
|
+
"G" => [4.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0]
|
47
|
+
}
|
48
|
+
assert_equal 1, motif.length # should only read one motif
|
49
|
+
assert_equal 8, motif[0].length
|
50
|
+
assert_equal corr_counts, motif[0].counts
|
51
|
+
end
|
52
|
+
|
53
|
+
should "correctly read sites formatted file" do
|
54
|
+
f = File.open('test/data/jaspar-test.sites', "r")
|
55
|
+
motif = Bio::Motifs.parse(f, "sites")
|
56
|
+
f.close
|
57
|
+
|
58
|
+
corr_counts = {
|
59
|
+
"A" => [15, 4, 41, 36, 7, 19, 3],
|
60
|
+
"C" => [11, 35, 1, 2, 29, 14, 22],
|
61
|
+
"T" => [7, 2, 0, 1, 1, 3, 3],
|
62
|
+
"G" => [10, 2, 1, 4, 6, 7, 15]
|
63
|
+
}
|
64
|
+
assert_equal 7, motif[0].length
|
65
|
+
assert_equal corr_counts, motif[0].counts
|
66
|
+
|
67
|
+
# Check the first and last sequence motifs
|
68
|
+
assert_equal "ccaaccc", motif[0].instances[0].to_s
|
69
|
+
assert_equal "gtatctc", motif[0].instances[-1].to_s
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
# Once the reads pass the test, load the test files and setup the test
|
75
|
+
setup do
|
76
|
+
f = File.open('test/data/jaspar-test.jaspar', "r")
|
77
|
+
@motifs = Bio::Motifs.parse(f, "jaspar")
|
78
|
+
@motif = @motifs.first
|
79
|
+
f.close
|
80
|
+
end
|
81
|
+
|
82
|
+
context 'JASPAR module' do
|
83
|
+
should "correctly convert Motifs into jaspar formatted string" do
|
84
|
+
corr_jaspar = ">MA0008.1 HAT5\nA [ 3.00 21.00 25.00 0.00 0.00 24.00 1.00 0.00]\nC [ 13.00 1.00 0.00 0.00 5.00 0.00 0.00 0.00]\nG [ 4.00 0.00 0.00 0.00 0.00 1.00 0.00 2.00]\nT [ 5.00 3.00 0.00 25.00 20.00 0.00 24.00 23.00]\n>MA0027.1 En1\nA [ 4.00 5.00 3.00 0.00 4.00 3.00 3.00 2.00 1.00 1.00 1.00]\nC [ 1.00 2.00 0.00 0.00 0.00 0.00 0.00 1.00 3.00 4.00 6.00]\nG [ 2.00 2.00 7.00 2.00 3.00 7.00 0.00 4.00 3.00 1.00 1.00]\nT [ 3.00 1.00 0.00 8.00 3.00 0.00 7.00 3.00 3.00 4.00 2.00]\n>MA0046.1 HNF1A\nA [ 5.00 1.00 1.00 1.00 20.00 16.00 1.00 8.00 14.00 2.00 0.00 13.00 8.00 5.00]\nC [ 0.00 0.00 0.00 0.00 0.00 2.00 0.00 2.00 0.00 0.00 4.00 1.00 8.00 13.00]\nG [ 14.00 20.00 0.00 0.00 0.00 1.00 0.00 4.00 1.00 0.00 0.00 3.00 3.00 0.00]\nT [ 2.00 0.00 20.00 20.00 1.00 2.00 20.00 7.00 6.00 19.00 17.00 4.00 2.00 3.00]\n"
|
85
|
+
jaspar = Bio::Jaspar.write(@motifs[0, 3], "jaspar")
|
86
|
+
assert_equal corr_jaspar, jaspar
|
87
|
+
end
|
88
|
+
|
89
|
+
should "correctly convert Motifs into pfm formatted string" do
|
90
|
+
corr_pfm = " 3.00 21.00 25.00 0.00 0.00 24.00 1.00 0.00\n 13.00 1.00 0.00 0.00 5.00 0.00 0.00 0.00\n 4.00 0.00 0.00 0.00 0.00 1.00 0.00 2.00\n 5.00 3.00 0.00 25.00 20.00 0.00 24.00 23.00\n"
|
91
|
+
pfm = Bio::Jaspar.write(@motifs, "pfm")
|
92
|
+
assert_equal corr_pfm, pfm
|
93
|
+
end
|
94
|
+
|
95
|
+
should "correctly calculate pseudocounts" do
|
96
|
+
corr_pc = {"A" => 1.25, "C" => 1.25, "T" => 1.25, "G" => 1.25}
|
97
|
+
pc = Bio::Jaspar.calculate_pseudocounts(@motifs[0])
|
98
|
+
assert_equal corr_pc, pc
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
context 'JASPAR Motif class' do
|
103
|
+
should "return a correct length" do
|
104
|
+
assert_equal 8, @motif.length
|
105
|
+
end
|
106
|
+
|
107
|
+
should "return a correct consensus sequence" do
|
108
|
+
assert_equal Bio::Sequence.auto("CAATTATT").to_s, @motif.consensus.to_s
|
109
|
+
end
|
110
|
+
|
111
|
+
should "return a correct anticonsensus sequence" do
|
112
|
+
assert_equal Bio::Sequence.auto("AGGGGTGA").to_s, @motif.anticonsensus.to_s
|
113
|
+
end
|
114
|
+
|
115
|
+
should "return a correct degenerate consensus" do
|
116
|
+
assert_equal Bio::Sequence.auto("CAATTATT").to_s, @motif.degenerate_consensus.to_s
|
117
|
+
end
|
118
|
+
|
119
|
+
should "return a correct reverse complement" do
|
120
|
+
corr_rc_counts = {
|
121
|
+
"A" => [23.0, 24.0, 0.0, 20.0, 25.0, 0.0, 3.0, 5.0],
|
122
|
+
"C" => [2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 4.0],
|
123
|
+
"T" => [0.0, 1.0, 24.0, 0.0, 0.0, 25.0, 21.0, 3.0],
|
124
|
+
"G" => [0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 1.0, 13.0]
|
125
|
+
}
|
126
|
+
rc = @motif.reverse_complement
|
127
|
+
assert_equal 0.13, rc.counts.gc_content
|
128
|
+
assert_equal 8, rc.length
|
129
|
+
assert_equal corr_rc_counts, rc.counts
|
130
|
+
end
|
131
|
+
|
132
|
+
should "return a correct mask" do
|
133
|
+
assert_equal [1,1,1,1,1,1,1,1], @motif.mask
|
134
|
+
end
|
135
|
+
|
136
|
+
should "return correct pseudocounts" do
|
137
|
+
corr_pc = {"A" => 0.0, "C" => 0.0, "T" => 0.0, "G" => 0.0}
|
138
|
+
assert_equal corr_pc, @motif.pseudocounts
|
139
|
+
end
|
140
|
+
|
141
|
+
should "return a correct background" do
|
142
|
+
corr_bg = {"A" => 0.25, "C" => 0.25, "T" => 0.25, "G" => 0.25}
|
143
|
+
assert_equal corr_bg, @motif.background
|
144
|
+
end
|
145
|
+
|
146
|
+
should "return a correct pwm" do
|
147
|
+
corr_pwm = {
|
148
|
+
"A" => [0.12, 0.84, 1.0, 0.0, 0.0, 0.96, 0.04, 0.0],
|
149
|
+
"C" => [0.52, 0.04, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0],
|
150
|
+
"T" => [0.2, 0.12, 0.0, 1.0, 0.8, 0.0, 0.96, 0.92],
|
151
|
+
"G" => [0.16, 0.0, 0.0, 0.0, 0.0, 0.04, 0.0, 0.08]
|
152
|
+
}
|
153
|
+
assert_equal corr_pwm, @motif.pwm
|
154
|
+
end
|
155
|
+
|
156
|
+
should "return a correct pssm" do
|
157
|
+
corr_pssm = {
|
158
|
+
"A" => [-1.0588936890535685, 1.7484612330040357, 2.0, -Float::INFINITY, -Float::INFINITY, 1.9411063109464317, -2.643856189774725, -Float::INFINITY],
|
159
|
+
"C" => [1.0565835283663676, -2.643856189774725, -Float::INFINITY, -Float::INFINITY, -0.3219280948873623, -Float::INFINITY, -Float::INFINITY, -Float::INFINITY],
|
160
|
+
"T" => [-0.3219280948873623, -1.0588936890535685, -Float::INFINITY, 2.0, 1.6780719051126378, -Float::INFINITY, 1.9411063109464317, 1.8797057662822885],
|
161
|
+
"G" => [-0.6438561897747247, -Float::INFINITY, -Float::INFINITY, -Float::INFINITY, -Float::INFINITY, -2.643856189774725, -Float::INFINITY, -1.6438561897747248]
|
162
|
+
}
|
163
|
+
assert_equal corr_pssm, @motif.pssm
|
164
|
+
end
|
165
|
+
|
166
|
+
should "correctly format motif in jaspar format" do
|
167
|
+
corr_jaspar_str = ">MA0008.1 HAT5\nA [ 3.00 21.00 25.00 0.00 0.00 24.00 1.00 0.00]\nC [ 13.00 1.00 0.00 0.00 5.00 0.00 0.00 0.00]\nG [ 4.00 0.00 0.00 0.00 0.00 1.00 0.00 2.00]\nT [ 5.00 3.00 0.00 25.00 20.00 0.00 24.00 23.00]\n"
|
168
|
+
assert_equal corr_jaspar_str, @motif.format("jaspar")
|
169
|
+
end
|
170
|
+
|
171
|
+
should "correctly format motif in pfm format" do
|
172
|
+
corr_pfm_str = " 3.00 21.00 25.00 0.00 0.00 24.00 1.00 0.00\n 13.00 1.00 0.00 0.00 5.00 0.00 0.00 0.00\n 4.00 0.00 0.00 0.00 0.00 1.00 0.00 2.00\n 5.00 3.00 0.00 25.00 20.00 0.00 24.00 23.00\n"
|
173
|
+
assert_equal corr_pfm_str, @motif.format("pfm")
|
174
|
+
end
|
175
|
+
|
176
|
+
end
|
177
|
+
|
178
|
+
context "matrix" do
|
179
|
+
setup do
|
180
|
+
@motif2 = @motifs[1]
|
181
|
+
@non_inf_dist = @motifs[15].pssm.distribution
|
182
|
+
end
|
183
|
+
|
184
|
+
should "correctly return maximum possible score" do
|
185
|
+
assert_equal 14.245035054658192, @motif.pssm.max
|
186
|
+
end
|
187
|
+
|
188
|
+
should "correctly return the minimum possible score" do
|
189
|
+
assert_equal -Float::INFINITY, @motif.pssm.min
|
190
|
+
end
|
191
|
+
|
192
|
+
should "correctly refuse fraction gc content calculation on pssm" do
|
193
|
+
assert_raise do
|
194
|
+
@motif.pssm.gc_content
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
should "correctly calculate the mean" do
|
199
|
+
assert_equal 11.882147864914165, @motif.pssm.mean
|
200
|
+
end
|
201
|
+
|
202
|
+
should "correctly calculate the std" do
|
203
|
+
assert_equal 2.315187013634166, @motif.pssm.std
|
204
|
+
end
|
205
|
+
|
206
|
+
should "correctly calculates the PWM score for the given sequence" do
|
207
|
+
corr_res = [-Float::INFINITY, -Float::INFINITY, -Float::INFINITY, 4.7579989]
|
208
|
+
res = @motif.pssm.calculate(Bio::Sequence.auto("AGTTAATTAAG")).map{ |a|
|
209
|
+
if a.infinite?
|
210
|
+
a
|
211
|
+
else
|
212
|
+
(a * (10 ** 7)).floor / (10.0 ** 7)
|
213
|
+
end
|
214
|
+
}
|
215
|
+
assert_equal corr_res, res
|
216
|
+
end
|
217
|
+
|
218
|
+
should "correctly search and return the position of the hits with PWM higher than threshold" do
|
219
|
+
corr_hits = [[3, 4.7579989]]
|
220
|
+
hits = @motif.pssm.search(Bio::Sequence.auto("AGTTAATTAAG")).map{ |a, b|
|
221
|
+
[a, (b * 10 ** 7).floor / (10.0 ** 7)]
|
222
|
+
}
|
223
|
+
assert_equal corr_hits, hits
|
224
|
+
end
|
225
|
+
|
226
|
+
should "correctly compare sequences using Pearson's correlation" do
|
227
|
+
corr_pearson = [0.024879199790793116, -10]
|
228
|
+
assert_equal corr_pearson, @motif.pssm.dist_pearson(@motif2.pssm)
|
229
|
+
end
|
230
|
+
|
231
|
+
should "correctly generate a distribution for non-infinite pssms" do
|
232
|
+
assert_equal -54.665224748002345, @non_inf_dist.min_score
|
233
|
+
assert_equal 15000, @non_inf_dist.n_points
|
234
|
+
assert_equal 77.64489601267111, @non_inf_dist.interval
|
235
|
+
assert_equal 0.005176671512278893, @non_inf_dist.step
|
236
|
+
|
237
|
+
corr_md_beg_100 = [3.2600762748340303e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.4153180022070797e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.61062211083769e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.782556497068055e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.075095343542538e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.269147502758849e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.410691430657807e-26, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.5132776385471104e-26, 0.0, 0.0, 0.0, 0.0, 4.620724355927226e-26, 0.0, 0.0, 0.0, 0.0, 0.0]
|
238
|
+
corr_md_end_100 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.006362547198123186, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.007703187540387484]
|
239
|
+
assert_equal corr_md_beg_100, @non_inf_dist.mo_density[0, 100]
|
240
|
+
assert_equal corr_md_end_100, @non_inf_dist.mo_density[@non_inf_dist.mo_density.length-100, 100]
|
241
|
+
|
242
|
+
corr_bd_beg_100 = [9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0]
|
243
|
+
corr_bd_end_100 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.313225746154785e-10]
|
244
|
+
assert_equal corr_bd_beg_100, @non_inf_dist.bg_density[0, 100]
|
245
|
+
assert_equal corr_bd_end_100, @non_inf_dist.bg_density[@non_inf_dist.bg_density.length-100, 100]
|
246
|
+
end
|
247
|
+
|
248
|
+
should "correctly calculate the threshold for false positive rate" do
|
249
|
+
assert_equal -11.00517721344216, @non_inf_dist.threshold_fpr(0.1)
|
250
|
+
end
|
251
|
+
|
252
|
+
should "correctly calculate the threshold for false negative rate" do
|
253
|
+
assert_equal 8.655821190193073, @non_inf_dist.threshold_fnr(0.1)
|
254
|
+
end
|
255
|
+
|
256
|
+
should "correctly calculate the balanced threshold" do
|
257
|
+
assert_equal 0.3058500408872149, @non_inf_dist.threshold_balanced()
|
258
|
+
end
|
259
|
+
|
260
|
+
should "correctly calculate the patser threshold" do
|
261
|
+
assert_equal 11.435693792286841, @non_inf_dist.threshold_patser()
|
262
|
+
end
|
263
|
+
|
264
|
+
end
|
265
|
+
end
|
metadata
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-jaspar
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jessica Lee
|
8
|
+
- Wasserman Lab
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2015-09-30 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bio
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
requirements:
|
18
|
+
- - ">="
|
19
|
+
- !ruby/object:Gem::Version
|
20
|
+
version: 1.4.2
|
21
|
+
type: :runtime
|
22
|
+
prerelease: false
|
23
|
+
version_requirements: !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
version: 1.4.2
|
28
|
+
- !ruby/object:Gem::Dependency
|
29
|
+
name: mysql2
|
30
|
+
requirement: !ruby/object:Gem::Requirement
|
31
|
+
requirements:
|
32
|
+
- - "~>"
|
33
|
+
- !ruby/object:Gem::Version
|
34
|
+
version: 0.3.19
|
35
|
+
type: :runtime
|
36
|
+
prerelease: false
|
37
|
+
version_requirements: !ruby/object:Gem::Requirement
|
38
|
+
requirements:
|
39
|
+
- - "~>"
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 0.3.19
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: shoulda
|
44
|
+
requirement: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
type: :development
|
50
|
+
prerelease: false
|
51
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
- !ruby/object:Gem::Dependency
|
57
|
+
name: rake
|
58
|
+
requirement: !ruby/object:Gem::Requirement
|
59
|
+
requirements:
|
60
|
+
- - "~>"
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 0.9.3
|
63
|
+
type: :development
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 0.9.3
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: rdoc
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '3.12'
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - "~>"
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '3.12'
|
84
|
+
- !ruby/object:Gem::Dependency
|
85
|
+
name: test-unit
|
86
|
+
requirement: !ruby/object:Gem::Requirement
|
87
|
+
requirements:
|
88
|
+
- - ">="
|
89
|
+
- !ruby/object:Gem::Version
|
90
|
+
version: '0'
|
91
|
+
type: :development
|
92
|
+
prerelease: false
|
93
|
+
version_requirements: !ruby/object:Gem::Requirement
|
94
|
+
requirements:
|
95
|
+
- - ">="
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: jeweler
|
100
|
+
requirement: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - "~>"
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: 2.0.1
|
105
|
+
type: :development
|
106
|
+
prerelease: false
|
107
|
+
version_requirements: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - "~>"
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: 2.0.1
|
112
|
+
- !ruby/object:Gem::Dependency
|
113
|
+
name: bundler
|
114
|
+
requirement: !ruby/object:Gem::Requirement
|
115
|
+
requirements:
|
116
|
+
- - ">="
|
117
|
+
- !ruby/object:Gem::Version
|
118
|
+
version: 1.0.21
|
119
|
+
type: :development
|
120
|
+
prerelease: false
|
121
|
+
version_requirements: !ruby/object:Gem::Requirement
|
122
|
+
requirements:
|
123
|
+
- - ">="
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: 1.0.21
|
126
|
+
description: Basic tools for parsing, searching, and comparing JASPAR motifs; Based
|
127
|
+
on Bio.motifs module in Biopython
|
128
|
+
email:
|
129
|
+
executables: []
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files:
|
132
|
+
- LICENSE.txt
|
133
|
+
- README.md
|
134
|
+
- README.rdoc
|
135
|
+
files:
|
136
|
+
- ".document"
|
137
|
+
- ".travis.yml"
|
138
|
+
- Gemfile
|
139
|
+
- LICENSE.txt
|
140
|
+
- README.md
|
141
|
+
- README.rdoc
|
142
|
+
- Rakefile
|
143
|
+
- lib/bio-jaspar.rb
|
144
|
+
- lib/bio-jaspar/jaspar.rb
|
145
|
+
- test/helper.rb
|
146
|
+
- test/test_bio-jaspar.rb
|
147
|
+
homepage: http://github.com/wassermanlab/jaspar-bioruby
|
148
|
+
licenses:
|
149
|
+
- MIT
|
150
|
+
metadata: {}
|
151
|
+
post_install_message:
|
152
|
+
rdoc_options: []
|
153
|
+
require_paths:
|
154
|
+
- lib
|
155
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
161
|
+
requirements:
|
162
|
+
- - ">="
|
163
|
+
- !ruby/object:Gem::Version
|
164
|
+
version: '0'
|
165
|
+
requirements: []
|
166
|
+
rubyforge_project:
|
167
|
+
rubygems_version: 2.4.6
|
168
|
+
signing_key:
|
169
|
+
specification_version: 4
|
170
|
+
summary: Tools for JASPAR motifs in BioRuby
|
171
|
+
test_files: []
|