bioinform 0.1.8 → 0.1.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/Gemfile +12 -0
- data/Guardfile +9 -0
- data/README.md +7 -1
- data/TODO.txt +8 -0
- data/bioinform.gemspec +7 -5
- data/lib/bioinform.rb +1 -0
- data/lib/bioinform/cli.rb +12 -3
- data/lib/bioinform/cli/convert_motif.rb +108 -0
- data/lib/bioinform/cli/merge_into_collection.rb +6 -2
- data/lib/bioinform/cli/pcm2pwm.rb +1 -1
- data/lib/bioinform/cli/split_motifs.rb +1 -1
- data/lib/bioinform/conversion_algorithms/pcm2ppm_converter.rb +19 -0
- data/lib/bioinform/conversion_algorithms/pcm2pwm_converter.rb +20 -0
- data/lib/bioinform/conversion_algorithms/pcm2pwm_mara_converter.rb +0 -0
- data/lib/bioinform/conversion_algorithms/ppm2pcm_converter.rb +0 -0
- data/lib/bioinform/conversion_algorithms/ppm2pwm_converter.rb +0 -0
- data/lib/bioinform/data_models/collection.rb +21 -35
- data/lib/bioinform/data_models/motif.rb +56 -0
- data/lib/bioinform/data_models/pcm.rb +4 -8
- data/lib/bioinform/data_models/pm.rb +19 -48
- data/lib/bioinform/data_models/pwm.rb +16 -0
- data/lib/bioinform/formatters.rb +2 -0
- data/lib/bioinform/formatters/raw_formatter.rb +41 -0
- data/lib/bioinform/formatters/transfac_formatter.rb +39 -0
- data/lib/bioinform/parsers.rb +2 -1
- data/lib/bioinform/parsers/jaspar_parser.rb +35 -0
- data/lib/bioinform/parsers/string_parser.rb +1 -1
- data/lib/bioinform/parsers/trivial_parser.rb +2 -1
- data/lib/bioinform/parsers/yaml_parser.rb +1 -1
- data/lib/bioinform/support.rb +2 -1
- data/lib/bioinform/support/parameters.rb +27 -18
- data/lib/bioinform/support/strip_doc.rb +9 -0
- data/lib/bioinform/version.rb +1 -1
- data/spec/cli/convert_motif_spec.rb +107 -0
- data/spec/cli/data/merge_into_collection/collection.yaml.result +186 -183
- data/spec/cli/data/merge_into_collection/collection_pwm.yaml.result +186 -183
- data/spec/cli/data/split_motifs/collection.yaml +184 -193
- data/spec/cli/shared_examples/convert_motif/motif_list_empty.rb +18 -0
- data/spec/cli/shared_examples/convert_motif/several_motifs_specified.rb +14 -0
- data/spec/cli/shared_examples/convert_motif/single_motif_specified.rb +50 -0
- data/spec/cli/shared_examples/convert_motif/yield_help_string.rb +5 -0
- data/spec/cli/shared_examples/convert_motif/yield_motif_conversion_error.rb +4 -0
- data/spec/data_models/collection_spec.rb +36 -34
- data/spec/data_models/motif_spec.rb +224 -0
- data/spec/data_models/pcm_spec.rb +28 -17
- data/spec/data_models/pm_spec.rb +83 -121
- data/spec/data_models/pwm_spec.rb +38 -0
- data/spec/fabricators/collection_fabricator.rb +2 -2
- data/spec/fabricators/motif_fabricator.rb +33 -0
- data/spec/fabricators/motif_formats_fabricator.rb +125 -0
- data/spec/fabricators/pcm_fabricator.rb +25 -0
- data/spec/fabricators/pm_fabricator.rb +10 -1
- data/spec/fabricators/ppm_fabricator.rb +14 -0
- data/spec/fabricators/pwm_fabricator.rb +16 -0
- data/spec/parsers/trivial_parser_spec.rb +12 -12
- data/spec/parsers/yaml_parser_spec.rb +11 -11
- data/spec/spec_helper.rb +19 -49
- data/spec/spec_helper_source.rb +59 -0
- metadata +78 -7
data/.gitignore
CHANGED
data/Gemfile
CHANGED
@@ -2,3 +2,15 @@ source 'https://rubygems.org'
|
|
2
2
|
|
3
3
|
# Specify your gem's dependencies in bioinform.gemspec
|
4
4
|
gemspec
|
5
|
+
|
6
|
+
|
7
|
+
group :development do
|
8
|
+
# gem 'win32console'
|
9
|
+
gem 'rspec', '>= 2.0'
|
10
|
+
gem 'fabrication', '~> 2.5.0'
|
11
|
+
gem 'rspec-given', '>= 2.0.0'
|
12
|
+
gem 'spork', '>= 0.9.2'
|
13
|
+
gem 'fakefs', '~> 0.4.2'
|
14
|
+
gem 'wdm', :require => false
|
15
|
+
gem 'guard-rspec', '>=2.1.0'
|
16
|
+
end
|
data/Guardfile
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
# A sample Guardfile
|
2
|
+
# More info at https://github.com/guard/guard#readme
|
3
|
+
|
4
|
+
guard 'rspec', :cli => "--drb" do
|
5
|
+
# watch(%r{^spec/.+_spec\.rb$})
|
6
|
+
# watch(%r{^lib/bioinform/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
|
7
|
+
# watch('spec/spec_helper.rb') { "spec" }
|
8
|
+
watch(%r{^spec/.+\.rb$}) { 'spec' }
|
9
|
+
end
|
data/README.md
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
# Bioinform
|
2
|
-
|
3
2
|
Bioinform is a bunch of classes extracted from daily bioinformatics work. This classes is an attempt to encapsulate loading(parsing) logic for positional matrices in different formats and common transformations. It also includes several core classes extensions which are particularly useful on Enumerables
|
4
3
|
|
4
|
+
Bioinform is in its development phase. API is changing very quickly. Each version is tested and consistent but no one guarantees that code worked in your version will work in future versions. However last version of bioinform is always consistent with latest version of macroape, and cli-tools that're built on top of libraries changes their interface not so often, so you can use them thinking about library as about black-box that makes able to do some useful things
|
5
|
+
|
5
6
|
## Installation
|
6
7
|
|
7
8
|
Add this line to your application's Gemfile:
|
@@ -20,6 +21,11 @@ Or install it yourself as:
|
|
20
21
|
|
21
22
|
Usage is under construction. I don't recommend use this gem for a while: syntax is on the way to change to more simple and concise. But stay tuned
|
22
23
|
|
24
|
+
### Command-line applications
|
25
|
+
* pcm2pwm
|
26
|
+
* split_motifs
|
27
|
+
* merge_into_collection
|
28
|
+
|
23
29
|
## Contributing
|
24
30
|
|
25
31
|
1. Fork it
|
data/TODO.txt
CHANGED
@@ -1,8 +1,15 @@
|
|
1
|
+
Collection contain Motif-s, each Motif can contain any of list: pcm,pwm,ppm.
|
2
|
+
Name, background, tags and any other parameters should be removed from PM class to be placed in Motif
|
3
|
+
|
4
|
+
|
5
|
+
|
1
6
|
ToDo:
|
2
7
|
how to make PM#equal? and PM#hash so that using PMs in Sets wouldn't destroy comparability of Sets and two sets with the same PMs(but different objects) would be equal. (also using pm as a hash-key)
|
3
8
|
Make specs and fix code in such a way that Parser.split_on_motifs and so on returned consistent result. E.g. Parser.parse! raised an error on multiple times invocation
|
4
9
|
|
5
10
|
refactor CLI::SplitMotifs in place where it splits collection file and choose real data models or makes PM
|
11
|
+
|
12
|
+
Make Collection convenient way to store both pwm and pcm for a single motif (may be both should be in parameters of motif?). Also make methods like sort! that can change collection inner structure without working with @collection-variable directly. For example collection.sort!{|a,b| a.length<=>b.length} (here sort yielded only motif, but now it yields both motif and infos - it's inconvenient)
|
6
13
|
|
7
14
|
Make parser exception print out text where parsing was broken (processing line +- 2 nearest lines and command and line numbers)
|
8
15
|
Prevent parser going into infinity loop
|
@@ -23,6 +30,7 @@ Decide:
|
|
23
30
|
-- Make PCM#valid? and PPM#valid? more specific. This shouldn't destroy functionality to load arbitrary data as matrix, but only in force mode (I don't understand yet where should it be: in a constructor or where? And which validation-"severity" levels should be? Strong validation - size-only-validation - size-and-type-validation - no validation ??? or may be options: valid_strictness: 'strict', 'usual', 'strict_with_name' ??? It should be considered)
|
24
31
|
-- PM#to_pcm and friends have unintuitive behavior. E.g. pm.to_pcm.to_pwm != pm.to_pwm First is matrix treated as pcm and then converted, while second is matrix treated as pwm from start
|
25
32
|
-- Should parser be reloadable or not? May be delete #reset_scanner?
|
33
|
+
-- Should Collection has infos for each motif if it already has parameters? (see also discussion above about Collection#sort! and so on)
|
26
34
|
|
27
35
|
Specs
|
28
36
|
-- PWM#probabilities, #score_variance, #gauss_estimation
|
data/bioinform.gemspec
CHANGED
@@ -14,10 +14,12 @@ Gem::Specification.new do |gem|
|
|
14
14
|
gem.name = "bioinform"
|
15
15
|
gem.require_paths = ["lib"]
|
16
16
|
gem.version = Bioinform::VERSION
|
17
|
-
|
17
|
+
|
18
18
|
gem.add_dependency('activesupport', '>= 3.0.0')
|
19
|
-
gem.add_dependency('docopt', '
|
20
|
-
|
21
|
-
gem.add_development_dependency
|
22
|
-
gem.add_development_dependency
|
19
|
+
gem.add_dependency('docopt', '= 0.5.0')
|
20
|
+
|
21
|
+
gem.add_development_dependency('fakefs', '~> 0.4.2')
|
22
|
+
gem.add_development_dependency('fabrication', '~> 2.5.0')
|
23
|
+
gem.add_development_dependency('rspec', '>= 2.0')
|
24
|
+
gem.add_development_dependency('rspec-given', '>= 2.0.0')
|
23
25
|
end
|
data/lib/bioinform.rb
CHANGED
data/lib/bioinform/cli.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
|
+
require_relative 'support'
|
2
|
+
|
1
3
|
module Bioinform
|
2
4
|
module CLI
|
3
5
|
module Helpers
|
6
|
+
def name_wo_extension(filename)
|
7
|
+
File.join(File.dirname(filename), basename_wo_extension(filename))
|
8
|
+
end
|
4
9
|
def basename_wo_extension(filename)
|
5
10
|
File.basename(filename, File.extname(filename))
|
6
11
|
end
|
@@ -10,12 +15,16 @@ module Bioinform
|
|
10
15
|
def set_folder(folder, filename)
|
11
16
|
File.join(folder, filename)
|
12
17
|
end
|
13
|
-
def
|
18
|
+
def basename_changed_extension(filename, extension)
|
14
19
|
set_extension(basename_wo_extension(filename), extension)
|
15
20
|
end
|
16
21
|
def change_folder_and_extension(input_filename, extension, folder)
|
17
|
-
set_folder(folder,
|
22
|
+
set_folder(folder, basename_changed_extension(input_filename, extension))
|
18
23
|
end
|
19
24
|
end
|
20
25
|
end
|
21
|
-
end
|
26
|
+
end
|
27
|
+
|
28
|
+
require_relative 'cli/merge_into_collection'
|
29
|
+
require_relative 'cli/pcm2pwm'
|
30
|
+
require_relative 'cli/split_motifs'
|
@@ -0,0 +1,108 @@
|
|
1
|
+
require_relative '../../bioinform'
|
2
|
+
require 'optparse'
|
3
|
+
|
4
|
+
require 'logger'
|
5
|
+
$logger = Logger.new('convert_motif.log')
|
6
|
+
|
7
|
+
module Bioinform
|
8
|
+
module CLI
|
9
|
+
class ConvertMotif
|
10
|
+
|
11
|
+
def arguments
|
12
|
+
@arguments ||= []
|
13
|
+
end
|
14
|
+
def options
|
15
|
+
@options ||= {}
|
16
|
+
end
|
17
|
+
|
18
|
+
def main(argv)
|
19
|
+
parse!(argv, filename_format: './{name}.{ext}')
|
20
|
+
motif_files = arguments
|
21
|
+
motif_files += $stdin.read.shellsplit unless $stdin.tty?
|
22
|
+
if motif_files.empty?
|
23
|
+
puts option_parser.help()
|
24
|
+
return
|
25
|
+
end
|
26
|
+
|
27
|
+
output_motifs = []
|
28
|
+
motifs = motif_files.map do |filename|
|
29
|
+
case options[:model_from]
|
30
|
+
when 'pwm'
|
31
|
+
PWM.new(File.read(filename))
|
32
|
+
when 'pcm'
|
33
|
+
PCM.new(File.read(filename))
|
34
|
+
when 'ppm'
|
35
|
+
PPM.new(File.read(filename))
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
motifs.each do |motif|
|
40
|
+
begin
|
41
|
+
case options[:model_to]
|
42
|
+
when 'pwm'
|
43
|
+
output_motifs << motif.to_pwm
|
44
|
+
when 'pcm'
|
45
|
+
output_motifs << motif.to_pcm
|
46
|
+
when 'ppm'
|
47
|
+
output_motifs << motif.to_ppm
|
48
|
+
end
|
49
|
+
rescue
|
50
|
+
$stderr.puts "One can't convert from #{options[:model_from]} data-model to #{options[:model_to]} data-model"
|
51
|
+
raise
|
52
|
+
end
|
53
|
+
end
|
54
|
+
puts output_motifs.join("\n\n")
|
55
|
+
rescue
|
56
|
+
$stderr.puts "Error! Conversion wasn't performed"
|
57
|
+
end
|
58
|
+
|
59
|
+
def option_parser
|
60
|
+
@option_parser ||= OptionParser.new do |cli|
|
61
|
+
banner = <<-BANNER
|
62
|
+
Usage:
|
63
|
+
convert_motif [options] <motif-files>...
|
64
|
+
ls | convert_motif [options]
|
65
|
+
|
66
|
+
convert_motif - tool for converting motifs from different input formats
|
67
|
+
to different output formats.
|
68
|
+
It can change both formatting style and motif models.
|
69
|
+
Resulting model is sent to stdout (this can be overriden with --save option).
|
70
|
+
BANNER
|
71
|
+
|
72
|
+
cli.summary_indent = ''
|
73
|
+
cli.banner = strip_doc(banner)
|
74
|
+
cli.separator ""
|
75
|
+
cli.separator "Options:"
|
76
|
+
cli.on('--parser PARSER', 'Parser for input motif.'){|parser| options[:parser] = parser}
|
77
|
+
cli.on('--formatter FORMATTER', 'Formatter for output motif.'){|formatter| options[:formatter] = formatter}
|
78
|
+
cli.on('--from MODEL_OF_INPUT', 'Specify motif model of input.',
|
79
|
+
'It can be overriden by --parser option',
|
80
|
+
'(when parser implies certain input model)'){|model_from| options[:model_from] = model_from}
|
81
|
+
cli.on('--to MODEL_OF_OUTPUT', 'Specify motif model to convert to.',
|
82
|
+
'It can be overriden by --formatter option',
|
83
|
+
'(when formatter implies certain output model)'){|model_to| options[:model_to] = model_to}
|
84
|
+
cli.on('--algorithm ALGORITHM', 'Conversion algorithm to transform model.'){|conversion_algorithm| options[:conversion_algorithm] = conversion_algorithm}
|
85
|
+
cli.on('--save [FILENAME_FORMAT]', 'Save resulting motifs to according files.',
|
86
|
+
'filename format by default is ' + options[:filename_format],
|
87
|
+
'one can specify output folder here',
|
88
|
+
'{name} is a placeholder for motif name',
|
89
|
+
'{ext} is a placeholder for motif model') {|filename_format| options[:filename_format] = filename_format if filename_format}
|
90
|
+
cli.on('--[no-]force', 'Overwrite existing files.'){|force| options[:force] = force}
|
91
|
+
cli.on('--[no-]silent', 'Suppress error messages and notifications.'){|silent| options[:silent] = silent}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def parse!(argv, default_options = {})
|
96
|
+
@options = default_options.dup
|
97
|
+
option_parser.parse!(argv)
|
98
|
+
@arguments = argv
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
def self.main(argv)
|
103
|
+
self.new.main(argv)
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
@@ -13,7 +13,7 @@ module Bioinform
|
|
13
13
|
It takes motif files or (entire collections) and creates a collection consisting of them all. By default motifs are treated simply as matrices(PM), but you can (possibly should) specify data model. Output file by default are in YAML-format but it's possible to create plain text file. YAML collections are useful if you want to provide additional information for motifs in collection with another tool, plain text is more readable by humans.
|
14
14
|
|
15
15
|
Usage:
|
16
|
-
|
16
|
+
merge_into_collection [options] [<pm-files>...]
|
17
17
|
|
18
18
|
Options:
|
19
19
|
-h --help Show this screen.
|
@@ -62,7 +62,11 @@ module Bioinform
|
|
62
62
|
end
|
63
63
|
|
64
64
|
if plain_text
|
65
|
-
File.open(output_file, 'w')
|
65
|
+
File.open(output_file, 'w') do |f|
|
66
|
+
collection.each(options['--data-model'].downcase) do |pm|
|
67
|
+
f.puts(pm.to_s + "\n\n")
|
68
|
+
end
|
69
|
+
end
|
66
70
|
else
|
67
71
|
File.open(output_file, 'w'){|f| YAML.dump(collection, f) }
|
68
72
|
end
|
@@ -13,7 +13,7 @@ module Bioinform
|
|
13
13
|
When filelist is empty, it's obtained from STDIN. One can use it: `ls -b pcm_folder/*.pcm | pcm2pwm` (ls -b option escape spaces in filenames)
|
14
14
|
|
15
15
|
Usage:
|
16
|
-
|
16
|
+
pcm2pwm [options] [<pcm-files>...]
|
17
17
|
|
18
18
|
Options:
|
19
19
|
-h --help Show this screen.
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Bioinform
|
2
|
+
module ConversionAlgorithms
|
3
|
+
module PCM2PPMConverter
|
4
|
+
|
5
|
+
# parameters hash is ignored
|
6
|
+
def self.convert(pcm, parameters = {})
|
7
|
+
matrix = pcm.each_position.map do |pos|
|
8
|
+
pos.map do |el|
|
9
|
+
el.to_f / pcm.count
|
10
|
+
end
|
11
|
+
end
|
12
|
+
PPM.new(pcm.get_parameters.merge(matrix: matrix))
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Bioinform
|
2
|
+
module ConversionAlgorithms
|
3
|
+
module PCM2PWMConverter
|
4
|
+
def self.convert(pcm, parameters = {})
|
5
|
+
default_parameters = {pseudocount: Math.log(pcm.count),
|
6
|
+
probability: (pcm.probability || [0.25, 0.25, 0.25, 0.25])
|
7
|
+
}
|
8
|
+
parameters = default_parameters.merge(parameters)
|
9
|
+
probability = parameters[:probability]
|
10
|
+
pseudocount = parameters[:pseudocount]
|
11
|
+
matrix = pcm.each_position.map do |pos|
|
12
|
+
pos.each_index.map do |index|
|
13
|
+
Math.log((pos[index] + probability[index] * pseudocount) / (probability[index]*(pcm.count + pseudocount)) )
|
14
|
+
end
|
15
|
+
end
|
16
|
+
PWM.new(pcm.get_parameters.merge(matrix: matrix))
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
File without changes
|
File without changes
|
File without changes
|
@@ -1,21 +1,22 @@
|
|
1
1
|
require 'ostruct'
|
2
|
+
require_relative 'motif'
|
2
3
|
|
3
4
|
module Bioinform
|
4
5
|
class Collection
|
5
|
-
|
6
|
+
attr_accessor :container
|
6
7
|
|
7
8
|
include Parameters
|
8
9
|
make_parameters :name
|
9
10
|
|
10
11
|
# collection name is a tag name for each motif in a collection. But motif can be included in several collections so have several tags
|
11
12
|
def initialize(parameters = {})
|
12
|
-
@
|
13
|
+
@container = []
|
13
14
|
@parameters = OpenStruct.new(parameters)
|
14
15
|
yield @parameters if block_given?
|
15
16
|
end
|
16
17
|
|
17
18
|
def size
|
18
|
-
|
19
|
+
container.size
|
19
20
|
end
|
20
21
|
|
21
22
|
def to_s(with_name = true)
|
@@ -28,18 +29,19 @@ module Bioinform
|
|
28
29
|
|
29
30
|
def +(other)
|
30
31
|
result = self.class.new
|
31
|
-
each do |
|
32
|
-
result.
|
32
|
+
container.each do |motif|
|
33
|
+
result.container << motif
|
33
34
|
end
|
34
|
-
other.each do |
|
35
|
-
result.
|
35
|
+
other.container.each do |motif|
|
36
|
+
result.container << motif
|
36
37
|
end
|
37
38
|
result
|
38
39
|
end
|
39
40
|
|
40
41
|
def add_pm(pm, info)
|
41
|
-
pm.mark(self)
|
42
|
-
|
42
|
+
# pm.mark(self)
|
43
|
+
container << Motif.new(info.marshal_dump.merge(pm: pm))
|
44
|
+
#### What if pm already is a Motif
|
43
45
|
self
|
44
46
|
end
|
45
47
|
|
@@ -47,40 +49,24 @@ module Bioinform
|
|
47
49
|
add_pm(pm, OpenStruct.new)
|
48
50
|
end
|
49
51
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
else
|
54
|
-
Enumerator.new(self, :each)
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def each_pm
|
52
|
+
# collection.each{|motif| ... }
|
53
|
+
# collection.each(:pwm, :threshold){|pwm,threshold| }
|
54
|
+
def each(*args)
|
59
55
|
if block_given?
|
60
|
-
|
56
|
+
if args.empty?
|
57
|
+
container.each{|motif| yield motif}
|
58
|
+
else
|
59
|
+
container.each{|motif| yield( *args.map{|arg| motif.parameters.send(arg)} ) }
|
60
|
+
end
|
61
61
|
else
|
62
|
-
Enumerator.new(self, :
|
62
|
+
Enumerator.new(self, :each, *args)
|
63
63
|
end
|
64
64
|
end
|
65
65
|
|
66
66
|
include Enumerable
|
67
67
|
|
68
|
-
%w[pcm ppm pwm].each do |data_model|
|
69
|
-
method_name = "each_#{data_model}".to_sym #
|
70
|
-
converter_method = "to_#{data_model}".to_sym #
|
71
|
-
define_method method_name do |&block| # define_method :each_pcm do |&block|
|
72
|
-
if block # if block
|
73
|
-
each do |pm, infos| # each do |pm, infos|
|
74
|
-
block.call pm.send(converter_method) # block.call pm.send(:to_pcm)
|
75
|
-
end # end
|
76
|
-
else # else
|
77
|
-
Enumerator.new(self, method_name) # Enumerator.new(self, :each_pcm)
|
78
|
-
end # end
|
79
|
-
end # end
|
80
|
-
end
|
81
|
-
|
82
68
|
def ==(other)
|
83
|
-
(
|
69
|
+
(parameters == other.parameters) && (container == other.container)
|
84
70
|
rescue
|
85
71
|
false
|
86
72
|
end
|