bio-pangenome 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.travis.yml +13 -0
- data/Gemfile +15 -0
- data/LICENSE.txt +20 -0
- data/README.md +47 -0
- data/README.rdoc +48 -0
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/bin/pangenome_blast_flanking.rb +114 -0
- data/lib/bio-pangenome.rb +12 -0
- data/lib/bio-pangenome/pangenome.rb +201 -0
- data/test/helper.rb +34 -0
- data/test/test_bio-pangenome.rb +7 -0
- metadata +157 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 67ff78e3e817086eb54e9f3c2a93bad8dfec84733d2e6db2a1ccb716543ae3cb
|
4
|
+
data.tar.gz: b9720ab3108dba2864a2f94762597c3cb89fd0837e5853d6f08215abe9f8fce8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b1643aecb2e252b8c1cc4fe3328d9b296d9ac0f63987544b954320a720346b7175234d6a83c9322baac25f057edc171e530e11181714016aeca73d0da791a650
|
7
|
+
data.tar.gz: d34eafed8d0080ecfbdc68dad5a39366fb18fec2666337e69a00f656f9c2d187b7b9aad062eb3197c24f881194343ce87b4730caafb51d729a80b77722883bd3
|
data/.document
ADDED
data/.travis.yml
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 1.9.2
|
4
|
+
- 1.9.3
|
5
|
+
- jruby-19mode # JRuby in 1.9 mode
|
6
|
+
|
7
|
+
# - rbx-19mode
|
8
|
+
# - 1.8.7
|
9
|
+
# - jruby-18mode # JRuby in 1.8 mode
|
10
|
+
# - rbx-18mode
|
11
|
+
|
12
|
+
# uncomment this line if your project needs to run something other than `rake`:
|
13
|
+
# script: bundle exec rspec spec
|
data/Gemfile
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
source "https://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
gem "bio"
|
6
|
+
gem "bio-blastxmlparser"
|
7
|
+
# Add dependencies to develop your gem here.
|
8
|
+
# Include everything needed to run rake, tests, features, etc.
|
9
|
+
group :development do
|
10
|
+
gem "shoulda", ">= 0"
|
11
|
+
gem "rdoc", "~> 3.12"
|
12
|
+
gem "simplecov", ">= 0"
|
13
|
+
gem "juwelier"
|
14
|
+
gem "bundler", ">= 1.0.21"
|
15
|
+
end
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2019 homonecloco
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
# bio-pangenome
|
2
|
+
|
3
|
+
[](http://travis-ci.org/homonecloco/bioruby-pangenome)
|
4
|
+
|
5
|
+
Full description goes here
|
6
|
+
|
7
|
+
Note: this software is under active development!
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
```sh
|
12
|
+
gem install bio-pangenome
|
13
|
+
```
|
14
|
+
|
15
|
+
## Usage
|
16
|
+
|
17
|
+
```ruby
|
18
|
+
require 'bio-pangenome'
|
19
|
+
```
|
20
|
+
|
21
|
+
The API doc is online. For more code examples see the test files in
|
22
|
+
the source tree.
|
23
|
+
|
24
|
+
## Project home page
|
25
|
+
|
26
|
+
Information on the source tree, documentation, examples, issues and
|
27
|
+
how to contribute, see
|
28
|
+
|
29
|
+
http://github.com/homonecloco/bioruby-pangenome
|
30
|
+
|
31
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
32
|
+
|
33
|
+
## Cite
|
34
|
+
|
35
|
+
If you use this software, please cite one of
|
36
|
+
|
37
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
38
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
39
|
+
|
40
|
+
## Biogems.info
|
41
|
+
|
42
|
+
This Biogem is published at (http://biogems.info/index.html#bio-pangenome)
|
43
|
+
|
44
|
+
## Copyright
|
45
|
+
|
46
|
+
Copyright (c) 2019 homonecloco. See LICENSE.txt for further details.
|
47
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
= bio-pangenome
|
2
|
+
|
3
|
+
{<img
|
4
|
+
src="https://secure.travis-ci.org/homonecloco/bioruby-pangenome.png"
|
5
|
+
/>}[http://travis-ci.org/#!/homonecloco/bioruby-pangenome]
|
6
|
+
|
7
|
+
Full description goes here
|
8
|
+
|
9
|
+
Note: this software is under active development!
|
10
|
+
|
11
|
+
== Installation
|
12
|
+
|
13
|
+
gem install bio-pangenome
|
14
|
+
|
15
|
+
== Usage
|
16
|
+
|
17
|
+
== Developers
|
18
|
+
|
19
|
+
To use the library
|
20
|
+
|
21
|
+
require 'bio-pangenome'
|
22
|
+
|
23
|
+
The API doc is online. For more code examples see also the test files in
|
24
|
+
the source tree.
|
25
|
+
|
26
|
+
== Project home page
|
27
|
+
|
28
|
+
Information on the source tree, documentation, issues and how to contribute, see
|
29
|
+
|
30
|
+
http://github.com/homonecloco/bioruby-pangenome
|
31
|
+
|
32
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
33
|
+
|
34
|
+
== Cite
|
35
|
+
|
36
|
+
If you use this software, please cite one of
|
37
|
+
|
38
|
+
* [BioRuby: bioinformatics software for the Ruby programming language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
39
|
+
* [Biogem: an effective tool-based approach for scaling up open source software development in bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
40
|
+
|
41
|
+
== Biogems.info
|
42
|
+
|
43
|
+
This Biogem is published at http://biogems.info/index.html#bio-pangenome
|
44
|
+
|
45
|
+
== Copyright
|
46
|
+
|
47
|
+
Copyright (c) 2019 homonecloco. See LICENSE.txt for further details.
|
48
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'juwelier'
|
15
|
+
Juwelier::Tasks.new do |gem|
|
16
|
+
# gem is a Gem::Specification... see http://guides.rubygems.org/specification-reference/ for more options
|
17
|
+
gem.name = "bio-pangenome"
|
18
|
+
gem.homepage = "http://github.com/Uauy-Lab/bioruby-pangenome"
|
19
|
+
gem.license = "MIT"
|
20
|
+
gem.summary = %Q{Scripts to analyse pangenomes.}
|
21
|
+
gem.description = %Q{Tools to find similarity between pangenomes.}
|
22
|
+
gem.email = "ricardo.ramirez-gonzalez@jic.ac.uk"
|
23
|
+
gem.authors = ["Ricardo H. Ramirez-Gonzalez"]
|
24
|
+
# dependencies defined in Gemfile
|
25
|
+
end
|
26
|
+
|
27
|
+
Juwelier::RubygemsDotOrgTasks.new
|
28
|
+
|
29
|
+
require 'rake/testtask'
|
30
|
+
Rake::TestTask.new(:test) do |test|
|
31
|
+
test.libs << 'lib' << 'test'
|
32
|
+
test.pattern = 'test/**/test_*.rb'
|
33
|
+
test.verbose = true
|
34
|
+
end
|
35
|
+
|
36
|
+
desc "Code coverage detail"
|
37
|
+
task :simplecov do
|
38
|
+
ENV['COVERAGE'] = "true"
|
39
|
+
Rake::Task['test'].execute
|
40
|
+
end
|
41
|
+
|
42
|
+
task :default => :test
|
43
|
+
|
44
|
+
require 'rdoc/task'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
47
|
+
|
48
|
+
rdoc.rdoc_dir = 'rdoc'
|
49
|
+
rdoc.title = "bio-pangenome #{version}"
|
50
|
+
rdoc.rdoc_files.include('README*')
|
51
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
52
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
@@ -0,0 +1,114 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# BioRuby bio-pangenome Plugin BioPangenome
|
4
|
+
# Author:: homonecloco
|
5
|
+
# Copyright:: 2019
|
6
|
+
|
7
|
+
USAGE = "panggenome_blast_flanking.rb [options]"
|
8
|
+
|
9
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
10
|
+
$: << File.join(gempath,'lib')
|
11
|
+
|
12
|
+
VERSION_FILENAME=File.join(gempath,'VERSION')
|
13
|
+
version = File.new(VERSION_FILENAME).read.chomp
|
14
|
+
|
15
|
+
# print banner
|
16
|
+
print "panggenome_blast_flanking #{version} by Ricardo H. Ramirez-Gonzalez 2019\n"
|
17
|
+
|
18
|
+
if ARGV.size == 0
|
19
|
+
print USAGE
|
20
|
+
end
|
21
|
+
|
22
|
+
path = gempath + '/lib/bio-pangenome.rb'
|
23
|
+
require path
|
24
|
+
#require 'bio-pangenome'
|
25
|
+
require 'optparse'
|
26
|
+
require 'tmpdir'
|
27
|
+
|
28
|
+
options = {
|
29
|
+
:transcript_mapping => "sorted_filtered_mapping.csv.gz",
|
30
|
+
:lines => "lines.txt",
|
31
|
+
:genes => "genes.txt",
|
32
|
+
:no_windows => 0,
|
33
|
+
:window => 0,
|
34
|
+
:distance => 2000
|
35
|
+
}
|
36
|
+
opts = OptionParser.new do |o|
|
37
|
+
o.banner = "Usage: #{File.basename($0)} [options]"
|
38
|
+
|
39
|
+
o.on('-t', '--transcript_mapping [sorted_filtered_mapping.csv.gz]', 'File with the mappings across transcriptomes') do |arg|
|
40
|
+
options[:transcript_mapping] = arg
|
41
|
+
end
|
42
|
+
|
43
|
+
o.on('-g','--genes [genes.txt]', 'File with the list of genes') do |arg|
|
44
|
+
options[:genes] = arg
|
45
|
+
end
|
46
|
+
|
47
|
+
o.on('-n','--no_windows INT', "Number of chunks to divide the genes list. 0 to not split") do |arg|
|
48
|
+
options[:no_windows] = arg.to_i
|
49
|
+
end
|
50
|
+
|
51
|
+
o.on('-w', "--window INT", "Current window to run") do |arg|
|
52
|
+
options[:window] = arg.to_i
|
53
|
+
end
|
54
|
+
|
55
|
+
o.on('-d', "--distance DIST", "Name of the distance set. Ues 'cds' to align cds. default 2000") do |arg|
|
56
|
+
options[:distance] = arg
|
57
|
+
end
|
58
|
+
|
59
|
+
o.on("-b", "--basepath PATH", "Folder with the sequences and mapping positions across genomes") do |arg|
|
60
|
+
options[:basepath] = arg
|
61
|
+
end
|
62
|
+
|
63
|
+
o.on("-o", "--output PATH", "Folder with the output. If there are chunks, they will be used") do |arg|
|
64
|
+
options[:output] = arg
|
65
|
+
end
|
66
|
+
|
67
|
+
o.on("-l", "--lines PATH", "File containing the lines to be analysed") do |arg|
|
68
|
+
options[:lines] = arg
|
69
|
+
end
|
70
|
+
|
71
|
+
o.separator ""
|
72
|
+
o.on_tail('-h', '--help', 'display this help and exit') do
|
73
|
+
options[:show_help] = true
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
opts.parse!(ARGV)
|
78
|
+
|
79
|
+
genes = BioPangenome.load_genes(options[:genes], window: options[:window], no_windows: options[:no_windows] )
|
80
|
+
puts "Genes count: #{genes.size}"
|
81
|
+
|
82
|
+
lines = BioPangenome.load_lines(options[:lines])
|
83
|
+
|
84
|
+
projected_genes = BioPangenome.load_projected_genes options[:transcript_mapping], genes: genes
|
85
|
+
|
86
|
+
variety_coordinates = BioPangenome.load_mapping_hash(
|
87
|
+
varieties:lines,
|
88
|
+
genes: projected_genes ,
|
89
|
+
prefix: options[:basepath],
|
90
|
+
distance: options[:distance]
|
91
|
+
)
|
92
|
+
|
93
|
+
seqs = BioPangenome.load_sequences_from_hash(
|
94
|
+
coordinates:variety_coordinates,
|
95
|
+
prefix: options[:basepath],
|
96
|
+
distance: options[:distance],
|
97
|
+
projected_genes: projected_genes
|
98
|
+
)
|
99
|
+
|
100
|
+
output = options[:output].to_s
|
101
|
+
output = output + "_" + options[:window].to_s if options[:no_windows] > 0
|
102
|
+
|
103
|
+
Dir.mktmpdir do |temp_dir|
|
104
|
+
puts "Aligning in #{temp_dir}"
|
105
|
+
BioPangenome.align_gene_groups(
|
106
|
+
seqs: seqs,
|
107
|
+
distance:options[:distance],
|
108
|
+
output: output,
|
109
|
+
tmp_folder: temp_dir)
|
110
|
+
end
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
|
@@ -0,0 +1,12 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-pangenome/pangenome.rb'
|
12
|
+
|
@@ -0,0 +1,201 @@
|
|
1
|
+
|
2
|
+
require 'zlib'
|
3
|
+
require 'bio'
|
4
|
+
require 'csv'
|
5
|
+
require 'set'
|
6
|
+
require 'bio-blastxmlparser'
|
7
|
+
|
8
|
+
module BioPangenome
|
9
|
+
Transcript = Struct.new(:id, :gene, :chromosome,:version,:count,:transcript,:confidence, :count_int, :isoform)
|
10
|
+
GeneFlankingRegion = Struct.new(:transcript, :gene, :ann, :region, :id, :flank_length, :sequence, :line)
|
11
|
+
|
12
|
+
def self.parseTranscript name
|
13
|
+
arr=name.split(".")
|
14
|
+
match = /TraesCS(?<chr>[[:alnum:]]{1,2})(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]
|
15
|
+
raise "Unable to parse: #{name}" unless match
|
16
|
+
Transcript.new(name, arr[0],match[:chr],match[:ver],match[:count],arr[1],match[:conf], match[:count].to_i, arr[1])
|
17
|
+
end
|
18
|
+
def self.parseEITranscript name
|
19
|
+
arr=name.split(".")
|
20
|
+
match = /Traes(?<chr>[[:upper:]]{3}_scaffold_[[:digit:]]*)_(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]
|
21
|
+
raise "Unable to parse: #{name}" unless match
|
22
|
+
Transcript.new(name, arr[0],match[:chr].downcase,match[:ver],match[:count],arr[1],match[:conf], match[:count].to_i, arr[1])
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.parsePGSBTranscript name
|
26
|
+
arr=name.split(".")
|
27
|
+
match = /Traes(?<variety>[[:upper:]]{3})(?<chr>[[:alnum:]]{1,2})(?<ver>[[:digit:]]{2})G(?<count>[[:digit:]]+)(?<conf>[[:upper:]]*)/.match arr[0]
|
28
|
+
|
29
|
+
raise "Unable to parse: #{name}" unless match
|
30
|
+
Transcript.new(name, arr[0],match[:chr],match[:ver],match[:count],match[:variety], match[:conf],match[:count].to_i, arr[1])
|
31
|
+
end
|
32
|
+
def self.parseSequenceName region, name
|
33
|
+
match = /(?<transcript>[[:alnum:]].+)_(?<ann>.+)_(?<flank_length>[[:digit:]]+bp)/.match name
|
34
|
+
arr2=match[:transcript].split "."
|
35
|
+
GeneFlankingRegion.new(match[:transcript],arr2[0],match[:ann], region, name, match[:flank_length] , nil, nil)
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.load_mapping_hash(varieties:[], transcripts:[], genes:[], distance: 1000, prefix: "../flanking/releasePGSBV1/", suffix: ".RefSeqv1.1")
|
39
|
+
ret = Hash.new { |h, k| h[k] = Hash.new }
|
40
|
+
varieties.each do |v|
|
41
|
+
path = "#{prefix}#{distance}bp/#{v}_#{distance}bp_#{suffix}.reg.map"
|
42
|
+
$stderr.puts path
|
43
|
+
File.foreach(path) do |line|
|
44
|
+
line.chomp!
|
45
|
+
arr = line.split("\t")
|
46
|
+
begin
|
47
|
+
parsed = parseSequenceName(arr[0], arr[1])
|
48
|
+
rescue Exception => e
|
49
|
+
throw "Unable to parse #{line} (#{v}) [#{e.to_s}]"
|
50
|
+
end
|
51
|
+
next unless transcripts.include? parsed.transcript or genes.include? parsed.gene
|
52
|
+
ret[v][parsed.region] = parsed
|
53
|
+
end
|
54
|
+
end
|
55
|
+
ret
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.blast_pair_fast(path_a, path_b, out_path, program: "blastn")
|
59
|
+
cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
|
60
|
+
system cmd
|
61
|
+
n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
|
62
|
+
max_length = 0
|
63
|
+
max_pident = 0.0
|
64
|
+
n.each do | iter |
|
65
|
+
iter.each do | hit |
|
66
|
+
hit.each do | hsp |
|
67
|
+
if hsp.align_len > max_length
|
68
|
+
max_length = hsp.align_len
|
69
|
+
max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
[max_length, max_pident]
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def self.load_sequences_from_hash(coordinates:{}, prefix: "../flanking/filtered/", suffix: "RefSeqv1.1", distance: 1000, projected_genes: {})
|
79
|
+
ret = Hash.new { |h, k| h[k] = Hash.new }
|
80
|
+
coordinates.each_pair do |variety, coords|
|
81
|
+
|
82
|
+
path = "#{prefix}/#{distance}bp/#{variety}_#{distance}bp_#{suffix}.fa.gz"
|
83
|
+
puts "Loading: #{path}"
|
84
|
+
infile = open(path)
|
85
|
+
io = Zlib::GzipReader.new(infile)
|
86
|
+
Bio::FlatFile.open(Bio::FastaFormat, io) do |fasta_file|
|
87
|
+
fasta_file.each do |entry|
|
88
|
+
next unless coords[entry.definition]
|
89
|
+
seq_name = coords[entry.definition]
|
90
|
+
seq = entry.seq
|
91
|
+
seq.gsub!(/N*$/, '')
|
92
|
+
seq.gsub!(/^N*/, '')
|
93
|
+
seq_name.sequence = seq
|
94
|
+
base_gene = projected_genes[seq_name.gene]["gene"]
|
95
|
+
ret[base_gene][variety] = seq_name unless ret[base_gene][variety]
|
96
|
+
end
|
97
|
+
end
|
98
|
+
io.close
|
99
|
+
end
|
100
|
+
ret
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.align_gene_groups( seqs:{}, tmp_folder:"/Volumes/PanGenome/GeneRegions/201910_v2_v3/tmp", output:"../pairwise_blast_oct_2019/varieties_6A_identites", distance: 0 )
|
104
|
+
out_tmp="#{tmp_folder}/out.blast"
|
105
|
+
FileUtils.mkdir_p(tmp_folder)
|
106
|
+
out = File.open("#{output}_#{distance}bp.tab", "w")
|
107
|
+
out.puts [ "transcript" , "query", "subject" , "var_query", "var_subject", "aln_type", "length" , "pident" , "Ns_query", "Ns_subject", "Ns_total", "Flanking" ].join("\t")
|
108
|
+
seqs.each_pair do |transcript, transcript_seqs|
|
109
|
+
vars = transcript_seqs.keys
|
110
|
+
vars_done = []
|
111
|
+
ns = {}
|
112
|
+
vars.each do |v1|
|
113
|
+
tmp = tmp_folder + "/" + v1 + ".fa"
|
114
|
+
s = transcript_seqs[v1]
|
115
|
+
seq = ">#{s.id}\n#{s.sequence}"
|
116
|
+
File.open(tmp, 'w') {|f| f.write(seq) }
|
117
|
+
ns[v1] = s.sequence.count('Nn')
|
118
|
+
end
|
119
|
+
vars.each do |v1|
|
120
|
+
tmp1 = tmp_folder + "/" + v1 + ".fa"
|
121
|
+
s1 = transcript_seqs[v1]
|
122
|
+
next unless s1.sequence.length > 0
|
123
|
+
vars.each do |v2|
|
124
|
+
next if v1 == v2
|
125
|
+
next if vars_done.include? v2
|
126
|
+
s2 = transcript_seqs[v2]
|
127
|
+
next unless s2.sequence.length > 0
|
128
|
+
tmp2 = tmp_folder + "/" + v2 + ".fa"
|
129
|
+
to_print = [transcript, s1.id , s2.id , v1,v2,"#{v1}->#{v2}"]
|
130
|
+
to_print << blast_pair_fast(tmp1, tmp2, out_tmp)
|
131
|
+
to_print << ns[v1]
|
132
|
+
to_print << ns[v2]
|
133
|
+
to_print << ns[v1] + ns[v2]
|
134
|
+
to_print << distance
|
135
|
+
out.puts to_print.join("\t")
|
136
|
+
end
|
137
|
+
vars_done << v1
|
138
|
+
end
|
139
|
+
end
|
140
|
+
out.close
|
141
|
+
end
|
142
|
+
|
143
|
+
def self.load_cds_sequences( varieties:[], genes:{}, prefix: "../flanking/filtered/", suffix: ".cds.fa.gz", set_id: "cds" )
|
144
|
+
ret = Hash.new { |h, k| h[k] = Hash.new }
|
145
|
+
varieties.each do |variety|
|
146
|
+
path = "#{prefix}/#{variety}#{suffix}"
|
147
|
+
infile = open(path)
|
148
|
+
io = Zlib::GzipReader.new(infile)
|
149
|
+
Bio::FlatFile.open(Bio::FastaFormat, io) do |fasta_file|
|
150
|
+
fasta_file.each do |entry|
|
151
|
+
arr = entry.definition.split(".")
|
152
|
+
next unless genes[arr[0]]
|
153
|
+
row = genes[arr[0]]
|
154
|
+
seq_name = GeneFlankingRegion.new(entry.definition,
|
155
|
+
row["gene"], "",
|
156
|
+
"", entry.definition, set_id, nil, variety )
|
157
|
+
seq = entry.seq
|
158
|
+
seq.gsub!(/N*$/, '')
|
159
|
+
seq.gsub!(/^N*/, '')
|
160
|
+
seq_name.sequence = seq
|
161
|
+
base_gene = seq_name.gene
|
162
|
+
ret[base_gene][variety] = seq_name unless ret[base_gene][variety]
|
163
|
+
end
|
164
|
+
end
|
165
|
+
io.close
|
166
|
+
end
|
167
|
+
ret
|
168
|
+
end
|
169
|
+
|
170
|
+
def self.load_projected_genes(transcript_mapping, genes:[])
|
171
|
+
projected_genes = {}
|
172
|
+
Zlib::GzipReader.open(transcript_mapping) do |gzip|
|
173
|
+
csv = CSV.new(gzip, headers: true)
|
174
|
+
csv.each do |row|
|
175
|
+
next unless genes.include? row["gene"]
|
176
|
+
projected_genes[row["projected_gene"]] = row
|
177
|
+
end
|
178
|
+
end
|
179
|
+
projected_genes
|
180
|
+
end
|
181
|
+
|
182
|
+
def self.load_genes(filename, window: 0, no_windows: 0)
|
183
|
+
genes = File.readlines(filename).map do |t|
|
184
|
+
t.chomp!.split(".")[0]
|
185
|
+
end
|
186
|
+
if no_windows > 0
|
187
|
+
puts "'loading window #{window} of #{no_windows}'"
|
188
|
+
window_size = genes.size/no_windows
|
189
|
+
start = window * window_size
|
190
|
+
genes = genes[start, window_size]
|
191
|
+
end
|
192
|
+
genes
|
193
|
+
end
|
194
|
+
|
195
|
+
def self.load_lines(filename)
|
196
|
+
File.readlines(filename).map do |t|
|
197
|
+
t.chomp!.rstrip
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'simplecov'
|
2
|
+
|
3
|
+
module SimpleCov::Configuration
|
4
|
+
def clean_filters
|
5
|
+
@filters = []
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
SimpleCov.configure do
|
10
|
+
clean_filters
|
11
|
+
load_adapter 'test_frameworks'
|
12
|
+
end
|
13
|
+
|
14
|
+
ENV["COVERAGE"] && SimpleCov.start do
|
15
|
+
add_filter "/.rvm/"
|
16
|
+
end
|
17
|
+
require 'rubygems'
|
18
|
+
require 'bundler'
|
19
|
+
begin
|
20
|
+
Bundler.setup(:default, :development)
|
21
|
+
rescue Bundler::BundlerError => e
|
22
|
+
$stderr.puts e.message
|
23
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
24
|
+
exit e.status_code
|
25
|
+
end
|
26
|
+
require 'test/unit'
|
27
|
+
require 'shoulda'
|
28
|
+
|
29
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
30
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
31
|
+
require 'bio-pangenome'
|
32
|
+
|
33
|
+
class Test::Unit::TestCase
|
34
|
+
end
|
metadata
ADDED
@@ -0,0 +1,157 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bio-pangenome
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ricardo H. Ramirez-Gonzalez
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-11-27 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bio
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bio-blastxmlparser
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: shoulda
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rdoc
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.12'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.12'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: simplecov
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: juwelier
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: bundler
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 1.0.21
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 1.0.21
|
111
|
+
description: Tools to find similarity between pangenomes.
|
112
|
+
email: ricardo.ramirez-gonzalez@jic.ac.uk
|
113
|
+
executables:
|
114
|
+
- pangenome_blast_flanking.rb
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files:
|
117
|
+
- LICENSE.txt
|
118
|
+
- README.md
|
119
|
+
- README.rdoc
|
120
|
+
files:
|
121
|
+
- ".document"
|
122
|
+
- ".travis.yml"
|
123
|
+
- Gemfile
|
124
|
+
- LICENSE.txt
|
125
|
+
- README.md
|
126
|
+
- README.rdoc
|
127
|
+
- Rakefile
|
128
|
+
- VERSION
|
129
|
+
- bin/pangenome_blast_flanking.rb
|
130
|
+
- lib/bio-pangenome.rb
|
131
|
+
- lib/bio-pangenome/pangenome.rb
|
132
|
+
- test/helper.rb
|
133
|
+
- test/test_bio-pangenome.rb
|
134
|
+
homepage: http://github.com/Uauy-Lab/bioruby-pangenome
|
135
|
+
licenses:
|
136
|
+
- MIT
|
137
|
+
metadata: {}
|
138
|
+
post_install_message:
|
139
|
+
rdoc_options: []
|
140
|
+
require_paths:
|
141
|
+
- lib
|
142
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
143
|
+
requirements:
|
144
|
+
- - ">="
|
145
|
+
- !ruby/object:Gem::Version
|
146
|
+
version: '0'
|
147
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - ">="
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
requirements: []
|
153
|
+
rubygems_version: 3.0.6
|
154
|
+
signing_key:
|
155
|
+
specification_version: 4
|
156
|
+
summary: Scripts to analyse pangenomes.
|
157
|
+
test_files: []
|