bio-pangenome 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/VERSION +1 -1
- data/bin/pangenome_blast_flanking.rb +32 -13
- data/bin/pangenome_gene_bed_files.rb +125 -0
- data/lib/bio-pangenome.rb +2 -1
- data/lib/bio-pangenome/MultipleGFF3.rb +77 -0
- data/lib/bio-pangenome/gff3_extensions.rb +121 -0
- metadata +8 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8d1d196a63e35bb86d33ba74ed1465454ffa9d028f57f0603ff90f8708acc352
|
4
|
+
data.tar.gz: 5f00000897b7de9361eba0781d02b49da78dc3d9e0af75fc572283173826224e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6f8eb3a8f1b10706cd193dc720b3138460ddd4ceb9c5cc1878779159da8064ec20975a032583f872effc19ac24ea1deab6adbd97186f7789a11fccd6d441f126
|
7
|
+
data.tar.gz: 11e2512016f4b5d36ee1c19f88f47dafe82f6e1995c673d39ca5efcfc836fc2cca4f2a03550fcbb0a0b8ee7ef02c1a52e859b4c4111664dfb0870b2197c009b9
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.2
|
@@ -4,6 +4,13 @@
|
|
4
4
|
# Author:: homonecloco
|
5
5
|
# Copyright:: 2019
|
6
6
|
|
7
|
+
class String
|
8
|
+
def integer?
|
9
|
+
Integer(self) != nil rescue false
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
|
7
14
|
USAGE = "panggenome_blast_flanking.rb [options]"
|
8
15
|
|
9
16
|
gempath = File.dirname(File.dirname(__FILE__))
|
@@ -83,19 +90,31 @@ lines = BioPangenome.load_lines(options[:lines])
|
|
83
90
|
|
84
91
|
projected_genes = BioPangenome.load_projected_genes options[:transcript_mapping], genes: genes
|
85
92
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
93
|
+
|
94
|
+
seqs = nil
|
95
|
+
|
96
|
+
if options[:distance].integer?
|
97
|
+
variety_coordinates = BioPangenome.load_mapping_hash(
|
98
|
+
varieties:lines,
|
99
|
+
genes: projected_genes ,
|
100
|
+
prefix: options[:basepath],
|
101
|
+
distance: options[:distance]
|
102
|
+
)
|
103
|
+
seqs = BioPangenome.load_sequences_from_hash(
|
104
|
+
coordinates:variety_coordinates,
|
105
|
+
prefix: options[:basepath],
|
106
|
+
distance: options[:distance],
|
107
|
+
projected_genes: projected_genes
|
108
|
+
)
|
109
|
+
else
|
110
|
+
seqs = BioPangenome.load_cds_sequences( varieties: lines,
|
111
|
+
prefix: "#{options[:basepath]}/#{options[:distance]}/",
|
112
|
+
suffix: ".#{options[:distance]}.fa.gz",
|
113
|
+
set_id: options[:distance],
|
114
|
+
genes:projected_genes )
|
115
|
+
end
|
116
|
+
|
117
|
+
puts "Loaded squence set for #{seqs.size} genes"
|
99
118
|
|
100
119
|
output = options[:output].to_s
|
101
120
|
output = output + "_" + options[:window].to_s if options[:no_windows] > 0
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# BioRuby bio-pangenome Plugin BioPangenome
|
4
|
+
# Author:: homonecloco
|
5
|
+
# Copyright:: 2019
|
6
|
+
|
7
|
+
USAGE = "pangenome_gene_bed_files.rb [options]"
|
8
|
+
|
9
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
10
|
+
$: << File.join(gempath,'lib')
|
11
|
+
|
12
|
+
VERSION_FILENAME=File.join(gempath,'VERSION')
|
13
|
+
version = File.new(VERSION_FILENAME).read.chomp
|
14
|
+
|
15
|
+
# print banner
|
16
|
+
print "pangenome_gene_bed_files #{version} by Ricardo H. Ramirez-Gonzalez 2020\n"
|
17
|
+
|
18
|
+
if ARGV.size == 0
|
19
|
+
print USAGE
|
20
|
+
end
|
21
|
+
|
22
|
+
path = gempath + '/lib/bio-pangenome.rb'
|
23
|
+
require path
|
24
|
+
#require 'bio-pangenome'
|
25
|
+
require 'optparse'
|
26
|
+
require 'tmpdir'
|
27
|
+
|
28
|
+
require 'csv'
|
29
|
+
require 'zlib'
|
30
|
+
require 'bio'
|
31
|
+
require 'bio-svgenes'
|
32
|
+
|
33
|
+
|
34
|
+
options = {
|
35
|
+
lines: "lines.txt",
|
36
|
+
distances: [0,1000,2000,5000],
|
37
|
+
path: "./gff/",
|
38
|
+
is_gz: true
|
39
|
+
}
|
40
|
+
|
41
|
+
opts = OptionParser.new do |o|
|
42
|
+
|
43
|
+
o.on("-l", "--lines PATH", "File containing the lines to be analysed") do |arg|
|
44
|
+
options[:lines] = arg
|
45
|
+
end
|
46
|
+
|
47
|
+
o.on("-d", "--distances 0,1000,2000", "File containing the distances to be analysed") do |arg|
|
48
|
+
options[:distances] = arg.split(",").map { |e| e.to_i }
|
49
|
+
end
|
50
|
+
|
51
|
+
o.on("-p", "--gff_path DIR", "The directory where the gff files are") do |arg|
|
52
|
+
options[:path ] = arg
|
53
|
+
end
|
54
|
+
|
55
|
+
o.separator ""
|
56
|
+
o.on_tail('-h', '--help', 'display this help and exit') do
|
57
|
+
options[:show_help] = true
|
58
|
+
end
|
59
|
+
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.parse!(ARGV)
|
63
|
+
|
64
|
+
puts options.inspect
|
65
|
+
|
66
|
+
lines = File.foreach(options[:lines]).map { |line| line.chomp }
|
67
|
+
distances = options[:distances]
|
68
|
+
puts lines
|
69
|
+
|
70
|
+
|
71
|
+
|
72
|
+
gffs = MultipleGFFs.new(folder: options[:path], lines:lines, suffix:".gff.gz",
|
73
|
+
is_gz:options[:is_gz] )
|
74
|
+
|
75
|
+
distances.each do |d|
|
76
|
+
gffs.bedAround(distance: d, prefix: options[:path], suffix: ".bed" )
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
def bedAroundToRegions(lines:[], distance: 1000, prefix: "../flanking/filtered/", suffix: ".RefSeqv1.1.reg" , suffix_in: ".RefSeqv1.1.bed" )
|
81
|
+
lines.each do |k|
|
82
|
+
path="#{prefix}#{k}_#{distance}bp_#{suffix_in}"
|
83
|
+
path2="#{prefix}#{k}_#{distance}bp_#{suffix}"
|
84
|
+
path3="#{prefix}#{k}_#{distance}bp_#{suffix}.map"
|
85
|
+
puts path
|
86
|
+
out=File.open(path2, "w")
|
87
|
+
out2=File.open(path3, "w")
|
88
|
+
File.foreach(path) do |line|
|
89
|
+
# puts line
|
90
|
+
arr = line.chomp!.split "\t"
|
91
|
+
first=arr[1]
|
92
|
+
last=arr[2]
|
93
|
+
name=arr[0]
|
94
|
+
#if(arr[5] == "-")
|
95
|
+
# first=arr[2]
|
96
|
+
# last=arr[1]
|
97
|
+
#end
|
98
|
+
|
99
|
+
reg = "#{name}:#{first}-#{last}"
|
100
|
+
out.puts reg
|
101
|
+
out2.puts [reg,arr[3]].join "\t"
|
102
|
+
#puts reg
|
103
|
+
#v.bedAroundGene(distance:distance, out:out)
|
104
|
+
#break
|
105
|
+
end
|
106
|
+
out.close
|
107
|
+
out2.close
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
distances.each do |d|
|
114
|
+
bedAroundToRegions(lines:lines,
|
115
|
+
distance: d,
|
116
|
+
prefix: options[:path],
|
117
|
+
suffix_in: ".bed",
|
118
|
+
suffix: ".reg")
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
|
124
|
+
|
125
|
+
|
data/lib/bio-pangenome.rb
CHANGED
@@ -0,0 +1,77 @@
|
|
1
|
+
class MultipleGFFs
|
2
|
+
attr_reader :lines_gffs
|
3
|
+
|
4
|
+
def initialize(folder: "../mapping/", lines:[], suffix:".SM1.cds.sorted.gff", is_gz:false )
|
5
|
+
@folder = folder
|
6
|
+
@lines = lines
|
7
|
+
@suffix = suffix
|
8
|
+
@lines_gffs = Hash.new
|
9
|
+
@lines.each do |l|
|
10
|
+
path ="#{folder}/#{l}#{suffix}"
|
11
|
+
@lines_gffs[l] = GFF3.new(file: path, is_gz: is_gz)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def each_gff
|
16
|
+
@lines_gffs.each_pair{|k,v| yield k, v }
|
17
|
+
end
|
18
|
+
|
19
|
+
def bedAround(distance: 1000, prefix: "../flanking/releasePGSBV1/", suffix: ".RefSeqv1.1.bed" )
|
20
|
+
each_gff do |k, v|
|
21
|
+
path="#{prefix}#{k}_#{distance}bp_#{suffix}"
|
22
|
+
puts path
|
23
|
+
out=File.open(path, "w")
|
24
|
+
v.bedAroundGene(distance:distance, out:out)
|
25
|
+
out.close
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def summary
|
30
|
+
ret = []
|
31
|
+
each_gff do |k,v|
|
32
|
+
v.each_mrna do |record|
|
33
|
+
tmp = {}
|
34
|
+
tmp[:line] = k
|
35
|
+
tmp[:id] = record.get_attribute "Name"
|
36
|
+
tmp[:chr] = record.seqid
|
37
|
+
tmp[:start] = record.start
|
38
|
+
tmp[:end] = record.end
|
39
|
+
tmp[:strand] = record.strand
|
40
|
+
tmp[:genomic_length] = record.end - record.start
|
41
|
+
tmp[:coverage] = record.get_attribute "coverage"
|
42
|
+
tmp[:identity] = record.get_attribute "identity"
|
43
|
+
tmp[:matches] = record.get_attribute "matches"
|
44
|
+
tmp[:mismatches] = record.get_attribute "mismatches"
|
45
|
+
tmp[:indels] = record.get_attribute "indels"
|
46
|
+
tmp[:unknowns] = record.get_attribute "unknowns"
|
47
|
+
mrna_stats = @lines_gffs[k].mrna_info(record.id)
|
48
|
+
tmp[:cds_count] = mrna_stats.cds_count
|
49
|
+
tmp[:cds_max_gap] = mrna_stats.cds_max_gap
|
50
|
+
ret << tmp
|
51
|
+
end
|
52
|
+
end
|
53
|
+
ret
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_svg(mrna: "Sm1_CDS.mrna1", positions: false, out: nil)
|
57
|
+
p = Bio::Graphics::Page.new(width: 800,
|
58
|
+
height: 1000,
|
59
|
+
number_of_intervals:10,
|
60
|
+
background_color: "white"
|
61
|
+
)
|
62
|
+
each_gff do |k,v|
|
63
|
+
generic_track = p.add_track(:glyph => :generic,
|
64
|
+
:name => k,
|
65
|
+
:label => true )
|
66
|
+
v.cds_to_print(mrna).each do |cds|
|
67
|
+
|
68
|
+
f_id = positions ? cds.offset_start : nil
|
69
|
+
feature = Bio::Graphics::MiniFeature.new(start: cds.start,
|
70
|
+
end: cds.end,
|
71
|
+
fill_color: cds.color,
|
72
|
+
id: f_id)
|
73
|
+
generic_track.add(feature)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
require 'bio-gff3'
|
2
|
+
|
3
|
+
module Bio::GFFbrowser::FastLineParser
|
4
|
+
module_function :parse_line_fast
|
5
|
+
end
|
6
|
+
|
7
|
+
MrnaStats = Struct.new(:cds_count, :cds_max_gap)
|
8
|
+
|
9
|
+
class GFF3
|
10
|
+
CDS_feature = Struct.new(:start, :end, :color, :ref_chr,:ref_start, :ref_end, :offset_start)
|
11
|
+
|
12
|
+
def initialize(file: "", is_gz: true)
|
13
|
+
@file = file
|
14
|
+
@is_gz = is_gz
|
15
|
+
end
|
16
|
+
|
17
|
+
def each
|
18
|
+
return enum_for(:each) unless block_given?
|
19
|
+
io = nil
|
20
|
+
if @is_gz
|
21
|
+
infile = open(@file)
|
22
|
+
io = Zlib::GzipReader.new(infile)
|
23
|
+
else
|
24
|
+
io = File.open(@file)
|
25
|
+
end
|
26
|
+
parser = Bio::GFFbrowser::FastLineParser
|
27
|
+
io.each_line do |line|
|
28
|
+
line.encode!('UTF-8', 'UTF-8', :invalid => :replace)
|
29
|
+
line.strip!
|
30
|
+
break if line == '##FASTA'
|
31
|
+
next if line.length == 0 or line =~ /^#/
|
32
|
+
begin
|
33
|
+
record = Bio::GFFbrowser::FastLineRecord.new(parser.parse_line_fast(line))
|
34
|
+
yield record
|
35
|
+
rescue Exception => e
|
36
|
+
$stderr.puts "Unable to parse '#{line}'\n#{e}"
|
37
|
+
throw e
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def each_gene
|
43
|
+
return enum_for(:each_gene) unless block_given?
|
44
|
+
self.each do |record|
|
45
|
+
next unless record.feature == "gene"
|
46
|
+
yield record
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def each_mrna
|
51
|
+
return enum_for(:each_mrna) unless block_given?
|
52
|
+
self.each do |record|
|
53
|
+
next unless record.feature == "mRNA"
|
54
|
+
yield record
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def each_cds
|
59
|
+
return enum_for(:each_mrna) unless block_given?
|
60
|
+
self.each do |record|
|
61
|
+
next unless record.feature == "CDS"
|
62
|
+
yield record
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def calculate_mrna_stats
|
67
|
+
return if @mrna_stats
|
68
|
+
@mrna_stats = Hash.new {|h,k| h[k] = MrnaStats.new(0,0) }
|
69
|
+
last_mrna = ""
|
70
|
+
last_record = nil
|
71
|
+
each_cds do |record|
|
72
|
+
parent = record.get_attribute "Parent"
|
73
|
+
mrna = @mrna_stats[parent]
|
74
|
+
mrna.cds_count += 1
|
75
|
+
if last_mrna == parent
|
76
|
+
distance = record.start - last_record.end
|
77
|
+
mrna.cds_max_gap = distance if distance > mrna.cds_max_gap
|
78
|
+
end
|
79
|
+
last_record = record
|
80
|
+
last_mrna = parent
|
81
|
+
end
|
82
|
+
return
|
83
|
+
end
|
84
|
+
|
85
|
+
def mrna_info(id)
|
86
|
+
calculate_mrna_stats
|
87
|
+
@mrna_stats[id]
|
88
|
+
end
|
89
|
+
|
90
|
+
def bedAroundGene(distance:1000, out:$stdout)
|
91
|
+
each_gene do |record|
|
92
|
+
start = record.start-distance
|
93
|
+
start = 1 if start < 1
|
94
|
+
reg_end=record.end + distance
|
95
|
+
out.puts [record.seqid, start, reg_end, "#{record.id}_#{record.source}_#{distance}bp", ".", record.strand].join "\t"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
def cds_to_print(mrna,cannonical_exons:[], colors:["#a6cee3", "#1f78b4", "#b2df8a" , "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"])
|
101
|
+
|
102
|
+
cds_features = []
|
103
|
+
i = 0
|
104
|
+
offset=0
|
105
|
+
offset_start=0
|
106
|
+
each_cds do |record|
|
107
|
+
target = record.get_attribute "Target"
|
108
|
+
arr = target.split(" ")
|
109
|
+
col = colors[i % colors.size ]
|
110
|
+
start = arr[1].to_i + offset
|
111
|
+
ends = arr[2].to_i + offset
|
112
|
+
offset_start = record.start if offset_start == 0
|
113
|
+
tmp = CDS_feature.new(start, ends, col,
|
114
|
+
record.seqid, record.start,record.end, record.start - offset_start )
|
115
|
+
cds_features << tmp
|
116
|
+
i += 1
|
117
|
+
end
|
118
|
+
cds_features
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bio-pangenome
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ricardo H. Ramirez-Gonzalez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-04-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bio
|
@@ -112,6 +112,7 @@ description: Tools to find similarity between pangenomes.
|
|
112
112
|
email: ricardo.ramirez-gonzalez@jic.ac.uk
|
113
113
|
executables:
|
114
114
|
- pangenome_blast_flanking.rb
|
115
|
+
- pangenome_gene_bed_files.rb
|
115
116
|
extensions: []
|
116
117
|
extra_rdoc_files:
|
117
118
|
- LICENSE.txt
|
@@ -127,7 +128,10 @@ files:
|
|
127
128
|
- Rakefile
|
128
129
|
- VERSION
|
129
130
|
- bin/pangenome_blast_flanking.rb
|
131
|
+
- bin/pangenome_gene_bed_files.rb
|
130
132
|
- lib/bio-pangenome.rb
|
133
|
+
- lib/bio-pangenome/MultipleGFF3.rb
|
134
|
+
- lib/bio-pangenome/gff3_extensions.rb
|
131
135
|
- lib/bio-pangenome/pangenome.rb
|
132
136
|
- test/helper.rb
|
133
137
|
- test/test_bio-pangenome.rb
|
@@ -150,7 +154,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
150
154
|
- !ruby/object:Gem::Version
|
151
155
|
version: '0'
|
152
156
|
requirements: []
|
153
|
-
|
157
|
+
rubyforge_project:
|
158
|
+
rubygems_version: 2.7.7
|
154
159
|
signing_key:
|
155
160
|
specification_version: 4
|
156
161
|
summary: Scripts to analyse pangenomes.
|