bio-pangenome 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67ff78e3e817086eb54e9f3c2a93bad8dfec84733d2e6db2a1ccb716543ae3cb
4
- data.tar.gz: b9720ab3108dba2864a2f94762597c3cb89fd0837e5853d6f08215abe9f8fce8
3
+ metadata.gz: 8d1d196a63e35bb86d33ba74ed1465454ffa9d028f57f0603ff90f8708acc352
4
+ data.tar.gz: 5f00000897b7de9361eba0781d02b49da78dc3d9e0af75fc572283173826224e
5
5
  SHA512:
6
- metadata.gz: b1643aecb2e252b8c1cc4fe3328d9b296d9ac0f63987544b954320a720346b7175234d6a83c9322baac25f057edc171e530e11181714016aeca73d0da791a650
7
- data.tar.gz: d34eafed8d0080ecfbdc68dad5a39366fb18fec2666337e69a00f656f9c2d187b7b9aad062eb3197c24f881194343ce87b4730caafb51d729a80b77722883bd3
6
+ metadata.gz: 6f8eb3a8f1b10706cd193dc720b3138460ddd4ceb9c5cc1878779159da8064ec20975a032583f872effc19ac24ea1deab6adbd97186f7789a11fccd6d441f126
7
+ data.tar.gz: 11e2512016f4b5d36ee1c19f88f47dafe82f6e1995c673d39ca5efcfc836fc2cca4f2a03550fcbb0a0b8ee7ef02c1a52e859b4c4111664dfb0870b2197c009b9
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.2
@@ -4,6 +4,13 @@
4
4
  # Author:: homonecloco
5
5
  # Copyright:: 2019
6
6
 
7
+ class String
8
+ def integer?
9
+ Integer(self) != nil rescue false
10
+ end
11
+ end
12
+
13
+
7
14
  USAGE = "panggenome_blast_flanking.rb [options]"
8
15
 
9
16
  gempath = File.dirname(File.dirname(__FILE__))
@@ -83,19 +90,31 @@ lines = BioPangenome.load_lines(options[:lines])
83
90
 
84
91
  projected_genes = BioPangenome.load_projected_genes options[:transcript_mapping], genes: genes
85
92
 
86
- variety_coordinates = BioPangenome.load_mapping_hash(
87
- varieties:lines,
88
- genes: projected_genes ,
89
- prefix: options[:basepath],
90
- distance: options[:distance]
91
- )
92
-
93
- seqs = BioPangenome.load_sequences_from_hash(
94
- coordinates:variety_coordinates,
95
- prefix: options[:basepath],
96
- distance: options[:distance],
97
- projected_genes: projected_genes
98
- )
93
+
94
+ seqs = nil
95
+
96
+ if options[:distance].integer?
97
+ variety_coordinates = BioPangenome.load_mapping_hash(
98
+ varieties:lines,
99
+ genes: projected_genes ,
100
+ prefix: options[:basepath],
101
+ distance: options[:distance]
102
+ )
103
+ seqs = BioPangenome.load_sequences_from_hash(
104
+ coordinates:variety_coordinates,
105
+ prefix: options[:basepath],
106
+ distance: options[:distance],
107
+ projected_genes: projected_genes
108
+ )
109
+ else
110
+ seqs = BioPangenome.load_cds_sequences( varieties: lines,
111
+ prefix: "#{options[:basepath]}/#{options[:distance]}/",
112
+ suffix: ".#{options[:distance]}.fa.gz",
113
+ set_id: options[:distance],
114
+ genes:projected_genes )
115
+ end
116
+
117
+ puts "Loaded squence set for #{seqs.size} genes"
99
118
 
100
119
  output = options[:output].to_s
101
120
  output = output + "_" + options[:window].to_s if options[:no_windows] > 0
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # BioRuby bio-pangenome Plugin BioPangenome
4
+ # Author:: homonecloco
5
+ # Copyright:: 2019
6
+
7
+ USAGE = "pangenome_gene_bed_files.rb [options]"
8
+
9
+ gempath = File.dirname(File.dirname(__FILE__))
10
+ $: << File.join(gempath,'lib')
11
+
12
+ VERSION_FILENAME=File.join(gempath,'VERSION')
13
+ version = File.new(VERSION_FILENAME).read.chomp
14
+
15
+ # print banner
16
+ print "pangenome_gene_bed_files #{version} by Ricardo H. Ramirez-Gonzalez 2020\n"
17
+
18
+ if ARGV.size == 0
19
+ print USAGE
20
+ end
21
+
22
+ path = gempath + '/lib/bio-pangenome.rb'
23
+ require path
24
+ #require 'bio-pangenome'
25
+ require 'optparse'
26
+ require 'tmpdir'
27
+
28
+ require 'csv'
29
+ require 'zlib'
30
+ require 'bio'
31
+ require 'bio-svgenes'
32
+
33
+
34
+ options = {
35
+ lines: "lines.txt",
36
+ distances: [0,1000,2000,5000],
37
+ path: "./gff/",
38
+ is_gz: true
39
+ }
40
+
41
+ opts = OptionParser.new do |o|
42
+
43
+ o.on("-l", "--lines PATH", "File containing the lines to be analysed") do |arg|
44
+ options[:lines] = arg
45
+ end
46
+
47
+ o.on("-d", "--distances 0,1000,2000", "File containing the distances to be analysed") do |arg|
48
+ options[:distances] = arg.split(",").map { |e| e.to_i }
49
+ end
50
+
51
+ o.on("-p", "--gff_path DIR", "The directory where the gff files are") do |arg|
52
+ options[:path ] = arg
53
+ end
54
+
55
+ o.separator ""
56
+ o.on_tail('-h', '--help', 'display this help and exit') do
57
+ options[:show_help] = true
58
+ end
59
+
60
+ end
61
+
62
+ opts.parse!(ARGV)
63
+
64
+ puts options.inspect
65
+
66
+ lines = File.foreach(options[:lines]).map { |line| line.chomp }
67
+ distances = options[:distances]
68
+ puts lines
69
+
70
+
71
+
72
+ gffs = MultipleGFFs.new(folder: options[:path], lines:lines, suffix:".gff.gz",
73
+ is_gz:options[:is_gz] )
74
+
75
+ distances.each do |d|
76
+ gffs.bedAround(distance: d, prefix: options[:path], suffix: ".bed" )
77
+ end
78
+
79
+
80
+ def bedAroundToRegions(lines:[], distance: 1000, prefix: "../flanking/filtered/", suffix: ".RefSeqv1.1.reg" , suffix_in: ".RefSeqv1.1.bed" )
81
+ lines.each do |k|
82
+ path="#{prefix}#{k}_#{distance}bp_#{suffix_in}"
83
+ path2="#{prefix}#{k}_#{distance}bp_#{suffix}"
84
+ path3="#{prefix}#{k}_#{distance}bp_#{suffix}.map"
85
+ puts path
86
+ out=File.open(path2, "w")
87
+ out2=File.open(path3, "w")
88
+ File.foreach(path) do |line|
89
+ # puts line
90
+ arr = line.chomp!.split "\t"
91
+ first=arr[1]
92
+ last=arr[2]
93
+ name=arr[0]
94
+ #if(arr[5] == "-")
95
+ # first=arr[2]
96
+ # last=arr[1]
97
+ #end
98
+
99
+ reg = "#{name}:#{first}-#{last}"
100
+ out.puts reg
101
+ out2.puts [reg,arr[3]].join "\t"
102
+ #puts reg
103
+ #v.bedAroundGene(distance:distance, out:out)
104
+ #break
105
+ end
106
+ out.close
107
+ out2.close
108
+ end
109
+ end
110
+
111
+
112
+
113
+ distances.each do |d|
114
+ bedAroundToRegions(lines:lines,
115
+ distance: d,
116
+ prefix: options[:path],
117
+ suffix_in: ".bed",
118
+ suffix: ".reg")
119
+ end
120
+
121
+
122
+
123
+
124
+
125
+
data/lib/bio-pangenome.rb CHANGED
@@ -9,4 +9,5 @@
9
9
  # In this file only require other files. Avoid other source code.
10
10
 
11
11
  require 'bio-pangenome/pangenome.rb'
12
-
12
+ require 'bio-pangenome/gff3_extensions.rb'
13
+ require 'bio-pangenome/MultipleGFF3.rb'
@@ -0,0 +1,77 @@
1
+ class MultipleGFFs
2
+ attr_reader :lines_gffs
3
+
4
+ def initialize(folder: "../mapping/", lines:[], suffix:".SM1.cds.sorted.gff", is_gz:false )
5
+ @folder = folder
6
+ @lines = lines
7
+ @suffix = suffix
8
+ @lines_gffs = Hash.new
9
+ @lines.each do |l|
10
+ path ="#{folder}/#{l}#{suffix}"
11
+ @lines_gffs[l] = GFF3.new(file: path, is_gz: is_gz)
12
+ end
13
+ end
14
+
15
+ def each_gff
16
+ @lines_gffs.each_pair{|k,v| yield k, v }
17
+ end
18
+
19
+ def bedAround(distance: 1000, prefix: "../flanking/releasePGSBV1/", suffix: ".RefSeqv1.1.bed" )
20
+ each_gff do |k, v|
21
+ path="#{prefix}#{k}_#{distance}bp_#{suffix}"
22
+ puts path
23
+ out=File.open(path, "w")
24
+ v.bedAroundGene(distance:distance, out:out)
25
+ out.close
26
+ end
27
+ end
28
+
29
+ def summary
30
+ ret = []
31
+ each_gff do |k,v|
32
+ v.each_mrna do |record|
33
+ tmp = {}
34
+ tmp[:line] = k
35
+ tmp[:id] = record.get_attribute "Name"
36
+ tmp[:chr] = record.seqid
37
+ tmp[:start] = record.start
38
+ tmp[:end] = record.end
39
+ tmp[:strand] = record.strand
40
+ tmp[:genomic_length] = record.end - record.start
41
+ tmp[:coverage] = record.get_attribute "coverage"
42
+ tmp[:identity] = record.get_attribute "identity"
43
+ tmp[:matches] = record.get_attribute "matches"
44
+ tmp[:mismatches] = record.get_attribute "mismatches"
45
+ tmp[:indels] = record.get_attribute "indels"
46
+ tmp[:unknowns] = record.get_attribute "unknowns"
47
+ mrna_stats = @lines_gffs[k].mrna_info(record.id)
48
+ tmp[:cds_count] = mrna_stats.cds_count
49
+ tmp[:cds_max_gap] = mrna_stats.cds_max_gap
50
+ ret << tmp
51
+ end
52
+ end
53
+ ret
54
+ end
55
+
56
+ def to_svg(mrna: "Sm1_CDS.mrna1", positions: false, out: nil)
57
+ p = Bio::Graphics::Page.new(width: 800,
58
+ height: 1000,
59
+ number_of_intervals:10,
60
+ background_color: "white"
61
+ )
62
+ each_gff do |k,v|
63
+ generic_track = p.add_track(:glyph => :generic,
64
+ :name => k,
65
+ :label => true )
66
+ v.cds_to_print(mrna).each do |cds|
67
+
68
+ f_id = positions ? cds.offset_start : nil
69
+ feature = Bio::Graphics::MiniFeature.new(start: cds.start,
70
+ end: cds.end,
71
+ fill_color: cds.color,
72
+ id: f_id)
73
+ generic_track.add(feature)
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,121 @@
1
+ require 'bio-gff3'
2
+
3
+ module Bio::GFFbrowser::FastLineParser
4
+ module_function :parse_line_fast
5
+ end
6
+
7
+ MrnaStats = Struct.new(:cds_count, :cds_max_gap)
8
+
9
+ class GFF3
10
+ CDS_feature = Struct.new(:start, :end, :color, :ref_chr,:ref_start, :ref_end, :offset_start)
11
+
12
+ def initialize(file: "", is_gz: true)
13
+ @file = file
14
+ @is_gz = is_gz
15
+ end
16
+
17
+ def each
18
+ return enum_for(:each) unless block_given?
19
+ io = nil
20
+ if @is_gz
21
+ infile = open(@file)
22
+ io = Zlib::GzipReader.new(infile)
23
+ else
24
+ io = File.open(@file)
25
+ end
26
+ parser = Bio::GFFbrowser::FastLineParser
27
+ io.each_line do |line|
28
+ line.encode!('UTF-8', 'UTF-8', :invalid => :replace)
29
+ line.strip!
30
+ break if line == '##FASTA'
31
+ next if line.length == 0 or line =~ /^#/
32
+ begin
33
+ record = Bio::GFFbrowser::FastLineRecord.new(parser.parse_line_fast(line))
34
+ yield record
35
+ rescue Exception => e
36
+ $stderr.puts "Unable to parse '#{line}'\n#{e}"
37
+ throw e
38
+ end
39
+ end
40
+ end
41
+
42
+ def each_gene
43
+ return enum_for(:each_gene) unless block_given?
44
+ self.each do |record|
45
+ next unless record.feature == "gene"
46
+ yield record
47
+ end
48
+ end
49
+
50
+ def each_mrna
51
+ return enum_for(:each_mrna) unless block_given?
52
+ self.each do |record|
53
+ next unless record.feature == "mRNA"
54
+ yield record
55
+ end
56
+ end
57
+
58
+ def each_cds
59
+ return enum_for(:each_mrna) unless block_given?
60
+ self.each do |record|
61
+ next unless record.feature == "CDS"
62
+ yield record
63
+ end
64
+ end
65
+
66
+ def calculate_mrna_stats
67
+ return if @mrna_stats
68
+ @mrna_stats = Hash.new {|h,k| h[k] = MrnaStats.new(0,0) }
69
+ last_mrna = ""
70
+ last_record = nil
71
+ each_cds do |record|
72
+ parent = record.get_attribute "Parent"
73
+ mrna = @mrna_stats[parent]
74
+ mrna.cds_count += 1
75
+ if last_mrna == parent
76
+ distance = record.start - last_record.end
77
+ mrna.cds_max_gap = distance if distance > mrna.cds_max_gap
78
+ end
79
+ last_record = record
80
+ last_mrna = parent
81
+ end
82
+ return
83
+ end
84
+
85
+ def mrna_info(id)
86
+ calculate_mrna_stats
87
+ @mrna_stats[id]
88
+ end
89
+
90
+ def bedAroundGene(distance:1000, out:$stdout)
91
+ each_gene do |record|
92
+ start = record.start-distance
93
+ start = 1 if start < 1
94
+ reg_end=record.end + distance
95
+ out.puts [record.seqid, start, reg_end, "#{record.id}_#{record.source}_#{distance}bp", ".", record.strand].join "\t"
96
+ end
97
+ end
98
+
99
+
100
+ def cds_to_print(mrna,cannonical_exons:[], colors:["#a6cee3", "#1f78b4", "#b2df8a" , "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"])
101
+
102
+ cds_features = []
103
+ i = 0
104
+ offset=0
105
+ offset_start=0
106
+ each_cds do |record|
107
+ target = record.get_attribute "Target"
108
+ arr = target.split(" ")
109
+ col = colors[i % colors.size ]
110
+ start = arr[1].to_i + offset
111
+ ends = arr[2].to_i + offset
112
+ offset_start = record.start if offset_start == 0
113
+ tmp = CDS_feature.new(start, ends, col,
114
+ record.seqid, record.start,record.end, record.start - offset_start )
115
+ cds_features << tmp
116
+ i += 1
117
+ end
118
+ cds_features
119
+ end
120
+
121
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-pangenome
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ricardo H. Ramirez-Gonzalez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-27 00:00:00.000000000 Z
11
+ date: 2020-04-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -112,6 +112,7 @@ description: Tools to find similarity between pangenomes.
112
112
  email: ricardo.ramirez-gonzalez@jic.ac.uk
113
113
  executables:
114
114
  - pangenome_blast_flanking.rb
115
+ - pangenome_gene_bed_files.rb
115
116
  extensions: []
116
117
  extra_rdoc_files:
117
118
  - LICENSE.txt
@@ -127,7 +128,10 @@ files:
127
128
  - Rakefile
128
129
  - VERSION
129
130
  - bin/pangenome_blast_flanking.rb
131
+ - bin/pangenome_gene_bed_files.rb
130
132
  - lib/bio-pangenome.rb
133
+ - lib/bio-pangenome/MultipleGFF3.rb
134
+ - lib/bio-pangenome/gff3_extensions.rb
131
135
  - lib/bio-pangenome/pangenome.rb
132
136
  - test/helper.rb
133
137
  - test/test_bio-pangenome.rb
@@ -150,7 +154,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
150
154
  - !ruby/object:Gem::Version
151
155
  version: '0'
152
156
  requirements: []
153
- rubygems_version: 3.0.6
157
+ rubyforge_project:
158
+ rubygems_version: 2.7.7
154
159
  signing_key:
155
160
  specification_version: 4
156
161
  summary: Scripts to analyse pangenomes.