bio-pangenome 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67ff78e3e817086eb54e9f3c2a93bad8dfec84733d2e6db2a1ccb716543ae3cb
4
- data.tar.gz: b9720ab3108dba2864a2f94762597c3cb89fd0837e5853d6f08215abe9f8fce8
3
+ metadata.gz: 8d1d196a63e35bb86d33ba74ed1465454ffa9d028f57f0603ff90f8708acc352
4
+ data.tar.gz: 5f00000897b7de9361eba0781d02b49da78dc3d9e0af75fc572283173826224e
5
5
  SHA512:
6
- metadata.gz: b1643aecb2e252b8c1cc4fe3328d9b296d9ac0f63987544b954320a720346b7175234d6a83c9322baac25f057edc171e530e11181714016aeca73d0da791a650
7
- data.tar.gz: d34eafed8d0080ecfbdc68dad5a39366fb18fec2666337e69a00f656f9c2d187b7b9aad062eb3197c24f881194343ce87b4730caafb51d729a80b77722883bd3
6
+ metadata.gz: 6f8eb3a8f1b10706cd193dc720b3138460ddd4ceb9c5cc1878779159da8064ec20975a032583f872effc19ac24ea1deab6adbd97186f7789a11fccd6d441f126
7
+ data.tar.gz: 11e2512016f4b5d36ee1c19f88f47dafe82f6e1995c673d39ca5efcfc836fc2cca4f2a03550fcbb0a0b8ee7ef02c1a52e859b4c4111664dfb0870b2197c009b9
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.1
1
+ 0.1.2
@@ -4,6 +4,13 @@
4
4
  # Author:: homonecloco
5
5
  # Copyright:: 2019
6
6
 
7
+ class String
8
+ def integer?
9
+ Integer(self) != nil rescue false
10
+ end
11
+ end
12
+
13
+
7
14
  USAGE = "panggenome_blast_flanking.rb [options]"
8
15
 
9
16
  gempath = File.dirname(File.dirname(__FILE__))
@@ -83,19 +90,31 @@ lines = BioPangenome.load_lines(options[:lines])
83
90
 
84
91
  projected_genes = BioPangenome.load_projected_genes options[:transcript_mapping], genes: genes
85
92
 
86
- variety_coordinates = BioPangenome.load_mapping_hash(
87
- varieties:lines,
88
- genes: projected_genes ,
89
- prefix: options[:basepath],
90
- distance: options[:distance]
91
- )
92
-
93
- seqs = BioPangenome.load_sequences_from_hash(
94
- coordinates:variety_coordinates,
95
- prefix: options[:basepath],
96
- distance: options[:distance],
97
- projected_genes: projected_genes
98
- )
93
+
94
+ seqs = nil
95
+
96
+ if options[:distance].integer?
97
+ variety_coordinates = BioPangenome.load_mapping_hash(
98
+ varieties:lines,
99
+ genes: projected_genes ,
100
+ prefix: options[:basepath],
101
+ distance: options[:distance]
102
+ )
103
+ seqs = BioPangenome.load_sequences_from_hash(
104
+ coordinates:variety_coordinates,
105
+ prefix: options[:basepath],
106
+ distance: options[:distance],
107
+ projected_genes: projected_genes
108
+ )
109
+ else
110
+ seqs = BioPangenome.load_cds_sequences( varieties: lines,
111
+ prefix: "#{options[:basepath]}/#{options[:distance]}/",
112
+ suffix: ".#{options[:distance]}.fa.gz",
113
+ set_id: options[:distance],
114
+ genes:projected_genes )
115
+ end
116
+
117
+ puts "Loaded squence set for #{seqs.size} genes"
99
118
 
100
119
  output = options[:output].to_s
101
120
  output = output + "_" + options[:window].to_s if options[:no_windows] > 0
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # BioRuby bio-pangenome Plugin BioPangenome
4
+ # Author:: homonecloco
5
+ # Copyright:: 2019
6
+
7
+ USAGE = "pangenome_gene_bed_files.rb [options]"
8
+
9
+ gempath = File.dirname(File.dirname(__FILE__))
10
+ $: << File.join(gempath,'lib')
11
+
12
+ VERSION_FILENAME=File.join(gempath,'VERSION')
13
+ version = File.new(VERSION_FILENAME).read.chomp
14
+
15
+ # print banner
16
+ print "pangenome_gene_bed_files #{version} by Ricardo H. Ramirez-Gonzalez 2020\n"
17
+
18
+ if ARGV.size == 0
19
+ print USAGE
20
+ end
21
+
22
+ path = gempath + '/lib/bio-pangenome.rb'
23
+ require path
24
+ #require 'bio-pangenome'
25
+ require 'optparse'
26
+ require 'tmpdir'
27
+
28
+ require 'csv'
29
+ require 'zlib'
30
+ require 'bio'
31
+ require 'bio-svgenes'
32
+
33
+
34
+ options = {
35
+ lines: "lines.txt",
36
+ distances: [0,1000,2000,5000],
37
+ path: "./gff/",
38
+ is_gz: true
39
+ }
40
+
41
+ opts = OptionParser.new do |o|
42
+
43
+ o.on("-l", "--lines PATH", "File containing the lines to be analysed") do |arg|
44
+ options[:lines] = arg
45
+ end
46
+
47
+ o.on("-d", "--distances 0,1000,2000", "File containing the distances to be analysed") do |arg|
48
+ options[:distances] = arg.split(",").map { |e| e.to_i }
49
+ end
50
+
51
+ o.on("-p", "--gff_path DIR", "The directory where the gff files are") do |arg|
52
+ options[:path ] = arg
53
+ end
54
+
55
+ o.separator ""
56
+ o.on_tail('-h', '--help', 'display this help and exit') do
57
+ options[:show_help] = true
58
+ end
59
+
60
+ end
61
+
62
+ opts.parse!(ARGV)
63
+
64
+ puts options.inspect
65
+
66
+ lines = File.foreach(options[:lines]).map { |line| line.chomp }
67
+ distances = options[:distances]
68
+ puts lines
69
+
70
+
71
+
72
+ gffs = MultipleGFFs.new(folder: options[:path], lines:lines, suffix:".gff.gz",
73
+ is_gz:options[:is_gz] )
74
+
75
+ distances.each do |d|
76
+ gffs.bedAround(distance: d, prefix: options[:path], suffix: ".bed" )
77
+ end
78
+
79
+
80
+ def bedAroundToRegions(lines:[], distance: 1000, prefix: "../flanking/filtered/", suffix: ".RefSeqv1.1.reg" , suffix_in: ".RefSeqv1.1.bed" )
81
+ lines.each do |k|
82
+ path="#{prefix}#{k}_#{distance}bp_#{suffix_in}"
83
+ path2="#{prefix}#{k}_#{distance}bp_#{suffix}"
84
+ path3="#{prefix}#{k}_#{distance}bp_#{suffix}.map"
85
+ puts path
86
+ out=File.open(path2, "w")
87
+ out2=File.open(path3, "w")
88
+ File.foreach(path) do |line|
89
+ # puts line
90
+ arr = line.chomp!.split "\t"
91
+ first=arr[1]
92
+ last=arr[2]
93
+ name=arr[0]
94
+ #if(arr[5] == "-")
95
+ # first=arr[2]
96
+ # last=arr[1]
97
+ #end
98
+
99
+ reg = "#{name}:#{first}-#{last}"
100
+ out.puts reg
101
+ out2.puts [reg,arr[3]].join "\t"
102
+ #puts reg
103
+ #v.bedAroundGene(distance:distance, out:out)
104
+ #break
105
+ end
106
+ out.close
107
+ out2.close
108
+ end
109
+ end
110
+
111
+
112
+
113
+ distances.each do |d|
114
+ bedAroundToRegions(lines:lines,
115
+ distance: d,
116
+ prefix: options[:path],
117
+ suffix_in: ".bed",
118
+ suffix: ".reg")
119
+ end
120
+
121
+
122
+
123
+
124
+
125
+
data/lib/bio-pangenome.rb CHANGED
@@ -9,4 +9,5 @@
9
9
  # In this file only require other files. Avoid other source code.
10
10
 
11
11
  require 'bio-pangenome/pangenome.rb'
12
-
12
+ require 'bio-pangenome/gff3_extensions.rb'
13
+ require 'bio-pangenome/MultipleGFF3.rb'
@@ -0,0 +1,77 @@
1
+ class MultipleGFFs
2
+ attr_reader :lines_gffs
3
+
4
+ def initialize(folder: "../mapping/", lines:[], suffix:".SM1.cds.sorted.gff", is_gz:false )
5
+ @folder = folder
6
+ @lines = lines
7
+ @suffix = suffix
8
+ @lines_gffs = Hash.new
9
+ @lines.each do |l|
10
+ path ="#{folder}/#{l}#{suffix}"
11
+ @lines_gffs[l] = GFF3.new(file: path, is_gz: is_gz)
12
+ end
13
+ end
14
+
15
+ def each_gff
16
+ @lines_gffs.each_pair{|k,v| yield k, v }
17
+ end
18
+
19
+ def bedAround(distance: 1000, prefix: "../flanking/releasePGSBV1/", suffix: ".RefSeqv1.1.bed" )
20
+ each_gff do |k, v|
21
+ path="#{prefix}#{k}_#{distance}bp_#{suffix}"
22
+ puts path
23
+ out=File.open(path, "w")
24
+ v.bedAroundGene(distance:distance, out:out)
25
+ out.close
26
+ end
27
+ end
28
+
29
+ def summary
30
+ ret = []
31
+ each_gff do |k,v|
32
+ v.each_mrna do |record|
33
+ tmp = {}
34
+ tmp[:line] = k
35
+ tmp[:id] = record.get_attribute "Name"
36
+ tmp[:chr] = record.seqid
37
+ tmp[:start] = record.start
38
+ tmp[:end] = record.end
39
+ tmp[:strand] = record.strand
40
+ tmp[:genomic_length] = record.end - record.start
41
+ tmp[:coverage] = record.get_attribute "coverage"
42
+ tmp[:identity] = record.get_attribute "identity"
43
+ tmp[:matches] = record.get_attribute "matches"
44
+ tmp[:mismatches] = record.get_attribute "mismatches"
45
+ tmp[:indels] = record.get_attribute "indels"
46
+ tmp[:unknowns] = record.get_attribute "unknowns"
47
+ mrna_stats = @lines_gffs[k].mrna_info(record.id)
48
+ tmp[:cds_count] = mrna_stats.cds_count
49
+ tmp[:cds_max_gap] = mrna_stats.cds_max_gap
50
+ ret << tmp
51
+ end
52
+ end
53
+ ret
54
+ end
55
+
56
+ def to_svg(mrna: "Sm1_CDS.mrna1", positions: false, out: nil)
57
+ p = Bio::Graphics::Page.new(width: 800,
58
+ height: 1000,
59
+ number_of_intervals:10,
60
+ background_color: "white"
61
+ )
62
+ each_gff do |k,v|
63
+ generic_track = p.add_track(:glyph => :generic,
64
+ :name => k,
65
+ :label => true )
66
+ v.cds_to_print(mrna).each do |cds|
67
+
68
+ f_id = positions ? cds.offset_start : nil
69
+ feature = Bio::Graphics::MiniFeature.new(start: cds.start,
70
+ end: cds.end,
71
+ fill_color: cds.color,
72
+ id: f_id)
73
+ generic_track.add(feature)
74
+ end
75
+ end
76
+ end
77
+ end
@@ -0,0 +1,121 @@
1
+ require 'bio-gff3'
2
+
3
+ module Bio::GFFbrowser::FastLineParser
4
+ module_function :parse_line_fast
5
+ end
6
+
7
+ MrnaStats = Struct.new(:cds_count, :cds_max_gap)
8
+
9
+ class GFF3
10
+ CDS_feature = Struct.new(:start, :end, :color, :ref_chr,:ref_start, :ref_end, :offset_start)
11
+
12
+ def initialize(file: "", is_gz: true)
13
+ @file = file
14
+ @is_gz = is_gz
15
+ end
16
+
17
+ def each
18
+ return enum_for(:each) unless block_given?
19
+ io = nil
20
+ if @is_gz
21
+ infile = open(@file)
22
+ io = Zlib::GzipReader.new(infile)
23
+ else
24
+ io = File.open(@file)
25
+ end
26
+ parser = Bio::GFFbrowser::FastLineParser
27
+ io.each_line do |line|
28
+ line.encode!('UTF-8', 'UTF-8', :invalid => :replace)
29
+ line.strip!
30
+ break if line == '##FASTA'
31
+ next if line.length == 0 or line =~ /^#/
32
+ begin
33
+ record = Bio::GFFbrowser::FastLineRecord.new(parser.parse_line_fast(line))
34
+ yield record
35
+ rescue Exception => e
36
+ $stderr.puts "Unable to parse '#{line}'\n#{e}"
37
+ throw e
38
+ end
39
+ end
40
+ end
41
+
42
+ def each_gene
43
+ return enum_for(:each_gene) unless block_given?
44
+ self.each do |record|
45
+ next unless record.feature == "gene"
46
+ yield record
47
+ end
48
+ end
49
+
50
+ def each_mrna
51
+ return enum_for(:each_mrna) unless block_given?
52
+ self.each do |record|
53
+ next unless record.feature == "mRNA"
54
+ yield record
55
+ end
56
+ end
57
+
58
+ def each_cds
59
+ return enum_for(:each_mrna) unless block_given?
60
+ self.each do |record|
61
+ next unless record.feature == "CDS"
62
+ yield record
63
+ end
64
+ end
65
+
66
+ def calculate_mrna_stats
67
+ return if @mrna_stats
68
+ @mrna_stats = Hash.new {|h,k| h[k] = MrnaStats.new(0,0) }
69
+ last_mrna = ""
70
+ last_record = nil
71
+ each_cds do |record|
72
+ parent = record.get_attribute "Parent"
73
+ mrna = @mrna_stats[parent]
74
+ mrna.cds_count += 1
75
+ if last_mrna == parent
76
+ distance = record.start - last_record.end
77
+ mrna.cds_max_gap = distance if distance > mrna.cds_max_gap
78
+ end
79
+ last_record = record
80
+ last_mrna = parent
81
+ end
82
+ return
83
+ end
84
+
85
+ def mrna_info(id)
86
+ calculate_mrna_stats
87
+ @mrna_stats[id]
88
+ end
89
+
90
+ def bedAroundGene(distance:1000, out:$stdout)
91
+ each_gene do |record|
92
+ start = record.start-distance
93
+ start = 1 if start < 1
94
+ reg_end=record.end + distance
95
+ out.puts [record.seqid, start, reg_end, "#{record.id}_#{record.source}_#{distance}bp", ".", record.strand].join "\t"
96
+ end
97
+ end
98
+
99
+
100
+ def cds_to_print(mrna,cannonical_exons:[], colors:["#a6cee3", "#1f78b4", "#b2df8a" , "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"])
101
+
102
+ cds_features = []
103
+ i = 0
104
+ offset=0
105
+ offset_start=0
106
+ each_cds do |record|
107
+ target = record.get_attribute "Target"
108
+ arr = target.split(" ")
109
+ col = colors[i % colors.size ]
110
+ start = arr[1].to_i + offset
111
+ ends = arr[2].to_i + offset
112
+ offset_start = record.start if offset_start == 0
113
+ tmp = CDS_feature.new(start, ends, col,
114
+ record.seqid, record.start,record.end, record.start - offset_start )
115
+ cds_features << tmp
116
+ i += 1
117
+ end
118
+ cds_features
119
+ end
120
+
121
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bio-pangenome
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ricardo H. Ramirez-Gonzalez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-11-27 00:00:00.000000000 Z
11
+ date: 2020-04-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bio
@@ -112,6 +112,7 @@ description: Tools to find similarity between pangenomes.
112
112
  email: ricardo.ramirez-gonzalez@jic.ac.uk
113
113
  executables:
114
114
  - pangenome_blast_flanking.rb
115
+ - pangenome_gene_bed_files.rb
115
116
  extensions: []
116
117
  extra_rdoc_files:
117
118
  - LICENSE.txt
@@ -127,7 +128,10 @@ files:
127
128
  - Rakefile
128
129
  - VERSION
129
130
  - bin/pangenome_blast_flanking.rb
131
+ - bin/pangenome_gene_bed_files.rb
130
132
  - lib/bio-pangenome.rb
133
+ - lib/bio-pangenome/MultipleGFF3.rb
134
+ - lib/bio-pangenome/gff3_extensions.rb
131
135
  - lib/bio-pangenome/pangenome.rb
132
136
  - test/helper.rb
133
137
  - test/test_bio-pangenome.rb
@@ -150,7 +154,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
150
154
  - !ruby/object:Gem::Version
151
155
  version: '0'
152
156
  requirements: []
153
- rubygems_version: 3.0.6
157
+ rubyforge_project:
158
+ rubygems_version: 2.7.7
154
159
  signing_key:
155
160
  specification_version: 4
156
161
  summary: Scripts to analyse pangenomes.