RubyGems - bio-pangenome - Versions diffs - 0.1.1 → 0.1.2 - Mend

bio-pangenome 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/VERSION +1 -1
data/bin/pangenome_blast_flanking.rb +32 -13
data/bin/pangenome_gene_bed_files.rb +125 -0
data/lib/bio-pangenome.rb +2 -1
data/lib/bio-pangenome/MultipleGFF3.rb +77 -0
data/lib/bio-pangenome/gff3_extensions.rb +121 -0
metadata +8 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 67ff78e3e817086eb54e9f3c2a93bad8dfec84733d2e6db2a1ccb716543ae3cb
-  data.tar.gz: b9720ab3108dba2864a2f94762597c3cb89fd0837e5853d6f08215abe9f8fce8
+  metadata.gz: 8d1d196a63e35bb86d33ba74ed1465454ffa9d028f57f0603ff90f8708acc352
+  data.tar.gz: 5f00000897b7de9361eba0781d02b49da78dc3d9e0af75fc572283173826224e
 SHA512:
-  metadata.gz: b1643aecb2e252b8c1cc4fe3328d9b296d9ac0f63987544b954320a720346b7175234d6a83c9322baac25f057edc171e530e11181714016aeca73d0da791a650
-  data.tar.gz: d34eafed8d0080ecfbdc68dad5a39366fb18fec2666337e69a00f656f9c2d187b7b9aad062eb3197c24f881194343ce87b4730caafb51d729a80b77722883bd3
+  metadata.gz: 6f8eb3a8f1b10706cd193dc720b3138460ddd4ceb9c5cc1878779159da8064ec20975a032583f872effc19ac24ea1deab6adbd97186f7789a11fccd6d441f126
+  data.tar.gz: 11e2512016f4b5d36ee1c19f88f47dafe82f6e1995c673d39ca5efcfc836fc2cca4f2a03550fcbb0a0b8ee7ef02c1a52e859b4c4111664dfb0870b2197c009b9

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.1
1	+ 0.1.2

data/bin/pangenome_blast_flanking.rb CHANGED Viewed

@@ -4,6 +4,13 @@
 # Author:: homonecloco
 # Copyright:: 2019
+class String
+    def integer?
+        Integer(self) != nil rescue false
+    end
+end
 USAGE = "panggenome_blast_flanking.rb [options]"
 gempath = File.dirname(File.dirname(__FILE__))
@@ -83,19 +90,31 @@ lines = BioPangenome.load_lines(options[:lines])
 projected_genes = BioPangenome.load_projected_genes options[:transcript_mapping], genes: genes
-variety_coordinates = BioPangenome.load_mapping_hash(
-  varieties:lines,
-  genes: projected_genes ,
-  prefix: options[:basepath],
-  distance: options[:distance]
-  )
-seqs = BioPangenome.load_sequences_from_hash(
-  coordinates:variety_coordinates,
-  prefix: options[:basepath],
-  distance: options[:distance],
-  projected_genes: projected_genes
-  )
+seqs = nil
+if options[:distance].integer?
+  variety_coordinates = BioPangenome.load_mapping_hash(
+    varieties:lines,
+    genes: projected_genes ,
+    prefix: options[:basepath],
+    distance: options[:distance]
+    )
+  seqs = BioPangenome.load_sequences_from_hash(
+    coordinates:variety_coordinates,
+    prefix: options[:basepath],
+    distance: options[:distance],
+    projected_genes: projected_genes
+    )
+else
+  seqs = BioPangenome.load_cds_sequences( varieties: lines,
+    prefix: "#{options[:basepath]}/#{options[:distance]}/",
+    suffix: ".#{options[:distance]}.fa.gz",
+    set_id: options[:distance],
+    genes:projected_genes )
+end
+puts "Loaded squence set for #{seqs.size} genes"
 output = options[:output].to_s
 output = output + "_" + options[:window].to_s if options[:no_windows] > 0

data/bin/pangenome_gene_bed_files.rb ADDED Viewed

@@ -0,0 +1,125 @@
+#!/usr/bin/env ruby
+#
+# BioRuby bio-pangenome Plugin BioPangenome
+# Author:: homonecloco
+# Copyright:: 2019
+USAGE = "pangenome_gene_bed_files.rb [options]"
+gempath = File.dirname(File.dirname(__FILE__))
+$: << File.join(gempath,'lib')
+VERSION_FILENAME=File.join(gempath,'VERSION')
+version = File.new(VERSION_FILENAME).read.chomp
+# print banner
+print "pangenome_gene_bed_files #{version} by Ricardo H. Ramirez-Gonzalez 2020\n"
+if ARGV.size == 0
+  print USAGE
+end
+path = gempath + '/lib/bio-pangenome.rb'
+require path
+#require 'bio-pangenome'
+require 'optparse'
+require 'tmpdir'
+require 'csv'
+require 'zlib'
+require 'bio'
+require 'bio-svgenes'
+options = {
+    lines:     "lines.txt",
+    distances: [0,1000,2000,5000],
+    path: "./gff/",
+    is_gz: true
+}
+opts = OptionParser.new do |o|
+    o.on("-l", "--lines PATH", "File containing the lines to be analysed") do |arg|
+        options[:lines] = arg
+    end
+    o.on("-d", "--distances 0,1000,2000", "File containing the distances to be analysed") do |arg|
+        options[:distances] = arg.split(",").map { |e|  e.to_i }
+    end
+    o.on("-p", "--gff_path DIR", "The directory where the gff files are") do |arg|
+        options[:path ] = arg
+    end
+    o.separator ""
+    o.on_tail('-h', '--help', 'display this help and exit') do
+        options[:show_help] = true
+    end
+end
+opts.parse!(ARGV)
+puts options.inspect
+lines = File.foreach(options[:lines]).map { |line| line.chomp }
+distances = options[:distances]
+puts lines
+gffs = MultipleGFFs.new(folder: options[:path], lines:lines, suffix:".gff.gz",
+    is_gz:options[:is_gz] )
+distances.each do |d|
+    gffs.bedAround(distance: d, prefix: options[:path], suffix: ".bed" )
+end
+ def bedAroundToRegions(lines:[], distance: 1000, prefix: "../flanking/filtered/", suffix: ".RefSeqv1.1.reg" , suffix_in: ".RefSeqv1.1.bed" )
+     lines.each do |k|
+         path="#{prefix}#{k}_#{distance}bp_#{suffix_in}"
+         path2="#{prefix}#{k}_#{distance}bp_#{suffix}"
+         path3="#{prefix}#{k}_#{distance}bp_#{suffix}.map"
+         puts path
+         out=File.open(path2, "w")
+         out2=File.open(path3, "w")
+         File.foreach(path) do |line|
+            # puts line
+             arr = line.chomp!.split "\t"
+             first=arr[1]
+             last=arr[2]
+             name=arr[0]
+             #if(arr[5] == "-")
+             #    first=arr[2]
+             #    last=arr[1]
+             #end
+             reg =  "#{name}:#{first}-#{last}"
+             out.puts reg
+             out2.puts [reg,arr[3]].join "\t"
+             #puts reg
+             #v.bedAroundGene(distance:distance, out:out)
+             #break
+         end
+         out.close
+         out2.close
+     end
+ end
+ distances.each do |d|
+     bedAroundToRegions(lines:lines,
+         distance: d,
+         prefix: options[:path],
+         suffix_in: ".bed",
+         suffix: ".reg")
+ end

data/lib/bio-pangenome.rb CHANGED Viewed

@@ -9,4 +9,5 @@
 # In this file only require other files. Avoid other source code.
 require 'bio-pangenome/pangenome.rb'
+require 'bio-pangenome/gff3_extensions.rb'
+require	'bio-pangenome/MultipleGFF3.rb'

data/lib/bio-pangenome/MultipleGFF3.rb ADDED Viewed

@@ -0,0 +1,77 @@
+class MultipleGFFs
+    attr_reader :lines_gffs
+    def initialize(folder: "../mapping/", lines:[], suffix:".SM1.cds.sorted.gff", is_gz:false )
+        @folder = folder
+        @lines = lines
+        @suffix = suffix
+        @lines_gffs = Hash.new
+        @lines.each do |l|
+            path ="#{folder}/#{l}#{suffix}"
+            @lines_gffs[l] = GFF3.new(file: path, is_gz: is_gz)
+        end
+    end
+    def each_gff
+        @lines_gffs.each_pair{|k,v| yield k, v }
+    end
+    def bedAround(distance: 1000, prefix: "../flanking/releasePGSBV1/", suffix: ".RefSeqv1.1.bed" )
+        each_gff do |k, v|
+            path="#{prefix}#{k}_#{distance}bp_#{suffix}"
+            puts path
+            out=File.open(path, "w")
+            v.bedAroundGene(distance:distance, out:out)
+            out.close
+        end
+    end
+    def summary
+        ret = []
+        each_gff do |k,v|
+            v.each_mrna do |record|
+                tmp = {}
+                tmp[:line] = k
+                tmp[:id] = record.get_attribute "Name"
+                tmp[:chr] = record.seqid
+                tmp[:start] = record.start
+                tmp[:end] = record.end
+                tmp[:strand] = record.strand
+                tmp[:genomic_length] = record.end - record.start
+                tmp[:coverage] = record.get_attribute "coverage"
+                tmp[:identity] = record.get_attribute "identity"
+                tmp[:matches]  = record.get_attribute "matches"
+                tmp[:mismatches]  = record.get_attribute "mismatches"
+                tmp[:indels] = record.get_attribute "indels"
+                tmp[:unknowns] = record.get_attribute "unknowns"
+                mrna_stats = @lines_gffs[k].mrna_info(record.id)
+                tmp[:cds_count]   = mrna_stats.cds_count
+                tmp[:cds_max_gap] = mrna_stats.cds_max_gap
+                ret << tmp
+            end
+        end
+        ret
+    end
+    def to_svg(mrna: "Sm1_CDS.mrna1", positions: false, out: nil)
+        p = Bio::Graphics::Page.new(width: 800,
+         height: 1000,
+         number_of_intervals:10,
+         background_color: "white"
+         )
+        each_gff do |k,v|
+            generic_track = p.add_track(:glyph => :generic,
+                :name => k,
+                :label => true  )
+            v.cds_to_print(mrna).each do |cds|
+                f_id = positions ? cds.offset_start : nil
+                feature = Bio::Graphics::MiniFeature.new(start: cds.start,
+            end: cds.end,
+            fill_color: cds.color,
+            id: f_id)
+                generic_track.add(feature)
+            end
+        end
+    end
+end

data/lib/bio-pangenome/gff3_extensions.rb ADDED Viewed

@@ -0,0 +1,121 @@
+require 'bio-gff3'
+module Bio::GFFbrowser::FastLineParser
+	module_function :parse_line_fast
+end
+MrnaStats = Struct.new(:cds_count, :cds_max_gap)
+class GFF3
+	CDS_feature = Struct.new(:start, :end, :color, :ref_chr,:ref_start, :ref_end, :offset_start)
+	def initialize(file: "", is_gz: true)
+		@file = file
+		@is_gz = is_gz
+	end
+	def each
+		return enum_for(:each) unless block_given?
+		io = nil
+		if @is_gz
+			infile = open(@file)
+			io = Zlib::GzipReader.new(infile)
+		else
+			io =  File.open(@file)
+		end
+		parser = Bio::GFFbrowser::FastLineParser
+		io.each_line do |line|
+			line.encode!('UTF-8', 'UTF-8', :invalid => :replace)
+			line.strip!
+			break if line == '##FASTA'
+			next if line.length == 0 or line =~ /^#/
+			begin
+				record = Bio::GFFbrowser::FastLineRecord.new(parser.parse_line_fast(line))
+				yield record
+			rescue Exception => e
+				$stderr.puts "Unable to parse '#{line}'\n#{e}"
+				throw e
+			end
+		end
+	end
+	def each_gene
+		return enum_for(:each_gene) unless block_given?
+		self.each do |record|
+			next unless record.feature == "gene"
+			yield record
+		end
+	end
+	def each_mrna
+		return enum_for(:each_mrna) unless block_given?
+		self.each do |record|
+			next unless record.feature == "mRNA"
+			yield record
+		end
+	end
+	def each_cds
+		return enum_for(:each_mrna) unless block_given?
+		self.each do |record|
+			next unless record.feature == "CDS"
+			yield record
+		end
+	end
+	def calculate_mrna_stats
+		return if @mrna_stats
+		@mrna_stats = Hash.new {|h,k| h[k] = MrnaStats.new(0,0) }
+		last_mrna = ""
+		last_record = nil
+		each_cds do |record|
+			parent = record.get_attribute "Parent"
+			mrna = @mrna_stats[parent]
+			mrna.cds_count += 1
+			if last_mrna == parent
+				distance =  record.start - last_record.end
+				mrna.cds_max_gap = distance if distance > mrna.cds_max_gap
+			end
+			last_record = record
+			last_mrna   = parent
+		end
+		return
+	end
+	def mrna_info(id)
+		calculate_mrna_stats
+		@mrna_stats[id]
+	end
+	def bedAroundGene(distance:1000, out:$stdout)
+		each_gene do |record|
+			start = record.start-distance
+			start = 1 if start < 1
+			reg_end=record.end + distance
+			out.puts [record.seqid, start, reg_end, "#{record.id}_#{record.source}_#{distance}bp", ".", record.strand].join "\t"
+		end
+	end
+	def cds_to_print(mrna,cannonical_exons:[], colors:["#a6cee3", "#1f78b4", "#b2df8a" , "#33a02c", "#fb9a99",  "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"])
+		cds_features = []
+		i = 0
+		offset=0
+		offset_start=0
+		each_cds do |record|
+			target = record.get_attribute "Target"
+			arr = target.split(" ")
+			col = colors[i % colors.size ]
+			start = arr[1].to_i + offset
+			ends = arr[2].to_i + offset
+			offset_start = record.start  if offset_start == 0
+			tmp = CDS_feature.new(start, ends, col,
+				record.seqid, record.start,record.end, record.start - offset_start )
+			cds_features << tmp
+			i += 1
+		end
+		cds_features
+	end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: bio-pangenome
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.2
 platform: ruby
 authors:
 - Ricardo H. Ramirez-Gonzalez
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2019-11-27 00:00:00.000000000 Z
+date: 2020-04-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bio
@@ -112,6 +112,7 @@ description: Tools to find similarity between pangenomes.
 email: ricardo.ramirez-gonzalez@jic.ac.uk
 executables:
 - pangenome_blast_flanking.rb
+- pangenome_gene_bed_files.rb
 extensions: []
 extra_rdoc_files:
 - LICENSE.txt
@@ -127,7 +128,10 @@ files:
 - Rakefile
 - VERSION
 - bin/pangenome_blast_flanking.rb
+- bin/pangenome_gene_bed_files.rb
 - lib/bio-pangenome.rb
+- lib/bio-pangenome/MultipleGFF3.rb
+- lib/bio-pangenome/gff3_extensions.rb
 - lib/bio-pangenome/pangenome.rb
 - test/helper.rb
 - test/test_bio-pangenome.rb
@@ -150,7 +154,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.6
+rubyforge_project:
+rubygems_version: 2.7.7
 signing_key:
 specification_version: 4
 summary: Scripts to analyse pangenomes.