RubyGems - mapp2g - Versions diffs - 0.1.3 → 0.1.4 - Mend

mapp2g 0.1.3 → 0.1.4

Files changed (7) hide show

checksums.yaml +4 -4
data/exe/mapp2g +29 -1
data/lib/mapp2g/version.rb +1 -1
data/scripts/add_annotation_from_uniprot_fasta_to_gff.rb +117 -0
data/scripts/mapp2g-exonerate_gff2_to_jbgff3.v2.rb +37 -0
data/scripts/mapp2g-exonerate_gff2_to_jbgff3.v3.rb +70 -0
metadata +9 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '029a95186aca02659011cc06f0cdc67f85aa7c8fbda820c06e504a772158cd0a'
-  data.tar.gz: bbaac99c22269dfbba7980ee2ee7c90c7f6bc7f0874b57cd397c1f6d803f953e
+  metadata.gz: 80ed37b4687cdd64f5a292593e2b15addadbabe49223c79835b530e94ac23be6
+  data.tar.gz: 37a038d19321a88e10f44261a3027bda85c40c29d529b710bb93ed760ada585e
 SHA512:
-  metadata.gz: 3f41a7fad9ef596f1af730dfd235895c1f04365f5672d2ead4a6ae1979738ee3ba601707c440d48708e75e6f0d6b49944e6ba2861bfeac1ca004ceebba48783e
-  data.tar.gz: ec1c3ecd64349b0b880a35d4148066ce4a4c5888f789c8223b0fe93d089c254d11341f293e9e535e4c29b2e29508489eb945f29c72756b52ac4f0b64f6b90590
+  metadata.gz: 79f3e7022e8532c1bb9fc0f333927042a4eead1a40d1ecbcb9408d9391a1f21e162830132c395411aac7065076fa14eed694f6ee5b6824a3edc1c193a99ff9ab
+  data.tar.gz: b6c93c34a6576ea1e57fdc2eecdb04e767a595b6b6a037885ab3a89ebf46f36272b6017f55441788ca1cb89dd80cbe273d6a7049f067445b48d8464a486402b2

data/exe/mapp2g CHANGED Viewed

@@ -5,6 +5,7 @@ require 'bio'
 require 'tempfile'
 require 'optparse'
+### Parse options
 opt = OptionParser.new
 OPTS = {}
@@ -30,7 +31,6 @@ begin
   opt.parse!(ARGV)
 rescue => e
   puts "ERROR: #{e}\nSee #{opt}"
   exit
@@ -44,6 +44,34 @@ outdir = (OPTS[:o] || "mapp2g_out_#{$$}")
 #p [query, genome, outdir]
+### Check environment
+def command?(name)
+  `which #{name}`
+  unless $?.success?
+    raise "#{name} command not found"
+  end
+end
+begin
+  command?("exonerate")
+  command?("blastn")
+rescue => e
+  puts "ERROR: #{e}"
+  exit(1)
+end
+begin
+  unless File.exist?("#{genome}.nsq") && File.exist?("#{genome}.nos")
+    raise "genome is not indexed. Please run 'makeblastdb -in #{genome} -dbtype nucl -parse_seqids'"
+  end
+rescue => e
+  puts "ERROR: #{e}"
+  exit(1)
+end
+### Main
 Dir.mkdir(outdir)
 Bio::FlatFile.open(Bio::FastaFormat, query).each_with_index do |fas, i|

data/lib/mapp2g/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Mapp2g
-  VERSION = "0.1.3"
+  VERSION = "0.1.4"
 end

data/scripts/add_annotation_from_uniprot_fasta_to_gff.rb ADDED Viewed

@@ -0,0 +1,117 @@
+#===
+# add_annotation_from_uniprot_fasta_to_gff.rb
+#
+# This script adds annotation from UniProt FASTA file to GFF file.
+#
+# Usage: ruby add_annotation_from_uniprot_fasta_to_gff.rb <uniprot_proteome_referene_fasta> <gff>
+#
+# Example: ruby add_annotation_from_uniprot_fasta_to_gff.rb UP000001593_45351.fasta mygff.gff3 > mygff_with_annotation.gff3
+#
+require 'uri'
+# https://www.uniprot.org/help/fasta-headers
+#
+# UniProtKB
+#
+# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
+#
+# Where:
+#
+#     db is 'sp' for UniProtKB/Swiss-Prot and 'tr' for UniProtKB/TrEMBL.
+#     UniqueIdentifier is the primary accession number of the UniProtKB entry.
+#     EntryName is the entry name of the UniProtKB entry.
+#     ProteinName is the recommended name of the UniProtKB entry as annotated in the RecName field. For UniProtKB/TrEMBL entries without a RecName field, the SubName field is used. In case of multiple SubNames, the first one is used. The 'precursor' attribute is excluded, 'Fragment' is included with the name if applicable.
+#     OrganismName is the scientific name of the organism of the UniProtKB entry.
+#     OrganismIdentifier is the unique identifier of the source organism, assigned by the NCBI.
+#     GeneName is the first gene name of the UniProtKB entry. If there is no gene name, OrderedLocusName or ORFname, the GN field is not listed.
+#     ProteinExistence is the numerical value describing the evidence for the existence of the protein.
+#     SequenceVersion is the version number of the sequence.
+# >db|UniqueIdentifier|EntryName ProteinName OS=OrganismName OX=OrganismIdentifier [GN=GeneName ]PE=ProteinExistence SV=SequenceVersion
+fastaf = ARGV[0]
+gfff = ARGV[1]
+data = Hash.new
+module Escape
+  # ref: https://github.com/bioruby/bioruby/blob/master/lib/bio/db/gff.rb
+  # unsafe characters to be escaped for normal columns
+  UNSAFE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><;=,]/n
+  # unsafe characters to be escaped for seqid columns
+  # and target_id of the "Target" attribute
+  UNSAFE_SEQID = /[^-a-zA-Z0-9.:^*$@!+_?|]/n
+  # unsafe characters to be escaped for attribute columns
+  UNSAFE_ATTRIBUTE = /[^-_.!~*'()a-zA-Z\d\/?:@+$\[\] "\x80-\xfd><]/n
+  URI_PARSER = URI::Parser.new
+  def self.escape_attribute(str)
+    URI_PARSER.escape(str, UNSAFE_ATTRIBUTE)
+  end
+end
+File.open(fastaf).each do |l|
+  if /^>/.match(l)
+#    puts l
+    m = /^>(\S+)\|(\S+)\|(\S+)\s+(.+)\s+OS\=(.+)\s+OX\=(\d+)\s+(GN\=(.+)\s+)?PE\=(\d+)\s+SV\=(\d+)/.match(l)
+    db = m[1]
+    id = m[2]
+    entry_name = m[3]
+    protein_name = m[4]
+    organism_name = m[5]
+    organism_id = m[6]
+    gene_name = m[8]
+    prot_exis = m[9]
+    seq_ver = m[10]
+    h = {
+      :db => db,
+      :id => id,
+      :entry_name => entry_name,
+      :protein_name => protein_name,
+      :organism_name => organism_name,
+      :organism_id => organism_id,
+      :gene_name => gene_name,
+      :prot_exis => prot_exis,
+      :seq_ver => seq_ver
+    }
+    data[id] = h
+#    p [db, id, entry_name, protein_name, organism_name, organism_id]
+  end
+end
+File.open(gfff).each do |l|
+  a = l.chomp.split(/\t/)
+  col_type = a[2]
+  col_attr = a[8]
+  if col_type == "match"
+    id = /ID\=(.+)/.match(col_attr)[1]
+#    p id
+    d = data[id]
+    h = Hash.new
+    dbxref = Hash.new
+    h['Name'] = Escape.escape_attribute(d[:protein_name])
+    dbxref['EMBL'] = d[:id]
+    dbxref['Uniprot'] = d[:entry_name]
+    dbxref['tax'] = d[:organism_id]
+    h['Dbxref'] = dbxref.map{|k, v| "#{k}:#{Escape.escape_attribute(v)}"}.join(",")
+    if d[:gene_name]
+      h['Alias'] = [Escape.escape_attribute(d[:gene_name])].join(",")
+    end
+    new_attr_col = col_attr + ";" + h.map{|k, v| "#{k}=#{v}"}.join(";")
+    b = a.dup
+    b[8] = new_attr_col
+    puts b.join("\t")
+  else
+    puts l
+  end
+end

data/scripts/mapp2g-exonerate_gff2_to_jbgff3.v2.rb ADDED Viewed

@@ -0,0 +1,37 @@
+gff_lines = []
+i = 0
+name = nil
+target = nil
+ARGF.each do |l|
+  if m = /\s+Query:\s/.match(l)
+    name = m.post_match.chomp.split[0]
+    i+=1
+    STDERR.puts "#{i} records processed" if i % 1000 == 0
+  elsif m = /\s+Target:\s/.match(l)
+    target = m.post_match.split[0]
+  elsif /^#{target}/.match(l) &&
+      (/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
+      (/\tcds\t/.match(l) || /\texon\t/.match(l) || /\tgene\t/.match(l))
+#    puts l
+    a = l.chomp.split(/\t/)
+    b = Array.new(9)
+    a.each_with_index{|x, i| b[i] = x}
+    if b[2] == "gene"
+      b[-1] = "ID=#{name}"
+      b[2] = "match"
+    elsif (b[2] == "cds" || b[2] == "exon")
+      b[-1] = "Parent=#{name}"
+      b[2] = "match_part"
+    else
+      raise
+    end
+    gff_lines <<  b.join("\t")
+  end
+end
+puts gff_lines

data/scripts/mapp2g-exonerate_gff2_to_jbgff3.v3.rb ADDED Viewed

@@ -0,0 +1,70 @@
+#!/bin/env ruby
+exonerate_out = (ARGV[0] || "mapp2g_out_ClyHem/mapp2g_out_ClyHem_j609t87/57.exonerate.txt")
+query_name = nil
+target = nil
+cigar = nil
+gff2_lines = []
+#vulgar = nil
+File.open(exonerate_out, "r").each do |l|
+  if m = /\s+Query:\s/.match(l)
+    query_name = m.post_match.chomp.split[0]
+  elsif m = /\s+Target:\s/.match(l)
+    target = m.post_match.split[0]
+  elsif m = /^cigar:\s/.match(l)
+    cigar = m.post_match.chomp
+  elsif /^#{target}/.match(l) &&
+      (/\texonerate:est2genome\t/.match(l) || /\texonerate:protein2genome:local\t/.match(l)) &&
+      (/\texon\t/.match(l) || /\tgene\t/.match(l))
+    gff2_lines << l.chomp
+  end
+end
+#puts gff2_lines
+gff2_lines.each do |l|
+  a = l.chomp.split(/\t/)
+  b = Array.new(9)
+  a.each_with_index{|x, i| b[i] = x}
+  if b[2] == "gene"
+    b[2] = "match"
+    orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
+#    p orig_attribute
+    c = cigar.split(/\s+/)
+    cigar_pairs = c[9..-1].join.scan(/[MDI]\d+/)
+    attribute = {'ID' => query_name,
+                 'Target' => [query_name, c[1].to_i + 1, c[2]].join(" "),
+                 'Gap'=> cigar_pairs.join(" "),
+                 'identity' => orig_attribute['identity'],
+                 'similarity' => orig_attribute['similarity']}
+    b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
+  elsif b[2] == "exon"
+    b[2] = "match_part"
+    orig_attribute = b[8].split(";").map{|x| x.strip.split(/\s+/)}.to_h
+    attribute = {'Parent' => query_name,
+    'identity' => orig_attribute['identity'],
+    'similarity' => orig_attribute['similarity']}
+    b[8] = attribute.map{|k, v| "#{k}=#{v}"}.join(";")
+  else
+    raise
+  end
+  puts b.join("\t")
+end
+# CIGAR format
+# The format starts with the same 9 fields as sugar output (see above), and is followed by a series of <operation, length> pairs where operation is one of match, insert or delete, and the length describes the number of times this operation is repeated.
+# 1 query_id: Query identifier
+# 2 query_start: Query position at alignment start
+# 3 query_end: Query position alignment end
+# 4 query_strand: Strand of query matched
+# 5 target_id|
+# 6 target_start| the same 4 fields
+# 7 target_end  | for the target sequence
+# 8 target_strand|
+# 9 score| The raw alignment score

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: mapp2g
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
 platform: ruby
 authors:
 - Shuji Shigenobu
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-02-09 00:00:00.000000000 Z
+date: 2023-07-09 00:00:00.000000000 Z
 dependencies: []
 description: mapp2g is a bioinformatics software, which map and align protein sequences
   (amino acid sequences) to genome references in a splicing-aware way. mapp2g alignment
@@ -30,7 +30,10 @@ files:
 - lib/mapp2g/mapper.rb
 - lib/mapp2g/version.rb
 - mapp2g.gemspec
+- scripts/add_annotation_from_uniprot_fasta_to_gff.rb
 - scripts/mapp2g-exonerate_gff2_to_jbgff3.rb
+- scripts/mapp2g-exonerate_gff2_to_jbgff3.v2.rb
+- scripts/mapp2g-exonerate_gff2_to_jbgff3.v3.rb
 - sig/mapp2g.rbs
 homepage: https://github.com/shujishigenobu/mapp2g
 licenses:
@@ -39,7 +42,7 @@ metadata:
   homepage_uri: https://github.com/shujishigenobu/mapp2g
   source_code_uri: https://github.com/shujishigenobu/mapp2g
   changelog_uri: https://github.com/shujishigenobu/mapp2g
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -54,8 +57,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.15
-signing_key:
+rubygems_version: 3.4.10
+signing_key:
 specification_version: 4
 summary: mapp2g is the tool to map protein sequences to genome references in a splicing-aware
   way.