RubyGems - ezgff - Versions diffs - 0.0.5 → 0.0.6 - Mend

ezgff 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/README.md +7 -0
data/exe/ezgff +2 -1
data/exe/ezgff_extract_gene_model_features_from_gff3.rb +155 -0
data/exe/ezgff_rm_fasta.rb +14 -0
data/lib/ezgff/gffsqlitedb.rb +9 -3
data/lib/ezgff/version.rb +1 -1
metadata +6 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 3dedc8f2f93b8b91983a4f330d7c8194be3bb6ba72c52ded51b11bcea2cc55eb
-  data.tar.gz: 7db8dbc168eb71e79576e7efcfc1da8441848c89b47a916e4e212de727aa1c46
+  metadata.gz: d4a07d4b65fd0a41b91f7627a19291d9740748a84f680c93e1307dfe1397c110
+  data.tar.gz: 3bd3d44155e7a92f81e899481706a280aca5b6f1636c67b550ba6244b39f6a8c
 SHA512:
-  metadata.gz: a75c6fd47cfdddf8a2984462be9f985e775a1e8ada520f9c17581573c6d3db3a3b4d00b6b94d06934ec2d0237bbd287b97203c057244700e73f6aa70bc36b701
-  data.tar.gz: ba88715c8a7855e2b005beb055cff0958f9d7146cb8ac560a3d6660b7daf34c0b5039d8765b19f868fc290828899603f4c9232104bb5440722942426839eba43
+  metadata.gz: d665269b3032ad82f9859d737cc31e4393b6f7275a2547b76faa8ea5499d7258e6a8651c211409115e767dd316b5b0ef616ffc45bdc09c131d3b036738ac26d1
+  data.tar.gz: bb76c6640fb0c410d5eb66413073a5cb97efdd6d9167c5a3b5e4ff601ba48233b545139230935fd15032a876b91e26811f306da00e6fd2a956d269c8fccd3b7a

data/README.md CHANGED Viewed

@@ -54,3 +54,10 @@ ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1  --with=
  |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] \
  |@csv'
 ```
+## FAQ
+### Can I use GFF2 or GTF for ezgff?
+Ans.
+No. ezgff takes GFF3 format only. GFF2/GTF can be converted to GFF3 easily. gffread and other tools can be used for conversion.

data/exe/ezgff CHANGED Viewed

@@ -132,6 +132,7 @@ module Ezgff
     desc "search DB QUERY", "search GFF record giving query"
     option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
+    option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
     def search(db, query)
       ezdb = db
       files = Dir["#{ezdb}/*.sqlite3"]
@@ -144,7 +145,7 @@ module Ezgff
         raise "Multiple sqlite3 files found"
       end
       sq3_db = GffDb.new(sq3_file)
-      results = sq3_db.search(query, 100)
+      results = sq3_db.search(query, 100, options[:type])
       case options[:format]
       when "json"
         h = Hash.new

data/exe/ezgff_extract_gene_model_features_from_gff3.rb ADDED Viewed

@@ -0,0 +1,155 @@
+#!/usr/bin/env ruby
+require_relative '../lib/ezgff'
+# require 'thor'
+include Ezgff
+def path_to_root(ary, pdata)
+  #  p ary
+  #  p pdata
+    if par = pdata[ary.last]
+      newary = [ary, par].flatten
+      path_to_root(newary, data)
+    else
+      return ary
+    end
+  end
+def descendant_paths(paths, cdata)
+  #  puts "input:"
+  #  p paths
+    if  paths.map{|pa| cdata[pa.last].size}.all?{|v| v == 0}
+      return paths
+    else
+      newpaths = []
+      paths.each do |pa|
+  #p      pa
+  #p      cdata[pa.last][:children]
+        if cdata[pa.last].size > 0
+          cdata[pa.last].each do |c|
+            newary = [pa, c].flatten
+            newpaths << newary
+          end
+        else
+          newpaths << pa
+        end
+      end
+  #    puts  "generated:"
+  #    p newpaths
+      descendant_paths(newpaths, cdata)
+    end
+end
+gff_file = ARGV[0]
+out_file = ARGV[1]
+outfile_not_found_parents = out_file + ".ParentsNoteFound.txt"
+include_features = ["gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR", "ncRNA","polyA_site", "pre_miRNA", "pseudogene", "rRNA", "snRNA", "snoRNA", "tRNA"]
+exclude_features = ["match", "match_part", "orthologous_to", "paralogous_to", "oligo", "sgRNA"]
+data = {}   # key: line_num (int); value: gff_feature (Bio::GFF::GFF3::Record)
+id2ln = {}  # key: ID; value: line_num (reference to key of data)
+pdata = {}  # Hash to store parent relations
+            # key: line_num (int); value: ID
+STDERR.puts "#{Time.now} Loading data..."
+File.open(gff_file).each_with_index do |l, i|
+  STDERR.print "#{i} lines loaded\r" if i % 100000 == 0
+  #    puts l
+  a = l.chomp.split(/\t/)
+#  next if exclude_features.include?(a[2])
+  next unless include_features.include?(a[2])
+  ## skip FASTA seq section
+  break if /^\#\#FASTA/.match(l)
+  ## skip header section
+  next if /^\#/.match(l)
+  gr = Bio::GFF::GFF3::Record.new(l.chomp)
+  data[i] = gr
+  id = nil
+  id_found = gr.attributes.select{|a| a[0] == "ID"}
+  if id_found.size == 1
+    id = id_found[0][1]
+  elsif id_found.size == 0
+    ## do nothing (id = nil)
+  elsif id_found > 1
+    STDERR.puts gr.attributes
+    raise "Multiple IDs found."
+  end
+  id2ln[id] = i
+end
+STDERR.puts "#{Time.now} Loading data: done."
+STDERR.puts "parent-children relations are beeing analyzed..."
+notfound = []
+data.each do |i, v|
+  gr = v
+  parent =  ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
+  if parent
+    begin
+      pdata[i] = id2ln.fetch(parent)
+    rescue
+      notfound << [i, parent]
+    end
+  else
+    pdata[i] = nil
+  end
+end
+STDERR.puts "parent database were created.\nchildren database is being created..."
+require 'pp'
+#p data
+#p id2ln
+#pp pdata
+## build children data from pareint data (pdata)
+cdata = Hash.new()
+## init cdata
+pdata.each do |k, v|
+  unless cdata.has_key?(k)
+    cdata[k] = []
+  end
+end
+pdata.each do |k, v|
+  if v
+    parent = v
+    cdata[v] << k
+  end
+end
+#pp cdata
+STDERR.puts "children databse created."
+STDERR.puts "parent-children databases were successfully created."
+STDERR.puts "gene and descendant features are being extracted..."
+passed_paths = []
+data.keys.sort.each do |i|
+  gr = data[i]
+  if gr.feature == "gene"
+#    puts gr
+  passed_paths << descendant_paths([[i]], cdata)
+  end
+#  break if i > 100
+end
+#p passed_paths
+File.open(out_file, "w") do |o|
+  passed_paths.flatten.sort.uniq.each do |i|
+      o.puts  data[i]
+  end
+end
+STDERR.puts "target features were successfully extracted."
+File.open(outfile_not_found_parents, "w") do |o|
+  notfound.each do |i, parent|
+    o.puts [i, parent, data[i]].join("\t")
+  end
+end

data/exe/ezgff_rm_fasta.rb ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require_relative '../lib/ezgff'
+require 'thor'
+include Ezgff
+gff_file = ARGV[0]
+File.open(gff_file).each_with_index do |l, i|
+  puts l
+  ## skip FASTA seq section
+  break if /^\#\#FASTA/.match(l)
+end

data/lib/ezgff/gffsqlitedb.rb CHANGED Viewed

@@ -163,7 +163,7 @@ module Ezgff
       end
       h2 = Hash.new
       h.each do |key, values|
-        if key == "Dbxref" || key == "Ontology_term"
+        if key == "Dbxref2" # dummy (not used currently)
           h3 = Hash.new
           values.each do |val|
             m = /(.+?):/.match(val)
@@ -172,6 +172,8 @@ module Ezgff
             h3.update({dbtag => dbval})
           end
           h2[key] = h3
+        elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
+          h2[key] = values
         else
           h2[key] = values.join(",")
         end
@@ -225,8 +227,12 @@ module Ezgff
       end
     end
-    def search(query, num_limit=100)
-      sql = %Q{SELECT * FROM  gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
+    def search(query, num_limit=100, type=nil)
+      sql = %Q{SELECT * FROM  gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%" }
+      if type
+        sql += %Q{ AND type=="#{type}"}
+      end
+      sql += %Q{ LIMIT #{num_limit} } ;
       STDERR.puts sql
       res = @db.execute(sql)
       res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}

data/lib/ezgff/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Ezgff
-  VERSION = "0.0.5"
+  VERSION = "0.0.6"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ezgff
 version: !ruby/object:Gem::Version
-  version: 0.0.5
+  version: 0.0.6
 platform: ruby
 authors:
 - Shuji Shigenobu
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2021-07-22 00:00:00.000000000 Z
+date: 2021-08-01 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: sqlite3
@@ -72,6 +72,8 @@ email:
 - sshigenobu@gmail.com
 executables:
 - ezgff
+- ezgff_extract_gene_model_features_from_gff3.rb
+- ezgff_rm_fasta.rb
 extensions: []
 extra_rdoc_files: []
 files:
@@ -87,6 +89,8 @@ files:
 - dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
 - dev/gff_examples/apisum_part.gff3.gz
 - exe/ezgff
+- exe/ezgff_extract_gene_model_features_from_gff3.rb
+- exe/ezgff_rm_fasta.rb
 - ezgff.gemspec
 - lib/ezgff.rb
 - lib/ezgff/gffsqlitedb.rb