RubyGems - ezgff - Versions diffs - 0.0.2 → 0.0.6 - Mend

ezgff 0.0.2 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/.gitignore +6 -0
data/README.md +40 -11
data/exe/ezgff +2 -1
data/exe/ezgff_extract_gene_model_features_from_gff3.rb +155 -0
data/exe/ezgff_rm_fasta.rb +14 -0
data/ezgff.gemspec +6 -0
data/lib/ezgff/gffsqlitedb.rb +23 -5
data/lib/ezgff/version.rb +1 -1
data/webapi/app/main.py +76 -0
data/webapi/app/run.py +5 -0
metadata +65 -5
data/dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz +0 -0
data/dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c5243c27b01a1b9c51ca19784b7fcce0cc86766de41270aa1a546370d1dcda9c
-  data.tar.gz: 06a3a2491430b10e33bae1e89c4a32d52feaab7a93425de47d6d3a0b15d6de23
+  metadata.gz: d4a07d4b65fd0a41b91f7627a19291d9740748a84f680c93e1307dfe1397c110
+  data.tar.gz: 3bd3d44155e7a92f81e899481706a280aca5b6f1636c67b550ba6244b39f6a8c
 SHA512:
-  metadata.gz: cfe588ee8d77d84540ade9b87cc3d3f6393e87cff783039959105a09a1fd9d6a94fd58aac1ee0964061112db6a5ba3dde4292bc9b4db8e2effee4b6f27bec703
-  data.tar.gz: cf1f4a39764e43456686fe7da5e7cae3cf1bbc2fba55b78ef9d4f52dd6ea232186a944b320a506bfc6499077e98d921ac6933fb774034d19323430724c1b5091
+  metadata.gz: d665269b3032ad82f9859d737cc31e4393b6f7275a2547b76faa8ea5499d7258e6a8651c211409115e767dd316b5b0ef616ffc45bdc09c131d3b036738ac26d1
+  data.tar.gz: bb76c6640fb0c410d5eb66413073a5cb97efdd6d9167c5a3b5e4ff601ba48233b545139230935fd15032a876b91e26811f306da00e6fd2a956d269c8fccd3b7a

data/.gitignore CHANGED Viewed

@@ -54,3 +54,9 @@ build-iPhoneSimulator/
 # Used by RuboCop. Remote config files pulled in from inherit_from directive.
 # .rubocop-https?--*
+# for python
+__pycache__/
+# for Mac
+.DS_Store

data/README.md CHANGED Viewed

@@ -1,34 +1,63 @@
-# ezgff_alpha
+# ezgff
+## What is ezgff?
+Utilities for GFF3, the genome annotation format. Useful to explore the gene model features.
-## What is ezgff_alpha
 ## Pre-requisites
-  * sqlite3
+  * Sqlite3
+  * Ruby
 ## Install
+```bash
+gem install ezgff
+```
 ## Quick start
-Build database from GFF3 file.
+ezgff provides the command line interface.
+You need build an ezgff database from the gff3 file first by using 'build' subcommand. Once you built ezgff db, you can search and retrieve data from the database by using 'search' and 'view' subcommands.
+### Build database from GFF3 file.
 ```bash
-ezgff build in.gff3
+ezgff build gff3_file
 ```
-Retrieve GFF3 reacod by ID.
+This command generates gff3_file.ezdb directory which is the ezgff database that will be specified when you use view and search subcommands.
-```bash
-ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors
+### Retrieve GFF3 reacod by ID.
+```
+ezgff view DB ID
 ```
 ```
-ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq
+ezgff view DB ID --with=ancestors
 ```
-examples to use jq
+GFF lines with the ID are displayed.
+Data can be formated in JSON. Below are examples to work with jq.
 ```
-ezgff_alpha/bin/ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1  --with=ancestors --format=json  |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] |@csv'
+ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq
+```
+More complicated example
 ```
+ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1  --with=ancestors --format=json \
+ |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] \
+ |@csv'
+```
+## FAQ
+### Can I use GFF2 or GTF for ezgff?
+Ans.
+No. ezgff takes GFF3 format only. GFF2/GTF can be converted to GFF3 easily. gffread and other tools can be used for conversion.

data/exe/ezgff CHANGED Viewed

@@ -132,6 +132,7 @@ module Ezgff
     desc "search DB QUERY", "search GFF record giving query"
     option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
+    option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
     def search(db, query)
       ezdb = db
       files = Dir["#{ezdb}/*.sqlite3"]
@@ -144,7 +145,7 @@ module Ezgff
         raise "Multiple sqlite3 files found"
       end
       sq3_db = GffDb.new(sq3_file)
-      results = sq3_db.search(query, 100)
+      results = sq3_db.search(query, 100, options[:type])
       case options[:format]
       when "json"
         h = Hash.new

data/exe/ezgff_extract_gene_model_features_from_gff3.rb ADDED Viewed

@@ -0,0 +1,155 @@
+#!/usr/bin/env ruby
+require_relative '../lib/ezgff'
+# require 'thor'
+include Ezgff
+def path_to_root(ary, pdata)
+  #  p ary
+  #  p pdata
+    if par = pdata[ary.last]
+      newary = [ary, par].flatten
+      path_to_root(newary, data)
+    else
+      return ary
+    end
+  end
+def descendant_paths(paths, cdata)
+  #  puts "input:"
+  #  p paths
+    if  paths.map{|pa| cdata[pa.last].size}.all?{|v| v == 0}
+      return paths
+    else
+      newpaths = []
+      paths.each do |pa|
+  #p      pa
+  #p      cdata[pa.last][:children]
+        if cdata[pa.last].size > 0
+          cdata[pa.last].each do |c|
+            newary = [pa, c].flatten
+            newpaths << newary
+          end
+        else
+          newpaths << pa
+        end
+      end
+  #    puts  "generated:"
+  #    p newpaths
+      descendant_paths(newpaths, cdata)
+    end
+end
+gff_file = ARGV[0]
+out_file = ARGV[1]
+outfile_not_found_parents = out_file + ".ParentsNoteFound.txt"
+include_features = ["gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR", "ncRNA","polyA_site", "pre_miRNA", "pseudogene", "rRNA", "snRNA", "snoRNA", "tRNA"]
+exclude_features = ["match", "match_part", "orthologous_to", "paralogous_to", "oligo", "sgRNA"]
+data = {}   # key: line_num (int); value: gff_feature (Bio::GFF::GFF3::Record)
+id2ln = {}  # key: ID; value: line_num (reference to key of data)
+pdata = {}  # Hash to store parent relations
+            # key: line_num (int); value: ID
+STDERR.puts "#{Time.now} Loading data..."
+File.open(gff_file).each_with_index do |l, i|
+  STDERR.print "#{i} lines loaded\r" if i % 100000 == 0
+  #    puts l
+  a = l.chomp.split(/\t/)
+#  next if exclude_features.include?(a[2])
+  next unless include_features.include?(a[2])
+  ## skip FASTA seq section
+  break if /^\#\#FASTA/.match(l)
+  ## skip header section
+  next if /^\#/.match(l)
+  gr = Bio::GFF::GFF3::Record.new(l.chomp)
+  data[i] = gr
+  id = nil
+  id_found = gr.attributes.select{|a| a[0] == "ID"}
+  if id_found.size == 1
+    id = id_found[0][1]
+  elsif id_found.size == 0
+    ## do nothing (id = nil)
+  elsif id_found > 1
+    STDERR.puts gr.attributes
+    raise "Multiple IDs found."
+  end
+  id2ln[id] = i
+end
+STDERR.puts "#{Time.now} Loading data: done."
+STDERR.puts "parent-children relations are beeing analyzed..."
+notfound = []
+data.each do |i, v|
+  gr = v
+  parent =  ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
+  if parent
+    begin
+      pdata[i] = id2ln.fetch(parent)
+    rescue
+      notfound << [i, parent]
+    end
+  else
+    pdata[i] = nil
+  end
+end
+STDERR.puts "parent database were created.\nchildren database is being created..."
+require 'pp'
+#p data
+#p id2ln
+#pp pdata
+## build children data from pareint data (pdata)
+cdata = Hash.new()
+## init cdata
+pdata.each do |k, v|
+  unless cdata.has_key?(k)
+    cdata[k] = []
+  end
+end
+pdata.each do |k, v|
+  if v
+    parent = v
+    cdata[v] << k
+  end
+end
+#pp cdata
+STDERR.puts "children databse created."
+STDERR.puts "parent-children databases were successfully created."
+STDERR.puts "gene and descendant features are being extracted..."
+passed_paths = []
+data.keys.sort.each do |i|
+  gr = data[i]
+  if gr.feature == "gene"
+#    puts gr
+  passed_paths << descendant_paths([[i]], cdata)
+  end
+#  break if i > 100
+end
+#p passed_paths
+File.open(out_file, "w") do |o|
+  passed_paths.flatten.sort.uniq.each do |i|
+      o.puts  data[i]
+  end
+end
+STDERR.puts "target features were successfully extracted."
+File.open(outfile_not_found_parents, "w") do |o|
+  notfound.each do |i, parent|
+    o.puts [i, parent, data[i]].join("\t")
+  end
+end

data/exe/ezgff_rm_fasta.rb ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require_relative '../lib/ezgff'
+require 'thor'
+include Ezgff
+gff_file = ARGV[0]
+File.open(gff_file).each_with_index do |l, i|
+  puts l
+  ## skip FASTA seq section
+  break if /^\#\#FASTA/.match(l)
+end

data/ezgff.gemspec CHANGED Viewed

@@ -27,4 +27,10 @@ Gem::Specification.new do |spec|
   spec.bindir        = "exe"
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
+  spec.add_runtime_dependency "sqlite3"
+  spec.add_runtime_dependency "bio"
+  spec.add_runtime_dependency "thor"
+  spec.add_runtime_dependency "color_echo"
 end

data/lib/ezgff/gffsqlitedb.rb CHANGED Viewed

@@ -103,8 +103,20 @@ module Ezgff
           sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
-          values = [i, l.chomp, id, parent,
-            a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
+          values = [
+            i,       # line number
+            l.chomp, # raw record
+            id,      # ID
+            parent,  # parent ID
+            a[0],    # seqid
+            a[1],    # source
+            a[2],    # type
+            a[3],    # start
+            a[4],    # end
+            (a[5] == "." ? nil : a[5]),    # score
+            a[6],    # strand
+            (a[7] == "." ? nil : a[7]),    # phase
+            a[8],    # attributes
             attributes_as_json(l)]
           sq3_db.execute(sql, values)
         end
@@ -151,7 +163,7 @@ module Ezgff
       end
       h2 = Hash.new
       h.each do |key, values|
-        if key == "Dbxref" || key == "Ontology_term"
+        if key == "Dbxref2" # dummy (not used currently)
           h3 = Hash.new
           values.each do |val|
             m = /(.+?):/.match(val)
@@ -160,6 +172,8 @@ module Ezgff
             h3.update({dbtag => dbval})
           end
           h2[key] = h3
+        elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
+          h2[key] = values
         else
           h2[key] = values.join(",")
         end
@@ -213,8 +227,12 @@ module Ezgff
       end
     end
-    def search(query, num_limit=100)
-      sql = %Q{SELECT * FROM  gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
+    def search(query, num_limit=100, type=nil)
+      sql = %Q{SELECT * FROM  gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%" }
+      if type
+        sql += %Q{ AND type=="#{type}"}
+      end
+      sql += %Q{ LIMIT #{num_limit} } ;
       STDERR.puts sql
       res = @db.execute(sql)
       res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}

data/lib/ezgff/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Ezgff
-  VERSION = "0.0.2"
+  VERSION = "0.0.6"
 end

data/webapi/app/main.py ADDED Viewed

@@ -0,0 +1,76 @@
+from fastapi import FastAPI
+from fastapi import Query, Path
+from typing import Optional, List
+import subprocess
+import sys
+import json
+from pydantic import BaseModel, Field
+from enum import Enum
+from pydantic.errors import NoneIsNotAllowedError
+import uvicorn
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('-d', '--db', required=True)
+parser.add_argument('-b', '--bind', default='0.0.0.0')
+parser.add_argument('-p', '--port', type=int, default=8000)
+args = parser.parse_args()
+print(args)
+ezdb = args.db
+#print(ezdb)
+#print(args)
+app = FastAPI()
+class OptWith(str, Enum):
+    none = "none"
+    parent = "parent"
+    children = "children"
+    ancestors = "ancestors"
+    descendants = "descendants"
+class GffColumnStrand(str, Enum):
+    plus = "+"
+    minus = "-"
+    unstranded = "."
+    unknown = "?"
+class GffRecord(BaseModel):
+    seqid: str = Field(title="seqid", descripion="GFF3 column 1: sequence ID", example="NC_002528.1")
+    source: str = Field(title="source", descripion="GFF3 column 2: algorithm or operating procedure", example="Refseq")
+    type: str = Field(title="type", description="GFF3 column 3: the type of the feature (previously called the \"method\"")
+    start: int = Field(title="start", description="GFF3 column 4: the start coordinate of the feature. 1-based integer.")
+    end: int = Field(title="end", description="GFF3 column 5: the end coordinate of the feature. 1-based integer.")
+    score: Optional[float] = Field(None, description="GFF3 column 6: the score of the feature. A floating point number.")
+    strand: GffColumnStrand = Field(title="strand", description="GFF3 column 7: the strand of the feature. +, -, . (unstranded), ? (unknown) are allowed.")
+    phase: Optional[int] = Field(None, description="GFF3 column 8: phase for CDS. 0, 1, 2 are allowed.")
+    line_num: int = Field(description="Line number in the original GFF3 file. Required and Unique.")
+    id: Optional[str] = Field(None, description="ID")
+    parent_id: Optional[str] = Field(None, description="Parent ID")
+    attributes: Optional[dict] = Field(title="attributes", description="Gff3 column 9: attributes.")
+class GffRecords(BaseModel):
+    gff_records: List[GffRecord]
+@app.get("/view/{query}", response_model=GffRecords)
+def view(
+    query: str = Path(..., example="NC_002528.1"),
+    w: OptWith = Query("none", description="with"),
+    t: Optional[str] = Query(None, description="type", example="gene")
+    ):
+    return json.loads(run_ezgff(query, w, t))
+def run_ezgff(query, w, t):
+    cmd = ["ezgff", "view", ezdb, query, "-f", "json", "-w", w]
+    if t:
+        cmd.extend(["-t", t])
+    print(cmd)
+    proc = subprocess.run(cmd, stdout=subprocess.PIPE)
+    res = proc.stdout
+    print(res)
+    return res
+if __name__ == "__main__":
+    uvicorn.run(app, host=args.bind, port=args.port)

data/webapi/app/run.py ADDED Viewed

@@ -0,0 +1,5 @@
+import uvicorn
+if __name__ == '__main__':
+    # コンソールで [$ uvicorn run:app --reload]でも可
+    uvicorn.run(app="main")

metadata CHANGED Viewed

@@ -1,21 +1,79 @@
 --- !ruby/object:Gem::Specification
 name: ezgff
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.6
 platform: ruby
 authors:
 - Shuji Shigenobu
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2021-07-14 00:00:00.000000000 Z
-dependencies: []
+date: 2021-08-01 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: sqlite3
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bio
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: thor
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: color_echo
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: Utilities for GFF3, the genome annotation format. Useful to explore the
   gene model features.
 email:
 - sshigenobu@gmail.com
 executables:
 - ezgff
+- ezgff_extract_gene_model_features_from_gff3.rb
+- ezgff_rm_fasta.rb
 extensions: []
 extra_rdoc_files: []
 files:
@@ -27,16 +85,18 @@ files:
 - bin/build_gff_sqlitedb.rb
 - bin/build_gff_sqlitedb_keywords.rb
 - bin/build_gff_tabix.rb
-- dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz
 - dev/gff_examples/ApMT_NC_011594.gb.gff.gz
 - dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
 - dev/gff_examples/apisum_part.gff3.gz
-- dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz
 - exe/ezgff
+- exe/ezgff_extract_gene_model_features_from_gff3.rb
+- exe/ezgff_rm_fasta.rb
 - ezgff.gemspec
 - lib/ezgff.rb
 - lib/ezgff/gffsqlitedb.rb
 - lib/ezgff/version.rb
+- webapi/app/main.py
+- webapi/app/run.py
 homepage: https://github.com/shujishigenobu/ezgff_alpha
 licenses:
 - MIT

data/dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz DELETED Viewed

Binary file

data/dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz DELETED Viewed

Binary file