ezgff 0.0.2 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c5243c27b01a1b9c51ca19784b7fcce0cc86766de41270aa1a546370d1dcda9c
4
- data.tar.gz: 06a3a2491430b10e33bae1e89c4a32d52feaab7a93425de47d6d3a0b15d6de23
3
+ metadata.gz: d4a07d4b65fd0a41b91f7627a19291d9740748a84f680c93e1307dfe1397c110
4
+ data.tar.gz: 3bd3d44155e7a92f81e899481706a280aca5b6f1636c67b550ba6244b39f6a8c
5
5
  SHA512:
6
- metadata.gz: cfe588ee8d77d84540ade9b87cc3d3f6393e87cff783039959105a09a1fd9d6a94fd58aac1ee0964061112db6a5ba3dde4292bc9b4db8e2effee4b6f27bec703
7
- data.tar.gz: cf1f4a39764e43456686fe7da5e7cae3cf1bbc2fba55b78ef9d4f52dd6ea232186a944b320a506bfc6499077e98d921ac6933fb774034d19323430724c1b5091
6
+ metadata.gz: d665269b3032ad82f9859d737cc31e4393b6f7275a2547b76faa8ea5499d7258e6a8651c211409115e767dd316b5b0ef616ffc45bdc09c131d3b036738ac26d1
7
+ data.tar.gz: bb76c6640fb0c410d5eb66413073a5cb97efdd6d9167c5a3b5e4ff601ba48233b545139230935fd15032a876b91e26811f306da00e6fd2a956d269c8fccd3b7a
data/.gitignore CHANGED
@@ -54,3 +54,9 @@ build-iPhoneSimulator/
54
54
 
55
55
  # Used by RuboCop. Remote config files pulled in from inherit_from directive.
56
56
  # .rubocop-https?--*
57
+
58
+ # for python
59
+ __pycache__/
60
+
61
+ # for Mac
62
+ .DS_Store
data/README.md CHANGED
@@ -1,34 +1,63 @@
1
- # ezgff_alpha
1
+ # ezgff
2
+
3
+ ## What is ezgff?
4
+
5
+ Utilities for GFF3, the genome annotation format. Useful to explore the gene model features.
2
6
 
3
- ## What is ezgff_alpha
4
7
 
5
8
  ## Pre-requisites
6
9
 
7
- * sqlite3
10
+ * Sqlite3
11
+ * Ruby
8
12
 
9
13
  ## Install
10
14
 
15
+ ```bash
16
+ gem install ezgff
17
+ ```
11
18
 
12
19
  ## Quick start
13
20
 
14
- Build database from GFF3 file.
21
+ ezgff provides the command line interface.
22
+
23
+ You need build an ezgff database from the gff3 file first by using 'build' subcommand. Once you built ezgff db, you can search and retrieve data from the database by using 'search' and 'view' subcommands.
24
+
25
+ ### Build database from GFF3 file.
15
26
 
16
27
  ```bash
17
- ezgff build in.gff3
28
+ ezgff build gff3_file
18
29
  ```
19
30
 
20
- Retrieve GFF3 reacod by ID.
31
+ This command generates gff3_file.ezdb directory which is the ezgff database that will be specified when you use view and search subcommands.
21
32
 
22
- ```bash
23
- ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors
33
+ ### Retrieve GFF3 reacod by ID.
34
+
35
+ ```
36
+ ezgff view DB ID
24
37
  ```
25
38
 
26
39
  ```
27
- ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq
40
+ ezgff view DB ID --with=ancestors
28
41
  ```
29
42
 
30
- examples to use jq
43
+ GFF lines with the ID are displayed.
44
+
45
+ Data can be formated in JSON. Below are examples to work with jq.
31
46
 
32
47
  ```
33
- ezgff_alpha/bin/ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] |@csv'
48
+ ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq
49
+ ```
50
+
51
+ More complicated example
34
52
  ```
53
+ ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=ancestors --format=json \
54
+ |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] \
55
+ |@csv'
56
+ ```
57
+
58
+ ## FAQ
59
+ ### Can I use GFF2 or GTF for ezgff?
60
+ Ans.
61
+
62
+ No. ezgff takes GFF3 format only. GFF2/GTF can be converted to GFF3 easily. gffread and other tools can be used for conversion.
63
+
data/exe/ezgff CHANGED
@@ -132,6 +132,7 @@ module Ezgff
132
132
 
133
133
  desc "search DB QUERY", "search GFF record giving query"
134
134
  option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
135
+ option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
135
136
  def search(db, query)
136
137
  ezdb = db
137
138
  files = Dir["#{ezdb}/*.sqlite3"]
@@ -144,7 +145,7 @@ module Ezgff
144
145
  raise "Multiple sqlite3 files found"
145
146
  end
146
147
  sq3_db = GffDb.new(sq3_file)
147
- results = sq3_db.search(query, 100)
148
+ results = sq3_db.search(query, 100, options[:type])
148
149
  case options[:format]
149
150
  when "json"
150
151
  h = Hash.new
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/ezgff'
4
+ # require 'thor'
5
+
6
+ include Ezgff
7
+
8
+ def path_to_root(ary, pdata)
9
+ # p ary
10
+ # p pdata
11
+ if par = pdata[ary.last]
12
+ newary = [ary, par].flatten
13
+ path_to_root(newary, data)
14
+ else
15
+ return ary
16
+ end
17
+ end
18
+
19
+ def descendant_paths(paths, cdata)
20
+ # puts "input:"
21
+ # p paths
22
+ if paths.map{|pa| cdata[pa.last].size}.all?{|v| v == 0}
23
+ return paths
24
+ else
25
+ newpaths = []
26
+ paths.each do |pa|
27
+ #p pa
28
+ #p cdata[pa.last][:children]
29
+ if cdata[pa.last].size > 0
30
+ cdata[pa.last].each do |c|
31
+ newary = [pa, c].flatten
32
+ newpaths << newary
33
+ end
34
+ else
35
+ newpaths << pa
36
+ end
37
+ end
38
+ # puts "generated:"
39
+ # p newpaths
40
+ descendant_paths(newpaths, cdata)
41
+ end
42
+ end
43
+
44
+ gff_file = ARGV[0]
45
+ out_file = ARGV[1]
46
+ outfile_not_found_parents = out_file + ".ParentsNoteFound.txt"
47
+
48
+ include_features = ["gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR", "ncRNA","polyA_site", "pre_miRNA", "pseudogene", "rRNA", "snRNA", "snoRNA", "tRNA"]
49
+ exclude_features = ["match", "match_part", "orthologous_to", "paralogous_to", "oligo", "sgRNA"]
50
+
51
+ data = {} # key: line_num (int); value: gff_feature (Bio::GFF::GFF3::Record)
52
+ id2ln = {} # key: ID; value: line_num (reference to key of data)
53
+ pdata = {} # Hash to store parent relations
54
+ # key: line_num (int); value: ID
55
+
56
+ STDERR.puts "#{Time.now} Loading data..."
57
+ File.open(gff_file).each_with_index do |l, i|
58
+ STDERR.print "#{i} lines loaded\r" if i % 100000 == 0
59
+
60
+ # puts l
61
+ a = l.chomp.split(/\t/)
62
+ # next if exclude_features.include?(a[2])
63
+ next unless include_features.include?(a[2])
64
+ ## skip FASTA seq section
65
+ break if /^\#\#FASTA/.match(l)
66
+
67
+ ## skip header section
68
+ next if /^\#/.match(l)
69
+ gr = Bio::GFF::GFF3::Record.new(l.chomp)
70
+ data[i] = gr
71
+
72
+ id = nil
73
+ id_found = gr.attributes.select{|a| a[0] == "ID"}
74
+ if id_found.size == 1
75
+ id = id_found[0][1]
76
+ elsif id_found.size == 0
77
+ ## do nothing (id = nil)
78
+ elsif id_found > 1
79
+ STDERR.puts gr.attributes
80
+ raise "Multiple IDs found."
81
+ end
82
+ id2ln[id] = i
83
+ end
84
+
85
+ STDERR.puts "#{Time.now} Loading data: done."
86
+ STDERR.puts "parent-children relations are beeing analyzed..."
87
+
88
+
89
+ notfound = []
90
+ data.each do |i, v|
91
+ gr = v
92
+ parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
93
+ if parent
94
+ begin
95
+ pdata[i] = id2ln.fetch(parent)
96
+ rescue
97
+ notfound << [i, parent]
98
+ end
99
+ else
100
+ pdata[i] = nil
101
+ end
102
+ end
103
+ STDERR.puts "parent database were created.\nchildren database is being created..."
104
+
105
+ require 'pp'
106
+
107
+ #p data
108
+ #p id2ln
109
+ #pp pdata
110
+
111
+ ## build children data from pareint data (pdata)
112
+ cdata = Hash.new()
113
+ ## init cdata
114
+ pdata.each do |k, v|
115
+ unless cdata.has_key?(k)
116
+ cdata[k] = []
117
+ end
118
+ end
119
+ pdata.each do |k, v|
120
+ if v
121
+ parent = v
122
+ cdata[v] << k
123
+ end
124
+ end
125
+
126
+ #pp cdata
127
+ STDERR.puts "children databse created."
128
+ STDERR.puts "parent-children databases were successfully created."
129
+
130
+ STDERR.puts "gene and descendant features are being extracted..."
131
+
132
+ passed_paths = []
133
+ data.keys.sort.each do |i|
134
+ gr = data[i]
135
+ if gr.feature == "gene"
136
+ # puts gr
137
+ passed_paths << descendant_paths([[i]], cdata)
138
+ end
139
+ # break if i > 100
140
+ end
141
+
142
+ #p passed_paths
143
+ File.open(out_file, "w") do |o|
144
+ passed_paths.flatten.sort.uniq.each do |i|
145
+ o.puts data[i]
146
+ end
147
+ end
148
+
149
+ STDERR.puts "target features were successfully extracted."
150
+
151
+ File.open(outfile_not_found_parents, "w") do |o|
152
+ notfound.each do |i, parent|
153
+ o.puts [i, parent, data[i]].join("\t")
154
+ end
155
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/ezgff'
4
+ require 'thor'
5
+
6
+ include Ezgff
7
+
8
+ gff_file = ARGV[0]
9
+
10
+ File.open(gff_file).each_with_index do |l, i|
11
+ puts l
12
+ ## skip FASTA seq section
13
+ break if /^\#\#FASTA/.match(l)
14
+ end
data/ezgff.gemspec CHANGED
@@ -27,4 +27,10 @@ Gem::Specification.new do |spec|
27
27
  spec.bindir = "exe"
28
28
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
29
  spec.require_paths = ["lib"]
30
+
31
+ spec.add_runtime_dependency "sqlite3"
32
+ spec.add_runtime_dependency "bio"
33
+ spec.add_runtime_dependency "thor"
34
+ spec.add_runtime_dependency "color_echo"
35
+
30
36
  end
@@ -103,8 +103,20 @@ module Ezgff
103
103
 
104
104
  sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
105
105
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
106
- values = [i, l.chomp, id, parent,
107
- a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
106
+ values = [
107
+ i, # line number
108
+ l.chomp, # raw record
109
+ id, # ID
110
+ parent, # parent ID
111
+ a[0], # seqid
112
+ a[1], # source
113
+ a[2], # type
114
+ a[3], # start
115
+ a[4], # end
116
+ (a[5] == "." ? nil : a[5]), # score
117
+ a[6], # strand
118
+ (a[7] == "." ? nil : a[7]), # phase
119
+ a[8], # attributes
108
120
  attributes_as_json(l)]
109
121
  sq3_db.execute(sql, values)
110
122
  end
@@ -151,7 +163,7 @@ module Ezgff
151
163
  end
152
164
  h2 = Hash.new
153
165
  h.each do |key, values|
154
- if key == "Dbxref" || key == "Ontology_term"
166
+ if key == "Dbxref2" # dummy (not used currently)
155
167
  h3 = Hash.new
156
168
  values.each do |val|
157
169
  m = /(.+?):/.match(val)
@@ -160,6 +172,8 @@ module Ezgff
160
172
  h3.update({dbtag => dbval})
161
173
  end
162
174
  h2[key] = h3
175
+ elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
176
+ h2[key] = values
163
177
  else
164
178
  h2[key] = values.join(",")
165
179
  end
@@ -213,8 +227,12 @@ module Ezgff
213
227
  end
214
228
  end
215
229
 
216
- def search(query, num_limit=100)
217
- sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
230
+ def search(query, num_limit=100, type=nil)
231
+ sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%" }
232
+ if type
233
+ sql += %Q{ AND type=="#{type}"}
234
+ end
235
+ sql += %Q{ LIMIT #{num_limit} } ;
218
236
  STDERR.puts sql
219
237
  res = @db.execute(sql)
220
238
  res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
data/lib/ezgff/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Ezgff
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -0,0 +1,76 @@
1
+ from fastapi import FastAPI
2
+ from fastapi import Query, Path
3
+ from typing import Optional, List
4
+ import subprocess
5
+ import sys
6
+ import json
7
+ from pydantic import BaseModel, Field
8
+ from enum import Enum
9
+ from pydantic.errors import NoneIsNotAllowedError
10
+ import uvicorn
11
+ import argparse
12
+
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument('-d', '--db', required=True)
15
+ parser.add_argument('-b', '--bind', default='0.0.0.0')
16
+ parser.add_argument('-p', '--port', type=int, default=8000)
17
+ args = parser.parse_args()
18
+ print(args)
19
+ ezdb = args.db
20
+
21
+ #print(ezdb)
22
+ #print(args)
23
+
24
+ app = FastAPI()
25
+
26
+ class OptWith(str, Enum):
27
+ none = "none"
28
+ parent = "parent"
29
+ children = "children"
30
+ ancestors = "ancestors"
31
+ descendants = "descendants"
32
+
33
+ class GffColumnStrand(str, Enum):
34
+ plus = "+"
35
+ minus = "-"
36
+ unstranded = "."
37
+ unknown = "?"
38
+
39
+ class GffRecord(BaseModel):
40
+ seqid: str = Field(title="seqid", descripion="GFF3 column 1: sequence ID", example="NC_002528.1")
41
+ source: str = Field(title="source", descripion="GFF3 column 2: algorithm or operating procedure", example="Refseq")
42
+ type: str = Field(title="type", description="GFF3 column 3: the type of the feature (previously called the \"method\"")
43
+ start: int = Field(title="start", description="GFF3 column 4: the start coordinate of the feature. 1-based integer.")
44
+ end: int = Field(title="end", description="GFF3 column 5: the end coordinate of the feature. 1-based integer.")
45
+ score: Optional[float] = Field(None, description="GFF3 column 6: the score of the feature. A floating point number.")
46
+ strand: GffColumnStrand = Field(title="strand", description="GFF3 column 7: the strand of the feature. +, -, . (unstranded), ? (unknown) are allowed.")
47
+ phase: Optional[int] = Field(None, description="GFF3 column 8: phase for CDS. 0, 1, 2 are allowed.")
48
+ line_num: int = Field(description="Line number in the original GFF3 file. Required and Unique.")
49
+ id: Optional[str] = Field(None, description="ID")
50
+ parent_id: Optional[str] = Field(None, description="Parent ID")
51
+ attributes: Optional[dict] = Field(title="attributes", description="Gff3 column 9: attributes.")
52
+
53
+ class GffRecords(BaseModel):
54
+ gff_records: List[GffRecord]
55
+
56
+ @app.get("/view/{query}", response_model=GffRecords)
57
+ def view(
58
+ query: str = Path(..., example="NC_002528.1"),
59
+ w: OptWith = Query("none", description="with"),
60
+ t: Optional[str] = Query(None, description="type", example="gene")
61
+ ):
62
+ return json.loads(run_ezgff(query, w, t))
63
+
64
+ def run_ezgff(query, w, t):
65
+ cmd = ["ezgff", "view", ezdb, query, "-f", "json", "-w", w]
66
+ if t:
67
+ cmd.extend(["-t", t])
68
+ print(cmd)
69
+ proc = subprocess.run(cmd, stdout=subprocess.PIPE)
70
+ res = proc.stdout
71
+ print(res)
72
+ return res
73
+
74
+
75
+ if __name__ == "__main__":
76
+ uvicorn.run(app, host=args.bind, port=args.port)
data/webapi/app/run.py ADDED
@@ -0,0 +1,5 @@
1
+ import uvicorn
2
+
3
+ if __name__ == '__main__':
4
+ # コンソールで [$ uvicorn run:app --reload]でも可
5
+ uvicorn.run(app="main")
metadata CHANGED
@@ -1,21 +1,79 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ezgff
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuji Shigenobu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-07-14 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2021-08-01 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: sqlite3
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bio
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: thor
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: color_echo
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
13
69
  description: Utilities for GFF3, the genome annotation format. Useful to explore the
14
70
  gene model features.
15
71
  email:
16
72
  - sshigenobu@gmail.com
17
73
  executables:
18
74
  - ezgff
75
+ - ezgff_extract_gene_model_features_from_gff3.rb
76
+ - ezgff_rm_fasta.rb
19
77
  extensions: []
20
78
  extra_rdoc_files: []
21
79
  files:
@@ -27,16 +85,18 @@ files:
27
85
  - bin/build_gff_sqlitedb.rb
28
86
  - bin/build_gff_sqlitedb_keywords.rb
29
87
  - bin/build_gff_tabix.rb
30
- - dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz
31
88
  - dev/gff_examples/ApMT_NC_011594.gb.gff.gz
32
89
  - dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
33
90
  - dev/gff_examples/apisum_part.gff3.gz
34
- - dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz
35
91
  - exe/ezgff
92
+ - exe/ezgff_extract_gene_model_features_from_gff3.rb
93
+ - exe/ezgff_rm_fasta.rb
36
94
  - ezgff.gemspec
37
95
  - lib/ezgff.rb
38
96
  - lib/ezgff/gffsqlitedb.rb
39
97
  - lib/ezgff/version.rb
98
+ - webapi/app/main.py
99
+ - webapi/app/run.py
40
100
  homepage: https://github.com/shujishigenobu/ezgff_alpha
41
101
  licenses:
42
102
  - MIT