ezgff 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +56 -0
- data/Gemfile +14 -0
- data/LICENSE +21 -0
- data/README.md +34 -0
- data/Rakefile +6 -0
- data/bin/build_gff_sqlitedb.rb +91 -0
- data/bin/build_gff_sqlitedb_keywords.rb +70 -0
- data/bin/build_gff_tabix.rb +13 -0
- data/dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz +0 -0
- data/dev/gff_examples/ApMT_NC_011594.gb.gff.gz +0 -0
- data/dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz +0 -0
- data/dev/gff_examples/apisum_part.gff3.gz +0 -0
- data/dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz +0 -0
- data/exe/ezgff +168 -0
- data/ezgff.gemspec +30 -0
- data/lib/ezgff.rb +12 -0
- data/lib/ezgff/gffsqlitedb.rb +390 -0
- data/lib/ezgff/version.rb +3 -0
- metadata +66 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c5243c27b01a1b9c51ca19784b7fcce0cc86766de41270aa1a546370d1dcda9c
|
4
|
+
data.tar.gz: 06a3a2491430b10e33bae1e89c4a32d52feaab7a93425de47d6d3a0b15d6de23
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cfe588ee8d77d84540ade9b87cc3d3f6393e87cff783039959105a09a1fd9d6a94fd58aac1ee0964061112db6a5ba3dde4292bc9b4db8e2effee4b6f27bec703
|
7
|
+
data.tar.gz: cf1f4a39764e43456686fe7da5e7cae3cf1bbc2fba55b78ef9d4f52dd6ea232186a944b320a506bfc6499077e98d921ac6933fb774034d19323430724c1b5091
|
data/.gitignore
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
# .env
|
15
|
+
|
16
|
+
# Ignore Byebug command history file.
|
17
|
+
.byebug_history
|
18
|
+
|
19
|
+
## Specific to RubyMotion:
|
20
|
+
.dat*
|
21
|
+
.repl_history
|
22
|
+
build/
|
23
|
+
*.bridgesupport
|
24
|
+
build-iPhoneOS/
|
25
|
+
build-iPhoneSimulator/
|
26
|
+
|
27
|
+
## Specific to RubyMotion (use of CocoaPods):
|
28
|
+
#
|
29
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
30
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
31
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
32
|
+
#
|
33
|
+
# vendor/Pods/
|
34
|
+
|
35
|
+
## Documentation cache and generated files:
|
36
|
+
/.yardoc/
|
37
|
+
/_yardoc/
|
38
|
+
/doc/
|
39
|
+
/rdoc/
|
40
|
+
|
41
|
+
## Environment normalization:
|
42
|
+
/.bundle/
|
43
|
+
/vendor/bundle
|
44
|
+
/lib/bundler/man/
|
45
|
+
|
46
|
+
# for a library or gem, you might want to ignore these files since the code is
|
47
|
+
# intended to run in multiple environments; otherwise, check them in:
|
48
|
+
# Gemfile.lock
|
49
|
+
# .ruby-version
|
50
|
+
# .ruby-gemset
|
51
|
+
|
52
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
53
|
+
.rvmrc
|
54
|
+
|
55
|
+
# Used by RuboCop. Remote config files pulled in from inherit_from directive.
|
56
|
+
# .rubocop-https?--*
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2021 Shuji Shigenobu
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# ezgff_alpha
|
2
|
+
|
3
|
+
## What is ezgff_alpha
|
4
|
+
|
5
|
+
## Pre-requisites
|
6
|
+
|
7
|
+
* sqlite3
|
8
|
+
|
9
|
+
## Install
|
10
|
+
|
11
|
+
|
12
|
+
## Quick start
|
13
|
+
|
14
|
+
Build database from GFF3 file.
|
15
|
+
|
16
|
+
```bash
|
17
|
+
ezgff build in.gff3
|
18
|
+
```
|
19
|
+
|
20
|
+
Retrieve GFF3 reacod by ID.
|
21
|
+
|
22
|
+
```bash
|
23
|
+
ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors
|
24
|
+
```
|
25
|
+
|
26
|
+
```
|
27
|
+
ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq
|
28
|
+
```
|
29
|
+
|
30
|
+
examples to use jq
|
31
|
+
|
32
|
+
```
|
33
|
+
ezgff_alpha/bin/ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] |@csv'
|
34
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'sqlite3'
|
2
|
+
require 'bio'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
def attributes_as_json(gffline)
|
6
|
+
gr = Bio::GFF::GFF3::Record.new(gffline.chomp)
|
7
|
+
h = Hash.new
|
8
|
+
gr.attributes.each do |att|
|
9
|
+
k, v = att
|
10
|
+
unless h.has_key?(k)
|
11
|
+
h[k] = []
|
12
|
+
end
|
13
|
+
h[k] << v
|
14
|
+
end
|
15
|
+
h2 = Hash.new
|
16
|
+
h.each do |k, v|
|
17
|
+
h2[k] = v.join(",")
|
18
|
+
end
|
19
|
+
h2.to_json
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
gfffile = ARGV[0]
|
24
|
+
#gfffile = "example_gff/apisum_part.gff3"
|
25
|
+
#gfffile = "example_gff/ApL_HF_liftover_Refseq.gff"
|
26
|
+
|
27
|
+
dbname = gfffile + ".sqlite3"
|
28
|
+
|
29
|
+
db = SQLite3::Database.new(dbname)
|
30
|
+
|
31
|
+
sql = <<-SQL
|
32
|
+
CREATE TABLE gff_records (
|
33
|
+
line_num integer primary key,
|
34
|
+
record text,
|
35
|
+
id text,
|
36
|
+
parent text,
|
37
|
+
seqid text not null,
|
38
|
+
source text,
|
39
|
+
type text,
|
40
|
+
start integer not null,
|
41
|
+
end integer not null,
|
42
|
+
score real,
|
43
|
+
strand varchar(1),
|
44
|
+
phase integer,
|
45
|
+
attributes text,
|
46
|
+
attributes_json text
|
47
|
+
);
|
48
|
+
SQL
|
49
|
+
|
50
|
+
db.execute(sql)
|
51
|
+
|
52
|
+
db.transaction do
|
53
|
+
File.open(gfffile).each_with_index do |l, i|
|
54
|
+
# puts l
|
55
|
+
## skip FASTA seq section
|
56
|
+
break if /^\#\#FASTA/.match(l)
|
57
|
+
|
58
|
+
## skip header section
|
59
|
+
next if /^\#/.match(l)
|
60
|
+
gr = Bio::GFF::GFF3::Record.new(l.chomp)
|
61
|
+
# p gr.attributes
|
62
|
+
id = nil
|
63
|
+
id_found = gr.attributes.select{|a| a[0] == "ID"}
|
64
|
+
if id_found.size == 1
|
65
|
+
id = id_found[0][1]
|
66
|
+
elsif id_found.size == 0
|
67
|
+
## do nothing (id = nil)
|
68
|
+
elsif id_found > 1
|
69
|
+
STDERR.puts gr.attributes
|
70
|
+
raise "Multiple IDs found."
|
71
|
+
end
|
72
|
+
parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
|
73
|
+
a = l.chomp.split(/\t/)
|
74
|
+
|
75
|
+
sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
|
76
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
77
|
+
values = [i, l.chomp, id, parent,
|
78
|
+
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
|
79
|
+
attributes_as_json(l)]
|
80
|
+
db.execute(sql, values)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
#===
|
85
|
+
# create index
|
86
|
+
table = "gff_records"
|
87
|
+
%w{id parent source type}.each do |col|
|
88
|
+
idxname = "index_#{table}_on_#{col}"
|
89
|
+
sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
|
90
|
+
db.execute(sql)
|
91
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'sqlite3'
|
2
|
+
require 'bio'
|
3
|
+
require 'json'
|
4
|
+
require './gffsqlitedb'
|
5
|
+
|
6
|
+
def insert_data(record, key, db_kw)
|
7
|
+
rec = record
|
8
|
+
att = rec.attributes
|
9
|
+
xdb = rec.dbxrefs
|
10
|
+
if val = att[key]
|
11
|
+
sql = "INSERT INTO gff_keywords (line_num, key, value) VALUES (?, ?, ?)"
|
12
|
+
values = [rec.line_num, key, val]
|
13
|
+
puts sql
|
14
|
+
p values
|
15
|
+
db_kw.execute(sql, values)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def insert_data_dbxref(record, key, db_kw)
|
20
|
+
rec = record
|
21
|
+
xdb = rec.dbxrefs
|
22
|
+
val = xdb[key]
|
23
|
+
sql = "INSERT INTO gff_keywords (line_num, key, value, category) VALUES (?, ?, ?, ?)"
|
24
|
+
values = [rec.line_num, key, val, "Dbxref"]
|
25
|
+
puts sql
|
26
|
+
p values
|
27
|
+
db_kw.execute(sql, values)
|
28
|
+
end
|
29
|
+
|
30
|
+
gfffile = ARGV[0]
|
31
|
+
|
32
|
+
db_file = gfffile + ".sqlite3" # altready built
|
33
|
+
#db_keywards = gfffile + ".keywords.sqlite3"
|
34
|
+
|
35
|
+
gffdb = GffDb.new(db_file) # altready built
|
36
|
+
db = SQLite3::Database.new(db_file)
|
37
|
+
|
38
|
+
sql = <<-SQL
|
39
|
+
CREATE TABLE gff_keywords (
|
40
|
+
id integer primary key,
|
41
|
+
line_num integer,
|
42
|
+
key text not null,
|
43
|
+
value text,
|
44
|
+
category text
|
45
|
+
);
|
46
|
+
SQL
|
47
|
+
|
48
|
+
db.execute(sql)
|
49
|
+
|
50
|
+
db.transaction do
|
51
|
+
gffdb.each_record do |r|
|
52
|
+
insert_data(r, "Name", db)
|
53
|
+
insert_data(r, "gbkey", db)
|
54
|
+
insert_data(r, "gene", db)
|
55
|
+
insert_data(r, "product", db)
|
56
|
+
insert_data(r, "transcript_id", db)
|
57
|
+
r.dbxrefs.keys.each do |k|
|
58
|
+
insert_data_dbxref(r, k, db)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#===
|
64
|
+
# create index
|
65
|
+
table = "gff_keywords"
|
66
|
+
%w{id line_num key value}.each do |col|
|
67
|
+
idxname = "index_#{table}_on_#{col}"
|
68
|
+
sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
|
69
|
+
db.execute(sql)
|
70
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
gfffile = ARGV[0]
|
2
|
+
gfffile = "example_gff/apisum_part.gff3"
|
3
|
+
#gfffile = "example_gff/ApL_HF_liftover_Refseq.gff"
|
4
|
+
|
5
|
+
#=== sort gff by position
|
6
|
+
gfffile_sorted = gfffile + ".gz"
|
7
|
+
cmd = %Q{(grep ^"#" #{gfffile}; grep -v ^"#" #{gfffile} | sort -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
|
8
|
+
system cmd
|
9
|
+
|
10
|
+
cmd = "tabix -p gff #{gfffile_sorted}"
|
11
|
+
system cmd
|
12
|
+
|
13
|
+
STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/exe/ezgff
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/ezgff'
|
4
|
+
require 'thor'
|
5
|
+
require 'color_echo/get'
|
6
|
+
|
7
|
+
module Ezgff
|
8
|
+
class CLI < Thor
|
9
|
+
|
10
|
+
## hack to enable -h option
|
11
|
+
## ref: https://magazine.rubyist.net/articles/0046/0046-Milkode.html#%E3%82%AB%E3%82%B9%E3%82%BF%E3%83%9E%E3%82%A4%E3%82%BA-milk-add--h-%E3%82%92%E5%AE%9F%E7%8F%BE%E3%81%99%E3%82%8B
|
12
|
+
|
13
|
+
class_option :help, :type => :boolean, :aliases => '-h', :desc => 'Help message'
|
14
|
+
|
15
|
+
no_tasks do
|
16
|
+
def invoke_command(task, *args)
|
17
|
+
if options[:help] &&
|
18
|
+
task.name != 'grep'
|
19
|
+
Ezgff::CLI.task_help(shell, task.name)
|
20
|
+
elsif options[:version] && task.name == 'help'
|
21
|
+
puts "milk #{Version}"
|
22
|
+
else
|
23
|
+
super
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
## end of -h option setting
|
28
|
+
|
29
|
+
desc "version", "show version number"
|
30
|
+
def version()
|
31
|
+
puts Ezgff::VERSION
|
32
|
+
end
|
33
|
+
|
34
|
+
desc "view DB QUERY", "retrieve GFF record by ID and view it in a specified format.
|
35
|
+
DB path to ezdb
|
36
|
+
ezdb should be created by using 'build' subcommand in advance.
|
37
|
+
QUERY query for search
|
38
|
+
Two modes, simple mode and advanced mode are available.
|
39
|
+
Simple mode
|
40
|
+
ID is given and search by the ID.
|
41
|
+
Advanced mode
|
42
|
+
Query is given in KEY=VALUE style. Available keys are
|
43
|
+
ID: ID (ex. ID=rna-XM_029485812.1)
|
44
|
+
LN: Line number (ex. LN=255)
|
45
|
+
Note: Spaces are not allowed before and after =
|
46
|
+
|
47
|
+
"
|
48
|
+
|
49
|
+
option :format, :aliases => '-f', :enum => ["json", "gff"], :default => "gff", :desc => "Specify output format."
|
50
|
+
option :with, :aliases => '-w', :enum => ["none", "parent", "children", "ancestors", "descendants"], :default => "none", :desc => "Retrieve data with parent or children features."
|
51
|
+
option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
|
52
|
+
|
53
|
+
def view(db, query)
|
54
|
+
ezdb = db
|
55
|
+
files = Dir["#{ezdb}/*.sqlite3"]
|
56
|
+
sq3_file = nil
|
57
|
+
if files.size == 1
|
58
|
+
sq3_file = files[0]
|
59
|
+
elsif files.size == 0
|
60
|
+
raise "sqlite3 file not found"
|
61
|
+
elsif files.size > 1
|
62
|
+
raise "Multiple sqlite3 files found"
|
63
|
+
end
|
64
|
+
sq3_db = GffDb.new(sq3_file)
|
65
|
+
|
66
|
+
if m = /^LN\=/.match(query)
|
67
|
+
## search by line number
|
68
|
+
query2 = m.post_match.strip
|
69
|
+
ann = sq3_db.get_by_line_number(query2)
|
70
|
+
elsif m = /^ID\=/.match(query)
|
71
|
+
## search by ID
|
72
|
+
query2 = m.post_match.strip
|
73
|
+
ann = sq3_db.get(query2)
|
74
|
+
else
|
75
|
+
ann = sq3_db.get(query)
|
76
|
+
end
|
77
|
+
|
78
|
+
results = []
|
79
|
+
if options[:with] == "descendants"
|
80
|
+
results = ann.descendants
|
81
|
+
elsif options[:with] == "ancestors"
|
82
|
+
results = ann.ancestors
|
83
|
+
else
|
84
|
+
parent = nil
|
85
|
+
if options[:with] == "parent"
|
86
|
+
parent = ann.parent
|
87
|
+
end
|
88
|
+
children = []
|
89
|
+
if options[:with] == "children"
|
90
|
+
ann.children.each do |c|
|
91
|
+
children << c
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
results = []
|
96
|
+
results << parent if parent
|
97
|
+
results << ann
|
98
|
+
results += children
|
99
|
+
end
|
100
|
+
|
101
|
+
## Filter results
|
102
|
+
## - type / type
|
103
|
+
if options[:type]
|
104
|
+
target_type = options[:type]
|
105
|
+
results = results.select{|r| r.type == target_type}
|
106
|
+
end
|
107
|
+
|
108
|
+
case options[:format]
|
109
|
+
when "json"
|
110
|
+
h = Hash.new
|
111
|
+
ary = results.map{|r| r.to_h}
|
112
|
+
h["gff_records"] = ary
|
113
|
+
puts h.to_json
|
114
|
+
|
115
|
+
when "gff"
|
116
|
+
puts results
|
117
|
+
else
|
118
|
+
raise "Unknown format: #{options[:format]}"
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
desc "build GFF", "build database from GFF file"
|
124
|
+
option :help, :aliases => :h, :type => :boolean
|
125
|
+
def build(gff_in)
|
126
|
+
# puts "build #{gff_file} => #{dbpath}"
|
127
|
+
dbpath = GffDb.build_db(gff_in)
|
128
|
+
STDERR.puts "new database created: #{dbpath}"
|
129
|
+
gff_file = dbpath + "/" + File.basename(gff_in)
|
130
|
+
GffDb.build_tabix(gff_file)
|
131
|
+
end
|
132
|
+
|
133
|
+
desc "search DB QUERY", "search GFF record giving query"
|
134
|
+
option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
|
135
|
+
def search(db, query)
|
136
|
+
ezdb = db
|
137
|
+
files = Dir["#{ezdb}/*.sqlite3"]
|
138
|
+
sq3_file = nil
|
139
|
+
if files.size == 1
|
140
|
+
sq3_file = files[0]
|
141
|
+
elsif files.size == 0
|
142
|
+
raise "sqlite3 file not found"
|
143
|
+
elsif files.size > 1
|
144
|
+
raise "Multiple sqlite3 files found"
|
145
|
+
end
|
146
|
+
sq3_db = GffDb.new(sq3_file)
|
147
|
+
results = sq3_db.search(query, 100)
|
148
|
+
case options[:format]
|
149
|
+
when "json"
|
150
|
+
h = Hash.new
|
151
|
+
ary = results.map{|r| r.to_h}
|
152
|
+
h["gff_records"] = ary
|
153
|
+
puts h.to_json
|
154
|
+
when "gff"
|
155
|
+
gfftxt = results.map{|r| r.to_s}.join("\n")
|
156
|
+
puts CE.pickup(/#{query}/i, :green, nil, :bold).get(gfftxt)
|
157
|
+
else
|
158
|
+
raise "Unknown format: #{options[:format]}"
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
Ezgff::CLI.start(ARGV)
|
166
|
+
|
167
|
+
|
168
|
+
|
data/ezgff.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative 'lib/ezgff/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = "ezgff"
|
5
|
+
spec.version = Ezgff::VERSION
|
6
|
+
spec.authors = ["Shuji Shigenobu"]
|
7
|
+
spec.email = ["sshigenobu@gmail.com"]
|
8
|
+
|
9
|
+
spec.summary = %q{Utilities for GFF3}
|
10
|
+
spec.description = %q{Utilities for GFF3, the genome annotation format. Useful to explore the gene model features.}
|
11
|
+
spec.homepage = "https://github.com/shujishigenobu/ezgff_alpha"
|
12
|
+
spec.license = "MIT"
|
13
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
|
+
|
15
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
16
|
+
|
17
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
18
|
+
spec.metadata["source_code_uri"] = "https://github.com/shujishigenobu/ezgff_alpha"
|
19
|
+
spec.metadata["changelog_uri"] = "https://github.com/shujishigenobu/ezgff_alpha"
|
20
|
+
# TODO: Put your gem's CHANGELOG.md URL here."
|
21
|
+
|
22
|
+
# Specify which files should be added to the gem when it is released.
|
23
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
24
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
25
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
26
|
+
end
|
27
|
+
spec.bindir = "exe"
|
28
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
|
+
spec.require_paths = ["lib"]
|
30
|
+
end
|
data/lib/ezgff.rb
ADDED
@@ -0,0 +1,390 @@
|
|
1
|
+
require 'sqlite3'
|
2
|
+
require 'json'
|
3
|
+
require 'bio'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
#
|
7
|
+
# References
|
8
|
+
# * Official specification of GFF3 -- https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
|
9
|
+
#
|
10
|
+
|
11
|
+
module Ezgff
|
12
|
+
|
13
|
+
class GffDb
|
14
|
+
|
15
|
+
#===
|
16
|
+
# sqlite3 schema
|
17
|
+
#
|
18
|
+
# gff_records (
|
19
|
+
# line_num integer primary key,
|
20
|
+
# record text, # original record
|
21
|
+
# id text,
|
22
|
+
# parent text,
|
23
|
+
# seqid text not null,
|
24
|
+
# source text,
|
25
|
+
# type text,
|
26
|
+
# start integer not null,
|
27
|
+
# end integer not null,
|
28
|
+
# score real,
|
29
|
+
# strand varchar(1),
|
30
|
+
# phase integer,
|
31
|
+
# attributes text,
|
32
|
+
# attributes_json json
|
33
|
+
# )
|
34
|
+
|
35
|
+
def self.build_db(gff_in, ezdb_base = nil)
|
36
|
+
ezdb_base = (ezdb_base || ".")
|
37
|
+
ezdb_path = ezdb_base + "/" + File.basename(gff_in) + ".ezdb"
|
38
|
+
gff_file = ezdb_path + "/" + File.basename(gff_in)
|
39
|
+
Dir.mkdir(ezdb_path)
|
40
|
+
File.open(gff_file, "w") do |o|
|
41
|
+
File.open(gff_in).each do |l|
|
42
|
+
break if /^\#\#FASTA/.match(l)
|
43
|
+
## skip header section
|
44
|
+
next if /^\#/.match(l)
|
45
|
+
o.puts l
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# FileUtils.cp(gff_in, gff_file)
|
50
|
+
sq3_file = gff_file + ".sqlite3"
|
51
|
+
|
52
|
+
## Create table in sqlite3 RDBMS
|
53
|
+
## table name: gff_record
|
54
|
+
|
55
|
+
sq3_db = SQLite3::Database.new(sq3_file)
|
56
|
+
|
57
|
+
sql = <<-SQL
|
58
|
+
CREATE TABLE gff_records (
|
59
|
+
line_num integer primary key,
|
60
|
+
record text,
|
61
|
+
id text,
|
62
|
+
parent text,
|
63
|
+
seqid text not null,
|
64
|
+
source text,
|
65
|
+
type text,
|
66
|
+
start integer not null,
|
67
|
+
end integer not null,
|
68
|
+
score real,
|
69
|
+
strand varchar(1),
|
70
|
+
phase integer,
|
71
|
+
attributes text,
|
72
|
+
attributes_json json
|
73
|
+
);
|
74
|
+
SQL
|
75
|
+
|
76
|
+
sq3_db.execute(sql)
|
77
|
+
|
78
|
+
## Read GFF file and insert data into
|
79
|
+
## the sqlite3 table
|
80
|
+
|
81
|
+
sq3_db.transaction do
|
82
|
+
File.open(gff_file).each_with_index do |l, i|
|
83
|
+
# puts l
|
84
|
+
## skip FASTA seq section
|
85
|
+
break if /^\#\#FASTA/.match(l)
|
86
|
+
|
87
|
+
## skip header section
|
88
|
+
next if /^\#/.match(l)
|
89
|
+
gr = Bio::GFF::GFF3::Record.new(l.chomp)
|
90
|
+
# p gr.attributes
|
91
|
+
id = nil
|
92
|
+
id_found = gr.attributes.select{|a| a[0] == "ID"}
|
93
|
+
if id_found.size == 1
|
94
|
+
id = id_found[0][1]
|
95
|
+
elsif id_found.size == 0
|
96
|
+
## do nothing (id = nil)
|
97
|
+
elsif id_found > 1
|
98
|
+
STDERR.puts gr.attributes
|
99
|
+
raise "Multiple IDs found."
|
100
|
+
end
|
101
|
+
parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
|
102
|
+
a = l.chomp.split(/\t/)
|
103
|
+
|
104
|
+
sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
|
105
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
106
|
+
values = [i, l.chomp, id, parent,
|
107
|
+
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
|
108
|
+
attributes_as_json(l)]
|
109
|
+
sq3_db.execute(sql, values)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
## Indexing the sqlite3 table
|
114
|
+
table = "gff_records"
|
115
|
+
%w{id parent source type}.each do |col|
|
116
|
+
idxname = "index_#{table}_on_#{col}"
|
117
|
+
sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
|
118
|
+
sq3_db.execute(sql)
|
119
|
+
end
|
120
|
+
|
121
|
+
return ezdb_path
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.build_tabix(gff_in)
|
126
|
+
## sort gff by position
|
127
|
+
gfffile_sorted = gff_in + ".gz"
|
128
|
+
cmd = %Q{(grep ^"#" #{gff_in}; grep -v ^"#" #{gff_in} | sort -t $'\t' -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
|
129
|
+
STDERR.puts cmd
|
130
|
+
system cmd
|
131
|
+
|
132
|
+
cmd = "tabix -p gff #{gfffile_sorted}"
|
133
|
+
STDERR.puts cmd
|
134
|
+
system cmd
|
135
|
+
|
136
|
+
STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.attributes_as_json(gffline)
|
140
|
+
keys_multi_val_allowed = %{Parent Alias Note Dbxref Ontology_term}
|
141
|
+
|
142
|
+
gr = Bio::GFF::GFF3::Record.new(gffline.chomp)
|
143
|
+
|
144
|
+
h = Hash.new
|
145
|
+
gr.attributes.each do |att|
|
146
|
+
k, v = att
|
147
|
+
unless h.has_key?(k)
|
148
|
+
h[k] = []
|
149
|
+
end
|
150
|
+
h[k] << v
|
151
|
+
end
|
152
|
+
h2 = Hash.new
|
153
|
+
h.each do |key, values|
|
154
|
+
if key == "Dbxref" || key == "Ontology_term"
|
155
|
+
h3 = Hash.new
|
156
|
+
values.each do |val|
|
157
|
+
m = /(.+?):/.match(val)
|
158
|
+
dbtag = m[1]
|
159
|
+
dbval = m.post_match
|
160
|
+
h3.update({dbtag => dbval})
|
161
|
+
end
|
162
|
+
h2[key] = h3
|
163
|
+
else
|
164
|
+
h2[key] = values.join(",")
|
165
|
+
end
|
166
|
+
end
|
167
|
+
h2.to_json
|
168
|
+
end
|
169
|
+
|
170
|
+
def initialize(path)
|
171
|
+
@db = SQLite3::Database.new(path)
|
172
|
+
end
|
173
|
+
|
174
|
+
def each_record
|
175
|
+
sql = "SELECT * FROM gff_records"
|
176
|
+
@db.execute(sql).each do |r|
|
177
|
+
an = Annotation.new()
|
178
|
+
an.build_from_db_record(r)
|
179
|
+
yield an
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def get(id)
|
184
|
+
sql = %Q{SELECT * FROM gff_records WHERE id=="#{id}";}
|
185
|
+
# puts sql
|
186
|
+
res = @db.execute(sql)
|
187
|
+
if res.size == 1
|
188
|
+
an = Annotation.new(@db)
|
189
|
+
an.build_from_db_record(res[0])
|
190
|
+
return an
|
191
|
+
else
|
192
|
+
if res.size >= 2
|
193
|
+
raise "multiple hits"
|
194
|
+
elsif res.size == 0
|
195
|
+
raise "not found: #{id}"
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def get_by_line_number(n)
|
201
|
+
sql = %Q{SELECT * FROM gff_records WHERE line_num=="#{n}";}
|
202
|
+
res = @db.execute(sql)
|
203
|
+
if res.size == 1
|
204
|
+
an = Annotation.new(@db)
|
205
|
+
an.build_from_db_record(res[0])
|
206
|
+
return an
|
207
|
+
else
|
208
|
+
if res.size >= 2
|
209
|
+
raise "multiple hits"
|
210
|
+
elsif res.size == 0
|
211
|
+
raise "not found: #{id}"
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
def search(query, num_limit=100)
|
217
|
+
sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
|
218
|
+
STDERR.puts sql
|
219
|
+
res = @db.execute(sql)
|
220
|
+
res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
|
221
|
+
res2
|
222
|
+
end
|
223
|
+
|
224
|
+
class Annotation
|
225
|
+
|
226
|
+
def initialize(db = nil)
|
227
|
+
@db = db
|
228
|
+
@seqid
|
229
|
+
@source
|
230
|
+
@type
|
231
|
+
@start
|
232
|
+
@end
|
233
|
+
@score
|
234
|
+
@strand
|
235
|
+
@phase
|
236
|
+
@attributes
|
237
|
+
@id
|
238
|
+
@parent_id
|
239
|
+
@gffline
|
240
|
+
end
|
241
|
+
|
242
|
+
attr_accessor :seqid, :source, :type, :start, :end, :score, :strand, :phase, :attributes
|
243
|
+
attr_accessor :id, :parent_id, :gffline, :line_num
|
244
|
+
|
245
|
+
def to_s
|
246
|
+
gffline
|
247
|
+
end
|
248
|
+
|
249
|
+
def to_hash
|
250
|
+
h = {
|
251
|
+
'seqid' => seqid,
|
252
|
+
'source' => source,
|
253
|
+
'type' => type,
|
254
|
+
'start' => start,
|
255
|
+
'end' => self.end,
|
256
|
+
'score' => score,
|
257
|
+
'strand' => strand,
|
258
|
+
'phase' => phase,
|
259
|
+
'line_num' => line_num,
|
260
|
+
'id' => id,
|
261
|
+
'parent_id' => parent_id,
|
262
|
+
'attributes' => attributes
|
263
|
+
}
|
264
|
+
end
|
265
|
+
|
266
|
+
alias :to_h :to_hash
|
267
|
+
|
268
|
+
def to_json
|
269
|
+
self.to_hash.to_json
|
270
|
+
end
|
271
|
+
|
272
|
+
def build_from_db_record(sql_result)
|
273
|
+
## sql_result: Array returned by @db.execute(sql)
|
274
|
+
v = sql_result
|
275
|
+
@seqid = v[4]
|
276
|
+
@source = v[5]
|
277
|
+
@type = v[6]
|
278
|
+
@start = v[7]
|
279
|
+
@end = v[8]
|
280
|
+
@score = v[9]
|
281
|
+
@strand = v[10]
|
282
|
+
@phase = v[11]
|
283
|
+
@line_num = v[0]
|
284
|
+
@gffline = v[1]
|
285
|
+
@id = v[2]
|
286
|
+
@parent_id = v[3]
|
287
|
+
@attributes = JSON.parse(v[13])
|
288
|
+
end
|
289
|
+
|
290
|
+
def parent
|
291
|
+
if parent_id
|
292
|
+
sql = %Q{SELECT * FROM gff_records WHERE id=="#{parent_id}";}
|
293
|
+
# puts sql
|
294
|
+
res = @db.execute(sql)
|
295
|
+
an = Annotation.new(@db)
|
296
|
+
an.build_from_db_record(res[0])
|
297
|
+
return an
|
298
|
+
else
|
299
|
+
return nil
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
def children
|
304
|
+
ary = []
|
305
|
+
sql = %Q{SELECT * FROM gff_records WHERE parent=="#{id}";}
|
306
|
+
# puts sql
|
307
|
+
res = @db.execute(sql)
|
308
|
+
res.each do |r|
|
309
|
+
an = Annotation.new(@db)
|
310
|
+
an.build_from_db_record(r)
|
311
|
+
ary << an
|
312
|
+
end
|
313
|
+
ary
|
314
|
+
end
|
315
|
+
|
316
|
+
def descendants
|
317
|
+
ary = []
|
318
|
+
sql = %Q{WITH RECURSIVE r AS (
|
319
|
+
SELECT * FROM gff_records WHERE id=="#{id}"
|
320
|
+
UNION ALL
|
321
|
+
SELECT gff_records.* FROM gff_records, r WHERE gff_records.parent == r.id
|
322
|
+
)
|
323
|
+
SELECT * FROM r}
|
324
|
+
res = @db.execute(sql)
|
325
|
+
res.each do |r|
|
326
|
+
an = Annotation.new(@db)
|
327
|
+
an.build_from_db_record(r)
|
328
|
+
ary << an
|
329
|
+
end
|
330
|
+
ary
|
331
|
+
end
|
332
|
+
|
333
|
+
def ancestors
|
334
|
+
ary = []
|
335
|
+
sql = %Q{WITH RECURSIVE ancestor AS (
|
336
|
+
SELECT * FROM gff_records WHERE id=="#{id}"
|
337
|
+
UNION ALL
|
338
|
+
SELECT gff_records.* FROM gff_records, ancestor
|
339
|
+
WHERE ancestor.parent = gff_records.id
|
340
|
+
)
|
341
|
+
SELECT * FROM ancestor;}
|
342
|
+
res = @db.execute(sql)
|
343
|
+
res.each do |r|
|
344
|
+
an = Annotation.new(@db)
|
345
|
+
an.build_from_db_record(r)
|
346
|
+
ary << an
|
347
|
+
end
|
348
|
+
ary
|
349
|
+
end
|
350
|
+
|
351
|
+
|
352
|
+
def length
|
353
|
+
len = @end - @start + 1
|
354
|
+
raise unless len > 0
|
355
|
+
return len
|
356
|
+
end
|
357
|
+
|
358
|
+
def dbxrefs
|
359
|
+
h = Hash.new
|
360
|
+
if attributes["Dbxref"]
|
361
|
+
attributes["Dbxref"].split(/,/).each do |x|
|
362
|
+
m = /(.+?):/.match(x)
|
363
|
+
key = m[1]
|
364
|
+
val = m.post_match
|
365
|
+
h.update({key => val})
|
366
|
+
end
|
367
|
+
end
|
368
|
+
h
|
369
|
+
end
|
370
|
+
|
371
|
+
end
|
372
|
+
|
373
|
+
end
|
374
|
+
|
375
|
+
end
|
376
|
+
|
377
|
+
if __FILE__ == $0
|
378
|
+
dbname = ARGV[0]
|
379
|
+
query = ARGV[1]
|
380
|
+
db = GffDb.new(dbname)
|
381
|
+
ann = db.get(query)
|
382
|
+
p ann
|
383
|
+
puts ann.to_s
|
384
|
+
p ann.to_hash
|
385
|
+
exit
|
386
|
+
db.each_line do |an|
|
387
|
+
p an
|
388
|
+
# p [an.id, an.seqid, an.start, an.end, an.attributes["protein_id"]]
|
389
|
+
end
|
390
|
+
end
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ezgff
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Shuji Shigenobu
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-07-14 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Utilities for GFF3, the genome annotation format. Useful to explore the
|
14
|
+
gene model features.
|
15
|
+
email:
|
16
|
+
- sshigenobu@gmail.com
|
17
|
+
executables:
|
18
|
+
- ezgff
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- ".gitignore"
|
23
|
+
- Gemfile
|
24
|
+
- LICENSE
|
25
|
+
- README.md
|
26
|
+
- Rakefile
|
27
|
+
- bin/build_gff_sqlitedb.rb
|
28
|
+
- bin/build_gff_sqlitedb_keywords.rb
|
29
|
+
- bin/build_gff_tabix.rb
|
30
|
+
- dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz
|
31
|
+
- dev/gff_examples/ApMT_NC_011594.gb.gff.gz
|
32
|
+
- dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
|
33
|
+
- dev/gff_examples/apisum_part.gff3.gz
|
34
|
+
- dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz
|
35
|
+
- exe/ezgff
|
36
|
+
- ezgff.gemspec
|
37
|
+
- lib/ezgff.rb
|
38
|
+
- lib/ezgff/gffsqlitedb.rb
|
39
|
+
- lib/ezgff/version.rb
|
40
|
+
homepage: https://github.com/shujishigenobu/ezgff_alpha
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata:
|
44
|
+
homepage_uri: https://github.com/shujishigenobu/ezgff_alpha
|
45
|
+
source_code_uri: https://github.com/shujishigenobu/ezgff_alpha
|
46
|
+
changelog_uri: https://github.com/shujishigenobu/ezgff_alpha
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 2.3.0
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubygems_version: 3.1.2
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: Utilities for GFF3
|
66
|
+
test_files: []
|