ezgff 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +56 -0
- data/Gemfile +14 -0
- data/LICENSE +21 -0
- data/README.md +34 -0
- data/Rakefile +6 -0
- data/bin/build_gff_sqlitedb.rb +91 -0
- data/bin/build_gff_sqlitedb_keywords.rb +70 -0
- data/bin/build_gff_tabix.rb +13 -0
- data/dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz +0 -0
- data/dev/gff_examples/ApMT_NC_011594.gb.gff.gz +0 -0
- data/dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz +0 -0
- data/dev/gff_examples/apisum_part.gff3.gz +0 -0
- data/dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz +0 -0
- data/exe/ezgff +168 -0
- data/ezgff.gemspec +30 -0
- data/lib/ezgff.rb +12 -0
- data/lib/ezgff/gffsqlitedb.rb +390 -0
- data/lib/ezgff/version.rb +3 -0
- metadata +66 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c5243c27b01a1b9c51ca19784b7fcce0cc86766de41270aa1a546370d1dcda9c
|
4
|
+
data.tar.gz: 06a3a2491430b10e33bae1e89c4a32d52feaab7a93425de47d6d3a0b15d6de23
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cfe588ee8d77d84540ade9b87cc3d3f6393e87cff783039959105a09a1fd9d6a94fd58aac1ee0964061112db6a5ba3dde4292bc9b4db8e2effee4b6f27bec703
|
7
|
+
data.tar.gz: cf1f4a39764e43456686fe7da5e7cae3cf1bbc2fba55b78ef9d4f52dd6ea232186a944b320a506bfc6499077e98d921ac6933fb774034d19323430724c1b5091
|
data/.gitignore
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/spec/examples.txt
|
9
|
+
/test/tmp/
|
10
|
+
/test/version_tmp/
|
11
|
+
/tmp/
|
12
|
+
|
13
|
+
# Used by dotenv library to load environment variables.
|
14
|
+
# .env
|
15
|
+
|
16
|
+
# Ignore Byebug command history file.
|
17
|
+
.byebug_history
|
18
|
+
|
19
|
+
## Specific to RubyMotion:
|
20
|
+
.dat*
|
21
|
+
.repl_history
|
22
|
+
build/
|
23
|
+
*.bridgesupport
|
24
|
+
build-iPhoneOS/
|
25
|
+
build-iPhoneSimulator/
|
26
|
+
|
27
|
+
## Specific to RubyMotion (use of CocoaPods):
|
28
|
+
#
|
29
|
+
# We recommend against adding the Pods directory to your .gitignore. However
|
30
|
+
# you should judge for yourself, the pros and cons are mentioned at:
|
31
|
+
# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
|
32
|
+
#
|
33
|
+
# vendor/Pods/
|
34
|
+
|
35
|
+
## Documentation cache and generated files:
|
36
|
+
/.yardoc/
|
37
|
+
/_yardoc/
|
38
|
+
/doc/
|
39
|
+
/rdoc/
|
40
|
+
|
41
|
+
## Environment normalization:
|
42
|
+
/.bundle/
|
43
|
+
/vendor/bundle
|
44
|
+
/lib/bundler/man/
|
45
|
+
|
46
|
+
# for a library or gem, you might want to ignore these files since the code is
|
47
|
+
# intended to run in multiple environments; otherwise, check them in:
|
48
|
+
# Gemfile.lock
|
49
|
+
# .ruby-version
|
50
|
+
# .ruby-gemset
|
51
|
+
|
52
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
53
|
+
.rvmrc
|
54
|
+
|
55
|
+
# Used by RuboCop. Remote config files pulled in from inherit_from directive.
|
56
|
+
# .rubocop-https?--*
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2021 Shuji Shigenobu
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# ezgff_alpha
|
2
|
+
|
3
|
+
## What is ezgff_alpha
|
4
|
+
|
5
|
+
## Pre-requisites
|
6
|
+
|
7
|
+
* sqlite3
|
8
|
+
|
9
|
+
## Install
|
10
|
+
|
11
|
+
|
12
|
+
## Quick start
|
13
|
+
|
14
|
+
Build database from GFF3 file.
|
15
|
+
|
16
|
+
```bash
|
17
|
+
ezgff build in.gff3
|
18
|
+
```
|
19
|
+
|
20
|
+
Retrieve GFF3 reacod by ID.
|
21
|
+
|
22
|
+
```bash
|
23
|
+
ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors
|
24
|
+
```
|
25
|
+
|
26
|
+
```
|
27
|
+
ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq
|
28
|
+
```
|
29
|
+
|
30
|
+
examples to use jq
|
31
|
+
|
32
|
+
```
|
33
|
+
ezgff_alpha/bin/ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] |@csv'
|
34
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'sqlite3'
|
2
|
+
require 'bio'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
def attributes_as_json(gffline)
|
6
|
+
gr = Bio::GFF::GFF3::Record.new(gffline.chomp)
|
7
|
+
h = Hash.new
|
8
|
+
gr.attributes.each do |att|
|
9
|
+
k, v = att
|
10
|
+
unless h.has_key?(k)
|
11
|
+
h[k] = []
|
12
|
+
end
|
13
|
+
h[k] << v
|
14
|
+
end
|
15
|
+
h2 = Hash.new
|
16
|
+
h.each do |k, v|
|
17
|
+
h2[k] = v.join(",")
|
18
|
+
end
|
19
|
+
h2.to_json
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
gfffile = ARGV[0]
|
24
|
+
#gfffile = "example_gff/apisum_part.gff3"
|
25
|
+
#gfffile = "example_gff/ApL_HF_liftover_Refseq.gff"
|
26
|
+
|
27
|
+
dbname = gfffile + ".sqlite3"
|
28
|
+
|
29
|
+
db = SQLite3::Database.new(dbname)
|
30
|
+
|
31
|
+
sql = <<-SQL
|
32
|
+
CREATE TABLE gff_records (
|
33
|
+
line_num integer primary key,
|
34
|
+
record text,
|
35
|
+
id text,
|
36
|
+
parent text,
|
37
|
+
seqid text not null,
|
38
|
+
source text,
|
39
|
+
type text,
|
40
|
+
start integer not null,
|
41
|
+
end integer not null,
|
42
|
+
score real,
|
43
|
+
strand varchar(1),
|
44
|
+
phase integer,
|
45
|
+
attributes text,
|
46
|
+
attributes_json text
|
47
|
+
);
|
48
|
+
SQL
|
49
|
+
|
50
|
+
db.execute(sql)
|
51
|
+
|
52
|
+
db.transaction do
|
53
|
+
File.open(gfffile).each_with_index do |l, i|
|
54
|
+
# puts l
|
55
|
+
## skip FASTA seq section
|
56
|
+
break if /^\#\#FASTA/.match(l)
|
57
|
+
|
58
|
+
## skip header section
|
59
|
+
next if /^\#/.match(l)
|
60
|
+
gr = Bio::GFF::GFF3::Record.new(l.chomp)
|
61
|
+
# p gr.attributes
|
62
|
+
id = nil
|
63
|
+
id_found = gr.attributes.select{|a| a[0] == "ID"}
|
64
|
+
if id_found.size == 1
|
65
|
+
id = id_found[0][1]
|
66
|
+
elsif id_found.size == 0
|
67
|
+
## do nothing (id = nil)
|
68
|
+
elsif id_found > 1
|
69
|
+
STDERR.puts gr.attributes
|
70
|
+
raise "Multiple IDs found."
|
71
|
+
end
|
72
|
+
parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
|
73
|
+
a = l.chomp.split(/\t/)
|
74
|
+
|
75
|
+
sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
|
76
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
77
|
+
values = [i, l.chomp, id, parent,
|
78
|
+
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
|
79
|
+
attributes_as_json(l)]
|
80
|
+
db.execute(sql, values)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
#===
|
85
|
+
# create index
|
86
|
+
table = "gff_records"
|
87
|
+
%w{id parent source type}.each do |col|
|
88
|
+
idxname = "index_#{table}_on_#{col}"
|
89
|
+
sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
|
90
|
+
db.execute(sql)
|
91
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'sqlite3'
|
2
|
+
require 'bio'
|
3
|
+
require 'json'
|
4
|
+
require './gffsqlitedb'
|
5
|
+
|
6
|
+
def insert_data(record, key, db_kw)
|
7
|
+
rec = record
|
8
|
+
att = rec.attributes
|
9
|
+
xdb = rec.dbxrefs
|
10
|
+
if val = att[key]
|
11
|
+
sql = "INSERT INTO gff_keywords (line_num, key, value) VALUES (?, ?, ?)"
|
12
|
+
values = [rec.line_num, key, val]
|
13
|
+
puts sql
|
14
|
+
p values
|
15
|
+
db_kw.execute(sql, values)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def insert_data_dbxref(record, key, db_kw)
|
20
|
+
rec = record
|
21
|
+
xdb = rec.dbxrefs
|
22
|
+
val = xdb[key]
|
23
|
+
sql = "INSERT INTO gff_keywords (line_num, key, value, category) VALUES (?, ?, ?, ?)"
|
24
|
+
values = [rec.line_num, key, val, "Dbxref"]
|
25
|
+
puts sql
|
26
|
+
p values
|
27
|
+
db_kw.execute(sql, values)
|
28
|
+
end
|
29
|
+
|
30
|
+
gfffile = ARGV[0]
|
31
|
+
|
32
|
+
db_file = gfffile + ".sqlite3" # altready built
|
33
|
+
#db_keywards = gfffile + ".keywords.sqlite3"
|
34
|
+
|
35
|
+
gffdb = GffDb.new(db_file) # altready built
|
36
|
+
db = SQLite3::Database.new(db_file)
|
37
|
+
|
38
|
+
sql = <<-SQL
|
39
|
+
CREATE TABLE gff_keywords (
|
40
|
+
id integer primary key,
|
41
|
+
line_num integer,
|
42
|
+
key text not null,
|
43
|
+
value text,
|
44
|
+
category text
|
45
|
+
);
|
46
|
+
SQL
|
47
|
+
|
48
|
+
db.execute(sql)
|
49
|
+
|
50
|
+
db.transaction do
|
51
|
+
gffdb.each_record do |r|
|
52
|
+
insert_data(r, "Name", db)
|
53
|
+
insert_data(r, "gbkey", db)
|
54
|
+
insert_data(r, "gene", db)
|
55
|
+
insert_data(r, "product", db)
|
56
|
+
insert_data(r, "transcript_id", db)
|
57
|
+
r.dbxrefs.keys.each do |k|
|
58
|
+
insert_data_dbxref(r, k, db)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#===
|
64
|
+
# create index
|
65
|
+
table = "gff_keywords"
|
66
|
+
%w{id line_num key value}.each do |col|
|
67
|
+
idxname = "index_#{table}_on_#{col}"
|
68
|
+
sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
|
69
|
+
db.execute(sql)
|
70
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
gfffile = ARGV[0]
|
2
|
+
gfffile = "example_gff/apisum_part.gff3"
|
3
|
+
#gfffile = "example_gff/ApL_HF_liftover_Refseq.gff"
|
4
|
+
|
5
|
+
#=== sort gff by position
|
6
|
+
gfffile_sorted = gfffile + ".gz"
|
7
|
+
cmd = %Q{(grep ^"#" #{gfffile}; grep -v ^"#" #{gfffile} | sort -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
|
8
|
+
system cmd
|
9
|
+
|
10
|
+
cmd = "tabix -p gff #{gfffile_sorted}"
|
11
|
+
system cmd
|
12
|
+
|
13
|
+
STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/exe/ezgff
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/ezgff'
|
4
|
+
require 'thor'
|
5
|
+
require 'color_echo/get'
|
6
|
+
|
7
|
+
module Ezgff
|
8
|
+
class CLI < Thor
|
9
|
+
|
10
|
+
## hack to enable -h option
|
11
|
+
## ref: https://magazine.rubyist.net/articles/0046/0046-Milkode.html#%E3%82%AB%E3%82%B9%E3%82%BF%E3%83%9E%E3%82%A4%E3%82%BA-milk-add--h-%E3%82%92%E5%AE%9F%E7%8F%BE%E3%81%99%E3%82%8B
|
12
|
+
|
13
|
+
class_option :help, :type => :boolean, :aliases => '-h', :desc => 'Help message'
|
14
|
+
|
15
|
+
no_tasks do
|
16
|
+
def invoke_command(task, *args)
|
17
|
+
if options[:help] &&
|
18
|
+
task.name != 'grep'
|
19
|
+
Ezgff::CLI.task_help(shell, task.name)
|
20
|
+
elsif options[:version] && task.name == 'help'
|
21
|
+
puts "milk #{Version}"
|
22
|
+
else
|
23
|
+
super
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
## end of -h option setting
|
28
|
+
|
29
|
+
desc "version", "show version number"
|
30
|
+
def version()
|
31
|
+
puts Ezgff::VERSION
|
32
|
+
end
|
33
|
+
|
34
|
+
desc "view DB QUERY", "retrieve GFF record by ID and view it in a specified format.
|
35
|
+
DB path to ezdb
|
36
|
+
ezdb should be created by using 'build' subcommand in advance.
|
37
|
+
QUERY query for search
|
38
|
+
Two modes, simple mode and advanced mode are available.
|
39
|
+
Simple mode
|
40
|
+
ID is given and search by the ID.
|
41
|
+
Advanced mode
|
42
|
+
Query is given in KEY=VALUE style. Available keys are
|
43
|
+
ID: ID (ex. ID=rna-XM_029485812.1)
|
44
|
+
LN: Line number (ex. LN=255)
|
45
|
+
Note: Spaces are not allowed before and after =
|
46
|
+
|
47
|
+
"
|
48
|
+
|
49
|
+
option :format, :aliases => '-f', :enum => ["json", "gff"], :default => "gff", :desc => "Specify output format."
|
50
|
+
option :with, :aliases => '-w', :enum => ["none", "parent", "children", "ancestors", "descendants"], :default => "none", :desc => "Retrieve data with parent or children features."
|
51
|
+
option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
|
52
|
+
|
53
|
+
def view(db, query)
|
54
|
+
ezdb = db
|
55
|
+
files = Dir["#{ezdb}/*.sqlite3"]
|
56
|
+
sq3_file = nil
|
57
|
+
if files.size == 1
|
58
|
+
sq3_file = files[0]
|
59
|
+
elsif files.size == 0
|
60
|
+
raise "sqlite3 file not found"
|
61
|
+
elsif files.size > 1
|
62
|
+
raise "Multiple sqlite3 files found"
|
63
|
+
end
|
64
|
+
sq3_db = GffDb.new(sq3_file)
|
65
|
+
|
66
|
+
if m = /^LN\=/.match(query)
|
67
|
+
## search by line number
|
68
|
+
query2 = m.post_match.strip
|
69
|
+
ann = sq3_db.get_by_line_number(query2)
|
70
|
+
elsif m = /^ID\=/.match(query)
|
71
|
+
## search by ID
|
72
|
+
query2 = m.post_match.strip
|
73
|
+
ann = sq3_db.get(query2)
|
74
|
+
else
|
75
|
+
ann = sq3_db.get(query)
|
76
|
+
end
|
77
|
+
|
78
|
+
results = []
|
79
|
+
if options[:with] == "descendants"
|
80
|
+
results = ann.descendants
|
81
|
+
elsif options[:with] == "ancestors"
|
82
|
+
results = ann.ancestors
|
83
|
+
else
|
84
|
+
parent = nil
|
85
|
+
if options[:with] == "parent"
|
86
|
+
parent = ann.parent
|
87
|
+
end
|
88
|
+
children = []
|
89
|
+
if options[:with] == "children"
|
90
|
+
ann.children.each do |c|
|
91
|
+
children << c
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
results = []
|
96
|
+
results << parent if parent
|
97
|
+
results << ann
|
98
|
+
results += children
|
99
|
+
end
|
100
|
+
|
101
|
+
## Filter results
|
102
|
+
## - type / type
|
103
|
+
if options[:type]
|
104
|
+
target_type = options[:type]
|
105
|
+
results = results.select{|r| r.type == target_type}
|
106
|
+
end
|
107
|
+
|
108
|
+
case options[:format]
|
109
|
+
when "json"
|
110
|
+
h = Hash.new
|
111
|
+
ary = results.map{|r| r.to_h}
|
112
|
+
h["gff_records"] = ary
|
113
|
+
puts h.to_json
|
114
|
+
|
115
|
+
when "gff"
|
116
|
+
puts results
|
117
|
+
else
|
118
|
+
raise "Unknown format: #{options[:format]}"
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
|
123
|
+
desc "build GFF", "build database from GFF file"
|
124
|
+
option :help, :aliases => :h, :type => :boolean
|
125
|
+
def build(gff_in)
|
126
|
+
# puts "build #{gff_file} => #{dbpath}"
|
127
|
+
dbpath = GffDb.build_db(gff_in)
|
128
|
+
STDERR.puts "new database created: #{dbpath}"
|
129
|
+
gff_file = dbpath + "/" + File.basename(gff_in)
|
130
|
+
GffDb.build_tabix(gff_file)
|
131
|
+
end
|
132
|
+
|
133
|
+
desc "search DB QUERY", "search GFF record giving query"
|
134
|
+
option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
|
135
|
+
def search(db, query)
|
136
|
+
ezdb = db
|
137
|
+
files = Dir["#{ezdb}/*.sqlite3"]
|
138
|
+
sq3_file = nil
|
139
|
+
if files.size == 1
|
140
|
+
sq3_file = files[0]
|
141
|
+
elsif files.size == 0
|
142
|
+
raise "sqlite3 file not found"
|
143
|
+
elsif files.size > 1
|
144
|
+
raise "Multiple sqlite3 files found"
|
145
|
+
end
|
146
|
+
sq3_db = GffDb.new(sq3_file)
|
147
|
+
results = sq3_db.search(query, 100)
|
148
|
+
case options[:format]
|
149
|
+
when "json"
|
150
|
+
h = Hash.new
|
151
|
+
ary = results.map{|r| r.to_h}
|
152
|
+
h["gff_records"] = ary
|
153
|
+
puts h.to_json
|
154
|
+
when "gff"
|
155
|
+
gfftxt = results.map{|r| r.to_s}.join("\n")
|
156
|
+
puts CE.pickup(/#{query}/i, :green, nil, :bold).get(gfftxt)
|
157
|
+
else
|
158
|
+
raise "Unknown format: #{options[:format]}"
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
Ezgff::CLI.start(ARGV)
|
166
|
+
|
167
|
+
|
168
|
+
|
data/ezgff.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require_relative 'lib/ezgff/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |spec|
|
4
|
+
spec.name = "ezgff"
|
5
|
+
spec.version = Ezgff::VERSION
|
6
|
+
spec.authors = ["Shuji Shigenobu"]
|
7
|
+
spec.email = ["sshigenobu@gmail.com"]
|
8
|
+
|
9
|
+
spec.summary = %q{Utilities for GFF3}
|
10
|
+
spec.description = %q{Utilities for GFF3, the genome annotation format. Useful to explore the gene model features.}
|
11
|
+
spec.homepage = "https://github.com/shujishigenobu/ezgff_alpha"
|
12
|
+
spec.license = "MIT"
|
13
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
|
14
|
+
|
15
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
16
|
+
|
17
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
18
|
+
spec.metadata["source_code_uri"] = "https://github.com/shujishigenobu/ezgff_alpha"
|
19
|
+
spec.metadata["changelog_uri"] = "https://github.com/shujishigenobu/ezgff_alpha"
|
20
|
+
# TODO: Put your gem's CHANGELOG.md URL here."
|
21
|
+
|
22
|
+
# Specify which files should be added to the gem when it is released.
|
23
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
24
|
+
spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
|
25
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
26
|
+
end
|
27
|
+
spec.bindir = "exe"
|
28
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
29
|
+
spec.require_paths = ["lib"]
|
30
|
+
end
|
data/lib/ezgff.rb
ADDED
@@ -0,0 +1,390 @@
|
|
1
|
+
require 'sqlite3'
|
2
|
+
require 'json'
|
3
|
+
require 'bio'
|
4
|
+
require 'fileutils'
|
5
|
+
|
6
|
+
#
|
7
|
+
# References
|
8
|
+
# * Official specification of GFF3 -- https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
|
9
|
+
#
|
10
|
+
|
11
|
+
module Ezgff
|
12
|
+
|
13
|
+
class GffDb
|
14
|
+
|
15
|
+
#===
|
16
|
+
# sqlite3 schema
|
17
|
+
#
|
18
|
+
# gff_records (
|
19
|
+
# line_num integer primary key,
|
20
|
+
# record text, # original record
|
21
|
+
# id text,
|
22
|
+
# parent text,
|
23
|
+
# seqid text not null,
|
24
|
+
# source text,
|
25
|
+
# type text,
|
26
|
+
# start integer not null,
|
27
|
+
# end integer not null,
|
28
|
+
# score real,
|
29
|
+
# strand varchar(1),
|
30
|
+
# phase integer,
|
31
|
+
# attributes text,
|
32
|
+
# attributes_json json
|
33
|
+
# )
|
34
|
+
|
35
|
+
def self.build_db(gff_in, ezdb_base = nil)
|
36
|
+
ezdb_base = (ezdb_base || ".")
|
37
|
+
ezdb_path = ezdb_base + "/" + File.basename(gff_in) + ".ezdb"
|
38
|
+
gff_file = ezdb_path + "/" + File.basename(gff_in)
|
39
|
+
Dir.mkdir(ezdb_path)
|
40
|
+
File.open(gff_file, "w") do |o|
|
41
|
+
File.open(gff_in).each do |l|
|
42
|
+
break if /^\#\#FASTA/.match(l)
|
43
|
+
## skip header section
|
44
|
+
next if /^\#/.match(l)
|
45
|
+
o.puts l
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# FileUtils.cp(gff_in, gff_file)
|
50
|
+
sq3_file = gff_file + ".sqlite3"
|
51
|
+
|
52
|
+
## Create table in sqlite3 RDBMS
|
53
|
+
## table name: gff_record
|
54
|
+
|
55
|
+
sq3_db = SQLite3::Database.new(sq3_file)
|
56
|
+
|
57
|
+
sql = <<-SQL
|
58
|
+
CREATE TABLE gff_records (
|
59
|
+
line_num integer primary key,
|
60
|
+
record text,
|
61
|
+
id text,
|
62
|
+
parent text,
|
63
|
+
seqid text not null,
|
64
|
+
source text,
|
65
|
+
type text,
|
66
|
+
start integer not null,
|
67
|
+
end integer not null,
|
68
|
+
score real,
|
69
|
+
strand varchar(1),
|
70
|
+
phase integer,
|
71
|
+
attributes text,
|
72
|
+
attributes_json json
|
73
|
+
);
|
74
|
+
SQL
|
75
|
+
|
76
|
+
sq3_db.execute(sql)
|
77
|
+
|
78
|
+
## Read GFF file and insert data into
|
79
|
+
## the sqlite3 table
|
80
|
+
|
81
|
+
sq3_db.transaction do
|
82
|
+
File.open(gff_file).each_with_index do |l, i|
|
83
|
+
# puts l
|
84
|
+
## skip FASTA seq section
|
85
|
+
break if /^\#\#FASTA/.match(l)
|
86
|
+
|
87
|
+
## skip header section
|
88
|
+
next if /^\#/.match(l)
|
89
|
+
gr = Bio::GFF::GFF3::Record.new(l.chomp)
|
90
|
+
# p gr.attributes
|
91
|
+
id = nil
|
92
|
+
id_found = gr.attributes.select{|a| a[0] == "ID"}
|
93
|
+
if id_found.size == 1
|
94
|
+
id = id_found[0][1]
|
95
|
+
elsif id_found.size == 0
|
96
|
+
## do nothing (id = nil)
|
97
|
+
elsif id_found > 1
|
98
|
+
STDERR.puts gr.attributes
|
99
|
+
raise "Multiple IDs found."
|
100
|
+
end
|
101
|
+
parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
|
102
|
+
a = l.chomp.split(/\t/)
|
103
|
+
|
104
|
+
sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
|
105
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
106
|
+
values = [i, l.chomp, id, parent,
|
107
|
+
a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
|
108
|
+
attributes_as_json(l)]
|
109
|
+
sq3_db.execute(sql, values)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
## Indexing the sqlite3 table
|
114
|
+
table = "gff_records"
|
115
|
+
%w{id parent source type}.each do |col|
|
116
|
+
idxname = "index_#{table}_on_#{col}"
|
117
|
+
sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
|
118
|
+
sq3_db.execute(sql)
|
119
|
+
end
|
120
|
+
|
121
|
+
return ezdb_path
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
def self.build_tabix(gff_in)
|
126
|
+
## sort gff by position
|
127
|
+
gfffile_sorted = gff_in + ".gz"
|
128
|
+
cmd = %Q{(grep ^"#" #{gff_in}; grep -v ^"#" #{gff_in} | sort -t $'\t' -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
|
129
|
+
STDERR.puts cmd
|
130
|
+
system cmd
|
131
|
+
|
132
|
+
cmd = "tabix -p gff #{gfffile_sorted}"
|
133
|
+
STDERR.puts cmd
|
134
|
+
system cmd
|
135
|
+
|
136
|
+
STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
|
137
|
+
end
|
138
|
+
|
139
|
+
def self.attributes_as_json(gffline)
|
140
|
+
keys_multi_val_allowed = %{Parent Alias Note Dbxref Ontology_term}
|
141
|
+
|
142
|
+
gr = Bio::GFF::GFF3::Record.new(gffline.chomp)
|
143
|
+
|
144
|
+
h = Hash.new
|
145
|
+
gr.attributes.each do |att|
|
146
|
+
k, v = att
|
147
|
+
unless h.has_key?(k)
|
148
|
+
h[k] = []
|
149
|
+
end
|
150
|
+
h[k] << v
|
151
|
+
end
|
152
|
+
h2 = Hash.new
|
153
|
+
h.each do |key, values|
|
154
|
+
if key == "Dbxref" || key == "Ontology_term"
|
155
|
+
h3 = Hash.new
|
156
|
+
values.each do |val|
|
157
|
+
m = /(.+?):/.match(val)
|
158
|
+
dbtag = m[1]
|
159
|
+
dbval = m.post_match
|
160
|
+
h3.update({dbtag => dbval})
|
161
|
+
end
|
162
|
+
h2[key] = h3
|
163
|
+
else
|
164
|
+
h2[key] = values.join(",")
|
165
|
+
end
|
166
|
+
end
|
167
|
+
h2.to_json
|
168
|
+
end
|
169
|
+
|
170
|
+
def initialize(path)
|
171
|
+
@db = SQLite3::Database.new(path)
|
172
|
+
end
|
173
|
+
|
174
|
+
def each_record
|
175
|
+
sql = "SELECT * FROM gff_records"
|
176
|
+
@db.execute(sql).each do |r|
|
177
|
+
an = Annotation.new()
|
178
|
+
an.build_from_db_record(r)
|
179
|
+
yield an
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
def get(id)
|
184
|
+
sql = %Q{SELECT * FROM gff_records WHERE id=="#{id}";}
|
185
|
+
# puts sql
|
186
|
+
res = @db.execute(sql)
|
187
|
+
if res.size == 1
|
188
|
+
an = Annotation.new(@db)
|
189
|
+
an.build_from_db_record(res[0])
|
190
|
+
return an
|
191
|
+
else
|
192
|
+
if res.size >= 2
|
193
|
+
raise "multiple hits"
|
194
|
+
elsif res.size == 0
|
195
|
+
raise "not found: #{id}"
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def get_by_line_number(n)
|
201
|
+
sql = %Q{SELECT * FROM gff_records WHERE line_num=="#{n}";}
|
202
|
+
res = @db.execute(sql)
|
203
|
+
if res.size == 1
|
204
|
+
an = Annotation.new(@db)
|
205
|
+
an.build_from_db_record(res[0])
|
206
|
+
return an
|
207
|
+
else
|
208
|
+
if res.size >= 2
|
209
|
+
raise "multiple hits"
|
210
|
+
elsif res.size == 0
|
211
|
+
raise "not found: #{id}"
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
215
|
+
|
216
|
+
def search(query, num_limit=100)
|
217
|
+
sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
|
218
|
+
STDERR.puts sql
|
219
|
+
res = @db.execute(sql)
|
220
|
+
res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
|
221
|
+
res2
|
222
|
+
end
|
223
|
+
|
224
|
+
class Annotation
|
225
|
+
|
226
|
+
def initialize(db = nil)
|
227
|
+
@db = db
|
228
|
+
@seqid
|
229
|
+
@source
|
230
|
+
@type
|
231
|
+
@start
|
232
|
+
@end
|
233
|
+
@score
|
234
|
+
@strand
|
235
|
+
@phase
|
236
|
+
@attributes
|
237
|
+
@id
|
238
|
+
@parent_id
|
239
|
+
@gffline
|
240
|
+
end
|
241
|
+
|
242
|
+
attr_accessor :seqid, :source, :type, :start, :end, :score, :strand, :phase, :attributes
|
243
|
+
attr_accessor :id, :parent_id, :gffline, :line_num
|
244
|
+
|
245
|
+
def to_s
|
246
|
+
gffline
|
247
|
+
end
|
248
|
+
|
249
|
+
def to_hash
|
250
|
+
h = {
|
251
|
+
'seqid' => seqid,
|
252
|
+
'source' => source,
|
253
|
+
'type' => type,
|
254
|
+
'start' => start,
|
255
|
+
'end' => self.end,
|
256
|
+
'score' => score,
|
257
|
+
'strand' => strand,
|
258
|
+
'phase' => phase,
|
259
|
+
'line_num' => line_num,
|
260
|
+
'id' => id,
|
261
|
+
'parent_id' => parent_id,
|
262
|
+
'attributes' => attributes
|
263
|
+
}
|
264
|
+
end
|
265
|
+
|
266
|
+
alias :to_h :to_hash
|
267
|
+
|
268
|
+
def to_json
|
269
|
+
self.to_hash.to_json
|
270
|
+
end
|
271
|
+
|
272
|
+
def build_from_db_record(sql_result)
|
273
|
+
## sql_result: Array returned by @db.execute(sql)
|
274
|
+
v = sql_result
|
275
|
+
@seqid = v[4]
|
276
|
+
@source = v[5]
|
277
|
+
@type = v[6]
|
278
|
+
@start = v[7]
|
279
|
+
@end = v[8]
|
280
|
+
@score = v[9]
|
281
|
+
@strand = v[10]
|
282
|
+
@phase = v[11]
|
283
|
+
@line_num = v[0]
|
284
|
+
@gffline = v[1]
|
285
|
+
@id = v[2]
|
286
|
+
@parent_id = v[3]
|
287
|
+
@attributes = JSON.parse(v[13])
|
288
|
+
end
|
289
|
+
|
290
|
+
def parent
|
291
|
+
if parent_id
|
292
|
+
sql = %Q{SELECT * FROM gff_records WHERE id=="#{parent_id}";}
|
293
|
+
# puts sql
|
294
|
+
res = @db.execute(sql)
|
295
|
+
an = Annotation.new(@db)
|
296
|
+
an.build_from_db_record(res[0])
|
297
|
+
return an
|
298
|
+
else
|
299
|
+
return nil
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
def children
|
304
|
+
ary = []
|
305
|
+
sql = %Q{SELECT * FROM gff_records WHERE parent=="#{id}";}
|
306
|
+
# puts sql
|
307
|
+
res = @db.execute(sql)
|
308
|
+
res.each do |r|
|
309
|
+
an = Annotation.new(@db)
|
310
|
+
an.build_from_db_record(r)
|
311
|
+
ary << an
|
312
|
+
end
|
313
|
+
ary
|
314
|
+
end
|
315
|
+
|
316
|
+
def descendants
|
317
|
+
ary = []
|
318
|
+
sql = %Q{WITH RECURSIVE r AS (
|
319
|
+
SELECT * FROM gff_records WHERE id=="#{id}"
|
320
|
+
UNION ALL
|
321
|
+
SELECT gff_records.* FROM gff_records, r WHERE gff_records.parent == r.id
|
322
|
+
)
|
323
|
+
SELECT * FROM r}
|
324
|
+
res = @db.execute(sql)
|
325
|
+
res.each do |r|
|
326
|
+
an = Annotation.new(@db)
|
327
|
+
an.build_from_db_record(r)
|
328
|
+
ary << an
|
329
|
+
end
|
330
|
+
ary
|
331
|
+
end
|
332
|
+
|
333
|
+
def ancestors
|
334
|
+
ary = []
|
335
|
+
sql = %Q{WITH RECURSIVE ancestor AS (
|
336
|
+
SELECT * FROM gff_records WHERE id=="#{id}"
|
337
|
+
UNION ALL
|
338
|
+
SELECT gff_records.* FROM gff_records, ancestor
|
339
|
+
WHERE ancestor.parent = gff_records.id
|
340
|
+
)
|
341
|
+
SELECT * FROM ancestor;}
|
342
|
+
res = @db.execute(sql)
|
343
|
+
res.each do |r|
|
344
|
+
an = Annotation.new(@db)
|
345
|
+
an.build_from_db_record(r)
|
346
|
+
ary << an
|
347
|
+
end
|
348
|
+
ary
|
349
|
+
end
|
350
|
+
|
351
|
+
|
352
|
+
def length
|
353
|
+
len = @end - @start + 1
|
354
|
+
raise unless len > 0
|
355
|
+
return len
|
356
|
+
end
|
357
|
+
|
358
|
+
def dbxrefs
|
359
|
+
h = Hash.new
|
360
|
+
if attributes["Dbxref"]
|
361
|
+
attributes["Dbxref"].split(/,/).each do |x|
|
362
|
+
m = /(.+?):/.match(x)
|
363
|
+
key = m[1]
|
364
|
+
val = m.post_match
|
365
|
+
h.update({key => val})
|
366
|
+
end
|
367
|
+
end
|
368
|
+
h
|
369
|
+
end
|
370
|
+
|
371
|
+
end
|
372
|
+
|
373
|
+
end
|
374
|
+
|
375
|
+
end
|
376
|
+
|
377
|
+
if __FILE__ == $0
|
378
|
+
dbname = ARGV[0]
|
379
|
+
query = ARGV[1]
|
380
|
+
db = GffDb.new(dbname)
|
381
|
+
ann = db.get(query)
|
382
|
+
p ann
|
383
|
+
puts ann.to_s
|
384
|
+
p ann.to_hash
|
385
|
+
exit
|
386
|
+
db.each_line do |an|
|
387
|
+
p an
|
388
|
+
# p [an.id, an.seqid, an.start, an.end, an.attributes["protein_id"]]
|
389
|
+
end
|
390
|
+
end
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ezgff
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Shuji Shigenobu
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2021-07-14 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Utilities for GFF3, the genome annotation format. Useful to explore the
|
14
|
+
gene model features.
|
15
|
+
email:
|
16
|
+
- sshigenobu@gmail.com
|
17
|
+
executables:
|
18
|
+
- ezgff
|
19
|
+
extensions: []
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- ".gitignore"
|
23
|
+
- Gemfile
|
24
|
+
- LICENSE
|
25
|
+
- README.md
|
26
|
+
- Rakefile
|
27
|
+
- bin/build_gff_sqlitedb.rb
|
28
|
+
- bin/build_gff_sqlitedb_keywords.rb
|
29
|
+
- bin/build_gff_tabix.rb
|
30
|
+
- dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz
|
31
|
+
- dev/gff_examples/ApMT_NC_011594.gb.gff.gz
|
32
|
+
- dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
|
33
|
+
- dev/gff_examples/apisum_part.gff3.gz
|
34
|
+
- dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz
|
35
|
+
- exe/ezgff
|
36
|
+
- ezgff.gemspec
|
37
|
+
- lib/ezgff.rb
|
38
|
+
- lib/ezgff/gffsqlitedb.rb
|
39
|
+
- lib/ezgff/version.rb
|
40
|
+
homepage: https://github.com/shujishigenobu/ezgff_alpha
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata:
|
44
|
+
homepage_uri: https://github.com/shujishigenobu/ezgff_alpha
|
45
|
+
source_code_uri: https://github.com/shujishigenobu/ezgff_alpha
|
46
|
+
changelog_uri: https://github.com/shujishigenobu/ezgff_alpha
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: 2.3.0
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
requirements: []
|
62
|
+
rubygems_version: 3.1.2
|
63
|
+
signing_key:
|
64
|
+
specification_version: 4
|
65
|
+
summary: Utilities for GFF3
|
66
|
+
test_files: []
|