ezgff 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +7 -0
- data/exe/ezgff +2 -1
- data/exe/ezgff_extract_gene_model_features_from_gff3.rb +155 -0
- data/exe/ezgff_rm_fasta.rb +14 -0
- data/lib/ezgff/gffsqlitedb.rb +9 -3
- data/lib/ezgff/version.rb +1 -1
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d4a07d4b65fd0a41b91f7627a19291d9740748a84f680c93e1307dfe1397c110
|
4
|
+
data.tar.gz: 3bd3d44155e7a92f81e899481706a280aca5b6f1636c67b550ba6244b39f6a8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d665269b3032ad82f9859d737cc31e4393b6f7275a2547b76faa8ea5499d7258e6a8651c211409115e767dd316b5b0ef616ffc45bdc09c131d3b036738ac26d1
|
7
|
+
data.tar.gz: bb76c6640fb0c410d5eb66413073a5cb97efdd6d9167c5a3b5e4ff601ba48233b545139230935fd15032a876b91e26811f306da00e6fd2a956d269c8fccd3b7a
|
data/README.md
CHANGED
@@ -54,3 +54,10 @@ ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=
|
|
54
54
|
|jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] \
|
55
55
|
|@csv'
|
56
56
|
```
|
57
|
+
|
58
|
+
## FAQ
|
59
|
+
### Can I use GFF2 or GTF for ezgff?
|
60
|
+
Ans.
|
61
|
+
|
62
|
+
No. ezgff takes GFF3 format only. GFF2/GTF can be converted to GFF3 easily. gffread and other tools can be used for conversion.
|
63
|
+
|
data/exe/ezgff
CHANGED
@@ -132,6 +132,7 @@ module Ezgff
|
|
132
132
|
|
133
133
|
desc "search DB QUERY", "search GFF record giving query"
|
134
134
|
option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
|
135
|
+
option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
|
135
136
|
def search(db, query)
|
136
137
|
ezdb = db
|
137
138
|
files = Dir["#{ezdb}/*.sqlite3"]
|
@@ -144,7 +145,7 @@ module Ezgff
|
|
144
145
|
raise "Multiple sqlite3 files found"
|
145
146
|
end
|
146
147
|
sq3_db = GffDb.new(sq3_file)
|
147
|
-
results = sq3_db.search(query, 100)
|
148
|
+
results = sq3_db.search(query, 100, options[:type])
|
148
149
|
case options[:format]
|
149
150
|
when "json"
|
150
151
|
h = Hash.new
|
@@ -0,0 +1,155 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/ezgff'
|
4
|
+
# require 'thor'
|
5
|
+
|
6
|
+
include Ezgff
|
7
|
+
|
8
|
+
def path_to_root(ary, pdata)
|
9
|
+
# p ary
|
10
|
+
# p pdata
|
11
|
+
if par = pdata[ary.last]
|
12
|
+
newary = [ary, par].flatten
|
13
|
+
path_to_root(newary, data)
|
14
|
+
else
|
15
|
+
return ary
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def descendant_paths(paths, cdata)
|
20
|
+
# puts "input:"
|
21
|
+
# p paths
|
22
|
+
if paths.map{|pa| cdata[pa.last].size}.all?{|v| v == 0}
|
23
|
+
return paths
|
24
|
+
else
|
25
|
+
newpaths = []
|
26
|
+
paths.each do |pa|
|
27
|
+
#p pa
|
28
|
+
#p cdata[pa.last][:children]
|
29
|
+
if cdata[pa.last].size > 0
|
30
|
+
cdata[pa.last].each do |c|
|
31
|
+
newary = [pa, c].flatten
|
32
|
+
newpaths << newary
|
33
|
+
end
|
34
|
+
else
|
35
|
+
newpaths << pa
|
36
|
+
end
|
37
|
+
end
|
38
|
+
# puts "generated:"
|
39
|
+
# p newpaths
|
40
|
+
descendant_paths(newpaths, cdata)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
gff_file = ARGV[0]
|
45
|
+
out_file = ARGV[1]
|
46
|
+
outfile_not_found_parents = out_file + ".ParentsNoteFound.txt"
|
47
|
+
|
48
|
+
include_features = ["gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR", "ncRNA","polyA_site", "pre_miRNA", "pseudogene", "rRNA", "snRNA", "snoRNA", "tRNA"]
|
49
|
+
exclude_features = ["match", "match_part", "orthologous_to", "paralogous_to", "oligo", "sgRNA"]
|
50
|
+
|
51
|
+
data = {} # key: line_num (int); value: gff_feature (Bio::GFF::GFF3::Record)
|
52
|
+
id2ln = {} # key: ID; value: line_num (reference to key of data)
|
53
|
+
pdata = {} # Hash to store parent relations
|
54
|
+
# key: line_num (int); value: ID
|
55
|
+
|
56
|
+
STDERR.puts "#{Time.now} Loading data..."
|
57
|
+
File.open(gff_file).each_with_index do |l, i|
|
58
|
+
STDERR.print "#{i} lines loaded\r" if i % 100000 == 0
|
59
|
+
|
60
|
+
# puts l
|
61
|
+
a = l.chomp.split(/\t/)
|
62
|
+
# next if exclude_features.include?(a[2])
|
63
|
+
next unless include_features.include?(a[2])
|
64
|
+
## skip FASTA seq section
|
65
|
+
break if /^\#\#FASTA/.match(l)
|
66
|
+
|
67
|
+
## skip header section
|
68
|
+
next if /^\#/.match(l)
|
69
|
+
gr = Bio::GFF::GFF3::Record.new(l.chomp)
|
70
|
+
data[i] = gr
|
71
|
+
|
72
|
+
id = nil
|
73
|
+
id_found = gr.attributes.select{|a| a[0] == "ID"}
|
74
|
+
if id_found.size == 1
|
75
|
+
id = id_found[0][1]
|
76
|
+
elsif id_found.size == 0
|
77
|
+
## do nothing (id = nil)
|
78
|
+
elsif id_found > 1
|
79
|
+
STDERR.puts gr.attributes
|
80
|
+
raise "Multiple IDs found."
|
81
|
+
end
|
82
|
+
id2ln[id] = i
|
83
|
+
end
|
84
|
+
|
85
|
+
STDERR.puts "#{Time.now} Loading data: done."
|
86
|
+
STDERR.puts "parent-children relations are beeing analyzed..."
|
87
|
+
|
88
|
+
|
89
|
+
notfound = []
|
90
|
+
data.each do |i, v|
|
91
|
+
gr = v
|
92
|
+
parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
|
93
|
+
if parent
|
94
|
+
begin
|
95
|
+
pdata[i] = id2ln.fetch(parent)
|
96
|
+
rescue
|
97
|
+
notfound << [i, parent]
|
98
|
+
end
|
99
|
+
else
|
100
|
+
pdata[i] = nil
|
101
|
+
end
|
102
|
+
end
|
103
|
+
STDERR.puts "parent database were created.\nchildren database is being created..."
|
104
|
+
|
105
|
+
require 'pp'
|
106
|
+
|
107
|
+
#p data
|
108
|
+
#p id2ln
|
109
|
+
#pp pdata
|
110
|
+
|
111
|
+
## build children data from pareint data (pdata)
|
112
|
+
cdata = Hash.new()
|
113
|
+
## init cdata
|
114
|
+
pdata.each do |k, v|
|
115
|
+
unless cdata.has_key?(k)
|
116
|
+
cdata[k] = []
|
117
|
+
end
|
118
|
+
end
|
119
|
+
pdata.each do |k, v|
|
120
|
+
if v
|
121
|
+
parent = v
|
122
|
+
cdata[v] << k
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
#pp cdata
|
127
|
+
STDERR.puts "children databse created."
|
128
|
+
STDERR.puts "parent-children databases were successfully created."
|
129
|
+
|
130
|
+
STDERR.puts "gene and descendant features are being extracted..."
|
131
|
+
|
132
|
+
passed_paths = []
|
133
|
+
data.keys.sort.each do |i|
|
134
|
+
gr = data[i]
|
135
|
+
if gr.feature == "gene"
|
136
|
+
# puts gr
|
137
|
+
passed_paths << descendant_paths([[i]], cdata)
|
138
|
+
end
|
139
|
+
# break if i > 100
|
140
|
+
end
|
141
|
+
|
142
|
+
#p passed_paths
|
143
|
+
File.open(out_file, "w") do |o|
|
144
|
+
passed_paths.flatten.sort.uniq.each do |i|
|
145
|
+
o.puts data[i]
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
STDERR.puts "target features were successfully extracted."
|
150
|
+
|
151
|
+
File.open(outfile_not_found_parents, "w") do |o|
|
152
|
+
notfound.each do |i, parent|
|
153
|
+
o.puts [i, parent, data[i]].join("\t")
|
154
|
+
end
|
155
|
+
end
|
data/lib/ezgff/gffsqlitedb.rb
CHANGED
@@ -163,7 +163,7 @@ module Ezgff
|
|
163
163
|
end
|
164
164
|
h2 = Hash.new
|
165
165
|
h.each do |key, values|
|
166
|
-
if key == "
|
166
|
+
if key == "Dbxref2" # dummy (not used currently)
|
167
167
|
h3 = Hash.new
|
168
168
|
values.each do |val|
|
169
169
|
m = /(.+?):/.match(val)
|
@@ -172,6 +172,8 @@ module Ezgff
|
|
172
172
|
h3.update({dbtag => dbval})
|
173
173
|
end
|
174
174
|
h2[key] = h3
|
175
|
+
elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
|
176
|
+
h2[key] = values
|
175
177
|
else
|
176
178
|
h2[key] = values.join(",")
|
177
179
|
end
|
@@ -225,8 +227,12 @@ module Ezgff
|
|
225
227
|
end
|
226
228
|
end
|
227
229
|
|
228
|
-
def search(query, num_limit=100)
|
229
|
-
sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%"
|
230
|
+
def search(query, num_limit=100, type=nil)
|
231
|
+
sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%" }
|
232
|
+
if type
|
233
|
+
sql += %Q{ AND type=="#{type}"}
|
234
|
+
end
|
235
|
+
sql += %Q{ LIMIT #{num_limit} } ;
|
230
236
|
STDERR.puts sql
|
231
237
|
res = @db.execute(sql)
|
232
238
|
res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
|
data/lib/ezgff/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ezgff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuji Shigenobu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: sqlite3
|
@@ -72,6 +72,8 @@ email:
|
|
72
72
|
- sshigenobu@gmail.com
|
73
73
|
executables:
|
74
74
|
- ezgff
|
75
|
+
- ezgff_extract_gene_model_features_from_gff3.rb
|
76
|
+
- ezgff_rm_fasta.rb
|
75
77
|
extensions: []
|
76
78
|
extra_rdoc_files: []
|
77
79
|
files:
|
@@ -87,6 +89,8 @@ files:
|
|
87
89
|
- dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
|
88
90
|
- dev/gff_examples/apisum_part.gff3.gz
|
89
91
|
- exe/ezgff
|
92
|
+
- exe/ezgff_extract_gene_model_features_from_gff3.rb
|
93
|
+
- exe/ezgff_rm_fasta.rb
|
90
94
|
- ezgff.gemspec
|
91
95
|
- lib/ezgff.rb
|
92
96
|
- lib/ezgff/gffsqlitedb.rb
|