ezgff 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +7 -0
- data/exe/ezgff +2 -1
- data/exe/ezgff_extract_gene_model_features_from_gff3.rb +155 -0
- data/exe/ezgff_rm_fasta.rb +14 -0
- data/lib/ezgff/gffsqlitedb.rb +9 -3
- data/lib/ezgff/version.rb +1 -1
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d4a07d4b65fd0a41b91f7627a19291d9740748a84f680c93e1307dfe1397c110
|
4
|
+
data.tar.gz: 3bd3d44155e7a92f81e899481706a280aca5b6f1636c67b550ba6244b39f6a8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d665269b3032ad82f9859d737cc31e4393b6f7275a2547b76faa8ea5499d7258e6a8651c211409115e767dd316b5b0ef616ffc45bdc09c131d3b036738ac26d1
|
7
|
+
data.tar.gz: bb76c6640fb0c410d5eb66413073a5cb97efdd6d9167c5a3b5e4ff601ba48233b545139230935fd15032a876b91e26811f306da00e6fd2a956d269c8fccd3b7a
|
data/README.md
CHANGED
@@ -54,3 +54,10 @@ ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=
|
|
54
54
|
|jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] \
|
55
55
|
|@csv'
|
56
56
|
```
|
57
|
+
|
58
|
+
## FAQ
|
59
|
+
### Can I use GFF2 or GTF for ezgff?
|
60
|
+
Ans.
|
61
|
+
|
62
|
+
No. ezgff takes GFF3 format only. GFF2/GTF can be converted to GFF3 easily. gffread and other tools can be used for conversion.
|
63
|
+
|
data/exe/ezgff
CHANGED
@@ -132,6 +132,7 @@ module Ezgff
|
|
132
132
|
|
133
133
|
desc "search DB QUERY", "search GFF record giving query"
|
134
134
|
option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
|
135
|
+
option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
|
135
136
|
def search(db, query)
|
136
137
|
ezdb = db
|
137
138
|
files = Dir["#{ezdb}/*.sqlite3"]
|
@@ -144,7 +145,7 @@ module Ezgff
|
|
144
145
|
raise "Multiple sqlite3 files found"
|
145
146
|
end
|
146
147
|
sq3_db = GffDb.new(sq3_file)
|
147
|
-
results = sq3_db.search(query, 100)
|
148
|
+
results = sq3_db.search(query, 100, options[:type])
|
148
149
|
case options[:format]
|
149
150
|
when "json"
|
150
151
|
h = Hash.new
|
@@ -0,0 +1,155 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require_relative '../lib/ezgff'
|
4
|
+
# require 'thor'
|
5
|
+
|
6
|
+
include Ezgff
|
7
|
+
|
8
|
+
def path_to_root(ary, pdata)
|
9
|
+
# p ary
|
10
|
+
# p pdata
|
11
|
+
if par = pdata[ary.last]
|
12
|
+
newary = [ary, par].flatten
|
13
|
+
path_to_root(newary, data)
|
14
|
+
else
|
15
|
+
return ary
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def descendant_paths(paths, cdata)
|
20
|
+
# puts "input:"
|
21
|
+
# p paths
|
22
|
+
if paths.map{|pa| cdata[pa.last].size}.all?{|v| v == 0}
|
23
|
+
return paths
|
24
|
+
else
|
25
|
+
newpaths = []
|
26
|
+
paths.each do |pa|
|
27
|
+
#p pa
|
28
|
+
#p cdata[pa.last][:children]
|
29
|
+
if cdata[pa.last].size > 0
|
30
|
+
cdata[pa.last].each do |c|
|
31
|
+
newary = [pa, c].flatten
|
32
|
+
newpaths << newary
|
33
|
+
end
|
34
|
+
else
|
35
|
+
newpaths << pa
|
36
|
+
end
|
37
|
+
end
|
38
|
+
# puts "generated:"
|
39
|
+
# p newpaths
|
40
|
+
descendant_paths(newpaths, cdata)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
gff_file = ARGV[0]
|
45
|
+
out_file = ARGV[1]
|
46
|
+
outfile_not_found_parents = out_file + ".ParentsNoteFound.txt"
|
47
|
+
|
48
|
+
include_features = ["gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR", "ncRNA","polyA_site", "pre_miRNA", "pseudogene", "rRNA", "snRNA", "snoRNA", "tRNA"]
|
49
|
+
exclude_features = ["match", "match_part", "orthologous_to", "paralogous_to", "oligo", "sgRNA"]
|
50
|
+
|
51
|
+
data = {} # key: line_num (int); value: gff_feature (Bio::GFF::GFF3::Record)
|
52
|
+
id2ln = {} # key: ID; value: line_num (reference to key of data)
|
53
|
+
pdata = {} # Hash to store parent relations
|
54
|
+
# key: line_num (int); value: ID
|
55
|
+
|
56
|
+
STDERR.puts "#{Time.now} Loading data..."
|
57
|
+
File.open(gff_file).each_with_index do |l, i|
|
58
|
+
STDERR.print "#{i} lines loaded\r" if i % 100000 == 0
|
59
|
+
|
60
|
+
# puts l
|
61
|
+
a = l.chomp.split(/\t/)
|
62
|
+
# next if exclude_features.include?(a[2])
|
63
|
+
next unless include_features.include?(a[2])
|
64
|
+
## skip FASTA seq section
|
65
|
+
break if /^\#\#FASTA/.match(l)
|
66
|
+
|
67
|
+
## skip header section
|
68
|
+
next if /^\#/.match(l)
|
69
|
+
gr = Bio::GFF::GFF3::Record.new(l.chomp)
|
70
|
+
data[i] = gr
|
71
|
+
|
72
|
+
id = nil
|
73
|
+
id_found = gr.attributes.select{|a| a[0] == "ID"}
|
74
|
+
if id_found.size == 1
|
75
|
+
id = id_found[0][1]
|
76
|
+
elsif id_found.size == 0
|
77
|
+
## do nothing (id = nil)
|
78
|
+
elsif id_found > 1
|
79
|
+
STDERR.puts gr.attributes
|
80
|
+
raise "Multiple IDs found."
|
81
|
+
end
|
82
|
+
id2ln[id] = i
|
83
|
+
end
|
84
|
+
|
85
|
+
STDERR.puts "#{Time.now} Loading data: done."
|
86
|
+
STDERR.puts "parent-children relations are beeing analyzed..."
|
87
|
+
|
88
|
+
|
89
|
+
notfound = []
|
90
|
+
data.each do |i, v|
|
91
|
+
gr = v
|
92
|
+
parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
|
93
|
+
if parent
|
94
|
+
begin
|
95
|
+
pdata[i] = id2ln.fetch(parent)
|
96
|
+
rescue
|
97
|
+
notfound << [i, parent]
|
98
|
+
end
|
99
|
+
else
|
100
|
+
pdata[i] = nil
|
101
|
+
end
|
102
|
+
end
|
103
|
+
STDERR.puts "parent database were created.\nchildren database is being created..."
|
104
|
+
|
105
|
+
require 'pp'
|
106
|
+
|
107
|
+
#p data
|
108
|
+
#p id2ln
|
109
|
+
#pp pdata
|
110
|
+
|
111
|
+
## build children data from pareint data (pdata)
|
112
|
+
cdata = Hash.new()
|
113
|
+
## init cdata
|
114
|
+
pdata.each do |k, v|
|
115
|
+
unless cdata.has_key?(k)
|
116
|
+
cdata[k] = []
|
117
|
+
end
|
118
|
+
end
|
119
|
+
pdata.each do |k, v|
|
120
|
+
if v
|
121
|
+
parent = v
|
122
|
+
cdata[v] << k
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
#pp cdata
|
127
|
+
STDERR.puts "children databse created."
|
128
|
+
STDERR.puts "parent-children databases were successfully created."
|
129
|
+
|
130
|
+
STDERR.puts "gene and descendant features are being extracted..."
|
131
|
+
|
132
|
+
passed_paths = []
|
133
|
+
data.keys.sort.each do |i|
|
134
|
+
gr = data[i]
|
135
|
+
if gr.feature == "gene"
|
136
|
+
# puts gr
|
137
|
+
passed_paths << descendant_paths([[i]], cdata)
|
138
|
+
end
|
139
|
+
# break if i > 100
|
140
|
+
end
|
141
|
+
|
142
|
+
#p passed_paths
|
143
|
+
File.open(out_file, "w") do |o|
|
144
|
+
passed_paths.flatten.sort.uniq.each do |i|
|
145
|
+
o.puts data[i]
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
STDERR.puts "target features were successfully extracted."
|
150
|
+
|
151
|
+
File.open(outfile_not_found_parents, "w") do |o|
|
152
|
+
notfound.each do |i, parent|
|
153
|
+
o.puts [i, parent, data[i]].join("\t")
|
154
|
+
end
|
155
|
+
end
|
data/lib/ezgff/gffsqlitedb.rb
CHANGED
@@ -163,7 +163,7 @@ module Ezgff
|
|
163
163
|
end
|
164
164
|
h2 = Hash.new
|
165
165
|
h.each do |key, values|
|
166
|
-
if key == "
|
166
|
+
if key == "Dbxref2" # dummy (not used currently)
|
167
167
|
h3 = Hash.new
|
168
168
|
values.each do |val|
|
169
169
|
m = /(.+?):/.match(val)
|
@@ -172,6 +172,8 @@ module Ezgff
|
|
172
172
|
h3.update({dbtag => dbval})
|
173
173
|
end
|
174
174
|
h2[key] = h3
|
175
|
+
elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
|
176
|
+
h2[key] = values
|
175
177
|
else
|
176
178
|
h2[key] = values.join(",")
|
177
179
|
end
|
@@ -225,8 +227,12 @@ module Ezgff
|
|
225
227
|
end
|
226
228
|
end
|
227
229
|
|
228
|
-
def search(query, num_limit=100)
|
229
|
-
sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%"
|
230
|
+
def search(query, num_limit=100, type=nil)
|
231
|
+
sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%" }
|
232
|
+
if type
|
233
|
+
sql += %Q{ AND type=="#{type}"}
|
234
|
+
end
|
235
|
+
sql += %Q{ LIMIT #{num_limit} } ;
|
230
236
|
STDERR.puts sql
|
231
237
|
res = @db.execute(sql)
|
232
238
|
res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
|
data/lib/ezgff/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ezgff
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Shuji Shigenobu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: sqlite3
|
@@ -72,6 +72,8 @@ email:
|
|
72
72
|
- sshigenobu@gmail.com
|
73
73
|
executables:
|
74
74
|
- ezgff
|
75
|
+
- ezgff_extract_gene_model_features_from_gff3.rb
|
76
|
+
- ezgff_rm_fasta.rb
|
75
77
|
extensions: []
|
76
78
|
extra_rdoc_files: []
|
77
79
|
files:
|
@@ -87,6 +89,8 @@ files:
|
|
87
89
|
- dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
|
88
90
|
- dev/gff_examples/apisum_part.gff3.gz
|
89
91
|
- exe/ezgff
|
92
|
+
- exe/ezgff_extract_gene_model_features_from_gff3.rb
|
93
|
+
- exe/ezgff_rm_fasta.rb
|
90
94
|
- ezgff.gemspec
|
91
95
|
- lib/ezgff.rb
|
92
96
|
- lib/ezgff/gffsqlitedb.rb
|