ezgff 0.0.5 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3dedc8f2f93b8b91983a4f330d7c8194be3bb6ba72c52ded51b11bcea2cc55eb
4
- data.tar.gz: 7db8dbc168eb71e79576e7efcfc1da8441848c89b47a916e4e212de727aa1c46
3
+ metadata.gz: d4a07d4b65fd0a41b91f7627a19291d9740748a84f680c93e1307dfe1397c110
4
+ data.tar.gz: 3bd3d44155e7a92f81e899481706a280aca5b6f1636c67b550ba6244b39f6a8c
5
5
  SHA512:
6
- metadata.gz: a75c6fd47cfdddf8a2984462be9f985e775a1e8ada520f9c17581573c6d3db3a3b4d00b6b94d06934ec2d0237bbd287b97203c057244700e73f6aa70bc36b701
7
- data.tar.gz: ba88715c8a7855e2b005beb055cff0958f9d7146cb8ac560a3d6660b7daf34c0b5039d8765b19f868fc290828899603f4c9232104bb5440722942426839eba43
6
+ metadata.gz: d665269b3032ad82f9859d737cc31e4393b6f7275a2547b76faa8ea5499d7258e6a8651c211409115e767dd316b5b0ef616ffc45bdc09c131d3b036738ac26d1
7
+ data.tar.gz: bb76c6640fb0c410d5eb66413073a5cb97efdd6d9167c5a3b5e4ff601ba48233b545139230935fd15032a876b91e26811f306da00e6fd2a956d269c8fccd3b7a
data/README.md CHANGED
@@ -54,3 +54,10 @@ ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=
54
54
  |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] \
55
55
  |@csv'
56
56
  ```
57
+
58
+ ## FAQ
59
+ ### Can I use GFF2 or GTF for ezgff?
60
+ Ans.
61
+
62
+ No. ezgff takes GFF3 format only. GFF2/GTF can be converted to GFF3 easily. gffread and other tools can be used for conversion.
63
+
data/exe/ezgff CHANGED
@@ -132,6 +132,7 @@ module Ezgff
132
132
 
133
133
  desc "search DB QUERY", "search GFF record giving query"
134
134
  option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
135
+ option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
135
136
  def search(db, query)
136
137
  ezdb = db
137
138
  files = Dir["#{ezdb}/*.sqlite3"]
@@ -144,7 +145,7 @@ module Ezgff
144
145
  raise "Multiple sqlite3 files found"
145
146
  end
146
147
  sq3_db = GffDb.new(sq3_file)
147
- results = sq3_db.search(query, 100)
148
+ results = sq3_db.search(query, 100, options[:type])
148
149
  case options[:format]
149
150
  when "json"
150
151
  h = Hash.new
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/ezgff'
4
+ # require 'thor'
5
+
6
+ include Ezgff
7
+
8
+ def path_to_root(ary, pdata)
9
+ # p ary
10
+ # p pdata
11
+ if par = pdata[ary.last]
12
+ newary = [ary, par].flatten
13
+ path_to_root(newary, data)
14
+ else
15
+ return ary
16
+ end
17
+ end
18
+
19
+ def descendant_paths(paths, cdata)
20
+ # puts "input:"
21
+ # p paths
22
+ if paths.map{|pa| cdata[pa.last].size}.all?{|v| v == 0}
23
+ return paths
24
+ else
25
+ newpaths = []
26
+ paths.each do |pa|
27
+ #p pa
28
+ #p cdata[pa.last][:children]
29
+ if cdata[pa.last].size > 0
30
+ cdata[pa.last].each do |c|
31
+ newary = [pa, c].flatten
32
+ newpaths << newary
33
+ end
34
+ else
35
+ newpaths << pa
36
+ end
37
+ end
38
+ # puts "generated:"
39
+ # p newpaths
40
+ descendant_paths(newpaths, cdata)
41
+ end
42
+ end
43
+
44
+ gff_file = ARGV[0]
45
+ out_file = ARGV[1]
46
+ outfile_not_found_parents = out_file + ".ParentsNoteFound.txt"
47
+
48
+ include_features = ["gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR", "ncRNA","polyA_site", "pre_miRNA", "pseudogene", "rRNA", "snRNA", "snoRNA", "tRNA"]
49
+ exclude_features = ["match", "match_part", "orthologous_to", "paralogous_to", "oligo", "sgRNA"]
50
+
51
+ data = {} # key: line_num (int); value: gff_feature (Bio::GFF::GFF3::Record)
52
+ id2ln = {} # key: ID; value: line_num (reference to key of data)
53
+ pdata = {} # Hash to store parent relations
54
+ # key: line_num (int); value: ID
55
+
56
+ STDERR.puts "#{Time.now} Loading data..."
57
+ File.open(gff_file).each_with_index do |l, i|
58
+ STDERR.print "#{i} lines loaded\r" if i % 100000 == 0
59
+
60
+ # puts l
61
+ a = l.chomp.split(/\t/)
62
+ # next if exclude_features.include?(a[2])
63
+ next unless include_features.include?(a[2])
64
+ ## skip FASTA seq section
65
+ break if /^\#\#FASTA/.match(l)
66
+
67
+ ## skip header section
68
+ next if /^\#/.match(l)
69
+ gr = Bio::GFF::GFF3::Record.new(l.chomp)
70
+ data[i] = gr
71
+
72
+ id = nil
73
+ id_found = gr.attributes.select{|a| a[0] == "ID"}
74
+ if id_found.size == 1
75
+ id = id_found[0][1]
76
+ elsif id_found.size == 0
77
+ ## do nothing (id = nil)
78
+ elsif id_found > 1
79
+ STDERR.puts gr.attributes
80
+ raise "Multiple IDs found."
81
+ end
82
+ id2ln[id] = i
83
+ end
84
+
85
+ STDERR.puts "#{Time.now} Loading data: done."
86
+ STDERR.puts "parent-children relations are beeing analyzed..."
87
+
88
+
89
+ notfound = []
90
+ data.each do |i, v|
91
+ gr = v
92
+ parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
93
+ if parent
94
+ begin
95
+ pdata[i] = id2ln.fetch(parent)
96
+ rescue
97
+ notfound << [i, parent]
98
+ end
99
+ else
100
+ pdata[i] = nil
101
+ end
102
+ end
103
+ STDERR.puts "parent database were created.\nchildren database is being created..."
104
+
105
+ require 'pp'
106
+
107
+ #p data
108
+ #p id2ln
109
+ #pp pdata
110
+
111
+ ## build children data from pareint data (pdata)
112
+ cdata = Hash.new()
113
+ ## init cdata
114
+ pdata.each do |k, v|
115
+ unless cdata.has_key?(k)
116
+ cdata[k] = []
117
+ end
118
+ end
119
+ pdata.each do |k, v|
120
+ if v
121
+ parent = v
122
+ cdata[v] << k
123
+ end
124
+ end
125
+
126
+ #pp cdata
127
+ STDERR.puts "children databse created."
128
+ STDERR.puts "parent-children databases were successfully created."
129
+
130
+ STDERR.puts "gene and descendant features are being extracted..."
131
+
132
+ passed_paths = []
133
+ data.keys.sort.each do |i|
134
+ gr = data[i]
135
+ if gr.feature == "gene"
136
+ # puts gr
137
+ passed_paths << descendant_paths([[i]], cdata)
138
+ end
139
+ # break if i > 100
140
+ end
141
+
142
+ #p passed_paths
143
+ File.open(out_file, "w") do |o|
144
+ passed_paths.flatten.sort.uniq.each do |i|
145
+ o.puts data[i]
146
+ end
147
+ end
148
+
149
+ STDERR.puts "target features were successfully extracted."
150
+
151
+ File.open(outfile_not_found_parents, "w") do |o|
152
+ notfound.each do |i, parent|
153
+ o.puts [i, parent, data[i]].join("\t")
154
+ end
155
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/ezgff'
4
+ require 'thor'
5
+
6
+ include Ezgff
7
+
8
+ gff_file = ARGV[0]
9
+
10
+ File.open(gff_file).each_with_index do |l, i|
11
+ puts l
12
+ ## skip FASTA seq section
13
+ break if /^\#\#FASTA/.match(l)
14
+ end
@@ -163,7 +163,7 @@ module Ezgff
163
163
  end
164
164
  h2 = Hash.new
165
165
  h.each do |key, values|
166
- if key == "Dbxref" || key == "Ontology_term"
166
+ if key == "Dbxref2" # dummy (not used currently)
167
167
  h3 = Hash.new
168
168
  values.each do |val|
169
169
  m = /(.+?):/.match(val)
@@ -172,6 +172,8 @@ module Ezgff
172
172
  h3.update({dbtag => dbval})
173
173
  end
174
174
  h2[key] = h3
175
+ elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
176
+ h2[key] = values
175
177
  else
176
178
  h2[key] = values.join(",")
177
179
  end
@@ -225,8 +227,12 @@ module Ezgff
225
227
  end
226
228
  end
227
229
 
228
- def search(query, num_limit=100)
229
- sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
230
+ def search(query, num_limit=100, type=nil)
231
+ sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%" }
232
+ if type
233
+ sql += %Q{ AND type=="#{type}"}
234
+ end
235
+ sql += %Q{ LIMIT #{num_limit} } ;
230
236
  STDERR.puts sql
231
237
  res = @db.execute(sql)
232
238
  res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
data/lib/ezgff/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Ezgff
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ezgff
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuji Shigenobu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-07-22 00:00:00.000000000 Z
11
+ date: 2021-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: sqlite3
@@ -72,6 +72,8 @@ email:
72
72
  - sshigenobu@gmail.com
73
73
  executables:
74
74
  - ezgff
75
+ - ezgff_extract_gene_model_features_from_gff3.rb
76
+ - ezgff_rm_fasta.rb
75
77
  extensions: []
76
78
  extra_rdoc_files: []
77
79
  files:
@@ -87,6 +89,8 @@ files:
87
89
  - dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
88
90
  - dev/gff_examples/apisum_part.gff3.gz
89
91
  - exe/ezgff
92
+ - exe/ezgff_extract_gene_model_features_from_gff3.rb
93
+ - exe/ezgff_rm_fasta.rb
90
94
  - ezgff.gemspec
91
95
  - lib/ezgff.rb
92
96
  - lib/ezgff/gffsqlitedb.rb