ezgff 0.0.5 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3dedc8f2f93b8b91983a4f330d7c8194be3bb6ba72c52ded51b11bcea2cc55eb
4
- data.tar.gz: 7db8dbc168eb71e79576e7efcfc1da8441848c89b47a916e4e212de727aa1c46
3
+ metadata.gz: d4a07d4b65fd0a41b91f7627a19291d9740748a84f680c93e1307dfe1397c110
4
+ data.tar.gz: 3bd3d44155e7a92f81e899481706a280aca5b6f1636c67b550ba6244b39f6a8c
5
5
  SHA512:
6
- metadata.gz: a75c6fd47cfdddf8a2984462be9f985e775a1e8ada520f9c17581573c6d3db3a3b4d00b6b94d06934ec2d0237bbd287b97203c057244700e73f6aa70bc36b701
7
- data.tar.gz: ba88715c8a7855e2b005beb055cff0958f9d7146cb8ac560a3d6660b7daf34c0b5039d8765b19f868fc290828899603f4c9232104bb5440722942426839eba43
6
+ metadata.gz: d665269b3032ad82f9859d737cc31e4393b6f7275a2547b76faa8ea5499d7258e6a8651c211409115e767dd316b5b0ef616ffc45bdc09c131d3b036738ac26d1
7
+ data.tar.gz: bb76c6640fb0c410d5eb66413073a5cb97efdd6d9167c5a3b5e4ff601ba48233b545139230935fd15032a876b91e26811f306da00e6fd2a956d269c8fccd3b7a
data/README.md CHANGED
@@ -54,3 +54,10 @@ ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=
54
54
  |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] \
55
55
  |@csv'
56
56
  ```
57
+
58
+ ## FAQ
59
+ ### Can I use GFF2 or GTF for ezgff?
60
+ Ans.
61
+
62
+ No. ezgff takes GFF3 format only. GFF2/GTF can be converted to GFF3 easily. gffread and other tools can be used for conversion.
63
+
data/exe/ezgff CHANGED
@@ -132,6 +132,7 @@ module Ezgff
132
132
 
133
133
  desc "search DB QUERY", "search GFF record giving query"
134
134
  option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
135
+ option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
135
136
  def search(db, query)
136
137
  ezdb = db
137
138
  files = Dir["#{ezdb}/*.sqlite3"]
@@ -144,7 +145,7 @@ module Ezgff
144
145
  raise "Multiple sqlite3 files found"
145
146
  end
146
147
  sq3_db = GffDb.new(sq3_file)
147
- results = sq3_db.search(query, 100)
148
+ results = sq3_db.search(query, 100, options[:type])
148
149
  case options[:format]
149
150
  when "json"
150
151
  h = Hash.new
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/ezgff'
4
+ # require 'thor'
5
+
6
+ include Ezgff
7
+
8
+ def path_to_root(ary, pdata)
9
+ # p ary
10
+ # p pdata
11
+ if par = pdata[ary.last]
12
+ newary = [ary, par].flatten
13
+ path_to_root(newary, data)
14
+ else
15
+ return ary
16
+ end
17
+ end
18
+
19
+ def descendant_paths(paths, cdata)
20
+ # puts "input:"
21
+ # p paths
22
+ if paths.map{|pa| cdata[pa.last].size}.all?{|v| v == 0}
23
+ return paths
24
+ else
25
+ newpaths = []
26
+ paths.each do |pa|
27
+ #p pa
28
+ #p cdata[pa.last][:children]
29
+ if cdata[pa.last].size > 0
30
+ cdata[pa.last].each do |c|
31
+ newary = [pa, c].flatten
32
+ newpaths << newary
33
+ end
34
+ else
35
+ newpaths << pa
36
+ end
37
+ end
38
+ # puts "generated:"
39
+ # p newpaths
40
+ descendant_paths(newpaths, cdata)
41
+ end
42
+ end
43
+
44
+ gff_file = ARGV[0]
45
+ out_file = ARGV[1]
46
+ outfile_not_found_parents = out_file + ".ParentsNoteFound.txt"
47
+
48
+ include_features = ["gene", "mRNA", "exon", "CDS", "five_prime_UTR", "three_prime_UTR", "ncRNA","polyA_site", "pre_miRNA", "pseudogene", "rRNA", "snRNA", "snoRNA", "tRNA"]
49
+ exclude_features = ["match", "match_part", "orthologous_to", "paralogous_to", "oligo", "sgRNA"]
50
+
51
+ data = {} # key: line_num (int); value: gff_feature (Bio::GFF::GFF3::Record)
52
+ id2ln = {} # key: ID; value: line_num (reference to key of data)
53
+ pdata = {} # Hash to store parent relations
54
+ # key: line_num (int); value: ID
55
+
56
+ STDERR.puts "#{Time.now} Loading data..."
57
+ File.open(gff_file).each_with_index do |l, i|
58
+ STDERR.print "#{i} lines loaded\r" if i % 100000 == 0
59
+
60
+ # puts l
61
+ a = l.chomp.split(/\t/)
62
+ # next if exclude_features.include?(a[2])
63
+ next unless include_features.include?(a[2])
64
+ ## skip FASTA seq section
65
+ break if /^\#\#FASTA/.match(l)
66
+
67
+ ## skip header section
68
+ next if /^\#/.match(l)
69
+ gr = Bio::GFF::GFF3::Record.new(l.chomp)
70
+ data[i] = gr
71
+
72
+ id = nil
73
+ id_found = gr.attributes.select{|a| a[0] == "ID"}
74
+ if id_found.size == 1
75
+ id = id_found[0][1]
76
+ elsif id_found.size == 0
77
+ ## do nothing (id = nil)
78
+ elsif id_found > 1
79
+ STDERR.puts gr.attributes
80
+ raise "Multiple IDs found."
81
+ end
82
+ id2ln[id] = i
83
+ end
84
+
85
+ STDERR.puts "#{Time.now} Loading data: done."
86
+ STDERR.puts "parent-children relations are beeing analyzed..."
87
+
88
+
89
+ notfound = []
90
+ data.each do |i, v|
91
+ gr = v
92
+ parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
93
+ if parent
94
+ begin
95
+ pdata[i] = id2ln.fetch(parent)
96
+ rescue
97
+ notfound << [i, parent]
98
+ end
99
+ else
100
+ pdata[i] = nil
101
+ end
102
+ end
103
+ STDERR.puts "parent database were created.\nchildren database is being created..."
104
+
105
+ require 'pp'
106
+
107
+ #p data
108
+ #p id2ln
109
+ #pp pdata
110
+
111
+ ## build children data from pareint data (pdata)
112
+ cdata = Hash.new()
113
+ ## init cdata
114
+ pdata.each do |k, v|
115
+ unless cdata.has_key?(k)
116
+ cdata[k] = []
117
+ end
118
+ end
119
+ pdata.each do |k, v|
120
+ if v
121
+ parent = v
122
+ cdata[v] << k
123
+ end
124
+ end
125
+
126
+ #pp cdata
127
+ STDERR.puts "children databse created."
128
+ STDERR.puts "parent-children databases were successfully created."
129
+
130
+ STDERR.puts "gene and descendant features are being extracted..."
131
+
132
+ passed_paths = []
133
+ data.keys.sort.each do |i|
134
+ gr = data[i]
135
+ if gr.feature == "gene"
136
+ # puts gr
137
+ passed_paths << descendant_paths([[i]], cdata)
138
+ end
139
+ # break if i > 100
140
+ end
141
+
142
+ #p passed_paths
143
+ File.open(out_file, "w") do |o|
144
+ passed_paths.flatten.sort.uniq.each do |i|
145
+ o.puts data[i]
146
+ end
147
+ end
148
+
149
+ STDERR.puts "target features were successfully extracted."
150
+
151
+ File.open(outfile_not_found_parents, "w") do |o|
152
+ notfound.each do |i, parent|
153
+ o.puts [i, parent, data[i]].join("\t")
154
+ end
155
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/ezgff'
4
+ require 'thor'
5
+
6
+ include Ezgff
7
+
8
+ gff_file = ARGV[0]
9
+
10
+ File.open(gff_file).each_with_index do |l, i|
11
+ puts l
12
+ ## skip FASTA seq section
13
+ break if /^\#\#FASTA/.match(l)
14
+ end
@@ -163,7 +163,7 @@ module Ezgff
163
163
  end
164
164
  h2 = Hash.new
165
165
  h.each do |key, values|
166
- if key == "Dbxref" || key == "Ontology_term"
166
+ if key == "Dbxref2" # dummy (not used currently)
167
167
  h3 = Hash.new
168
168
  values.each do |val|
169
169
  m = /(.+?):/.match(val)
@@ -172,6 +172,8 @@ module Ezgff
172
172
  h3.update({dbtag => dbval})
173
173
  end
174
174
  h2[key] = h3
175
+ elsif key == "Ontology_term" || key == "Alias" || key == "Dbxref"
176
+ h2[key] = values
175
177
  else
176
178
  h2[key] = values.join(",")
177
179
  end
@@ -225,8 +227,12 @@ module Ezgff
225
227
  end
226
228
  end
227
229
 
228
- def search(query, num_limit=100)
229
- sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
230
+ def search(query, num_limit=100, type=nil)
231
+ sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%" }
232
+ if type
233
+ sql += %Q{ AND type=="#{type}"}
234
+ end
235
+ sql += %Q{ LIMIT #{num_limit} } ;
230
236
  STDERR.puts sql
231
237
  res = @db.execute(sql)
232
238
  res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
data/lib/ezgff/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Ezgff
2
- VERSION = "0.0.5"
2
+ VERSION = "0.0.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ezgff
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.5
4
+ version: 0.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Shuji Shigenobu
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2021-07-22 00:00:00.000000000 Z
11
+ date: 2021-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: sqlite3
@@ -72,6 +72,8 @@ email:
72
72
  - sshigenobu@gmail.com
73
73
  executables:
74
74
  - ezgff
75
+ - ezgff_extract_gene_model_features_from_gff3.rb
76
+ - ezgff_rm_fasta.rb
75
77
  extensions: []
76
78
  extra_rdoc_files: []
77
79
  files:
@@ -87,6 +89,8 @@ files:
87
89
  - dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
88
90
  - dev/gff_examples/apisum_part.gff3.gz
89
91
  - exe/ezgff
92
+ - exe/ezgff_extract_gene_model_features_from_gff3.rb
93
+ - exe/ezgff_rm_fasta.rb
90
94
  - ezgff.gemspec
91
95
  - lib/ezgff.rb
92
96
  - lib/ezgff/gffsqlitedb.rb