ezgff 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c5243c27b01a1b9c51ca19784b7fcce0cc86766de41270aa1a546370d1dcda9c
4
+ data.tar.gz: 06a3a2491430b10e33bae1e89c4a32d52feaab7a93425de47d6d3a0b15d6de23
5
+ SHA512:
6
+ metadata.gz: cfe588ee8d77d84540ade9b87cc3d3f6393e87cff783039959105a09a1fd9d6a94fd58aac1ee0964061112db6a5ba3dde4292bc9b4db8e2effee4b6f27bec703
7
+ data.tar.gz: cf1f4a39764e43456686fe7da5e7cae3cf1bbc2fba55b78ef9d4f52dd6ea232186a944b320a506bfc6499077e98d921ac6933fb774034d19323430724c1b5091
data/.gitignore ADDED
@@ -0,0 +1,56 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ # Ignore Byebug command history file.
17
+ .byebug_history
18
+
19
+ ## Specific to RubyMotion:
20
+ .dat*
21
+ .repl_history
22
+ build/
23
+ *.bridgesupport
24
+ build-iPhoneOS/
25
+ build-iPhoneSimulator/
26
+
27
+ ## Specific to RubyMotion (use of CocoaPods):
28
+ #
29
+ # We recommend against adding the Pods directory to your .gitignore. However
30
+ # you should judge for yourself, the pros and cons are mentioned at:
31
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
32
+ #
33
+ # vendor/Pods/
34
+
35
+ ## Documentation cache and generated files:
36
+ /.yardoc/
37
+ /_yardoc/
38
+ /doc/
39
+ /rdoc/
40
+
41
+ ## Environment normalization:
42
+ /.bundle/
43
+ /vendor/bundle
44
+ /lib/bundler/man/
45
+
46
+ # for a library or gem, you might want to ignore these files since the code is
47
+ # intended to run in multiple environments; otherwise, check them in:
48
+ # Gemfile.lock
49
+ # .ruby-version
50
+ # .ruby-gemset
51
+
52
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
53
+ .rvmrc
54
+
55
+ # Used by RuboCop. Remote config files pulled in from inherit_from directive.
56
+ # .rubocop-https?--*
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ezgff.gemspec
4
+ gemspec
5
+
6
+ gem "rake", "~> 12.0"
7
+ gem "rspec", "~> 3.0"
8
+
9
+ gem "sqlite3"
10
+ gem "color_echo"
11
+ gem "bio"
12
+ gem "thor"
13
+
14
+
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Shuji Shigenobu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # ezgff_alpha
2
+
3
+ ## What is ezgff_alpha
4
+
5
+ ## Pre-requisites
6
+
7
+ * sqlite3
8
+
9
+ ## Install
10
+
11
+
12
+ ## Quick start
13
+
14
+ Build database from GFF3 file.
15
+
16
+ ```bash
17
+ ezgff build in.gff3
18
+ ```
19
+
20
+ Retrieve GFF3 reacod by ID.
21
+
22
+ ```bash
23
+ ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors
24
+ ```
25
+
26
+ ```
27
+ ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq
28
+ ```
29
+
30
+ examples to use jq
31
+
32
+ ```
33
+ ezgff_alpha/bin/ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] |@csv'
34
+ ```
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,91 @@
1
+ require 'sqlite3'
2
+ require 'bio'
3
+ require 'json'
4
+
5
+ def attributes_as_json(gffline)
6
+ gr = Bio::GFF::GFF3::Record.new(gffline.chomp)
7
+ h = Hash.new
8
+ gr.attributes.each do |att|
9
+ k, v = att
10
+ unless h.has_key?(k)
11
+ h[k] = []
12
+ end
13
+ h[k] << v
14
+ end
15
+ h2 = Hash.new
16
+ h.each do |k, v|
17
+ h2[k] = v.join(",")
18
+ end
19
+ h2.to_json
20
+ end
21
+
22
+
23
+ gfffile = ARGV[0]
24
+ #gfffile = "example_gff/apisum_part.gff3"
25
+ #gfffile = "example_gff/ApL_HF_liftover_Refseq.gff"
26
+
27
+ dbname = gfffile + ".sqlite3"
28
+
29
+ db = SQLite3::Database.new(dbname)
30
+
31
+ sql = <<-SQL
32
+ CREATE TABLE gff_records (
33
+ line_num integer primary key,
34
+ record text,
35
+ id text,
36
+ parent text,
37
+ seqid text not null,
38
+ source text,
39
+ type text,
40
+ start integer not null,
41
+ end integer not null,
42
+ score real,
43
+ strand varchar(1),
44
+ phase integer,
45
+ attributes text,
46
+ attributes_json text
47
+ );
48
+ SQL
49
+
50
+ db.execute(sql)
51
+
52
+ db.transaction do
53
+ File.open(gfffile).each_with_index do |l, i|
54
+ # puts l
55
+ ## skip FASTA seq section
56
+ break if /^\#\#FASTA/.match(l)
57
+
58
+ ## skip header section
59
+ next if /^\#/.match(l)
60
+ gr = Bio::GFF::GFF3::Record.new(l.chomp)
61
+ # p gr.attributes
62
+ id = nil
63
+ id_found = gr.attributes.select{|a| a[0] == "ID"}
64
+ if id_found.size == 1
65
+ id = id_found[0][1]
66
+ elsif id_found.size == 0
67
+ ## do nothing (id = nil)
68
+ elsif id_found > 1
69
+ STDERR.puts gr.attributes
70
+ raise "Multiple IDs found."
71
+ end
72
+ parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
73
+ a = l.chomp.split(/\t/)
74
+
75
+ sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
76
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
77
+ values = [i, l.chomp, id, parent,
78
+ a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
79
+ attributes_as_json(l)]
80
+ db.execute(sql, values)
81
+ end
82
+ end
83
+
84
+ #===
85
+ # create index
86
+ table = "gff_records"
87
+ %w{id parent source type}.each do |col|
88
+ idxname = "index_#{table}_on_#{col}"
89
+ sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
90
+ db.execute(sql)
91
+ end
@@ -0,0 +1,70 @@
1
+ require 'sqlite3'
2
+ require 'bio'
3
+ require 'json'
4
+ require './gffsqlitedb'
5
+
6
+ def insert_data(record, key, db_kw)
7
+ rec = record
8
+ att = rec.attributes
9
+ xdb = rec.dbxrefs
10
+ if val = att[key]
11
+ sql = "INSERT INTO gff_keywords (line_num, key, value) VALUES (?, ?, ?)"
12
+ values = [rec.line_num, key, val]
13
+ puts sql
14
+ p values
15
+ db_kw.execute(sql, values)
16
+ end
17
+ end
18
+
19
+ def insert_data_dbxref(record, key, db_kw)
20
+ rec = record
21
+ xdb = rec.dbxrefs
22
+ val = xdb[key]
23
+ sql = "INSERT INTO gff_keywords (line_num, key, value, category) VALUES (?, ?, ?, ?)"
24
+ values = [rec.line_num, key, val, "Dbxref"]
25
+ puts sql
26
+ p values
27
+ db_kw.execute(sql, values)
28
+ end
29
+
30
+ gfffile = ARGV[0]
31
+
32
+ db_file = gfffile + ".sqlite3" # altready built
33
+ #db_keywards = gfffile + ".keywords.sqlite3"
34
+
35
+ gffdb = GffDb.new(db_file) # altready built
36
+ db = SQLite3::Database.new(db_file)
37
+
38
+ sql = <<-SQL
39
+ CREATE TABLE gff_keywords (
40
+ id integer primary key,
41
+ line_num integer,
42
+ key text not null,
43
+ value text,
44
+ category text
45
+ );
46
+ SQL
47
+
48
+ db.execute(sql)
49
+
50
+ db.transaction do
51
+ gffdb.each_record do |r|
52
+ insert_data(r, "Name", db)
53
+ insert_data(r, "gbkey", db)
54
+ insert_data(r, "gene", db)
55
+ insert_data(r, "product", db)
56
+ insert_data(r, "transcript_id", db)
57
+ r.dbxrefs.keys.each do |k|
58
+ insert_data_dbxref(r, k, db)
59
+ end
60
+ end
61
+ end
62
+
63
+ #===
64
+ # create index
65
+ table = "gff_keywords"
66
+ %w{id line_num key value}.each do |col|
67
+ idxname = "index_#{table}_on_#{col}"
68
+ sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
69
+ db.execute(sql)
70
+ end
@@ -0,0 +1,13 @@
1
+ gfffile = ARGV[0]
2
+ gfffile = "example_gff/apisum_part.gff3"
3
+ #gfffile = "example_gff/ApL_HF_liftover_Refseq.gff"
4
+
5
+ #=== sort gff by position
6
+ gfffile_sorted = gfffile + ".gz"
7
+ cmd = %Q{(grep ^"#" #{gfffile}; grep -v ^"#" #{gfffile} | sort -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
8
+ system cmd
9
+
10
+ cmd = "tabix -p gff #{gfffile_sorted}"
11
+ system cmd
12
+
13
+ STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
data/exe/ezgff ADDED
@@ -0,0 +1,168 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/ezgff'
4
+ require 'thor'
5
+ require 'color_echo/get'
6
+
7
+ module Ezgff
8
+ class CLI < Thor
9
+
10
+ ## hack to enable -h option
11
+ ## ref: https://magazine.rubyist.net/articles/0046/0046-Milkode.html#%E3%82%AB%E3%82%B9%E3%82%BF%E3%83%9E%E3%82%A4%E3%82%BA-milk-add--h-%E3%82%92%E5%AE%9F%E7%8F%BE%E3%81%99%E3%82%8B
12
+
13
+ class_option :help, :type => :boolean, :aliases => '-h', :desc => 'Help message'
14
+
15
+ no_tasks do
16
+ def invoke_command(task, *args)
17
+ if options[:help] &&
18
+ task.name != 'grep'
19
+ Ezgff::CLI.task_help(shell, task.name)
20
+ elsif options[:version] && task.name == 'help'
21
+ puts "milk #{Version}"
22
+ else
23
+ super
24
+ end
25
+ end
26
+ end
27
+ ## end of -h option setting
28
+
29
+ desc "version", "show version number"
30
+ def version()
31
+ puts Ezgff::VERSION
32
+ end
33
+
34
+ desc "view DB QUERY", "retrieve GFF record by ID and view it in a specified format.
35
+ DB path to ezdb
36
+ ezdb should be created by using 'build' subcommand in advance.
37
+ QUERY query for search
38
+ Two modes, simple mode and advanced mode are available.
39
+ Simple mode
40
+ ID is given and search by the ID.
41
+ Advanced mode
42
+ Query is given in KEY=VALUE style. Available keys are
43
+ ID: ID (ex. ID=rna-XM_029485812.1)
44
+ LN: Line number (ex. LN=255)
45
+ Note: Spaces are not allowed before and after =
46
+
47
+ "
48
+
49
+ option :format, :aliases => '-f', :enum => ["json", "gff"], :default => "gff", :desc => "Specify output format."
50
+ option :with, :aliases => '-w', :enum => ["none", "parent", "children", "ancestors", "descendants"], :default => "none", :desc => "Retrieve data with parent or children features."
51
+ option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
52
+
53
+ def view(db, query)
54
+ ezdb = db
55
+ files = Dir["#{ezdb}/*.sqlite3"]
56
+ sq3_file = nil
57
+ if files.size == 1
58
+ sq3_file = files[0]
59
+ elsif files.size == 0
60
+ raise "sqlite3 file not found"
61
+ elsif files.size > 1
62
+ raise "Multiple sqlite3 files found"
63
+ end
64
+ sq3_db = GffDb.new(sq3_file)
65
+
66
+ if m = /^LN\=/.match(query)
67
+ ## search by line number
68
+ query2 = m.post_match.strip
69
+ ann = sq3_db.get_by_line_number(query2)
70
+ elsif m = /^ID\=/.match(query)
71
+ ## search by ID
72
+ query2 = m.post_match.strip
73
+ ann = sq3_db.get(query2)
74
+ else
75
+ ann = sq3_db.get(query)
76
+ end
77
+
78
+ results = []
79
+ if options[:with] == "descendants"
80
+ results = ann.descendants
81
+ elsif options[:with] == "ancestors"
82
+ results = ann.ancestors
83
+ else
84
+ parent = nil
85
+ if options[:with] == "parent"
86
+ parent = ann.parent
87
+ end
88
+ children = []
89
+ if options[:with] == "children"
90
+ ann.children.each do |c|
91
+ children << c
92
+ end
93
+ end
94
+
95
+ results = []
96
+ results << parent if parent
97
+ results << ann
98
+ results += children
99
+ end
100
+
101
+ ## Filter results
102
+ ## - type / type
103
+ if options[:type]
104
+ target_type = options[:type]
105
+ results = results.select{|r| r.type == target_type}
106
+ end
107
+
108
+ case options[:format]
109
+ when "json"
110
+ h = Hash.new
111
+ ary = results.map{|r| r.to_h}
112
+ h["gff_records"] = ary
113
+ puts h.to_json
114
+
115
+ when "gff"
116
+ puts results
117
+ else
118
+ raise "Unknown format: #{options[:format]}"
119
+ end
120
+
121
+ end
122
+
123
+ desc "build GFF", "build database from GFF file"
124
+ option :help, :aliases => :h, :type => :boolean
125
+ def build(gff_in)
126
+ # puts "build #{gff_file} => #{dbpath}"
127
+ dbpath = GffDb.build_db(gff_in)
128
+ STDERR.puts "new database created: #{dbpath}"
129
+ gff_file = dbpath + "/" + File.basename(gff_in)
130
+ GffDb.build_tabix(gff_file)
131
+ end
132
+
133
+ desc "search DB QUERY", "search GFF record giving query"
134
+ option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
135
+ def search(db, query)
136
+ ezdb = db
137
+ files = Dir["#{ezdb}/*.sqlite3"]
138
+ sq3_file = nil
139
+ if files.size == 1
140
+ sq3_file = files[0]
141
+ elsif files.size == 0
142
+ raise "sqlite3 file not found"
143
+ elsif files.size > 1
144
+ raise "Multiple sqlite3 files found"
145
+ end
146
+ sq3_db = GffDb.new(sq3_file)
147
+ results = sq3_db.search(query, 100)
148
+ case options[:format]
149
+ when "json"
150
+ h = Hash.new
151
+ ary = results.map{|r| r.to_h}
152
+ h["gff_records"] = ary
153
+ puts h.to_json
154
+ when "gff"
155
+ gfftxt = results.map{|r| r.to_s}.join("\n")
156
+ puts CE.pickup(/#{query}/i, :green, nil, :bold).get(gfftxt)
157
+ else
158
+ raise "Unknown format: #{options[:format]}"
159
+ end
160
+ end
161
+
162
+ end
163
+ end
164
+
165
+ Ezgff::CLI.start(ARGV)
166
+
167
+
168
+
data/ezgff.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ require_relative 'lib/ezgff/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "ezgff"
5
+ spec.version = Ezgff::VERSION
6
+ spec.authors = ["Shuji Shigenobu"]
7
+ spec.email = ["sshigenobu@gmail.com"]
8
+
9
+ spec.summary = %q{Utilities for GFF3}
10
+ spec.description = %q{Utilities for GFF3, the genome annotation format. Useful to explore the gene model features.}
11
+ spec.homepage = "https://github.com/shujishigenobu/ezgff_alpha"
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
+
15
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/shujishigenobu/ezgff_alpha"
19
+ spec.metadata["changelog_uri"] = "https://github.com/shujishigenobu/ezgff_alpha"
20
+ # TODO: Put your gem's CHANGELOG.md URL here."
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+ end
data/lib/ezgff.rb ADDED
@@ -0,0 +1,12 @@
1
+ require_relative "ezgff/version"
2
+ require_relative "ezgff/gffsqlitedb"
3
+
4
+ #require "ezgff/version"
5
+ #require "ezgff/gffsqlitedb"
6
+
7
+
8
+
9
+ module Ezgff
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+ end
@@ -0,0 +1,390 @@
1
+ require 'sqlite3'
2
+ require 'json'
3
+ require 'bio'
4
+ require 'fileutils'
5
+
6
+ #
7
+ # References
8
+ # * Official specification of GFF3 -- https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
9
+ #
10
+
11
+ module Ezgff
12
+
13
+ class GffDb
14
+
15
+ #===
16
+ # sqlite3 schema
17
+ #
18
+ # gff_records (
19
+ # line_num integer primary key,
20
+ # record text, # original record
21
+ # id text,
22
+ # parent text,
23
+ # seqid text not null,
24
+ # source text,
25
+ # type text,
26
+ # start integer not null,
27
+ # end integer not null,
28
+ # score real,
29
+ # strand varchar(1),
30
+ # phase integer,
31
+ # attributes text,
32
+ # attributes_json json
33
+ # )
34
+
35
+ def self.build_db(gff_in, ezdb_base = nil)
36
+ ezdb_base = (ezdb_base || ".")
37
+ ezdb_path = ezdb_base + "/" + File.basename(gff_in) + ".ezdb"
38
+ gff_file = ezdb_path + "/" + File.basename(gff_in)
39
+ Dir.mkdir(ezdb_path)
40
+ File.open(gff_file, "w") do |o|
41
+ File.open(gff_in).each do |l|
42
+ break if /^\#\#FASTA/.match(l)
43
+ ## skip header section
44
+ next if /^\#/.match(l)
45
+ o.puts l
46
+ end
47
+ end
48
+
49
+ # FileUtils.cp(gff_in, gff_file)
50
+ sq3_file = gff_file + ".sqlite3"
51
+
52
+ ## Create table in sqlite3 RDBMS
53
+ ## table name: gff_record
54
+
55
+ sq3_db = SQLite3::Database.new(sq3_file)
56
+
57
+ sql = <<-SQL
58
+ CREATE TABLE gff_records (
59
+ line_num integer primary key,
60
+ record text,
61
+ id text,
62
+ parent text,
63
+ seqid text not null,
64
+ source text,
65
+ type text,
66
+ start integer not null,
67
+ end integer not null,
68
+ score real,
69
+ strand varchar(1),
70
+ phase integer,
71
+ attributes text,
72
+ attributes_json json
73
+ );
74
+ SQL
75
+
76
+ sq3_db.execute(sql)
77
+
78
+ ## Read GFF file and insert data into
79
+ ## the sqlite3 table
80
+
81
+ sq3_db.transaction do
82
+ File.open(gff_file).each_with_index do |l, i|
83
+ # puts l
84
+ ## skip FASTA seq section
85
+ break if /^\#\#FASTA/.match(l)
86
+
87
+ ## skip header section
88
+ next if /^\#/.match(l)
89
+ gr = Bio::GFF::GFF3::Record.new(l.chomp)
90
+ # p gr.attributes
91
+ id = nil
92
+ id_found = gr.attributes.select{|a| a[0] == "ID"}
93
+ if id_found.size == 1
94
+ id = id_found[0][1]
95
+ elsif id_found.size == 0
96
+ ## do nothing (id = nil)
97
+ elsif id_found > 1
98
+ STDERR.puts gr.attributes
99
+ raise "Multiple IDs found."
100
+ end
101
+ parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
102
+ a = l.chomp.split(/\t/)
103
+
104
+ sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
105
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
106
+ values = [i, l.chomp, id, parent,
107
+ a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
108
+ attributes_as_json(l)]
109
+ sq3_db.execute(sql, values)
110
+ end
111
+ end
112
+
113
+ ## Indexing the sqlite3 table
114
+ table = "gff_records"
115
+ %w{id parent source type}.each do |col|
116
+ idxname = "index_#{table}_on_#{col}"
117
+ sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
118
+ sq3_db.execute(sql)
119
+ end
120
+
121
+ return ezdb_path
122
+
123
+ end
124
+
125
+ def self.build_tabix(gff_in)
126
+ ## sort gff by position
127
+ gfffile_sorted = gff_in + ".gz"
128
+ cmd = %Q{(grep ^"#" #{gff_in}; grep -v ^"#" #{gff_in} | sort -t $'\t' -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
129
+ STDERR.puts cmd
130
+ system cmd
131
+
132
+ cmd = "tabix -p gff #{gfffile_sorted}"
133
+ STDERR.puts cmd
134
+ system cmd
135
+
136
+ STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
137
+ end
138
+
139
+ def self.attributes_as_json(gffline)
140
+ keys_multi_val_allowed = %{Parent Alias Note Dbxref Ontology_term}
141
+
142
+ gr = Bio::GFF::GFF3::Record.new(gffline.chomp)
143
+
144
+ h = Hash.new
145
+ gr.attributes.each do |att|
146
+ k, v = att
147
+ unless h.has_key?(k)
148
+ h[k] = []
149
+ end
150
+ h[k] << v
151
+ end
152
+ h2 = Hash.new
153
+ h.each do |key, values|
154
+ if key == "Dbxref" || key == "Ontology_term"
155
+ h3 = Hash.new
156
+ values.each do |val|
157
+ m = /(.+?):/.match(val)
158
+ dbtag = m[1]
159
+ dbval = m.post_match
160
+ h3.update({dbtag => dbval})
161
+ end
162
+ h2[key] = h3
163
+ else
164
+ h2[key] = values.join(",")
165
+ end
166
+ end
167
+ h2.to_json
168
+ end
169
+
170
+ def initialize(path)
171
+ @db = SQLite3::Database.new(path)
172
+ end
173
+
174
+ def each_record
175
+ sql = "SELECT * FROM gff_records"
176
+ @db.execute(sql).each do |r|
177
+ an = Annotation.new()
178
+ an.build_from_db_record(r)
179
+ yield an
180
+ end
181
+ end
182
+
183
+ def get(id)
184
+ sql = %Q{SELECT * FROM gff_records WHERE id=="#{id}";}
185
+ # puts sql
186
+ res = @db.execute(sql)
187
+ if res.size == 1
188
+ an = Annotation.new(@db)
189
+ an.build_from_db_record(res[0])
190
+ return an
191
+ else
192
+ if res.size >= 2
193
+ raise "multiple hits"
194
+ elsif res.size == 0
195
+ raise "not found: #{id}"
196
+ end
197
+ end
198
+ end
199
+
200
+ def get_by_line_number(n)
201
+ sql = %Q{SELECT * FROM gff_records WHERE line_num=="#{n}";}
202
+ res = @db.execute(sql)
203
+ if res.size == 1
204
+ an = Annotation.new(@db)
205
+ an.build_from_db_record(res[0])
206
+ return an
207
+ else
208
+ if res.size >= 2
209
+ raise "multiple hits"
210
+ elsif res.size == 0
211
+ raise "not found: #{id}"
212
+ end
213
+ end
214
+ end
215
+
216
+ def search(query, num_limit=100)
217
+ sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
218
+ STDERR.puts sql
219
+ res = @db.execute(sql)
220
+ res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
221
+ res2
222
+ end
223
+
224
+ class Annotation
225
+
226
+ def initialize(db = nil)
227
+ @db = db
228
+ @seqid
229
+ @source
230
+ @type
231
+ @start
232
+ @end
233
+ @score
234
+ @strand
235
+ @phase
236
+ @attributes
237
+ @id
238
+ @parent_id
239
+ @gffline
240
+ end
241
+
242
+ attr_accessor :seqid, :source, :type, :start, :end, :score, :strand, :phase, :attributes
243
+ attr_accessor :id, :parent_id, :gffline, :line_num
244
+
245
+ def to_s
246
+ gffline
247
+ end
248
+
249
+ def to_hash
250
+ h = {
251
+ 'seqid' => seqid,
252
+ 'source' => source,
253
+ 'type' => type,
254
+ 'start' => start,
255
+ 'end' => self.end,
256
+ 'score' => score,
257
+ 'strand' => strand,
258
+ 'phase' => phase,
259
+ 'line_num' => line_num,
260
+ 'id' => id,
261
+ 'parent_id' => parent_id,
262
+ 'attributes' => attributes
263
+ }
264
+ end
265
+
266
+ alias :to_h :to_hash
267
+
268
+ def to_json
269
+ self.to_hash.to_json
270
+ end
271
+
272
+ def build_from_db_record(sql_result)
273
+ ## sql_result: Array returned by @db.execute(sql)
274
+ v = sql_result
275
+ @seqid = v[4]
276
+ @source = v[5]
277
+ @type = v[6]
278
+ @start = v[7]
279
+ @end = v[8]
280
+ @score = v[9]
281
+ @strand = v[10]
282
+ @phase = v[11]
283
+ @line_num = v[0]
284
+ @gffline = v[1]
285
+ @id = v[2]
286
+ @parent_id = v[3]
287
+ @attributes = JSON.parse(v[13])
288
+ end
289
+
290
+ def parent
291
+ if parent_id
292
+ sql = %Q{SELECT * FROM gff_records WHERE id=="#{parent_id}";}
293
+ # puts sql
294
+ res = @db.execute(sql)
295
+ an = Annotation.new(@db)
296
+ an.build_from_db_record(res[0])
297
+ return an
298
+ else
299
+ return nil
300
+ end
301
+ end
302
+
303
+ def children
304
+ ary = []
305
+ sql = %Q{SELECT * FROM gff_records WHERE parent=="#{id}";}
306
+ # puts sql
307
+ res = @db.execute(sql)
308
+ res.each do |r|
309
+ an = Annotation.new(@db)
310
+ an.build_from_db_record(r)
311
+ ary << an
312
+ end
313
+ ary
314
+ end
315
+
316
+ def descendants
317
+ ary = []
318
+ sql = %Q{WITH RECURSIVE r AS (
319
+ SELECT * FROM gff_records WHERE id=="#{id}"
320
+ UNION ALL
321
+ SELECT gff_records.* FROM gff_records, r WHERE gff_records.parent == r.id
322
+ )
323
+ SELECT * FROM r}
324
+ res = @db.execute(sql)
325
+ res.each do |r|
326
+ an = Annotation.new(@db)
327
+ an.build_from_db_record(r)
328
+ ary << an
329
+ end
330
+ ary
331
+ end
332
+
333
+ def ancestors
334
+ ary = []
335
+ sql = %Q{WITH RECURSIVE ancestor AS (
336
+ SELECT * FROM gff_records WHERE id=="#{id}"
337
+ UNION ALL
338
+ SELECT gff_records.* FROM gff_records, ancestor
339
+ WHERE ancestor.parent = gff_records.id
340
+ )
341
+ SELECT * FROM ancestor;}
342
+ res = @db.execute(sql)
343
+ res.each do |r|
344
+ an = Annotation.new(@db)
345
+ an.build_from_db_record(r)
346
+ ary << an
347
+ end
348
+ ary
349
+ end
350
+
351
+
352
+ def length
353
+ len = @end - @start + 1
354
+ raise unless len > 0
355
+ return len
356
+ end
357
+
358
+ def dbxrefs
359
+ h = Hash.new
360
+ if attributes["Dbxref"]
361
+ attributes["Dbxref"].split(/,/).each do |x|
362
+ m = /(.+?):/.match(x)
363
+ key = m[1]
364
+ val = m.post_match
365
+ h.update({key => val})
366
+ end
367
+ end
368
+ h
369
+ end
370
+
371
+ end
372
+
373
+ end
374
+
375
+ end
376
+
377
+ if __FILE__ == $0
378
+ dbname = ARGV[0]
379
+ query = ARGV[1]
380
+ db = GffDb.new(dbname)
381
+ ann = db.get(query)
382
+ p ann
383
+ puts ann.to_s
384
+ p ann.to_hash
385
+ exit
386
+ db.each_line do |an|
387
+ p an
388
+ # p [an.id, an.seqid, an.start, an.end, an.attributes["protein_id"]]
389
+ end
390
+ end
@@ -0,0 +1,3 @@
1
+ module Ezgff
2
+ VERSION = "0.0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ezgff
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Shuji Shigenobu
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-07-14 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Utilities for GFF3, the genome annotation format. Useful to explore the
14
+ gene model features.
15
+ email:
16
+ - sshigenobu@gmail.com
17
+ executables:
18
+ - ezgff
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - ".gitignore"
23
+ - Gemfile
24
+ - LICENSE
25
+ - README.md
26
+ - Rakefile
27
+ - bin/build_gff_sqlitedb.rb
28
+ - bin/build_gff_sqlitedb_keywords.rb
29
+ - bin/build_gff_tabix.rb
30
+ - dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz
31
+ - dev/gff_examples/ApMT_NC_011594.gb.gff.gz
32
+ - dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
33
+ - dev/gff_examples/apisum_part.gff3.gz
34
+ - dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz
35
+ - exe/ezgff
36
+ - ezgff.gemspec
37
+ - lib/ezgff.rb
38
+ - lib/ezgff/gffsqlitedb.rb
39
+ - lib/ezgff/version.rb
40
+ homepage: https://github.com/shujishigenobu/ezgff_alpha
41
+ licenses:
42
+ - MIT
43
+ metadata:
44
+ homepage_uri: https://github.com/shujishigenobu/ezgff_alpha
45
+ source_code_uri: https://github.com/shujishigenobu/ezgff_alpha
46
+ changelog_uri: https://github.com/shujishigenobu/ezgff_alpha
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: 2.3.0
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubygems_version: 3.1.2
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: Utilities for GFF3
66
+ test_files: []