ezgff 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: c5243c27b01a1b9c51ca19784b7fcce0cc86766de41270aa1a546370d1dcda9c
4
+ data.tar.gz: 06a3a2491430b10e33bae1e89c4a32d52feaab7a93425de47d6d3a0b15d6de23
5
+ SHA512:
6
+ metadata.gz: cfe588ee8d77d84540ade9b87cc3d3f6393e87cff783039959105a09a1fd9d6a94fd58aac1ee0964061112db6a5ba3dde4292bc9b4db8e2effee4b6f27bec703
7
+ data.tar.gz: cf1f4a39764e43456686fe7da5e7cae3cf1bbc2fba55b78ef9d4f52dd6ea232186a944b320a506bfc6499077e98d921ac6933fb774034d19323430724c1b5091
data/.gitignore ADDED
@@ -0,0 +1,56 @@
1
+ *.gem
2
+ *.rbc
3
+ /.config
4
+ /coverage/
5
+ /InstalledFiles
6
+ /pkg/
7
+ /spec/reports/
8
+ /spec/examples.txt
9
+ /test/tmp/
10
+ /test/version_tmp/
11
+ /tmp/
12
+
13
+ # Used by dotenv library to load environment variables.
14
+ # .env
15
+
16
+ # Ignore Byebug command history file.
17
+ .byebug_history
18
+
19
+ ## Specific to RubyMotion:
20
+ .dat*
21
+ .repl_history
22
+ build/
23
+ *.bridgesupport
24
+ build-iPhoneOS/
25
+ build-iPhoneSimulator/
26
+
27
+ ## Specific to RubyMotion (use of CocoaPods):
28
+ #
29
+ # We recommend against adding the Pods directory to your .gitignore. However
30
+ # you should judge for yourself, the pros and cons are mentioned at:
31
+ # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
32
+ #
33
+ # vendor/Pods/
34
+
35
+ ## Documentation cache and generated files:
36
+ /.yardoc/
37
+ /_yardoc/
38
+ /doc/
39
+ /rdoc/
40
+
41
+ ## Environment normalization:
42
+ /.bundle/
43
+ /vendor/bundle
44
+ /lib/bundler/man/
45
+
46
+ # for a library or gem, you might want to ignore these files since the code is
47
+ # intended to run in multiple environments; otherwise, check them in:
48
+ # Gemfile.lock
49
+ # .ruby-version
50
+ # .ruby-gemset
51
+
52
+ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
53
+ .rvmrc
54
+
55
+ # Used by RuboCop. Remote config files pulled in from inherit_from directive.
56
+ # .rubocop-https?--*
data/Gemfile ADDED
@@ -0,0 +1,14 @@
1
+ source "https://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in ezgff.gemspec
4
+ gemspec
5
+
6
+ gem "rake", "~> 12.0"
7
+ gem "rspec", "~> 3.0"
8
+
9
+ gem "sqlite3"
10
+ gem "color_echo"
11
+ gem "bio"
12
+ gem "thor"
13
+
14
+
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Shuji Shigenobu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # ezgff_alpha
2
+
3
+ ## What is ezgff_alpha
4
+
5
+ ## Pre-requisites
6
+
7
+ * sqlite3
8
+
9
+ ## Install
10
+
11
+
12
+ ## Quick start
13
+
14
+ Build database from GFF3 file.
15
+
16
+ ```bash
17
+ ezgff build in.gff3
18
+ ```
19
+
20
+ Retrieve GFF3 reacod by ID.
21
+
22
+ ```bash
23
+ ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors
24
+ ```
25
+
26
+ ```
27
+ ezgff view data.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq
28
+ ```
29
+
30
+ examples to use jq
31
+
32
+ ```
33
+ ezgff_alpha/bin/ezgff view GCF_000009605.1_ASM960v1_genomic.gff.ezdb cds-WP_010895901.1 --with=ancestors --format=json |jq -r '.gff_records | map(select(.type == "gene"))[0] | [.seqid, .start, .end, .attributes.gene] |@csv'
34
+ ```
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,91 @@
1
+ require 'sqlite3'
2
+ require 'bio'
3
+ require 'json'
4
+
5
+ def attributes_as_json(gffline)
6
+ gr = Bio::GFF::GFF3::Record.new(gffline.chomp)
7
+ h = Hash.new
8
+ gr.attributes.each do |att|
9
+ k, v = att
10
+ unless h.has_key?(k)
11
+ h[k] = []
12
+ end
13
+ h[k] << v
14
+ end
15
+ h2 = Hash.new
16
+ h.each do |k, v|
17
+ h2[k] = v.join(",")
18
+ end
19
+ h2.to_json
20
+ end
21
+
22
+
23
+ gfffile = ARGV[0]
24
+ #gfffile = "example_gff/apisum_part.gff3"
25
+ #gfffile = "example_gff/ApL_HF_liftover_Refseq.gff"
26
+
27
+ dbname = gfffile + ".sqlite3"
28
+
29
+ db = SQLite3::Database.new(dbname)
30
+
31
+ sql = <<-SQL
32
+ CREATE TABLE gff_records (
33
+ line_num integer primary key,
34
+ record text,
35
+ id text,
36
+ parent text,
37
+ seqid text not null,
38
+ source text,
39
+ type text,
40
+ start integer not null,
41
+ end integer not null,
42
+ score real,
43
+ strand varchar(1),
44
+ phase integer,
45
+ attributes text,
46
+ attributes_json text
47
+ );
48
+ SQL
49
+
50
+ db.execute(sql)
51
+
52
+ db.transaction do
53
+ File.open(gfffile).each_with_index do |l, i|
54
+ # puts l
55
+ ## skip FASTA seq section
56
+ break if /^\#\#FASTA/.match(l)
57
+
58
+ ## skip header section
59
+ next if /^\#/.match(l)
60
+ gr = Bio::GFF::GFF3::Record.new(l.chomp)
61
+ # p gr.attributes
62
+ id = nil
63
+ id_found = gr.attributes.select{|a| a[0] == "ID"}
64
+ if id_found.size == 1
65
+ id = id_found[0][1]
66
+ elsif id_found.size == 0
67
+ ## do nothing (id = nil)
68
+ elsif id_found > 1
69
+ STDERR.puts gr.attributes
70
+ raise "Multiple IDs found."
71
+ end
72
+ parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
73
+ a = l.chomp.split(/\t/)
74
+
75
+ sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
76
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
77
+ values = [i, l.chomp, id, parent,
78
+ a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
79
+ attributes_as_json(l)]
80
+ db.execute(sql, values)
81
+ end
82
+ end
83
+
84
+ #===
85
+ # create index
86
+ table = "gff_records"
87
+ %w{id parent source type}.each do |col|
88
+ idxname = "index_#{table}_on_#{col}"
89
+ sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
90
+ db.execute(sql)
91
+ end
@@ -0,0 +1,70 @@
1
+ require 'sqlite3'
2
+ require 'bio'
3
+ require 'json'
4
+ require './gffsqlitedb'
5
+
6
+ def insert_data(record, key, db_kw)
7
+ rec = record
8
+ att = rec.attributes
9
+ xdb = rec.dbxrefs
10
+ if val = att[key]
11
+ sql = "INSERT INTO gff_keywords (line_num, key, value) VALUES (?, ?, ?)"
12
+ values = [rec.line_num, key, val]
13
+ puts sql
14
+ p values
15
+ db_kw.execute(sql, values)
16
+ end
17
+ end
18
+
19
+ def insert_data_dbxref(record, key, db_kw)
20
+ rec = record
21
+ xdb = rec.dbxrefs
22
+ val = xdb[key]
23
+ sql = "INSERT INTO gff_keywords (line_num, key, value, category) VALUES (?, ?, ?, ?)"
24
+ values = [rec.line_num, key, val, "Dbxref"]
25
+ puts sql
26
+ p values
27
+ db_kw.execute(sql, values)
28
+ end
29
+
30
+ gfffile = ARGV[0]
31
+
32
+ db_file = gfffile + ".sqlite3" # altready built
33
+ #db_keywards = gfffile + ".keywords.sqlite3"
34
+
35
+ gffdb = GffDb.new(db_file) # altready built
36
+ db = SQLite3::Database.new(db_file)
37
+
38
+ sql = <<-SQL
39
+ CREATE TABLE gff_keywords (
40
+ id integer primary key,
41
+ line_num integer,
42
+ key text not null,
43
+ value text,
44
+ category text
45
+ );
46
+ SQL
47
+
48
+ db.execute(sql)
49
+
50
+ db.transaction do
51
+ gffdb.each_record do |r|
52
+ insert_data(r, "Name", db)
53
+ insert_data(r, "gbkey", db)
54
+ insert_data(r, "gene", db)
55
+ insert_data(r, "product", db)
56
+ insert_data(r, "transcript_id", db)
57
+ r.dbxrefs.keys.each do |k|
58
+ insert_data_dbxref(r, k, db)
59
+ end
60
+ end
61
+ end
62
+
63
+ #===
64
+ # create index
65
+ table = "gff_keywords"
66
+ %w{id line_num key value}.each do |col|
67
+ idxname = "index_#{table}_on_#{col}"
68
+ sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
69
+ db.execute(sql)
70
+ end
@@ -0,0 +1,13 @@
1
+ gfffile = ARGV[0]
2
+ gfffile = "example_gff/apisum_part.gff3"
3
+ #gfffile = "example_gff/ApL_HF_liftover_Refseq.gff"
4
+
5
+ #=== sort gff by position
6
+ gfffile_sorted = gfffile + ".gz"
7
+ cmd = %Q{(grep ^"#" #{gfffile}; grep -v ^"#" #{gfffile} | sort -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
8
+ system cmd
9
+
10
+ cmd = "tabix -p gff #{gfffile_sorted}"
11
+ system cmd
12
+
13
+ STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
data/exe/ezgff ADDED
@@ -0,0 +1,168 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require_relative '../lib/ezgff'
4
+ require 'thor'
5
+ require 'color_echo/get'
6
+
7
+ module Ezgff
8
+ class CLI < Thor
9
+
10
+ ## hack to enable -h option
11
+ ## ref: https://magazine.rubyist.net/articles/0046/0046-Milkode.html#%E3%82%AB%E3%82%B9%E3%82%BF%E3%83%9E%E3%82%A4%E3%82%BA-milk-add--h-%E3%82%92%E5%AE%9F%E7%8F%BE%E3%81%99%E3%82%8B
12
+
13
+ class_option :help, :type => :boolean, :aliases => '-h', :desc => 'Help message'
14
+
15
+ no_tasks do
16
+ def invoke_command(task, *args)
17
+ if options[:help] &&
18
+ task.name != 'grep'
19
+ Ezgff::CLI.task_help(shell, task.name)
20
+ elsif options[:version] && task.name == 'help'
21
+ puts "milk #{Version}"
22
+ else
23
+ super
24
+ end
25
+ end
26
+ end
27
+ ## end of -h option setting
28
+
29
+ desc "version", "show version number"
30
+ def version()
31
+ puts Ezgff::VERSION
32
+ end
33
+
34
+ desc "view DB QUERY", "retrieve GFF record by ID and view it in a specified format.
35
+ DB path to ezdb
36
+ ezdb should be created by using 'build' subcommand in advance.
37
+ QUERY query for search
38
+ Two modes, simple mode and advanced mode are available.
39
+ Simple mode
40
+ ID is given and search by the ID.
41
+ Advanced mode
42
+ Query is given in KEY=VALUE style. Available keys are
43
+ ID: ID (ex. ID=rna-XM_029485812.1)
44
+ LN: Line number (ex. LN=255)
45
+ Note: Spaces are not allowed before and after =
46
+
47
+ "
48
+
49
+ option :format, :aliases => '-f', :enum => ["json", "gff"], :default => "gff", :desc => "Specify output format."
50
+ option :with, :aliases => '-w', :enum => ["none", "parent", "children", "ancestors", "descendants"], :default => "none", :desc => "Retrieve data with parent or children features."
51
+ option :type, :aliases => '-t', :desc => "Limit type (Column \#3 in GFF file) such as gene, mRNA and CDS"
52
+
53
+ def view(db, query)
54
+ ezdb = db
55
+ files = Dir["#{ezdb}/*.sqlite3"]
56
+ sq3_file = nil
57
+ if files.size == 1
58
+ sq3_file = files[0]
59
+ elsif files.size == 0
60
+ raise "sqlite3 file not found"
61
+ elsif files.size > 1
62
+ raise "Multiple sqlite3 files found"
63
+ end
64
+ sq3_db = GffDb.new(sq3_file)
65
+
66
+ if m = /^LN\=/.match(query)
67
+ ## search by line number
68
+ query2 = m.post_match.strip
69
+ ann = sq3_db.get_by_line_number(query2)
70
+ elsif m = /^ID\=/.match(query)
71
+ ## search by ID
72
+ query2 = m.post_match.strip
73
+ ann = sq3_db.get(query2)
74
+ else
75
+ ann = sq3_db.get(query)
76
+ end
77
+
78
+ results = []
79
+ if options[:with] == "descendants"
80
+ results = ann.descendants
81
+ elsif options[:with] == "ancestors"
82
+ results = ann.ancestors
83
+ else
84
+ parent = nil
85
+ if options[:with] == "parent"
86
+ parent = ann.parent
87
+ end
88
+ children = []
89
+ if options[:with] == "children"
90
+ ann.children.each do |c|
91
+ children << c
92
+ end
93
+ end
94
+
95
+ results = []
96
+ results << parent if parent
97
+ results << ann
98
+ results += children
99
+ end
100
+
101
+ ## Filter results
102
+ ## - type / type
103
+ if options[:type]
104
+ target_type = options[:type]
105
+ results = results.select{|r| r.type == target_type}
106
+ end
107
+
108
+ case options[:format]
109
+ when "json"
110
+ h = Hash.new
111
+ ary = results.map{|r| r.to_h}
112
+ h["gff_records"] = ary
113
+ puts h.to_json
114
+
115
+ when "gff"
116
+ puts results
117
+ else
118
+ raise "Unknown format: #{options[:format]}"
119
+ end
120
+
121
+ end
122
+
123
+ desc "build GFF", "build database from GFF file"
124
+ option :help, :aliases => :h, :type => :boolean
125
+ def build(gff_in)
126
+ # puts "build #{gff_file} => #{dbpath}"
127
+ dbpath = GffDb.build_db(gff_in)
128
+ STDERR.puts "new database created: #{dbpath}"
129
+ gff_file = dbpath + "/" + File.basename(gff_in)
130
+ GffDb.build_tabix(gff_file)
131
+ end
132
+
133
+ desc "search DB QUERY", "search GFF record giving query"
134
+ option :format, :aliases => :f, :enum => ["json", "gff"], :default => "gff"
135
+ def search(db, query)
136
+ ezdb = db
137
+ files = Dir["#{ezdb}/*.sqlite3"]
138
+ sq3_file = nil
139
+ if files.size == 1
140
+ sq3_file = files[0]
141
+ elsif files.size == 0
142
+ raise "sqlite3 file not found"
143
+ elsif files.size > 1
144
+ raise "Multiple sqlite3 files found"
145
+ end
146
+ sq3_db = GffDb.new(sq3_file)
147
+ results = sq3_db.search(query, 100)
148
+ case options[:format]
149
+ when "json"
150
+ h = Hash.new
151
+ ary = results.map{|r| r.to_h}
152
+ h["gff_records"] = ary
153
+ puts h.to_json
154
+ when "gff"
155
+ gfftxt = results.map{|r| r.to_s}.join("\n")
156
+ puts CE.pickup(/#{query}/i, :green, nil, :bold).get(gfftxt)
157
+ else
158
+ raise "Unknown format: #{options[:format]}"
159
+ end
160
+ end
161
+
162
+ end
163
+ end
164
+
165
+ Ezgff::CLI.start(ARGV)
166
+
167
+
168
+
data/ezgff.gemspec ADDED
@@ -0,0 +1,30 @@
1
+ require_relative 'lib/ezgff/version'
2
+
3
+ Gem::Specification.new do |spec|
4
+ spec.name = "ezgff"
5
+ spec.version = Ezgff::VERSION
6
+ spec.authors = ["Shuji Shigenobu"]
7
+ spec.email = ["sshigenobu@gmail.com"]
8
+
9
+ spec.summary = %q{Utilities for GFF3}
10
+ spec.description = %q{Utilities for GFF3, the genome annotation format. Useful to explore the gene model features.}
11
+ spec.homepage = "https://github.com/shujishigenobu/ezgff_alpha"
12
+ spec.license = "MIT"
13
+ spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
+
15
+ # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = "https://github.com/shujishigenobu/ezgff_alpha"
19
+ spec.metadata["changelog_uri"] = "https://github.com/shujishigenobu/ezgff_alpha"
20
+ # TODO: Put your gem's CHANGELOG.md URL here."
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
24
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
25
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+ end
data/lib/ezgff.rb ADDED
@@ -0,0 +1,12 @@
1
+ require_relative "ezgff/version"
2
+ require_relative "ezgff/gffsqlitedb"
3
+
4
+ #require "ezgff/version"
5
+ #require "ezgff/gffsqlitedb"
6
+
7
+
8
+
9
+ module Ezgff
10
+ class Error < StandardError; end
11
+ # Your code goes here...
12
+ end
@@ -0,0 +1,390 @@
1
+ require 'sqlite3'
2
+ require 'json'
3
+ require 'bio'
4
+ require 'fileutils'
5
+
6
+ #
7
+ # References
8
+ # * Official specification of GFF3 -- https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md
9
+ #
10
+
11
+ module Ezgff
12
+
13
+ class GffDb
14
+
15
+ #===
16
+ # sqlite3 schema
17
+ #
18
+ # gff_records (
19
+ # line_num integer primary key,
20
+ # record text, # original record
21
+ # id text,
22
+ # parent text,
23
+ # seqid text not null,
24
+ # source text,
25
+ # type text,
26
+ # start integer not null,
27
+ # end integer not null,
28
+ # score real,
29
+ # strand varchar(1),
30
+ # phase integer,
31
+ # attributes text,
32
+ # attributes_json json
33
+ # )
34
+
35
+ def self.build_db(gff_in, ezdb_base = nil)
36
+ ezdb_base = (ezdb_base || ".")
37
+ ezdb_path = ezdb_base + "/" + File.basename(gff_in) + ".ezdb"
38
+ gff_file = ezdb_path + "/" + File.basename(gff_in)
39
+ Dir.mkdir(ezdb_path)
40
+ File.open(gff_file, "w") do |o|
41
+ File.open(gff_in).each do |l|
42
+ break if /^\#\#FASTA/.match(l)
43
+ ## skip header section
44
+ next if /^\#/.match(l)
45
+ o.puts l
46
+ end
47
+ end
48
+
49
+ # FileUtils.cp(gff_in, gff_file)
50
+ sq3_file = gff_file + ".sqlite3"
51
+
52
+ ## Create table in sqlite3 RDBMS
53
+ ## table name: gff_record
54
+
55
+ sq3_db = SQLite3::Database.new(sq3_file)
56
+
57
+ sql = <<-SQL
58
+ CREATE TABLE gff_records (
59
+ line_num integer primary key,
60
+ record text,
61
+ id text,
62
+ parent text,
63
+ seqid text not null,
64
+ source text,
65
+ type text,
66
+ start integer not null,
67
+ end integer not null,
68
+ score real,
69
+ strand varchar(1),
70
+ phase integer,
71
+ attributes text,
72
+ attributes_json json
73
+ );
74
+ SQL
75
+
76
+ sq3_db.execute(sql)
77
+
78
+ ## Read GFF file and insert data into
79
+ ## the sqlite3 table
80
+
81
+ sq3_db.transaction do
82
+ File.open(gff_file).each_with_index do |l, i|
83
+ # puts l
84
+ ## skip FASTA seq section
85
+ break if /^\#\#FASTA/.match(l)
86
+
87
+ ## skip header section
88
+ next if /^\#/.match(l)
89
+ gr = Bio::GFF::GFF3::Record.new(l.chomp)
90
+ # p gr.attributes
91
+ id = nil
92
+ id_found = gr.attributes.select{|a| a[0] == "ID"}
93
+ if id_found.size == 1
94
+ id = id_found[0][1]
95
+ elsif id_found.size == 0
96
+ ## do nothing (id = nil)
97
+ elsif id_found > 1
98
+ STDERR.puts gr.attributes
99
+ raise "Multiple IDs found."
100
+ end
101
+ parent = ((gr.attributes.select{|a| a[0] == "Parent"}[0]) || [])[1]
102
+ a = l.chomp.split(/\t/)
103
+
104
+ sql = "INSERT INTO gff_records (line_num, record, id, parent, seqid, source, type, start, end, score, strand, phase, attributes, attributes_json)
105
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
106
+ values = [i, l.chomp, id, parent,
107
+ a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], a[8],
108
+ attributes_as_json(l)]
109
+ sq3_db.execute(sql, values)
110
+ end
111
+ end
112
+
113
+ ## Indexing the sqlite3 table
114
+ table = "gff_records"
115
+ %w{id parent source type}.each do |col|
116
+ idxname = "index_#{table}_on_#{col}"
117
+ sql = "CREATE INDEX #{idxname} ON #{table}(#{col})"
118
+ sq3_db.execute(sql)
119
+ end
120
+
121
+ return ezdb_path
122
+
123
+ end
124
+
125
+ def self.build_tabix(gff_in)
126
+ ## sort gff by position
127
+ gfffile_sorted = gff_in + ".gz"
128
+ cmd = %Q{(grep ^"#" #{gff_in}; grep -v ^"#" #{gff_in} | sort -t $'\t' -k1,1 -k4,4n) | bgzip > #{gfffile_sorted};}
129
+ STDERR.puts cmd
130
+ system cmd
131
+
132
+ cmd = "tabix -p gff #{gfffile_sorted}"
133
+ STDERR.puts cmd
134
+ system cmd
135
+
136
+ STDERR.puts "#{gfffile_sorted} and #{gfffile_sorted}.tbi were generated."
137
+ end
138
+
139
+ def self.attributes_as_json(gffline)
140
+ keys_multi_val_allowed = %{Parent Alias Note Dbxref Ontology_term}
141
+
142
+ gr = Bio::GFF::GFF3::Record.new(gffline.chomp)
143
+
144
+ h = Hash.new
145
+ gr.attributes.each do |att|
146
+ k, v = att
147
+ unless h.has_key?(k)
148
+ h[k] = []
149
+ end
150
+ h[k] << v
151
+ end
152
+ h2 = Hash.new
153
+ h.each do |key, values|
154
+ if key == "Dbxref" || key == "Ontology_term"
155
+ h3 = Hash.new
156
+ values.each do |val|
157
+ m = /(.+?):/.match(val)
158
+ dbtag = m[1]
159
+ dbval = m.post_match
160
+ h3.update({dbtag => dbval})
161
+ end
162
+ h2[key] = h3
163
+ else
164
+ h2[key] = values.join(",")
165
+ end
166
+ end
167
+ h2.to_json
168
+ end
169
+
170
+ def initialize(path)
171
+ @db = SQLite3::Database.new(path)
172
+ end
173
+
174
+ def each_record
175
+ sql = "SELECT * FROM gff_records"
176
+ @db.execute(sql).each do |r|
177
+ an = Annotation.new()
178
+ an.build_from_db_record(r)
179
+ yield an
180
+ end
181
+ end
182
+
183
+ def get(id)
184
+ sql = %Q{SELECT * FROM gff_records WHERE id=="#{id}";}
185
+ # puts sql
186
+ res = @db.execute(sql)
187
+ if res.size == 1
188
+ an = Annotation.new(@db)
189
+ an.build_from_db_record(res[0])
190
+ return an
191
+ else
192
+ if res.size >= 2
193
+ raise "multiple hits"
194
+ elsif res.size == 0
195
+ raise "not found: #{id}"
196
+ end
197
+ end
198
+ end
199
+
200
+ def get_by_line_number(n)
201
+ sql = %Q{SELECT * FROM gff_records WHERE line_num=="#{n}";}
202
+ res = @db.execute(sql)
203
+ if res.size == 1
204
+ an = Annotation.new(@db)
205
+ an.build_from_db_record(res[0])
206
+ return an
207
+ else
208
+ if res.size >= 2
209
+ raise "multiple hits"
210
+ elsif res.size == 0
211
+ raise "not found: #{id}"
212
+ end
213
+ end
214
+ end
215
+
216
+ def search(query, num_limit=100)
217
+ sql = %Q{SELECT * FROM gff_records WHERE id LIKE "%#{query}%" OR parent LIKE "%#{query}%" OR attributes LIKE "%#{query}%";}
218
+ STDERR.puts sql
219
+ res = @db.execute(sql)
220
+ res2 = res.map{|r| an = Annotation.new(@db); an.build_from_db_record(r); an}
221
+ res2
222
+ end
223
+
224
+ class Annotation
225
+
226
+ def initialize(db = nil)
227
+ @db = db
228
+ @seqid
229
+ @source
230
+ @type
231
+ @start
232
+ @end
233
+ @score
234
+ @strand
235
+ @phase
236
+ @attributes
237
+ @id
238
+ @parent_id
239
+ @gffline
240
+ end
241
+
242
+ attr_accessor :seqid, :source, :type, :start, :end, :score, :strand, :phase, :attributes
243
+ attr_accessor :id, :parent_id, :gffline, :line_num
244
+
245
+ def to_s
246
+ gffline
247
+ end
248
+
249
+ def to_hash
250
+ h = {
251
+ 'seqid' => seqid,
252
+ 'source' => source,
253
+ 'type' => type,
254
+ 'start' => start,
255
+ 'end' => self.end,
256
+ 'score' => score,
257
+ 'strand' => strand,
258
+ 'phase' => phase,
259
+ 'line_num' => line_num,
260
+ 'id' => id,
261
+ 'parent_id' => parent_id,
262
+ 'attributes' => attributes
263
+ }
264
+ end
265
+
266
+ alias :to_h :to_hash
267
+
268
+ def to_json
269
+ self.to_hash.to_json
270
+ end
271
+
272
+ def build_from_db_record(sql_result)
273
+ ## sql_result: Array returned by @db.execute(sql)
274
+ v = sql_result
275
+ @seqid = v[4]
276
+ @source = v[5]
277
+ @type = v[6]
278
+ @start = v[7]
279
+ @end = v[8]
280
+ @score = v[9]
281
+ @strand = v[10]
282
+ @phase = v[11]
283
+ @line_num = v[0]
284
+ @gffline = v[1]
285
+ @id = v[2]
286
+ @parent_id = v[3]
287
+ @attributes = JSON.parse(v[13])
288
+ end
289
+
290
+ def parent
291
+ if parent_id
292
+ sql = %Q{SELECT * FROM gff_records WHERE id=="#{parent_id}";}
293
+ # puts sql
294
+ res = @db.execute(sql)
295
+ an = Annotation.new(@db)
296
+ an.build_from_db_record(res[0])
297
+ return an
298
+ else
299
+ return nil
300
+ end
301
+ end
302
+
303
+ def children
304
+ ary = []
305
+ sql = %Q{SELECT * FROM gff_records WHERE parent=="#{id}";}
306
+ # puts sql
307
+ res = @db.execute(sql)
308
+ res.each do |r|
309
+ an = Annotation.new(@db)
310
+ an.build_from_db_record(r)
311
+ ary << an
312
+ end
313
+ ary
314
+ end
315
+
316
+ def descendants
317
+ ary = []
318
+ sql = %Q{WITH RECURSIVE r AS (
319
+ SELECT * FROM gff_records WHERE id=="#{id}"
320
+ UNION ALL
321
+ SELECT gff_records.* FROM gff_records, r WHERE gff_records.parent == r.id
322
+ )
323
+ SELECT * FROM r}
324
+ res = @db.execute(sql)
325
+ res.each do |r|
326
+ an = Annotation.new(@db)
327
+ an.build_from_db_record(r)
328
+ ary << an
329
+ end
330
+ ary
331
+ end
332
+
333
+ def ancestors
334
+ ary = []
335
+ sql = %Q{WITH RECURSIVE ancestor AS (
336
+ SELECT * FROM gff_records WHERE id=="#{id}"
337
+ UNION ALL
338
+ SELECT gff_records.* FROM gff_records, ancestor
339
+ WHERE ancestor.parent = gff_records.id
340
+ )
341
+ SELECT * FROM ancestor;}
342
+ res = @db.execute(sql)
343
+ res.each do |r|
344
+ an = Annotation.new(@db)
345
+ an.build_from_db_record(r)
346
+ ary << an
347
+ end
348
+ ary
349
+ end
350
+
351
+
352
+ def length
353
+ len = @end - @start + 1
354
+ raise unless len > 0
355
+ return len
356
+ end
357
+
358
+ def dbxrefs
359
+ h = Hash.new
360
+ if attributes["Dbxref"]
361
+ attributes["Dbxref"].split(/,/).each do |x|
362
+ m = /(.+?):/.match(x)
363
+ key = m[1]
364
+ val = m.post_match
365
+ h.update({key => val})
366
+ end
367
+ end
368
+ h
369
+ end
370
+
371
+ end
372
+
373
+ end
374
+
375
+ end
376
+
377
+ if __FILE__ == $0
378
+ dbname = ARGV[0]
379
+ query = ARGV[1]
380
+ db = GffDb.new(dbname)
381
+ ann = db.get(query)
382
+ p ann
383
+ puts ann.to_s
384
+ p ann.to_hash
385
+ exit
386
+ db.each_line do |an|
387
+ p an
388
+ # p [an.id, an.seqid, an.start, an.end, an.attributes["protein_id"]]
389
+ end
390
+ end
@@ -0,0 +1,3 @@
1
+ module Ezgff
2
+ VERSION = "0.0.2"
3
+ end
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ezgff
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ platform: ruby
6
+ authors:
7
+ - Shuji Shigenobu
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2021-07-14 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Utilities for GFF3, the genome annotation format. Useful to explore the
14
+ gene model features.
15
+ email:
16
+ - sshigenobu@gmail.com
17
+ executables:
18
+ - ezgff
19
+ extensions: []
20
+ extra_rdoc_files: []
21
+ files:
22
+ - ".gitignore"
23
+ - Gemfile
24
+ - LICENSE
25
+ - README.md
26
+ - Rakefile
27
+ - bin/build_gff_sqlitedb.rb
28
+ - bin/build_gff_sqlitedb_keywords.rb
29
+ - bin/build_gff_tabix.rb
30
+ - dev/gff_examples/ApL_HF_liftover_Refseq.gff.gz
31
+ - dev/gff_examples/ApMT_NC_011594.gb.gff.gz
32
+ - dev/gff_examples/GCF_000009605.1_ASM960v1_genomic.gff.gz
33
+ - dev/gff_examples/apisum_part.gff3.gz
34
+ - dev/gff_examples/ref_pea_aphid_22Mar2018_4r6ur_top_level.chrNamed.gff3.gz
35
+ - exe/ezgff
36
+ - ezgff.gemspec
37
+ - lib/ezgff.rb
38
+ - lib/ezgff/gffsqlitedb.rb
39
+ - lib/ezgff/version.rb
40
+ homepage: https://github.com/shujishigenobu/ezgff_alpha
41
+ licenses:
42
+ - MIT
43
+ metadata:
44
+ homepage_uri: https://github.com/shujishigenobu/ezgff_alpha
45
+ source_code_uri: https://github.com/shujishigenobu/ezgff_alpha
46
+ changelog_uri: https://github.com/shujishigenobu/ezgff_alpha
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: 2.3.0
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ requirements: []
62
+ rubygems_version: 3.1.2
63
+ signing_key:
64
+ specification_version: 4
65
+ summary: Utilities for GFF3
66
+ test_files: []