ncbi-taxonomy 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b77d9733ddf67d4f5ee94e8beae5d55826ba6c1e
4
+ data.tar.gz: c29cb5552e8e97c991ece58aa8eb0eb413a28a6b
5
+ SHA512:
6
+ metadata.gz: 541dcb47a69210bd72a562027c8950d5bcc2e8e3bbeaeb6754469427599de0effb71769efd202d3c0d6c9104dcc3c2479be045cb916a130577241f928b819f6b
7
+ data.tar.gz: b87802c2886c9ad7c88fd70a87fe91af302111a1792acadc393347c5a298dbf9341a522bd470ec930a5af2627657202779dfaefebf8b5f3251d7c07961c0f191
@@ -0,0 +1,129 @@
1
+ #!/usr/bin/env ruby
2
+ require 'ncbi_taxonomy'
3
+ require 'ncbi_taxonomy_update'
4
+
5
+ def help msg
6
+ STDERR.puts msg
7
+ STDERR.puts "usage: #{$0.split("/")[-1]} <command> {-r} {-f} <args>"
8
+ STDERR.puts
9
+ STDERR.puts "These commands are used for search."
10
+ STDERR.puts " name Search taxonomic names using organism name with all ranks"
11
+ STDERR.puts " example) $ ncbi_taxonomy name \"Escherichia coli SE11\""
12
+ #STDERR.puts " example using file) ncbi_taxonomy name -f list.txt"
13
+ STDERR.puts
14
+ STDERR.puts " fname Search taxonomic names using organism name with fixed ranks"
15
+ STDERR.puts " Superkingdom(aka Domain) Phylum Class Order Family Genus Species strain"
16
+ STDERR.puts " example) $ ncbi_taxonomy fname \"Acidibacter ferrireducens\""
17
+ #STDERR.puts " example using file) ncbi_taxonomy fname -f list.txt"
18
+ STDERR.puts
19
+ STDERR.puts " id Search taxonomic names using NCBI taxon id with all ranks"
20
+ STDERR.puts " example) $ ncbi_taxonomy id 409438"
21
+ #STDERR.puts " example using file) ncbi_taxonomy id -f list.txt"
22
+ STDERR.puts
23
+ STDERR.puts " fid Search taxonomic names using NCBI taxon id with fixed ranks"
24
+ STDERR.puts " example) $ ncbi_taxonomy fid 409438"
25
+ #STDERR.puts " example using file) ncbi_taxonomy fid -f list.txt"
26
+ STDERR.puts
27
+ STDERR.puts "Common option for search command"
28
+ STDERR.puts " -r show each rank name in the result (default is hiding rank name)"
29
+ STDERR.puts " example) $ ncbi_taxonomy name -r \"Escherichia coli SE11\""
30
+ STDERR.puts
31
+ STDERR.puts "We recommend to download the latest database using this command."
32
+ STDERR.puts " update Update the NCBI Taxonomy database"
33
+ STDERR.puts " downloaded database is stored in $HOME/.ncbi_taxonomy/"
34
+ STDERR.puts " example) $ ncbi_taxonomy update"
35
+ STDERR.puts
36
+ STDERR.puts "Common option for update command"
37
+ STDERR.puts " -f force to update teh database"
38
+ STDERR.puts " example) $ ncbi_taxonomy -f update"
39
+ #STDERR.puts " example for forcing to download) ncbi_taxonomy update -f"
40
+ STDERR.puts
41
+ STDERR.puts "About rank type"
42
+ STDERR.puts " all ranks : all ranks registered in NCBI Taxonomy database"
43
+ STDERR.puts " fixed ranks : major ranks for comman usage (especially microorganism)"
44
+ STDERR.puts " ==> Superkingdom(aka Domain) Phylum Class Order Family Genus Species strain\""
45
+ STDERR.puts
46
+ STDERR.puts "Bioinformatician can access methods of this software directly using Ruby."
47
+ STDERR.puts "More information is in GitLab web page."
48
+ STDERR.puts "https://gitlab.com/javamint/ncbi-taxonomy"
49
+ STDERR.puts
50
+ exit 1
51
+ end
52
+
53
+ def search command, arg, opts
54
+ t = Taxonomy.new
55
+ out = nil
56
+ if command =~ /^na/
57
+ outs = t.get_allrank_by_name arg
58
+ STDERR.puts "[WARNING] name in multiple taxon" if outs.size > 1
59
+ elsif command =~ /^fn/
60
+ outs = t.get_fixedrank_by_name arg
61
+ STDERR.puts "[WARNING] name in multiple taxon" if outs.size > 1
62
+ elsif command =~ /^id/
63
+ outs = t.get_allrank_by_id arg.to_i
64
+ outs = [ outs ]
65
+ elsif command =~ /^fi/
66
+ outs = t.get_fixedrank_by_id arg.to_i
67
+ outs = [ outs ]
68
+ else
69
+ help "Incorrect command: #{command}"
70
+ end
71
+
72
+ if opts.index('-r') != nil
73
+ outs.each do |out|
74
+ prt = Array.new
75
+ out.each do |x|
76
+ prt << "#{x[1]} (#{x[0]})"
77
+ end
78
+ puts prt.join("\t")
79
+ end
80
+ else
81
+ outs.each do |out|
82
+ prt = Array.new
83
+ out.each do |x|
84
+ prt << x[1]
85
+ end
86
+ puts prt.join("\t")
87
+ end
88
+
89
+ end
90
+ end
91
+
92
+ def update force_flag
93
+ update = Update.new
94
+
95
+ if update.status || force_flag
96
+ STDERR.print "NCBI Taxonomy database downloading "
97
+ update.do
98
+ STDERR.puts "DONE"
99
+ elsif update.status == false
100
+ STDERR.puts "Current local database is the latest one."
101
+ else
102
+ STDERR.puts update.status
103
+ exit 1
104
+ end
105
+ end
106
+
107
+ help '' if ARGV.size == 0
108
+
109
+ args = Array.new
110
+ opts = Array.new
111
+ command = ARGV.shift
112
+ case command
113
+ when 'update'
114
+ ARGV.shift == '-f' ? flag = true : flag = false
115
+ update flag
116
+ else
117
+ while ARGV.size > 0
118
+ arg = ARGV.shift
119
+ if arg =~ /^-/
120
+ opts << arg
121
+ else
122
+ args << arg
123
+ end
124
+ end
125
+
126
+ args.each do |x|
127
+ search command, x, opts
128
+ end
129
+ end
@@ -0,0 +1,160 @@
1
+ require 'sqlite3'
2
+
3
+ class Taxonomy
4
+ def using_unique_name
5
+ { "Ponticoccus" => 1, "Bacillus" => 1 }
6
+ end
7
+
8
+ def tax_rank_fixed
9
+ { "superkingdom" => 0, "phylum" => 4, "class" => 7, "order" => 11, "family" => 16, "genus" => 21, "species" => 24, "strain" => 28 }
10
+ end
11
+
12
+ def tax_rank_all
13
+ { "superkingdom" => 0, "kingdom" => 1, "subkingdom" => 2, "superphylum" => 3, "phylum" => 4, "subphylum" => 5, "superclass" => 6, "class" => 7, "infraclass" => 8, "subclass" => 9, "superorder" => 10, "order" => 11, "suborder" => 12, "infraorder" => 13, "parvorder" => 14, "superfamily" => 15, "family" => 16, "subfamily" => 17, "tribe" => 18, "subtribe" => 19, "genus" => 20, "subgenus" => 21, "species group" => 22, "species subgroup" => 23, "species" => 24, "subspecies" => 25, "varietas" => 26, "forma" => 27, "no rank" => 28 }
14
+ end
15
+
16
+ def initialize
17
+ @home_dir = Dir.home
18
+ @work_dir = @home_dir + "/.ncbi_taxonomy"
19
+ @taxdb_release = @work_dir + "/taxonomy.db"
20
+ begin
21
+ @db = SQLite3::Database.new @taxdb_release
22
+ rescue SQLite3::CantOpenException => e
23
+ STDERR.puts "Please download the NCBI Taxonomy database using 'ncbi_taxonomy update' command."
24
+ STDERR.puts "[MSG]" + e.message
25
+ exit 1
26
+ end
27
+ end
28
+
29
+ def memory
30
+ tmp = SQLite3::Database.new ':memory:'
31
+ backup = SQLite3::Backup.new tmp, 'main', @db, 'main'
32
+ backup.step -1
33
+ backup.finish
34
+ @db = tmp
35
+ end
36
+
37
+ def check_sqlite_version
38
+ rs = @db.execute "SELECT SQLITE_VERSION()"
39
+ Gem::Version.new(rs[0][0]) >= Gem::Version.new('3.8.3')
40
+ end
41
+
42
+ def get_taxonids_by_name name
43
+ name = SQLite3::Database.quote name
44
+ out = @db.execute "SELECT DISTINCT tax_id FROM names WHERE name_txt='#{name}'"
45
+ out.flatten
46
+ end
47
+
48
+ def get_names_by_taxonid id
49
+ @db.execute "SELECT name_class, name_txt FROM names WHERE tax_id=#{id}"
50
+ end
51
+
52
+ def get_scientific_name_by_names names
53
+ names.each {|x| return x[1] if x[0] == 'scientific name' }
54
+ end
55
+
56
+ def get_scientific_name_by_id id
57
+ id = id.to_i
58
+ out = @db.execute "SELECT name_txt FROM names WHERE tax_id=#{id} AND name_class='scientific name'"
59
+ out[0][0]
60
+ end
61
+
62
+ def get_all_names_by_id id
63
+ id = id.to_i
64
+ out = @db.execute "SELECT name_txt FROM names WHERE tax_id=#{id}"
65
+ out
66
+ end
67
+
68
+ def get_rank_ptaxonid_by_id id
69
+ id = id.to_i
70
+ out = @db.execute "SELECT parent_tax_id, rank FROM nodes WHERE tax_id=#{id}"
71
+ out[0]
72
+ end
73
+
74
+ def get_rank_ptaxonid_scientificname_by_id id
75
+ id = id.to_i
76
+ out = @db.execute "SELECT nodes.parent_tax_id, nodes.rank, names.name_txt FROM nodes, names WHERE nodes.tax_id=#{id} AND names.tax_id=#{id} AND names.name_class='scientific name'"
77
+ out[0]
78
+ end
79
+
80
+ def get_allrank_by_id id
81
+ id = id.to_i
82
+ out = Array.new
83
+ if check_sqlite_version
84
+ rs = @db.execute "WITH RECURSIVE allrank(id, pid, rank, name) AS ( VALUES(0, #{id}, 'no_rank', 'Homo sapiens javamintus') UNION ALL SELECT nodes.tax_id, nodes.parent_tax_id, nodes.rank, names.name_txt FROM nodes, names, allrank WHERE nodes.tax_id=allrank.pid AND names.tax_id = nodes.tax_id AND names.name_class='scientific name' AND nodes.tax_id<>1) SELECT * FROM allrank;"
85
+ rs[1..-2].each {|x| out << [ x[2], x[3] ] }
86
+ else
87
+ while true
88
+ rs = self.get_rank_ptaxonid_scientificname_by_id id
89
+ out << [ rs[1], rs[2] ]
90
+ break if rs[1] == 'superkingdom'
91
+ id = rs[0]
92
+ end
93
+ end
94
+
95
+ out = self.get_allrank_by_id(self.get_missing_id id).reverse if out.size == 0 && id > -1
96
+ out.reverse
97
+ end
98
+
99
+ def get_fixedrank_by_id id
100
+ id = id.to_i
101
+ ranks = self.get_allrank_by_id id
102
+ arr = Array.new
103
+ pos = 0
104
+ alt_name = ''
105
+ ranks.each do |rank, name|
106
+ rank_fixed_no = tax_rank_fixed[rank]
107
+ rank_all_no = tax_rank_all[rank]
108
+ if rank_fixed_no != nil
109
+ this_rank_fixed = tax_rank_fixed.to_a.index [rank, rank_fixed_no]
110
+ if arr.size < this_rank_fixed
111
+ (arr.size...this_rank_fixed).each do |x|
112
+ arr << [ tax_rank_fixed.to_a[x][0], "@#{alt_name}_#{tax_rank_fixed.to_a[x][0]}" ]
113
+ pos += 1
114
+ end
115
+ end
116
+ arr << [ rank, name ]
117
+ pos += 1
118
+ alt_name = name
119
+ elsif arr.size == 7 && rank_all_no == 28
120
+ arr << [ 'strain', name ]
121
+ elsif rank_all_no != 28
122
+ alt_name = name if tax_rank_fixed.to_a[pos-1][1] > tax_rank_all[rank]
123
+ end
124
+ end
125
+ if arr.size < 7 && arr.size >= 0
126
+ (arr.size..7).each do |x|
127
+ arr << [ tax_rank_fixed.to_a[x][0], nil ]
128
+ end
129
+ elsif arr.size == 7
130
+ arr << [ 'strain', arr[-1][1] ]
131
+ end
132
+ arr
133
+ end
134
+
135
+ def get_allrank_by_name name
136
+ out = Array.new
137
+ self.get_taxonids_by_name(name).each {|x| out << self.get_allrank_by_id(x) }
138
+ out
139
+ end
140
+
141
+ def get_fixedrank_by_name name
142
+ out = Array.new
143
+ self.get_taxonids_by_name(name).each {|x| out << self.get_fixedrank_by_id(x) }
144
+ out
145
+ end
146
+
147
+ def get_missing_id id
148
+ rs = @db.execute "SELECT tax_id FROM delnodes WHERE tax_id='#{id}'"
149
+ if rs.size > 0
150
+ return -1
151
+ else
152
+ rs = @db.execute "SELECT new_tax_id FROM merged WHERE old_tax_id=#{id}"
153
+ if rs.size == 1
154
+ return rs[0][0].to_i
155
+ else
156
+ return -1
157
+ end
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,179 @@
1
+ require 'sqlite3'
2
+ require 'fileutils'
3
+ # Bug : 2014. 04. 05
4
+ # Error: citations.dmp.trim line 32659: expected 7 columns of data but found 8
5
+ # Actually this software does not use this table. Therefore, this error can be ignored.
6
+ # Later, I should fix this.
7
+
8
+ class Update
9
+ def initialize
10
+ @home_dir = Dir.home
11
+ @work_dir = @home_dir + "/.ncbi_taxonomy"
12
+ md5_file = @work_dir + "/taxdump.tar.gz.md5"
13
+ md5_old_file = @work_dir + "/taxdump.tar.gz.md5.old"
14
+ @taxdb = @work_dir + "/taxonomy.db.prep"
15
+ @taxdb_release = @work_dir + "/taxonomy.db"
16
+ @status = nil
17
+
18
+ # check workinng directory, if not exist, make it.
19
+ if File.exist?(@work_dir)
20
+ if !File.directory?(@work_dir)
21
+ @status = "This software uses $HOME/.ncbi_taxonomy directory. However, in your home directory there is same name of file. We recommend you change that file name to another name."
22
+ return
23
+ else
24
+ Dir.chdir @work_dir
25
+ if File.exist?(md5_file)
26
+ `rm -f #{md5_old_file}`
27
+ File.rename(md5_file, md5_old_file)
28
+ self.download_md5
29
+ `diff #{md5_file} #{md5_old_file}`
30
+ if $?.exitstatus == 0
31
+ @status = false
32
+ return
33
+ else
34
+ @status = true
35
+ return
36
+ end
37
+ else
38
+ self.download_md5
39
+ end
40
+ end
41
+ else
42
+ Dir.mkdir @work_dir
43
+ Dir.chdir @work_dir
44
+ self.download_md5
45
+ end
46
+
47
+ @status = true
48
+ return
49
+ end
50
+
51
+ def status
52
+ return @status
53
+ end
54
+
55
+ def download_md5
56
+ `curl -s ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz.md5 > taxdump.tar.gz.md5 2>/dev/null`
57
+ end
58
+
59
+ def download_dump
60
+ `curl -s ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz 2>/dev/null | tar zxf - `
61
+ end
62
+
63
+ # substitute some characters
64
+ def substitution
65
+ Dir.entries(@work_dir).each do |file|
66
+ if file =~ /dmp$/
67
+ #STDERR.puts "treating #{file}"
68
+ File.open(@work_dir+"/"+file+".trim","w") do |out|
69
+ out << File.open(@work_dir+"/"+file).read.force_encoding('iso-8859-1').encode('utf-8').gsub(/([^|]) ([^|])/,'\1 \2').gsub(/\t\|\t/,"\t").gsub(/\t\|$/,"").gsub(/\"/,"%22")
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def release
76
+ begin
77
+ FileUtils.rm "#{@taxdb_release}.old"
78
+ rescue Errno::ENOENT => e
79
+ end
80
+ begin
81
+ File.rename(@taxdb_release, @taxdb_release+".old")
82
+ rescue Errno::ENOENT => e
83
+ end
84
+ File.rename(@taxdb, @taxdb_release)
85
+ end
86
+
87
+ def do
88
+ self.download_dump
89
+ self.substitution
90
+ self.load_db
91
+ self.release
92
+ end
93
+
94
+ def load_db
95
+ sql = <<EOF
96
+ PRAGMA page_size=4096;
97
+ PRAGMA main.locking_mode=EXCLUSIVE;
98
+
99
+ .separator '\t'
100
+
101
+ CREATE TABLE citations (
102
+ cit_id BIGINT,
103
+ cit_key VARCHAR(255),
104
+ pubmed_id BIGINT,
105
+ medline_id BIGINT,
106
+ ur LONGTEXT,
107
+ text LONGTEXT,
108
+ taxid_list LONGTEXT
109
+ );
110
+
111
+ CREATE TABLE delnodes (
112
+ tax_id BIGINT
113
+ );
114
+
115
+ CREATE TABLE division (
116
+ division_id BIGINT,
117
+ division_cde VARCHAR(255),
118
+ division_name VARCHAR(255),
119
+ comments VARCHAR(255)
120
+ );
121
+
122
+ CREATE TABLE gencode (
123
+ genetic_code_id INT,
124
+ abbreviation VARCHAR(255),
125
+ name VARCHAR(255),
126
+ cde LONGTEXT,
127
+ starts LONGTEXT
128
+ );
129
+
130
+ CREATE TABLE merged (
131
+ old_tax_id BIGINT,
132
+ new_tax_id BIGINT
133
+ );
134
+
135
+ CREATE TABLE names (
136
+ tax_id BIGINT,
137
+ name_txt VARCHAR(255),
138
+ unique_name VARCHAR(255),
139
+ name_class VARCHAR(255)
140
+ );
141
+
142
+ CREATE TABLE nodes (
143
+ tax_id BIGINT,
144
+ parent_tax_id BIGINT,
145
+ rank VARCHAR(64),
146
+ embl_code VARCHAR(64),
147
+ division_id INTEGER,
148
+ inherited_div_flag BOOLEAN,
149
+ genetic_code_id INTEGER,
150
+ inherited_GC_flag BOOLEAN,
151
+ mitochondrial_genetic_code_id INTEGER,
152
+ inherited_MGC_flag BOOLEAN,
153
+ GenBank_hidden_flag BOOLEAN,
154
+ hidden_subtree_root_flag BOOLEAN,
155
+ comments VARCHAR(255)
156
+ );
157
+
158
+
159
+ CREATE INDEX citations_idx ON citations(cit_id,cit_key,pubmed_id,medline_id,ur,text,taxid_list);
160
+ CREATE INDEX delnodes_idx ON delnodes(tax_id);
161
+ CREATE INDEX division_idx ON division(division_id,division_cde,division_name,comments);
162
+ CREATE INDEX gencode_idx ON gencode(genetic_code_id,abbreviation,name,cde,starts);
163
+ CREATE INDEX merged_idx ON merged(old_tax_id,new_tax_id);
164
+ CREATE INDEX names_idx ON names(tax_id,name_txt,unique_name,name_class);
165
+ CREATE INDEX nodes_idx ON nodes(tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_GC_flag,mitochondrial_genetic_code_id,inherited_MGC_flag,GenBank_hidden_flag,hidden_subtree_root_flag,comments);
166
+
167
+
168
+ .import citations.dmp.trim citations
169
+ .import delnodes.dmp.trim delnodes
170
+ .import division.dmp.trim division
171
+ .import gencode.dmp.trim gencode
172
+ .import merged.dmp.trim merged
173
+ .import names.dmp.trim names
174
+ .import nodes.dmp.trim nodes
175
+
176
+ EOF
177
+ `echo "#{sql}" | sqlite3 #{@taxdb} < /dev/stdin`
178
+ end
179
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ncbi-taxonomy
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.2.7
5
+ platform: ruby
6
+ authors:
7
+ - Seok-Won Kim
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-08-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: sqlite3
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.0.0
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.0'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.0.0
33
+ description: This gem supports essential functions for fast access of NCBI Taxonomy
34
+ database
35
+ email: javamint@gmail.com
36
+ executables:
37
+ - ncbi_taxonomy
38
+ extensions: []
39
+ extra_rdoc_files: []
40
+ files:
41
+ - bin/ncbi_taxonomy
42
+ - lib/ncbi_taxonomy.rb
43
+ - lib/ncbi_taxonomy_update.rb
44
+ homepage: https://gitlab.com/javamint/ncbi-taxonomy
45
+ licenses:
46
+ - Nonstandard
47
+ metadata: {}
48
+ post_install_message:
49
+ rdoc_options: []
50
+ require_paths:
51
+ - lib
52
+ required_ruby_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 2.5.1
65
+ signing_key:
66
+ specification_version: 4
67
+ summary: NCBI Taxonomy search using local repository
68
+ test_files: []