bio-exominer 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,58 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "bio-exominer"
18
+ gem.homepage = "http://github.com/pjotrp/bioruby-exominer"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Mine publications for gene names}
21
+ gem.description = %Q{Parse publications for gene names in a fuzzy fashion}
22
+ gem.email = "pjotr.public01@thebird.nl"
23
+ gem.authors = ["Pjotr Prins"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ # require 'rspec/core'
29
+ # require 'rspec/core/rake_task'
30
+ # RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ # spec.pattern = FileList['spec/**/*_spec.rb']
32
+ # end
33
+
34
+ # RSpec::Core::RakeTask.new(:rcov) do |spec|
35
+ # spec.pattern = 'spec/**/*_spec.rb'
36
+ # spec.rcov = true
37
+ # end
38
+
39
+ require 'rake/testtask'
40
+
41
+ Rake::TestTask.new do |t|
42
+ t.pattern = "spec/*_spec.rb"
43
+ end
44
+
45
+ require 'cucumber/rake/task'
46
+ Cucumber::Rake::Task.new(:features)
47
+
48
+ task :default => :test
49
+
50
+ require 'rdoc/task'
51
+ Rake::RDocTask.new do |rdoc|
52
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "bio-exominer #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.3
@@ -0,0 +1,250 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # BioRuby exominer Plugin BioExominer
4
+ # Author:: Pjotr Prins
5
+ #
6
+ # Copyright (C) 2013,2014 Cuppen Group & Pjotr Prins <pjotr.prins@thebird.nl>
7
+
8
+ USAGE = "exominer takes a symbol file and parses the piped data for gene symbols"
9
+
10
+ gempath = File.dirname(File.dirname(__FILE__))
11
+ $: << File.join(gempath,'lib')
12
+
13
+ VERSION_FILENAME=File.join(gempath,'VERSION')
14
+ version = File.new(VERSION_FILENAME).read.chomp
15
+
16
+ # print banner
17
+ $stderr.print "exominer #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n"
18
+
19
+ if ARGV.size == 0
20
+ print USAGE
21
+ end
22
+
23
+ require 'bio-exominer'
24
+ require 'optparse'
25
+
26
+ # Uncomment when using the bio-logger
27
+ # require 'bio-logger'
28
+ # log = Bio::Log::LoggerPlus.new 'exominer'
29
+ # log.outputters = Bio::Log::Outputter.stderr
30
+ # Bio::Log::CLI.logger('stderr')
31
+ # Bio::Log::CLI.trace('info')
32
+
33
+ options = { show_help: false, symbols: []}
34
+ opts = OptionParser.new do |o|
35
+ o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} -s ncbi_symbols.tab --rdf < test.txt"
36
+
37
+ o.on("--rdf", "Generate RDF") do |b|
38
+ options[:rdf] = true
39
+ end
40
+ o.on("--name name", 'Set name of result set') do |name|
41
+ options[:name] = name
42
+ end
43
+ o.on("-s","--symbols fn", 'Symbol file') do |fn|
44
+ options[:symbols] += [fn]
45
+ end
46
+ o.on("--hugo [fn]", 'Hugo symbol file') do |fn|
47
+ if fn
48
+ options[:hugo] = fn
49
+ else
50
+ options[:hugo] = gempath + '/test/data/input/hugo_symbols'
51
+ end
52
+ end
53
+ o.on("-i","--ignore fn", 'Ignore symbols in fn (NYI)') do |fn|
54
+ options[:ignore] = fn
55
+ end
56
+ o.on("--context [TYPE]",[:off,:line], 'Context parser mode (off,line)') do |context|
57
+ options[:context] = context
58
+ end
59
+ o.on("--doi doi", 'DOI') do |doi|
60
+ options[:doi] = doi
61
+ end
62
+ o.on("--tag string", 'Tag string') do |tag|
63
+ tags = {}
64
+ ts = tag.split(/\s?;\s?/)
65
+ ts.each do |field|
66
+ key,value = field.strip.split(/\s?=\s?/)
67
+ tags[key.to_sym] = value
68
+ end
69
+ options[:tags] = tags
70
+ end
71
+
72
+ # Uncomment the following when using the bio-logger
73
+ # o.separator ""
74
+ # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
75
+ # Bio::Log::CLI.logger(name)
76
+ # end
77
+ #
78
+ # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
79
+ # Bio::Log::CLI.trace(s)
80
+ # end
81
+ #
82
+ # o.on("-q", "--quiet", "Run quietly") do |q|
83
+ # Bio::Log::CLI.trace('error')
84
+ # end
85
+ #
86
+ # o.on("-v", "--verbose", "Run verbosely") do |v|
87
+ # Bio::Log::CLI.trace('info')
88
+ # end
89
+ #
90
+ # o.on("--debug", "Show debug messages") do |v|
91
+ # Bio::Log::CLI.trace('debug')
92
+ # end
93
+
94
+ o.separator ""
95
+ o.on_tail('-h', '--help', 'display this help and exit') do
96
+ options[:show_help] = true
97
+ end
98
+ end
99
+
100
+ require 'bio-exominer/textparser'
101
+ require 'bio-exominer/symbols'
102
+ require 'yaml'
103
+
104
+ include BioExominer
105
+
106
+ begin
107
+ opts.parse!(ARGV)
108
+
109
+ if options[:show_help]
110
+ print opts
111
+ exit 1
112
+ end
113
+
114
+ $stderr.print options
115
+
116
+ # Create unique name for the resultset
117
+ doi = options[:doi]
118
+ name = options[:name]
119
+ name = options[:tags][:name].to_sym if not name and options[:tags] and options[:tags][:name]
120
+ name = doi if not name and doi
121
+ name = rand(36**8).to_s(36) if not name # finally a random ID, if nothing else works
122
+ give_context = options[:context] != :off
123
+
124
+ # context may be override by tags
125
+ options[:context]=options[:tags][:context] if options[:tags][:context]
126
+
127
+ # Uncomment when using the bio-logger
128
+ # Bio::Log::CLI.configure('exominer')
129
+ # logger = Bio::Log::LoggerPlus['exominer']
130
+ # Log parsed options and remaining arguments in ARGV
131
+ # logger.info [options, ARGV]
132
+
133
+ $stderr.print "\nLoading text..."
134
+ buf = ARGF.read
135
+ $stderr.print "\nTokenizing..."
136
+ tokens,context =
137
+ TextParser::tokenize_with_context(buf,options[:context])
138
+
139
+ symbol_count = 0
140
+ alias_count = 0
141
+ hugo_count = 0
142
+ hugo_matches = {}
143
+ symbol_matches = {} # match symbols
144
+ alias_matches = {} # match aliases
145
+ info = {} # the main symbol match tracker
146
+ hugo = {} # HUGO tracker
147
+
148
+ $stderr.print "\nParse symbol files..."
149
+ parse_symbols = lambda { |symbolfn,is_hugo=false|
150
+ # ---- for every symbol file
151
+ $stderr.print "\nParse symbol file #{symbolfn}..."
152
+ Symbols::each(symbolfn) do | symbol,aliases,descr |
153
+ # ---- for every symbol and aliases
154
+ # $stderr.print "\nHUGO-"+symbol if symbol =~ /L3MBTL/
155
+ hugo[symbol] = true if is_hugo
156
+ # alias_count += aliases.size if aliases
157
+ # ---- If the symbol has a match, and it is not in the list, add it
158
+ if tokens[symbol] and not info[symbol]
159
+ symbol_matches[symbol] = tokens[symbol]
160
+ hugo_matches[symbol] ||= true if is_hugo
161
+ info[symbol] = { :symbol => symbol, :is_hugo=>is_hugo, :aliases => aliases, :descr => descr, :symbolfn => symbolfn }
162
+ end
163
+ # ---- If an alias has a match and is not in the list, add it
164
+ if aliases
165
+ aliases.each do | word |
166
+ # $stderr.print "\n!!"+word+':'+symbol if word == "L3MBTL"
167
+ # full = word + ' (' + symbol + ')'
168
+ if tokens[word] and not info[word]
169
+ # $stderr.print "\n=="+word+':'+symbol if word == "L3MBTL"
170
+ hugo_matches[word] ||= true if hugo[word]
171
+ alias_matches[word] = tokens[word]
172
+ info[word] = { :symbol => symbol, :is_hugo=>hugo[word], :aliases => aliases, :descr => descr, :symbolfn => symbolfn }
173
+ end
174
+ end
175
+ end
176
+ end
177
+ }
178
+
179
+ parse_symbols.call(options[:hugo],is_hugo = true) if options[:hugo]
180
+ options[:symbols].each { |fn| parse_symbols.call(fn) }
181
+
182
+ if options[:rdf]
183
+ # Write RDF!
184
+ print <<HEADER
185
+
186
+ # RDF output by bio-exominer https://github.com/pjotrp/bioruby-exominer
187
+ @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
188
+ @prefix dc: <http://purl.org/dc/elements/1.1/> .
189
+ @prefix hgnc: <http://identifiers.org/hgnc.symbol/> .
190
+ @prefix doi: <http://dx.doi.org/> .
191
+ @prefix bibo: <http://purl.org/ontology/bibo/> .
192
+ @prefix ncbigene: <https://www.google.nl/search?q=ncbi+gene+alias+> .
193
+ @prefix : <http://biobeat.org/rdf/exominer/ns#> .
194
+
195
+ HEADER
196
+
197
+ # Fix DOI with http://www.doi.org/doi_handbook/2_Numbering.html#2.6.2
198
+ print ":#{name} :doi \"doi:#{doi}\" . \n" if doi
199
+ print ":#{name} rdf:label \"#{name}\" . \n"
200
+ print ":#{name} a \"text resource\" . \n"
201
+ print ":#{name} dc:type \"#{options[:type]}\" . \n" if options[:type]
202
+ if options[:tags]
203
+ options[:tags].each do | k,v |
204
+ print ":#{name} :#{k.to_sym} \"#{v}\" .\n"
205
+ end
206
+ end
207
+ print "\n"
208
+
209
+ write_symbol_rdf = lambda { |symbol,freq,is_alias=false|
210
+ match_info = info[symbol]
211
+ match_context = context[symbol].join("; ")
212
+ symbol1 = match_info[:symbol]
213
+ symboluri = Symbols::uri(symbol1,hugo)
214
+ symbolidentifier = RDF::make_identifier(symbol1)
215
+ symbol1 = symbolidentifier if not hugo[symbol1]
216
+ symbolref = name + '_' + symbol1
217
+ print ":#{symbolref} a #{symboluri} .\n"
218
+ # print ":#{symbolref} :alias #{Symbols::uri(symbol,hugo)} .\n" if symbol1 != symbol
219
+ print ":#{symbolref} :textmatch \"#{symbol}\" .\n"
220
+ print ":#{symbolref} dc:partOf :#{name} .\n"
221
+ print ":#{symbolref} :frequency #{freq} .\n"
222
+ print ":#{symbolref} :context \"#{match_context.encode("UTF-8").encode(:xml => :text)}\" .\n" if give_context
223
+ print "#{symboluri} rdf:label \"#{symboluri}\" .\n"
224
+ print "#{symboluri} rdf:comment \"#{info[symbol][:descr]}\" .\n"
225
+ print "\n"
226
+ }
227
+
228
+ alias_matches.each do | symbol, freq |
229
+ write_symbol_rdf.call(symbol,freq,is_alias=true) if not symbol_matches[symbol]
230
+ end
231
+ symbol_matches.each do | symbol, freq |
232
+ write_symbol_rdf.call(symbol,freq,is_alias=false)
233
+ end
234
+ else
235
+ print "\nMatching symbol aliases:\n"
236
+ alias_matches.sort_by{|k,v| v}.each do | k,v |
237
+ print v,"\t",(hugo[info[k][:symbol]]?" HUGO":""),"\t",k,"\t",info[k][:symbol],"\t",info[k][:descr],"\n" if not symbol_matches[k]
238
+ end
239
+ print "\nMatching symbols:\n"
240
+ symbol_matches.sort_by{|k,v| v}.each do | k,v |
241
+ print v,"\t",(hugo[k]?" HUGO":""),"\t",k,"\t",info[k][:descr],"\n"
242
+ end
243
+ end
244
+ # $stderr.print "\nA total of #{symbol_count} symbols and #{alias_count} aliases scanned."
245
+ $stderr.print "\nThere were #{hugo_matches.size} HUGO matches out of #{hugo.size} symbols."
246
+ $stderr.print "\nDone!\n"
247
+ rescue OptionParser::InvalidOption => e
248
+ options[:invalid_argument] = e.message
249
+ end
250
+
@@ -0,0 +1,74 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # Convert NCBI gene_info to symbol file and calculate used letter
4
+ # frequencies. Note: all symbols that are numbers are removed.
5
+ #
6
+ # Copyright (C) 2013 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ #
8
+
9
+ require 'yaml'
10
+
11
+ # Returns true or false
12
+ def valid_symbol s
13
+ s = s.strip
14
+ return false if s =~ /^\d+$/ # drop all digit id's
15
+ $stderr.print "Symbol contains a space! <"+s+">\n" if s =~ /\s/
16
+ true
17
+ end
18
+
19
+ module Freq
20
+ def Freq::to_s freq
21
+ buf = freq.to_yaml + "\n"
22
+ freq.keys.sort.each do |c|
23
+ buf += c
24
+ end
25
+ buf
26
+ end
27
+ end
28
+
29
+ freq = {}
30
+
31
+ counter = 0
32
+
33
+ ARGF.each_line do | line |
34
+ counter += 1
35
+ $stderr.print "." if counter % 10_000 == 0
36
+
37
+ next if line =~ /^HGNC ID/
38
+ a = line.strip.split(/\t/)
39
+ symbol = a[1]
40
+ next if not valid_symbol(symbol)
41
+ name = a[2]
42
+ oldnames = nil
43
+ oldnames = a[4].strip.split(/\s?,\s?/) if a.size > 4
44
+ aliases = nil
45
+ aliases = a[6].strip.split(/\s?,\s?/) if a.size > 6
46
+ # p [a[4],a[6]]
47
+ as = []
48
+ as = aliases if aliases
49
+ as += oldnames if oldnames
50
+ as.reject! { |c| c.empty? }
51
+ aliases = if as.size == 0
52
+ 'NA'
53
+ else
54
+ as.uniq.join('|')
55
+ end
56
+ descr = a[2]
57
+ descr = '' if descr == '-'
58
+ print symbol,"\t",aliases,"\t",descr,"\n"
59
+ # Add stats
60
+ cs = symbol.scan(/./)
61
+ if aliases != 'NA'
62
+ cs += aliases.scan(/./) - ['|']
63
+ end
64
+ cs.each do |c|
65
+ freq[c] = 0 if not freq[c]
66
+ freq[c] += 1
67
+ end
68
+ end
69
+
70
+ File.open('hugo_exominer_symbols.freq','w') do |f|
71
+ f.print(Freq::to_s(freq))
72
+ end
73
+
74
+
@@ -0,0 +1,79 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # Convert HUGO gene_info to symbol file and calculate used letter
4
+ # frequencies. Note: all symbols that are numbers are removed.
5
+ #
6
+ # Copyright (C) 2013 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ #
8
+
9
+ require 'yaml'
10
+
11
+ # Returns true or false
12
+ def valid_symbol s
13
+ s = s.strip
14
+ return false if s =~ /^\d+$/ # drop all digit id's
15
+ $stderr.print "Symbol contains a space! <"+s+">\n" if s =~ /\s/
16
+ true
17
+ end
18
+
19
+ module Freq
20
+ def Freq::to_s freq
21
+ buf = freq.to_yaml + "\n"
22
+ freq.keys.sort.each do |c|
23
+ buf += c
24
+ end
25
+ buf
26
+ end
27
+ end
28
+
29
+ freq = {}
30
+
31
+ counter = 0
32
+
33
+ ARGF.each_line do | line |
34
+ counter += 1
35
+ $stderr.print "." if counter % 100_000 == 0
36
+
37
+ next if line =~ /^#/
38
+ a = line.strip.split(/\t/)
39
+ symbol = a[2]
40
+ next if symbol == 'NEWENTRY' or symbol == '-'
41
+ # Skip gene names that are numbers only
42
+ next if not valid_symbol(symbol)
43
+ name = a[3]
44
+ aliases = a[4]
45
+ if name != '-' and name != symbol and valid_symbol(name)
46
+ if aliases == '-'
47
+ aliases = name
48
+ else
49
+ aliases += '|'+name
50
+ end
51
+ end
52
+ aliases =
53
+ if aliases == '-'
54
+ 'NA'
55
+ else
56
+ as1 = aliases.split(/\|/)
57
+ # Skip gene names that are numbers only
58
+ as2 = as1.delete_if { |a| not valid_symbol(a) }
59
+ as2.uniq.join('|')
60
+ end
61
+ descr = a[8]
62
+ descr = '' if descr == '-'
63
+ print symbol,"\t",aliases,"\t",descr,"\n"
64
+ # Add stats
65
+ cs = symbol.scan(/./)
66
+ if aliases != 'NA'
67
+ cs += aliases.scan(/./) - ['|']
68
+ end
69
+ cs.each do |c|
70
+ freq[c] = 0 if not freq[c]
71
+ freq[c] += 1
72
+ end
73
+ end
74
+
75
+ File.open('ncbi_exominer_symbols.freq','w') do |f|
76
+ f.print(Freq::to_s(freq))
77
+ end
78
+
79
+