bio-exominer 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,58 @@
1
+ # encoding: utf-8
2
+
3
+ require 'rubygems'
4
+ require 'bundler'
5
+ begin
6
+ Bundler.setup(:default, :development)
7
+ rescue Bundler::BundlerError => e
8
+ $stderr.puts e.message
9
+ $stderr.puts "Run `bundle install` to install missing gems"
10
+ exit e.status_code
11
+ end
12
+ require 'rake'
13
+
14
+ require 'jeweler'
15
+ Jeweler::Tasks.new do |gem|
16
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
17
+ gem.name = "bio-exominer"
18
+ gem.homepage = "http://github.com/pjotrp/bioruby-exominer"
19
+ gem.license = "MIT"
20
+ gem.summary = %Q{Mine publications for gene names}
21
+ gem.description = %Q{Parse publications for gene names in a fuzzy fashion}
22
+ gem.email = "pjotr.public01@thebird.nl"
23
+ gem.authors = ["Pjotr Prins"]
24
+ # dependencies defined in Gemfile
25
+ end
26
+ Jeweler::RubygemsDotOrgTasks.new
27
+
28
+ # require 'rspec/core'
29
+ # require 'rspec/core/rake_task'
30
+ # RSpec::Core::RakeTask.new(:spec) do |spec|
31
+ # spec.pattern = FileList['spec/**/*_spec.rb']
32
+ # end
33
+
34
+ # RSpec::Core::RakeTask.new(:rcov) do |spec|
35
+ # spec.pattern = 'spec/**/*_spec.rb'
36
+ # spec.rcov = true
37
+ # end
38
+
39
+ require 'rake/testtask'
40
+
41
+ Rake::TestTask.new do |t|
42
+ t.pattern = "spec/*_spec.rb"
43
+ end
44
+
45
+ require 'cucumber/rake/task'
46
+ Cucumber::Rake::Task.new(:features)
47
+
48
+ task :default => :test
49
+
50
+ require 'rdoc/task'
51
+ Rake::RDocTask.new do |rdoc|
52
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "bio-exominer #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.3
@@ -0,0 +1,250 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # BioRuby exominer Plugin BioExominer
4
+ # Author:: Pjotr Prins
5
+ #
6
+ # Copyright (C) 2013,2014 Cuppen Group & Pjotr Prins <pjotr.prins@thebird.nl>
7
+
8
+ USAGE = "exominer takes a symbol file and parses the piped data for gene symbols"
9
+
10
+ gempath = File.dirname(File.dirname(__FILE__))
11
+ $: << File.join(gempath,'lib')
12
+
13
+ VERSION_FILENAME=File.join(gempath,'VERSION')
14
+ version = File.new(VERSION_FILENAME).read.chomp
15
+
16
+ # print banner
17
+ $stderr.print "exominer #{version} (biogem Ruby #{RUBY_VERSION}) by Pjotr Prins 2014\n"
18
+
19
+ if ARGV.size == 0
20
+ print USAGE
21
+ end
22
+
23
+ require 'bio-exominer'
24
+ require 'optparse'
25
+
26
+ # Uncomment when using the bio-logger
27
+ # require 'bio-logger'
28
+ # log = Bio::Log::LoggerPlus.new 'exominer'
29
+ # log.outputters = Bio::Log::Outputter.stderr
30
+ # Bio::Log::CLI.logger('stderr')
31
+ # Bio::Log::CLI.trace('info')
32
+
33
+ options = { show_help: false, symbols: []}
34
+ opts = OptionParser.new do |o|
35
+ o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} -s ncbi_symbols.tab --rdf < test.txt"
36
+
37
+ o.on("--rdf", "Generate RDF") do |b|
38
+ options[:rdf] = true
39
+ end
40
+ o.on("--name name", 'Set name of result set') do |name|
41
+ options[:name] = name
42
+ end
43
+ o.on("-s","--symbols fn", 'Symbol file') do |fn|
44
+ options[:symbols] += [fn]
45
+ end
46
+ o.on("--hugo [fn]", 'Hugo symbol file') do |fn|
47
+ if fn
48
+ options[:hugo] = fn
49
+ else
50
+ options[:hugo] = gempath + '/test/data/input/hugo_symbols'
51
+ end
52
+ end
53
+ o.on("-i","--ignore fn", 'Ignore symbols in fn (NYI)') do |fn|
54
+ options[:ignore] = fn
55
+ end
56
+ o.on("--context [TYPE]",[:off,:line], 'Context parser mode (off,line)') do |context|
57
+ options[:context] = context
58
+ end
59
+ o.on("--doi doi", 'DOI') do |doi|
60
+ options[:doi] = doi
61
+ end
62
+ o.on("--tag string", 'Tag string') do |tag|
63
+ tags = {}
64
+ ts = tag.split(/\s?;\s?/)
65
+ ts.each do |field|
66
+ key,value = field.strip.split(/\s?=\s?/)
67
+ tags[key.to_sym] = value
68
+ end
69
+ options[:tags] = tags
70
+ end
71
+
72
+ # Uncomment the following when using the bio-logger
73
+ # o.separator ""
74
+ # o.on("--logger filename",String,"Log to file (default stderr)") do | name |
75
+ # Bio::Log::CLI.logger(name)
76
+ # end
77
+ #
78
+ # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s |
79
+ # Bio::Log::CLI.trace(s)
80
+ # end
81
+ #
82
+ # o.on("-q", "--quiet", "Run quietly") do |q|
83
+ # Bio::Log::CLI.trace('error')
84
+ # end
85
+ #
86
+ # o.on("-v", "--verbose", "Run verbosely") do |v|
87
+ # Bio::Log::CLI.trace('info')
88
+ # end
89
+ #
90
+ # o.on("--debug", "Show debug messages") do |v|
91
+ # Bio::Log::CLI.trace('debug')
92
+ # end
93
+
94
+ o.separator ""
95
+ o.on_tail('-h', '--help', 'display this help and exit') do
96
+ options[:show_help] = true
97
+ end
98
+ end
99
+
100
+ require 'bio-exominer/textparser'
101
+ require 'bio-exominer/symbols'
102
+ require 'yaml'
103
+
104
+ include BioExominer
105
+
106
+ begin
107
+ opts.parse!(ARGV)
108
+
109
+ if options[:show_help]
110
+ print opts
111
+ exit 1
112
+ end
113
+
114
+ $stderr.print options
115
+
116
+ # Create unique name for the resultset
117
+ doi = options[:doi]
118
+ name = options[:name]
119
+ name = options[:tags][:name].to_sym if not name and options[:tags] and options[:tags][:name]
120
+ name = doi if not name and doi
121
+ name = rand(36**8).to_s(36) if not name # finally a random ID, if nothing else works
122
+ give_context = options[:context] != :off
123
+
124
+ # context may be override by tags
125
+ options[:context]=options[:tags][:context] if options[:tags][:context]
126
+
127
+ # Uncomment when using the bio-logger
128
+ # Bio::Log::CLI.configure('exominer')
129
+ # logger = Bio::Log::LoggerPlus['exominer']
130
+ # Log parsed options and remaining arguments in ARGV
131
+ # logger.info [options, ARGV]
132
+
133
+ $stderr.print "\nLoading text..."
134
+ buf = ARGF.read
135
+ $stderr.print "\nTokenizing..."
136
+ tokens,context =
137
+ TextParser::tokenize_with_context(buf,options[:context])
138
+
139
+ symbol_count = 0
140
+ alias_count = 0
141
+ hugo_count = 0
142
+ hugo_matches = {}
143
+ symbol_matches = {} # match symbols
144
+ alias_matches = {} # match aliases
145
+ info = {} # the main symbol match tracker
146
+ hugo = {} # HUGO tracker
147
+
148
+ $stderr.print "\nParse symbol files..."
149
+ parse_symbols = lambda { |symbolfn,is_hugo=false|
150
+ # ---- for every symbol file
151
+ $stderr.print "\nParse symbol file #{symbolfn}..."
152
+ Symbols::each(symbolfn) do | symbol,aliases,descr |
153
+ # ---- for every symbol and aliases
154
+ # $stderr.print "\nHUGO-"+symbol if symbol =~ /L3MBTL/
155
+ hugo[symbol] = true if is_hugo
156
+ # alias_count += aliases.size if aliases
157
+ # ---- If the symbol has a match, and it is not in the list, add it
158
+ if tokens[symbol] and not info[symbol]
159
+ symbol_matches[symbol] = tokens[symbol]
160
+ hugo_matches[symbol] ||= true if is_hugo
161
+ info[symbol] = { :symbol => symbol, :is_hugo=>is_hugo, :aliases => aliases, :descr => descr, :symbolfn => symbolfn }
162
+ end
163
+ # ---- If an alias has a match and is not in the list, add it
164
+ if aliases
165
+ aliases.each do | word |
166
+ # $stderr.print "\n!!"+word+':'+symbol if word == "L3MBTL"
167
+ # full = word + ' (' + symbol + ')'
168
+ if tokens[word] and not info[word]
169
+ # $stderr.print "\n=="+word+':'+symbol if word == "L3MBTL"
170
+ hugo_matches[word] ||= true if hugo[word]
171
+ alias_matches[word] = tokens[word]
172
+ info[word] = { :symbol => symbol, :is_hugo=>hugo[word], :aliases => aliases, :descr => descr, :symbolfn => symbolfn }
173
+ end
174
+ end
175
+ end
176
+ end
177
+ }
178
+
179
+ parse_symbols.call(options[:hugo],is_hugo = true) if options[:hugo]
180
+ options[:symbols].each { |fn| parse_symbols.call(fn) }
181
+
182
+ if options[:rdf]
183
+ # Write RDF!
184
+ print <<HEADER
185
+
186
+ # RDF output by bio-exominer https://github.com/pjotrp/bioruby-exominer
187
+ @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
188
+ @prefix dc: <http://purl.org/dc/elements/1.1/> .
189
+ @prefix hgnc: <http://identifiers.org/hgnc.symbol/> .
190
+ @prefix doi: <http://dx.doi.org/> .
191
+ @prefix bibo: <http://purl.org/ontology/bibo/> .
192
+ @prefix ncbigene: <https://www.google.nl/search?q=ncbi+gene+alias+> .
193
+ @prefix : <http://biobeat.org/rdf/exominer/ns#> .
194
+
195
+ HEADER
196
+
197
+ # Fix DOI with http://www.doi.org/doi_handbook/2_Numbering.html#2.6.2
198
+ print ":#{name} :doi \"doi:#{doi}\" . \n" if doi
199
+ print ":#{name} rdf:label \"#{name}\" . \n"
200
+ print ":#{name} a \"text resource\" . \n"
201
+ print ":#{name} dc:type \"#{options[:type]}\" . \n" if options[:type]
202
+ if options[:tags]
203
+ options[:tags].each do | k,v |
204
+ print ":#{name} :#{k.to_sym} \"#{v}\" .\n"
205
+ end
206
+ end
207
+ print "\n"
208
+
209
+ write_symbol_rdf = lambda { |symbol,freq,is_alias=false|
210
+ match_info = info[symbol]
211
+ match_context = context[symbol].join("; ")
212
+ symbol1 = match_info[:symbol]
213
+ symboluri = Symbols::uri(symbol1,hugo)
214
+ symbolidentifier = RDF::make_identifier(symbol1)
215
+ symbol1 = symbolidentifier if not hugo[symbol1]
216
+ symbolref = name + '_' + symbol1
217
+ print ":#{symbolref} a #{symboluri} .\n"
218
+ # print ":#{symbolref} :alias #{Symbols::uri(symbol,hugo)} .\n" if symbol1 != symbol
219
+ print ":#{symbolref} :textmatch \"#{symbol}\" .\n"
220
+ print ":#{symbolref} dc:partOf :#{name} .\n"
221
+ print ":#{symbolref} :frequency #{freq} .\n"
222
+ print ":#{symbolref} :context \"#{match_context.encode("UTF-8").encode(:xml => :text)}\" .\n" if give_context
223
+ print "#{symboluri} rdf:label \"#{symboluri}\" .\n"
224
+ print "#{symboluri} rdf:comment \"#{info[symbol][:descr]}\" .\n"
225
+ print "\n"
226
+ }
227
+
228
+ alias_matches.each do | symbol, freq |
229
+ write_symbol_rdf.call(symbol,freq,is_alias=true) if not symbol_matches[symbol]
230
+ end
231
+ symbol_matches.each do | symbol, freq |
232
+ write_symbol_rdf.call(symbol,freq,is_alias=false)
233
+ end
234
+ else
235
+ print "\nMatching symbol aliases:\n"
236
+ alias_matches.sort_by{|k,v| v}.each do | k,v |
237
+ print v,"\t",(hugo[info[k][:symbol]]?" HUGO":""),"\t",k,"\t",info[k][:symbol],"\t",info[k][:descr],"\n" if not symbol_matches[k]
238
+ end
239
+ print "\nMatching symbols:\n"
240
+ symbol_matches.sort_by{|k,v| v}.each do | k,v |
241
+ print v,"\t",(hugo[k]?" HUGO":""),"\t",k,"\t",info[k][:descr],"\n"
242
+ end
243
+ end
244
+ # $stderr.print "\nA total of #{symbol_count} symbols and #{alias_count} aliases scanned."
245
+ $stderr.print "\nThere were #{hugo_matches.size} HUGO matches out of #{hugo.size} symbols."
246
+ $stderr.print "\nDone!\n"
247
+ rescue OptionParser::InvalidOption => e
248
+ options[:invalid_argument] = e.message
249
+ end
250
+
@@ -0,0 +1,74 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # Convert NCBI gene_info to symbol file and calculate used letter
4
+ # frequencies. Note: all symbols that are numbers are removed.
5
+ #
6
+ # Copyright (C) 2013 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ #
8
+
9
+ require 'yaml'
10
+
11
+ # Returns true or false
12
+ def valid_symbol s
13
+ s = s.strip
14
+ return false if s =~ /^\d+$/ # drop all digit id's
15
+ $stderr.print "Symbol contains a space! <"+s+">\n" if s =~ /\s/
16
+ true
17
+ end
18
+
19
+ module Freq
20
+ def Freq::to_s freq
21
+ buf = freq.to_yaml + "\n"
22
+ freq.keys.sort.each do |c|
23
+ buf += c
24
+ end
25
+ buf
26
+ end
27
+ end
28
+
29
+ freq = {}
30
+
31
+ counter = 0
32
+
33
+ ARGF.each_line do | line |
34
+ counter += 1
35
+ $stderr.print "." if counter % 10_000 == 0
36
+
37
+ next if line =~ /^HGNC ID/
38
+ a = line.strip.split(/\t/)
39
+ symbol = a[1]
40
+ next if not valid_symbol(symbol)
41
+ name = a[2]
42
+ oldnames = nil
43
+ oldnames = a[4].strip.split(/\s?,\s?/) if a.size > 4
44
+ aliases = nil
45
+ aliases = a[6].strip.split(/\s?,\s?/) if a.size > 6
46
+ # p [a[4],a[6]]
47
+ as = []
48
+ as = aliases if aliases
49
+ as += oldnames if oldnames
50
+ as.reject! { |c| c.empty? }
51
+ aliases = if as.size == 0
52
+ 'NA'
53
+ else
54
+ as.uniq.join('|')
55
+ end
56
+ descr = a[2]
57
+ descr = '' if descr == '-'
58
+ print symbol,"\t",aliases,"\t",descr,"\n"
59
+ # Add stats
60
+ cs = symbol.scan(/./)
61
+ if aliases != 'NA'
62
+ cs += aliases.scan(/./) - ['|']
63
+ end
64
+ cs.each do |c|
65
+ freq[c] = 0 if not freq[c]
66
+ freq[c] += 1
67
+ end
68
+ end
69
+
70
+ File.open('hugo_exominer_symbols.freq','w') do |f|
71
+ f.print(Freq::to_s(freq))
72
+ end
73
+
74
+
@@ -0,0 +1,79 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # Convert HUGO gene_info to symbol file and calculate used letter
4
+ # frequencies. Note: all symbols that are numbers are removed.
5
+ #
6
+ # Copyright (C) 2013 Pjotr Prins <pjotr.prins@thebird.nl>
7
+ #
8
+
9
+ require 'yaml'
10
+
11
+ # Returns true or false
12
+ def valid_symbol s
13
+ s = s.strip
14
+ return false if s =~ /^\d+$/ # drop all digit id's
15
+ $stderr.print "Symbol contains a space! <"+s+">\n" if s =~ /\s/
16
+ true
17
+ end
18
+
19
+ module Freq
20
+ def Freq::to_s freq
21
+ buf = freq.to_yaml + "\n"
22
+ freq.keys.sort.each do |c|
23
+ buf += c
24
+ end
25
+ buf
26
+ end
27
+ end
28
+
29
+ freq = {}
30
+
31
+ counter = 0
32
+
33
+ ARGF.each_line do | line |
34
+ counter += 1
35
+ $stderr.print "." if counter % 100_000 == 0
36
+
37
+ next if line =~ /^#/
38
+ a = line.strip.split(/\t/)
39
+ symbol = a[2]
40
+ next if symbol == 'NEWENTRY' or symbol == '-'
41
+ # Skip gene names that are numbers only
42
+ next if not valid_symbol(symbol)
43
+ name = a[3]
44
+ aliases = a[4]
45
+ if name != '-' and name != symbol and valid_symbol(name)
46
+ if aliases == '-'
47
+ aliases = name
48
+ else
49
+ aliases += '|'+name
50
+ end
51
+ end
52
+ aliases =
53
+ if aliases == '-'
54
+ 'NA'
55
+ else
56
+ as1 = aliases.split(/\|/)
57
+ # Skip gene names that are numbers only
58
+ as2 = as1.delete_if { |a| not valid_symbol(a) }
59
+ as2.uniq.join('|')
60
+ end
61
+ descr = a[8]
62
+ descr = '' if descr == '-'
63
+ print symbol,"\t",aliases,"\t",descr,"\n"
64
+ # Add stats
65
+ cs = symbol.scan(/./)
66
+ if aliases != 'NA'
67
+ cs += aliases.scan(/./) - ['|']
68
+ end
69
+ cs.each do |c|
70
+ freq[c] = 0 if not freq[c]
71
+ freq[c] += 1
72
+ end
73
+ end
74
+
75
+ File.open('ncbi_exominer_symbols.freq','w') do |f|
76
+ f.print(Freq::to_s(freq))
77
+ end
78
+
79
+