bio-exominer 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +14 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +20 -0
- data/README.md +413 -0
- data/Rakefile +58 -0
- data/VERSION +1 -0
- data/bin/exominer +250 -0
- data/bin/hugo_exominer_symbols +74 -0
- data/bin/ncbi_exominer_symbols +79 -0
- data/bin/pack_exominer_symbols +38 -0
- data/features/bio-exominer.feature +9 -0
- data/features/step_definitions/bio-exominer_steps.rb +0 -0
- data/features/support/env.rb +13 -0
- data/lib/bio-exominer.rb +14 -0
- data/lib/bio-exominer/exominer.rb +3 -0
- data/lib/bio-exominer/rdf.rb +38 -0
- data/lib/bio-exominer/symbols.rb +49 -0
- data/lib/bio-exominer/textparser.rb +124 -0
- data/scripts/4store.sh +30 -0
- data/scripts/example.sh +9 -0
- data/scripts/example_rdf.sh +7 -0
- data/scripts/load_rdf.sh +15 -0
- data/spec/bio-exominer_spec.rb +8 -0
- data/spec/rdf_spec.rb +28 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/text_parser_spec.rb +59 -0
- data/test/data/input/hugo_symbols +38106 -0
- metadata +195 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Pack symbol file
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
#
|
7
|
+
|
8
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
9
|
+
$: << File.join(gempath,'lib')
|
10
|
+
|
11
|
+
require 'msgpack'
|
12
|
+
require 'bio-exominer/symbols'
|
13
|
+
|
14
|
+
include BioExominer
|
15
|
+
|
16
|
+
if ARGV[0] == '-d'
|
17
|
+
ARGV.shift
|
18
|
+
|
19
|
+
print "Unpacking symbols.bin..."
|
20
|
+
u = MessagePack::Unpacker.new(File.new('symbols.bin','rb'))
|
21
|
+
begin
|
22
|
+
u.each do |obj|
|
23
|
+
print obj[0],"\t",(obj[1] ? obj[1].join('|') : "NA"),"\t",obj[2],"\n"
|
24
|
+
end
|
25
|
+
rescue EOFError
|
26
|
+
end
|
27
|
+
else
|
28
|
+
|
29
|
+
print "Writing symbols.bin..."
|
30
|
+
bin = MessagePack::Packer.new(File.new('symbols.bin','wb'))
|
31
|
+
|
32
|
+
ARGF.each_line do | line |
|
33
|
+
symbol,aliases,descr = Symbols::parse_line(line)
|
34
|
+
bin.write([symbol,aliases,descr])
|
35
|
+
end
|
36
|
+
bin.flush
|
37
|
+
|
38
|
+
end
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
require 'bio-exominer'
|
12
|
+
|
13
|
+
require 'rspec/expectations'
|
data/lib/bio-exominer.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-exominer/rdf'
|
12
|
+
require 'bio-exominer/textparser'
|
13
|
+
require 'bio-exominer/exominer'
|
14
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module BioExominer
|
2
|
+
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
# FIXME: use bioruby-rdf modules instead! It is all there now.
|
6
|
+
module RDF
|
7
|
+
|
8
|
+
def RDF::valid_uri? uri
|
9
|
+
uri =~ /^([!#$&-;=?_a-z~]|%[0-9a-f]{2})+$/i
|
10
|
+
end
|
11
|
+
|
12
|
+
# An identifier is used for the subject and predicate in RDF. This is a case-sensitive
|
13
|
+
# (shortened) URI. You can change default behaviour for identifiers using the options
|
14
|
+
# --transform-ids (i.e. in the input side, rather than the output side)
|
15
|
+
#
|
16
|
+
def RDF::make_identifier(s)
|
17
|
+
id = s.strip.gsub(/[^[:print:]]/, '').gsub(/[#)(,]/,"").gsub(/[%]/,"perc").gsub(/(\s|\.|\$|\/|\\)+/,"_")
|
18
|
+
# id = URI::escape(id)
|
19
|
+
id = id.gsub(/\|/,'_')
|
20
|
+
id = id.gsub(/\-|:/,'_')
|
21
|
+
if id != s
|
22
|
+
# logger = Bio::Log::LoggerPlus['bio-table']
|
23
|
+
$stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
24
|
+
end
|
25
|
+
if not RDF::valid_uri?(id)
|
26
|
+
raise "Invalid URI after mangling <#{s}> to <#{id}>!"
|
27
|
+
end
|
28
|
+
valid_id = if id =~ /^\d/
|
29
|
+
'r' + id
|
30
|
+
else
|
31
|
+
id
|
32
|
+
end
|
33
|
+
valid_id
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module BioExominer
|
2
|
+
|
3
|
+
require 'msgpack'
|
4
|
+
require 'bio-exominer/rdf'
|
5
|
+
|
6
|
+
module Symbols
|
7
|
+
|
8
|
+
# Make a full URI out of a symbol
|
9
|
+
def Symbols::uri(symbol,hugo)
|
10
|
+
if hugo[symbol]
|
11
|
+
# http://bio2rdf.org/hugo:RAD51C
|
12
|
+
"hgnc:"+RDF::make_identifier(symbol)
|
13
|
+
else
|
14
|
+
"ncbigene:"+RDF::make_identifier(symbol) # remove all non-printable
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def Symbols::parse_line(line)
|
19
|
+
symbol,aliases,descr = line.strip.split(/\t/)
|
20
|
+
aliases =
|
21
|
+
if aliases == 'NA'
|
22
|
+
nil
|
23
|
+
else
|
24
|
+
aliases.split(/\|/)
|
25
|
+
end
|
26
|
+
return symbol,aliases,(descr ? descr.strip : "")
|
27
|
+
end
|
28
|
+
|
29
|
+
def Symbols::each(fn)
|
30
|
+
is_bin = fn =~ /.bin$/
|
31
|
+
|
32
|
+
if is_bin
|
33
|
+
u = MessagePack::Unpacker.new(File.new(fn,'rb'))
|
34
|
+
begin
|
35
|
+
u.each do |obj|
|
36
|
+
# print obj[0],"\t",(obj[1] ? obj[1].join('|') : "NA"),"\t",obj[2],"\n"
|
37
|
+
yield obj[0],obj[1],obj[2]
|
38
|
+
end
|
39
|
+
rescue EOFError
|
40
|
+
end
|
41
|
+
else
|
42
|
+
File.open(fn).each_line do | line |
|
43
|
+
yield parse_line(line)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# Text parsing
|
2
|
+
|
3
|
+
module BioExominer
|
4
|
+
|
5
|
+
module TextParser
|
6
|
+
|
7
|
+
MAX_SIZE = 120
|
8
|
+
SKIP_TOKENS = %w{ can has Ma van large was polymerase had far a impact was East early
|
9
|
+
face Park ali and team tag ras ac tail at al age ac TA tag small this pure such
|
10
|
+
type gene pmc but is ten org we an term not as by lost et out how up per for
|
11
|
+
end beta der The Ten Out At No How pdf Ding Jan To cell gov even Jun
|
12
|
+
Sun DNA Nat in hit doc bin with set Nov unknown key link to cgi
|
13
|
+
and or RDF NPG
|
14
|
+
}
|
15
|
+
|
16
|
+
# L3MBTL
|
17
|
+
|
18
|
+
def TextParser::valid_token? token
|
19
|
+
return false if token.strip == ""
|
20
|
+
return false if token =~ /^(\d|[,])+$/
|
21
|
+
return false if token !~ /[a-zA-Z]/ # at least one word char
|
22
|
+
true
|
23
|
+
end
|
24
|
+
|
25
|
+
def TextParser::add tokens, word
|
26
|
+
return if SKIP_TOKENS.include?(word)
|
27
|
+
# return if word.size < 2
|
28
|
+
tokens[word] ||= 0
|
29
|
+
tokens[word] += 1
|
30
|
+
end
|
31
|
+
|
32
|
+
def TextParser::rm_punctuation w
|
33
|
+
return nil if w == nil
|
34
|
+
word = w.dup
|
35
|
+
if word =~ /^\[\d+\]/
|
36
|
+
word = word.sub(/^\[\d+\]/,'')
|
37
|
+
end
|
38
|
+
word = word.sub(/^\(/,'')
|
39
|
+
word = word.sub(/\)$/,'')
|
40
|
+
word = word.sub(/[,:;!]$/,'') # remove punctuation
|
41
|
+
word = word.sub(/^[`"']/,'') # remove starting quotes
|
42
|
+
word = word.sub(/[`"']$/,'') # remove ending quotes
|
43
|
+
word
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return tokens with count
|
47
|
+
def TextParser::tokenize buf
|
48
|
+
tokens = {}
|
49
|
+
list = buf.split(/[\r\n\s]+/)
|
50
|
+
list.each_with_index do | word,idx |
|
51
|
+
n1 = p1 = nil
|
52
|
+
p1 = rm_punctuation(list[idx-1]) if idx>0
|
53
|
+
w1 = rm_punctuation(word)
|
54
|
+
n1 = rm_punctuation(list[idx+1]) if idx<list.size
|
55
|
+
next if w1.size < 2
|
56
|
+
next if p1 =~ /table|dataset|supplement|figure|chapter|section|paragraph/i
|
57
|
+
# Filter out letters+name
|
58
|
+
if w1 =~ /^[A-Z]/ and w1.capitalize == w1
|
59
|
+
next if n1 and n1.size == 1
|
60
|
+
next if p1 and p1.size == 1
|
61
|
+
next if n1 and n1.size == 2 and n1 =~ /^[A-Z][A-Z]/
|
62
|
+
next if p1 and p1.size == 2 and p1 =~ /^[A-Z][A-Z]/
|
63
|
+
end
|
64
|
+
if w1.size == 2 and w1 =~ /^[A-Z][A-Z]/
|
65
|
+
next if p1 and p1 =~ /^[A-Z]/ and p1.capitalize == p1
|
66
|
+
next if n1 and n1 =~ /^[A-Z]/ and n1.capitalize == n1
|
67
|
+
end
|
68
|
+
# Filter out all lowercase small names
|
69
|
+
next if w1.size < 4 and w1 == w1.downcase and w1 !~ /\d/
|
70
|
+
# Remove brackets and braces in first and last positions
|
71
|
+
add(tokens,w1) if TextParser.valid_token?(word)
|
72
|
+
# p [word,w1,TextParser.valid_token?(word)]
|
73
|
+
add(tokens,word) if TextParser.valid_token?(word) and word != w1
|
74
|
+
# split on dash or underscore
|
75
|
+
if word =~ /-|_/
|
76
|
+
word.split(/-|_/).each do |w|
|
77
|
+
add(tokens,w) if TextParser.valid_token?(w)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
# p tokens
|
82
|
+
tokens
|
83
|
+
end
|
84
|
+
|
85
|
+
# Return a list of tokens with count and context
|
86
|
+
def TextParser::tokenize_with_context buf, context_type = :sentence
|
87
|
+
tokens_context = {}
|
88
|
+
tokens_count = {}
|
89
|
+
# Split buf into sentences based on dots or newlines
|
90
|
+
sentences =
|
91
|
+
if context_type == :line or context_type == 'line'
|
92
|
+
buf.split(/\n/)
|
93
|
+
else
|
94
|
+
buf.split(/\.\s+/)
|
95
|
+
end
|
96
|
+
sentences.each do | sentence1 |
|
97
|
+
sentence = sentence1.strip.gsub(/(\r|\n)\s*/,' ')
|
98
|
+
# remove quotes
|
99
|
+
sentence = sentence.gsub(/"/,'')
|
100
|
+
tokens = tokenize(sentence)
|
101
|
+
tokens.each { | token, count |
|
102
|
+
# shorten the sentence
|
103
|
+
sentence2 =
|
104
|
+
if sentence.size > MAX_SIZE+2
|
105
|
+
half_size = MAX_SIZE/2
|
106
|
+
pos = sentence.index(token)
|
107
|
+
start = (pos-half_size<0 ? 0 : pos-half_size)
|
108
|
+
stop = pos+half_size
|
109
|
+
s2 = sentence[start..stop]
|
110
|
+
s2.sub(/^\w+\s+/,'').sub(/\s+\w+$/,'')
|
111
|
+
else
|
112
|
+
sentence
|
113
|
+
end
|
114
|
+
tokens_count[token] ||= 0
|
115
|
+
tokens_count[token] += count
|
116
|
+
tokens_context[token] ||= []
|
117
|
+
tokens_context[token] << sentence2
|
118
|
+
}
|
119
|
+
end
|
120
|
+
return tokens_count, tokens_context
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
data/scripts/4store.sh
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#! /bin/sh
|
2
|
+
#
|
3
|
+
# Options
|
4
|
+
#
|
5
|
+
# -r Restart server
|
6
|
+
# -d Delete DB and restart server
|
7
|
+
|
8
|
+
dbname=exominer
|
9
|
+
if [ ! -z $1 ] ; then
|
10
|
+
dbname=$1
|
11
|
+
shift
|
12
|
+
fi
|
13
|
+
|
14
|
+
echo Starting DB $dbname
|
15
|
+
|
16
|
+
if [ "$1" = "-r" ]; then
|
17
|
+
killall 4s-httpd
|
18
|
+
killall 4s-backend
|
19
|
+
fi
|
20
|
+
|
21
|
+
if [ "$1" = "-d" ]; then
|
22
|
+
killall 4s-httpd
|
23
|
+
killall 4s-backend
|
24
|
+
4s-backend-setup $dbname
|
25
|
+
fi
|
26
|
+
|
27
|
+
|
28
|
+
4s-backend $dbname
|
29
|
+
4s-httpd -p 8081 $dbname
|
30
|
+
|
data/scripts/example.sh
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
#! /bin/sh
|
2
|
+
|
3
|
+
|
4
|
+
./bin/exominer --rdf --name tcga_bc --hugo --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' -s ncbi_symbols.bin < tcga_bc.txt > tcga_bc.rdf
|
5
|
+
|
6
|
+
curl -T tcga_bc.rdf -H 'Content-Type: application/x-turtle' http://localhost:8081/data/exominer.rdf
|
7
|
+
|
8
|
+
~/opt/bin/sparql-query http://localhost:8081/sparql/ 'SELECT * WHERE { ?s ?p ?o } LIMIT 5'
|
9
|
+
|
@@ -0,0 +1,7 @@
|
|
1
|
+
#! /bin/sh
|
2
|
+
|
3
|
+
|
4
|
+
./bin/exominer --rdf --name tcga_bc --hugo --doi doi:10.1038/nature11412 --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' < tcga_bc.txt
|
5
|
+
./bin/exominer --rdf --name tcga_bc --hugo --doi doi:10.1038/nature11412 --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' < tcga_bc.txt > tcga_bc.rdf
|
6
|
+
|
7
|
+
curl -T tcga_bc.rdf -H 'Content-Type: application/x-turtle' http://localhost:8081/data/exominer.rdf
|
data/scripts/load_rdf.sh
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#! /bin/sh
|
2
|
+
|
3
|
+
PORT=8081
|
4
|
+
|
5
|
+
for rdf in $* ; do
|
6
|
+
echo "Loading $rdf"
|
7
|
+
# ---- test syntax
|
8
|
+
rapper -i turtle $rdf > /dev/null
|
9
|
+
|
10
|
+
uri=http://localhost:$PORT/data/http://biobeat.org/data/$rdf
|
11
|
+
|
12
|
+
curl -X DELETE $uri
|
13
|
+
curl -T $rdf -H 'Content-Type: application/x-turtle' $uri
|
14
|
+
done
|
15
|
+
|
data/spec/rdf_spec.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
require 'minitest'
|
3
|
+
|
4
|
+
class TestRDF < MiniTest::Test
|
5
|
+
|
6
|
+
include BioExominer
|
7
|
+
|
8
|
+
# def test_uri_escape
|
9
|
+
# assert_equal !RDF::escape("")
|
10
|
+
# end
|
11
|
+
|
12
|
+
def test_uri_validator
|
13
|
+
# invalid
|
14
|
+
assert !RDF::valid_uri?("use`quote")
|
15
|
+
# assert !RDF::valid_uri?("use%7quote")
|
16
|
+
|
17
|
+
# valid
|
18
|
+
assert RDF::valid_uri?("use%07quote")
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_make_identifier
|
23
|
+
assert_equal RDF::make_identifier("AA"), "AA"
|
24
|
+
assert_equal RDF::make_identifier("use:colon:"), "use_colon_"
|
25
|
+
assert_equal RDF::make_identifier("use|pipe"), "use_pipe"
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
# require 'rspec'
|
4
|
+
require 'rubygems'
|
5
|
+
gem 'minitest' # ensures you're using the gem, and not the built in MT
|
6
|
+
require 'minitest/autorun'
|
7
|
+
|
8
|
+
require 'bio-exominer'
|
9
|
+
|
10
|
+
include BioExominer
|
11
|
+
|
12
|
+
|
13
|
+
# Requires supporting files with custom matchers and macros, etc,
|
14
|
+
# in ./support/ and its subdirectories.
|
15
|
+
# Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
16
|
+
|
17
|
+
# RSpec.configure do |config|
|
18
|
+
|
19
|
+
# end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
class TestTextParser < MiniTest::Test
|
3
|
+
|
4
|
+
include BioExominer
|
5
|
+
|
6
|
+
BUF =<<TEXT
|
7
|
+
|
8
|
+
Hello world. Test gene GEN1.X. This is with context! I don't believe this is true,
|
9
|
+
and that you can do this. Love Ruby, love RDF. Love the combination.
|
10
|
+
Fish the token out of a very long sentence
|
11
|
+
Fish the token out of a very long sentence
|
12
|
+
Fish the token out of a very long sentence
|
13
|
+
Fish the token out of a very long sentence
|
14
|
+
Fish the token out of a very long sentence
|
15
|
+
Fish the token "GEN2.X" out of a very long sentence
|
16
|
+
Fish the token out of a very long sentence
|
17
|
+
Fish the token out of a very long sentence
|
18
|
+
Fish the token out of a very long sentence
|
19
|
+
Fish the token out of a very long sentence
|
20
|
+
Fish the token out of a very long sentence
|
21
|
+
Fish the token out of a very long sentence
|
22
|
+
Fish the token out of a very long sentence
|
23
|
+
TEXT
|
24
|
+
|
25
|
+
def test_tokenize_with_context
|
26
|
+
counts,match = TextParser::tokenize_with_context(BUF)
|
27
|
+
assert_equal counts['world'],1
|
28
|
+
assert_equal counts['Love'],2
|
29
|
+
assert_equal match['world'], ['Hello world']
|
30
|
+
assert_equal match['Love'], ['Love Ruby, love RDF', 'Love the combination']
|
31
|
+
assert_equal match['context'], ['This is with context! I don\'t believe this is true, and that you can do this']
|
32
|
+
assert_equal match['GEN1.X'], ['Test gene GEN1.X']
|
33
|
+
assert_equal match['GEN2.X'], ['Fish the token out of a very long sentence Fish the token GEN2.X out of a very long sentence Fish the token out of a']
|
34
|
+
# ---- Line based context
|
35
|
+
counts,match = TextParser::tokenize_with_context(BUF,context=:line)
|
36
|
+
assert_equal match['GEN2.X'], ['Fish the token GEN2.X out of a very long sentence']
|
37
|
+
end
|
38
|
+
|
39
|
+
BUF2 =<<TEXT2
|
40
|
+
valid token figure S11 table XX p53
|
41
|
+
Invalid MD, and RD Jester, Wikkel W, Wokkel WOS
|
42
|
+
TEXT2
|
43
|
+
|
44
|
+
def test_valid_tokens
|
45
|
+
match = TextParser::tokenize(BUF2)
|
46
|
+
assert match['token']
|
47
|
+
assert !match['S11']
|
48
|
+
assert !match['XX']
|
49
|
+
assert !match['Wokkel']
|
50
|
+
assert match['p53']
|
51
|
+
assert match['WOS']
|
52
|
+
assert !match['MD']
|
53
|
+
assert !match['Invalid']
|
54
|
+
assert !match['RD']
|
55
|
+
assert !match['Jester']
|
56
|
+
assert !match['Wikkel']
|
57
|
+
assert !match['W']
|
58
|
+
end
|
59
|
+
end
|