bio-exominer 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/.travis.yml +14 -0
- data/Gemfile +17 -0
- data/LICENSE.txt +20 -0
- data/README.md +413 -0
- data/Rakefile +58 -0
- data/VERSION +1 -0
- data/bin/exominer +250 -0
- data/bin/hugo_exominer_symbols +74 -0
- data/bin/ncbi_exominer_symbols +79 -0
- data/bin/pack_exominer_symbols +38 -0
- data/features/bio-exominer.feature +9 -0
- data/features/step_definitions/bio-exominer_steps.rb +0 -0
- data/features/support/env.rb +13 -0
- data/lib/bio-exominer.rb +14 -0
- data/lib/bio-exominer/exominer.rb +3 -0
- data/lib/bio-exominer/rdf.rb +38 -0
- data/lib/bio-exominer/symbols.rb +49 -0
- data/lib/bio-exominer/textparser.rb +124 -0
- data/scripts/4store.sh +30 -0
- data/scripts/example.sh +9 -0
- data/scripts/example_rdf.sh +7 -0
- data/scripts/load_rdf.sh +15 -0
- data/spec/bio-exominer_spec.rb +8 -0
- data/spec/rdf_spec.rb +28 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/text_parser_spec.rb +59 -0
- data/test/data/input/hugo_symbols +38106 -0
- metadata +195 -0
@@ -0,0 +1,38 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
# Pack symbol file
|
4
|
+
#
|
5
|
+
# Copyright (C) 2013 Pjotr Prins <pjotr.prins@thebird.nl>
|
6
|
+
#
|
7
|
+
|
8
|
+
gempath = File.dirname(File.dirname(__FILE__))
|
9
|
+
$: << File.join(gempath,'lib')
|
10
|
+
|
11
|
+
require 'msgpack'
|
12
|
+
require 'bio-exominer/symbols'
|
13
|
+
|
14
|
+
include BioExominer
|
15
|
+
|
16
|
+
if ARGV[0] == '-d'
|
17
|
+
ARGV.shift
|
18
|
+
|
19
|
+
print "Unpacking symbols.bin..."
|
20
|
+
u = MessagePack::Unpacker.new(File.new('symbols.bin','rb'))
|
21
|
+
begin
|
22
|
+
u.each do |obj|
|
23
|
+
print obj[0],"\t",(obj[1] ? obj[1].join('|') : "NA"),"\t",obj[2],"\n"
|
24
|
+
end
|
25
|
+
rescue EOFError
|
26
|
+
end
|
27
|
+
else
|
28
|
+
|
29
|
+
print "Writing symbols.bin..."
|
30
|
+
bin = MessagePack::Packer.new(File.new('symbols.bin','wb'))
|
31
|
+
|
32
|
+
ARGF.each_line do | line |
|
33
|
+
symbol,aliases,descr = Symbols::parse_line(line)
|
34
|
+
bin.write([symbol,aliases,descr])
|
35
|
+
end
|
36
|
+
bin.flush
|
37
|
+
|
38
|
+
end
|
File without changes
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
begin
|
3
|
+
Bundler.setup(:default, :development)
|
4
|
+
rescue Bundler::BundlerError => e
|
5
|
+
$stderr.puts e.message
|
6
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
7
|
+
exit e.status_code
|
8
|
+
end
|
9
|
+
|
10
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
|
11
|
+
require 'bio-exominer'
|
12
|
+
|
13
|
+
require 'rspec/expectations'
|
data/lib/bio-exominer.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# Please require your code below, respecting the naming conventions in the
|
2
|
+
# bioruby directory tree.
|
3
|
+
#
|
4
|
+
# For example, say you have a plugin named bio-plugin, the only uncommented
|
5
|
+
# line in this file would be
|
6
|
+
#
|
7
|
+
# require 'bio/bio-plugin/plugin'
|
8
|
+
#
|
9
|
+
# In this file only require other files. Avoid other source code.
|
10
|
+
|
11
|
+
require 'bio-exominer/rdf'
|
12
|
+
require 'bio-exominer/textparser'
|
13
|
+
require 'bio-exominer/exominer'
|
14
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module BioExominer
|
2
|
+
|
3
|
+
require 'uri'
|
4
|
+
|
5
|
+
# FIXME: use bioruby-rdf modules instead! It is all there now.
|
6
|
+
module RDF
|
7
|
+
|
8
|
+
def RDF::valid_uri? uri
|
9
|
+
uri =~ /^([!#$&-;=?_a-z~]|%[0-9a-f]{2})+$/i
|
10
|
+
end
|
11
|
+
|
12
|
+
# An identifier is used for the subject and predicate in RDF. This is a case-sensitive
|
13
|
+
# (shortened) URI. You can change default behaviour for identifiers using the options
|
14
|
+
# --transform-ids (i.e. in the input side, rather than the output side)
|
15
|
+
#
|
16
|
+
def RDF::make_identifier(s)
|
17
|
+
id = s.strip.gsub(/[^[:print:]]/, '').gsub(/[#)(,]/,"").gsub(/[%]/,"perc").gsub(/(\s|\.|\$|\/|\\)+/,"_")
|
18
|
+
# id = URI::escape(id)
|
19
|
+
id = id.gsub(/\|/,'_')
|
20
|
+
id = id.gsub(/\-|:/,'_')
|
21
|
+
if id != s
|
22
|
+
# logger = Bio::Log::LoggerPlus['bio-table']
|
23
|
+
$stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
|
24
|
+
end
|
25
|
+
if not RDF::valid_uri?(id)
|
26
|
+
raise "Invalid URI after mangling <#{s}> to <#{id}>!"
|
27
|
+
end
|
28
|
+
valid_id = if id =~ /^\d/
|
29
|
+
'r' + id
|
30
|
+
else
|
31
|
+
id
|
32
|
+
end
|
33
|
+
valid_id
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module BioExominer
|
2
|
+
|
3
|
+
require 'msgpack'
|
4
|
+
require 'bio-exominer/rdf'
|
5
|
+
|
6
|
+
module Symbols
|
7
|
+
|
8
|
+
# Make a full URI out of a symbol
|
9
|
+
def Symbols::uri(symbol,hugo)
|
10
|
+
if hugo[symbol]
|
11
|
+
# http://bio2rdf.org/hugo:RAD51C
|
12
|
+
"hgnc:"+RDF::make_identifier(symbol)
|
13
|
+
else
|
14
|
+
"ncbigene:"+RDF::make_identifier(symbol) # remove all non-printable
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def Symbols::parse_line(line)
|
19
|
+
symbol,aliases,descr = line.strip.split(/\t/)
|
20
|
+
aliases =
|
21
|
+
if aliases == 'NA'
|
22
|
+
nil
|
23
|
+
else
|
24
|
+
aliases.split(/\|/)
|
25
|
+
end
|
26
|
+
return symbol,aliases,(descr ? descr.strip : "")
|
27
|
+
end
|
28
|
+
|
29
|
+
def Symbols::each(fn)
|
30
|
+
is_bin = fn =~ /.bin$/
|
31
|
+
|
32
|
+
if is_bin
|
33
|
+
u = MessagePack::Unpacker.new(File.new(fn,'rb'))
|
34
|
+
begin
|
35
|
+
u.each do |obj|
|
36
|
+
# print obj[0],"\t",(obj[1] ? obj[1].join('|') : "NA"),"\t",obj[2],"\n"
|
37
|
+
yield obj[0],obj[1],obj[2]
|
38
|
+
end
|
39
|
+
rescue EOFError
|
40
|
+
end
|
41
|
+
else
|
42
|
+
File.open(fn).each_line do | line |
|
43
|
+
yield parse_line(line)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# Text parsing
|
2
|
+
|
3
|
+
module BioExominer
|
4
|
+
|
5
|
+
module TextParser
|
6
|
+
|
7
|
+
MAX_SIZE = 120
|
8
|
+
SKIP_TOKENS = %w{ can has Ma van large was polymerase had far a impact was East early
|
9
|
+
face Park ali and team tag ras ac tail at al age ac TA tag small this pure such
|
10
|
+
type gene pmc but is ten org we an term not as by lost et out how up per for
|
11
|
+
end beta der The Ten Out At No How pdf Ding Jan To cell gov even Jun
|
12
|
+
Sun DNA Nat in hit doc bin with set Nov unknown key link to cgi
|
13
|
+
and or RDF NPG
|
14
|
+
}
|
15
|
+
|
16
|
+
# L3MBTL
|
17
|
+
|
18
|
+
def TextParser::valid_token? token
|
19
|
+
return false if token.strip == ""
|
20
|
+
return false if token =~ /^(\d|[,])+$/
|
21
|
+
return false if token !~ /[a-zA-Z]/ # at least one word char
|
22
|
+
true
|
23
|
+
end
|
24
|
+
|
25
|
+
def TextParser::add tokens, word
|
26
|
+
return if SKIP_TOKENS.include?(word)
|
27
|
+
# return if word.size < 2
|
28
|
+
tokens[word] ||= 0
|
29
|
+
tokens[word] += 1
|
30
|
+
end
|
31
|
+
|
32
|
+
def TextParser::rm_punctuation w
|
33
|
+
return nil if w == nil
|
34
|
+
word = w.dup
|
35
|
+
if word =~ /^\[\d+\]/
|
36
|
+
word = word.sub(/^\[\d+\]/,'')
|
37
|
+
end
|
38
|
+
word = word.sub(/^\(/,'')
|
39
|
+
word = word.sub(/\)$/,'')
|
40
|
+
word = word.sub(/[,:;!]$/,'') # remove punctuation
|
41
|
+
word = word.sub(/^[`"']/,'') # remove starting quotes
|
42
|
+
word = word.sub(/[`"']$/,'') # remove ending quotes
|
43
|
+
word
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return tokens with count
|
47
|
+
def TextParser::tokenize buf
|
48
|
+
tokens = {}
|
49
|
+
list = buf.split(/[\r\n\s]+/)
|
50
|
+
list.each_with_index do | word,idx |
|
51
|
+
n1 = p1 = nil
|
52
|
+
p1 = rm_punctuation(list[idx-1]) if idx>0
|
53
|
+
w1 = rm_punctuation(word)
|
54
|
+
n1 = rm_punctuation(list[idx+1]) if idx<list.size
|
55
|
+
next if w1.size < 2
|
56
|
+
next if p1 =~ /table|dataset|supplement|figure|chapter|section|paragraph/i
|
57
|
+
# Filter out letters+name
|
58
|
+
if w1 =~ /^[A-Z]/ and w1.capitalize == w1
|
59
|
+
next if n1 and n1.size == 1
|
60
|
+
next if p1 and p1.size == 1
|
61
|
+
next if n1 and n1.size == 2 and n1 =~ /^[A-Z][A-Z]/
|
62
|
+
next if p1 and p1.size == 2 and p1 =~ /^[A-Z][A-Z]/
|
63
|
+
end
|
64
|
+
if w1.size == 2 and w1 =~ /^[A-Z][A-Z]/
|
65
|
+
next if p1 and p1 =~ /^[A-Z]/ and p1.capitalize == p1
|
66
|
+
next if n1 and n1 =~ /^[A-Z]/ and n1.capitalize == n1
|
67
|
+
end
|
68
|
+
# Filter out all lowercase small names
|
69
|
+
next if w1.size < 4 and w1 == w1.downcase and w1 !~ /\d/
|
70
|
+
# Remove brackets and braces in first and last positions
|
71
|
+
add(tokens,w1) if TextParser.valid_token?(word)
|
72
|
+
# p [word,w1,TextParser.valid_token?(word)]
|
73
|
+
add(tokens,word) if TextParser.valid_token?(word) and word != w1
|
74
|
+
# split on dash or underscore
|
75
|
+
if word =~ /-|_/
|
76
|
+
word.split(/-|_/).each do |w|
|
77
|
+
add(tokens,w) if TextParser.valid_token?(w)
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
# p tokens
|
82
|
+
tokens
|
83
|
+
end
|
84
|
+
|
85
|
+
# Return a list of tokens with count and context
|
86
|
+
def TextParser::tokenize_with_context buf, context_type = :sentence
|
87
|
+
tokens_context = {}
|
88
|
+
tokens_count = {}
|
89
|
+
# Split buf into sentences based on dots or newlines
|
90
|
+
sentences =
|
91
|
+
if context_type == :line or context_type == 'line'
|
92
|
+
buf.split(/\n/)
|
93
|
+
else
|
94
|
+
buf.split(/\.\s+/)
|
95
|
+
end
|
96
|
+
sentences.each do | sentence1 |
|
97
|
+
sentence = sentence1.strip.gsub(/(\r|\n)\s*/,' ')
|
98
|
+
# remove quotes
|
99
|
+
sentence = sentence.gsub(/"/,'')
|
100
|
+
tokens = tokenize(sentence)
|
101
|
+
tokens.each { | token, count |
|
102
|
+
# shorten the sentence
|
103
|
+
sentence2 =
|
104
|
+
if sentence.size > MAX_SIZE+2
|
105
|
+
half_size = MAX_SIZE/2
|
106
|
+
pos = sentence.index(token)
|
107
|
+
start = (pos-half_size<0 ? 0 : pos-half_size)
|
108
|
+
stop = pos+half_size
|
109
|
+
s2 = sentence[start..stop]
|
110
|
+
s2.sub(/^\w+\s+/,'').sub(/\s+\w+$/,'')
|
111
|
+
else
|
112
|
+
sentence
|
113
|
+
end
|
114
|
+
tokens_count[token] ||= 0
|
115
|
+
tokens_count[token] += count
|
116
|
+
tokens_context[token] ||= []
|
117
|
+
tokens_context[token] << sentence2
|
118
|
+
}
|
119
|
+
end
|
120
|
+
return tokens_count, tokens_context
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
end
|
data/scripts/4store.sh
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#! /bin/sh
|
2
|
+
#
|
3
|
+
# Options
|
4
|
+
#
|
5
|
+
# -r Restart server
|
6
|
+
# -d Delete DB and restart server
|
7
|
+
|
8
|
+
dbname=exominer
|
9
|
+
if [ ! -z $1 ] ; then
|
10
|
+
dbname=$1
|
11
|
+
shift
|
12
|
+
fi
|
13
|
+
|
14
|
+
echo Starting DB $dbname
|
15
|
+
|
16
|
+
if [ "$1" = "-r" ]; then
|
17
|
+
killall 4s-httpd
|
18
|
+
killall 4s-backend
|
19
|
+
fi
|
20
|
+
|
21
|
+
if [ "$1" = "-d" ]; then
|
22
|
+
killall 4s-httpd
|
23
|
+
killall 4s-backend
|
24
|
+
4s-backend-setup $dbname
|
25
|
+
fi
|
26
|
+
|
27
|
+
|
28
|
+
4s-backend $dbname
|
29
|
+
4s-httpd -p 8081 $dbname
|
30
|
+
|
data/scripts/example.sh
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
#! /bin/sh
|
2
|
+
|
3
|
+
|
4
|
+
./bin/exominer --rdf --name tcga_bc --hugo --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' -s ncbi_symbols.bin < tcga_bc.txt > tcga_bc.rdf
|
5
|
+
|
6
|
+
curl -T tcga_bc.rdf -H 'Content-Type: application/x-turtle' http://localhost:8081/data/exominer.rdf
|
7
|
+
|
8
|
+
~/opt/bin/sparql-query http://localhost:8081/sparql/ 'SELECT * WHERE { ?s ?p ?o } LIMIT 5'
|
9
|
+
|
@@ -0,0 +1,7 @@
|
|
1
|
+
#! /bin/sh
|
2
|
+
|
3
|
+
|
4
|
+
./bin/exominer --rdf --name tcga_bc --hugo --doi doi:10.1038/nature11412 --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' < tcga_bc.txt
|
5
|
+
./bin/exominer --rdf --name tcga_bc --hugo --doi doi:10.1038/nature11412 --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' < tcga_bc.txt > tcga_bc.rdf
|
6
|
+
|
7
|
+
curl -T tcga_bc.rdf -H 'Content-Type: application/x-turtle' http://localhost:8081/data/exominer.rdf
|
data/scripts/load_rdf.sh
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#! /bin/sh
|
2
|
+
|
3
|
+
PORT=8081
|
4
|
+
|
5
|
+
for rdf in $* ; do
|
6
|
+
echo "Loading $rdf"
|
7
|
+
# ---- test syntax
|
8
|
+
rapper -i turtle $rdf > /dev/null
|
9
|
+
|
10
|
+
uri=http://localhost:$PORT/data/http://biobeat.org/data/$rdf
|
11
|
+
|
12
|
+
curl -X DELETE $uri
|
13
|
+
curl -T $rdf -H 'Content-Type: application/x-turtle' $uri
|
14
|
+
done
|
15
|
+
|
data/spec/rdf_spec.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
require 'minitest'
|
3
|
+
|
4
|
+
class TestRDF < MiniTest::Test
|
5
|
+
|
6
|
+
include BioExominer
|
7
|
+
|
8
|
+
# def test_uri_escape
|
9
|
+
# assert_equal !RDF::escape("")
|
10
|
+
# end
|
11
|
+
|
12
|
+
def test_uri_validator
|
13
|
+
# invalid
|
14
|
+
assert !RDF::valid_uri?("use`quote")
|
15
|
+
# assert !RDF::valid_uri?("use%7quote")
|
16
|
+
|
17
|
+
# valid
|
18
|
+
assert RDF::valid_uri?("use%07quote")
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_make_identifier
|
23
|
+
assert_equal RDF::make_identifier("AA"), "AA"
|
24
|
+
assert_equal RDF::make_identifier("use:colon:"), "use_colon_"
|
25
|
+
assert_equal RDF::make_identifier("use|pipe"), "use_pipe"
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
# require 'rspec'
|
4
|
+
require 'rubygems'
|
5
|
+
gem 'minitest' # ensures you're using the gem, and not the built in MT
|
6
|
+
require 'minitest/autorun'
|
7
|
+
|
8
|
+
require 'bio-exominer'
|
9
|
+
|
10
|
+
include BioExominer
|
11
|
+
|
12
|
+
|
13
|
+
# Requires supporting files with custom matchers and macros, etc,
|
14
|
+
# in ./support/ and its subdirectories.
|
15
|
+
# Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
16
|
+
|
17
|
+
# RSpec.configure do |config|
|
18
|
+
|
19
|
+
# end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
class TestTextParser < MiniTest::Test
|
3
|
+
|
4
|
+
include BioExominer
|
5
|
+
|
6
|
+
BUF =<<TEXT
|
7
|
+
|
8
|
+
Hello world. Test gene GEN1.X. This is with context! I don't believe this is true,
|
9
|
+
and that you can do this. Love Ruby, love RDF. Love the combination.
|
10
|
+
Fish the token out of a very long sentence
|
11
|
+
Fish the token out of a very long sentence
|
12
|
+
Fish the token out of a very long sentence
|
13
|
+
Fish the token out of a very long sentence
|
14
|
+
Fish the token out of a very long sentence
|
15
|
+
Fish the token "GEN2.X" out of a very long sentence
|
16
|
+
Fish the token out of a very long sentence
|
17
|
+
Fish the token out of a very long sentence
|
18
|
+
Fish the token out of a very long sentence
|
19
|
+
Fish the token out of a very long sentence
|
20
|
+
Fish the token out of a very long sentence
|
21
|
+
Fish the token out of a very long sentence
|
22
|
+
Fish the token out of a very long sentence
|
23
|
+
TEXT
|
24
|
+
|
25
|
+
def test_tokenize_with_context
|
26
|
+
counts,match = TextParser::tokenize_with_context(BUF)
|
27
|
+
assert_equal counts['world'],1
|
28
|
+
assert_equal counts['Love'],2
|
29
|
+
assert_equal match['world'], ['Hello world']
|
30
|
+
assert_equal match['Love'], ['Love Ruby, love RDF', 'Love the combination']
|
31
|
+
assert_equal match['context'], ['This is with context! I don\'t believe this is true, and that you can do this']
|
32
|
+
assert_equal match['GEN1.X'], ['Test gene GEN1.X']
|
33
|
+
assert_equal match['GEN2.X'], ['Fish the token out of a very long sentence Fish the token GEN2.X out of a very long sentence Fish the token out of a']
|
34
|
+
# ---- Line based context
|
35
|
+
counts,match = TextParser::tokenize_with_context(BUF,context=:line)
|
36
|
+
assert_equal match['GEN2.X'], ['Fish the token GEN2.X out of a very long sentence']
|
37
|
+
end
|
38
|
+
|
39
|
+
BUF2 =<<TEXT2
|
40
|
+
valid token figure S11 table XX p53
|
41
|
+
Invalid MD, and RD Jester, Wikkel W, Wokkel WOS
|
42
|
+
TEXT2
|
43
|
+
|
44
|
+
def test_valid_tokens
|
45
|
+
match = TextParser::tokenize(BUF2)
|
46
|
+
assert match['token']
|
47
|
+
assert !match['S11']
|
48
|
+
assert !match['XX']
|
49
|
+
assert !match['Wokkel']
|
50
|
+
assert match['p53']
|
51
|
+
assert match['WOS']
|
52
|
+
assert !match['MD']
|
53
|
+
assert !match['Invalid']
|
54
|
+
assert !match['RD']
|
55
|
+
assert !match['Jester']
|
56
|
+
assert !match['Wikkel']
|
57
|
+
assert !match['W']
|
58
|
+
end
|
59
|
+
end
|