bio-exominer 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,38 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # Pack symbol file
4
+ #
5
+ # Copyright (C) 2013 Pjotr Prins <pjotr.prins@thebird.nl>
6
+ #
7
+
8
+ gempath = File.dirname(File.dirname(__FILE__))
9
+ $: << File.join(gempath,'lib')
10
+
11
+ require 'msgpack'
12
+ require 'bio-exominer/symbols'
13
+
14
+ include BioExominer
15
+
16
+ if ARGV[0] == '-d'
17
+ ARGV.shift
18
+
19
+ print "Unpacking symbols.bin..."
20
+ u = MessagePack::Unpacker.new(File.new('symbols.bin','rb'))
21
+ begin
22
+ u.each do |obj|
23
+ print obj[0],"\t",(obj[1] ? obj[1].join('|') : "NA"),"\t",obj[2],"\n"
24
+ end
25
+ rescue EOFError
26
+ end
27
+ else
28
+
29
+ print "Writing symbols.bin..."
30
+ bin = MessagePack::Packer.new(File.new('symbols.bin','wb'))
31
+
32
+ ARGF.each_line do | line |
33
+ symbol,aliases,descr = Symbols::parse_line(line)
34
+ bin.write([symbol,aliases,descr])
35
+ end
36
+ bin.flush
37
+
38
+ end
@@ -0,0 +1,9 @@
1
+ Feature: something something
2
+ In order to something something
3
+ A user something something
4
+ something something something
5
+
6
+ Scenario: something something
7
+ Given inspiration
8
+ When I create a sweet new gem
9
+ Then everyone should see how awesome I am
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
+ require 'bio-exominer'
12
+
13
+ require 'rspec/expectations'
@@ -0,0 +1,14 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio-exominer/rdf'
12
+ require 'bio-exominer/textparser'
13
+ require 'bio-exominer/exominer'
14
+
@@ -0,0 +1,3 @@
1
+
2
+ module BioExominer
3
+ end
@@ -0,0 +1,38 @@
1
+ module BioExominer
2
+
3
+ require 'uri'
4
+
5
+ # FIXME: use bioruby-rdf modules instead! It is all there now.
6
+ module RDF
7
+
8
+ def RDF::valid_uri? uri
9
+ uri =~ /^([!#$&-;=?_a-z~]|%[0-9a-f]{2})+$/i
10
+ end
11
+
12
+ # An identifier is used for the subject and predicate in RDF. This is a case-sensitive
13
+ # (shortened) URI. You can change default behaviour for identifiers using the options
14
+ # --transform-ids (i.e. in the input side, rather than the output side)
15
+ #
16
+ def RDF::make_identifier(s)
17
+ id = s.strip.gsub(/[^[:print:]]/, '').gsub(/[#)(,]/,"").gsub(/[%]/,"perc").gsub(/(\s|\.|\$|\/|\\)+/,"_")
18
+ # id = URI::escape(id)
19
+ id = id.gsub(/\|/,'_')
20
+ id = id.gsub(/\-|:/,'_')
21
+ if id != s
22
+ # logger = Bio::Log::LoggerPlus['bio-table']
23
+ $stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
24
+ end
25
+ if not RDF::valid_uri?(id)
26
+ raise "Invalid URI after mangling <#{s}> to <#{id}>!"
27
+ end
28
+ valid_id = if id =~ /^\d/
29
+ 'r' + id
30
+ else
31
+ id
32
+ end
33
+ valid_id
34
+ end
35
+
36
+ end
37
+ end
38
+
@@ -0,0 +1,49 @@
1
+ module BioExominer
2
+
3
+ require 'msgpack'
4
+ require 'bio-exominer/rdf'
5
+
6
+ module Symbols
7
+
8
+ # Make a full URI out of a symbol
9
+ def Symbols::uri(symbol,hugo)
10
+ if hugo[symbol]
11
+ # http://bio2rdf.org/hugo:RAD51C
12
+ "hgnc:"+RDF::make_identifier(symbol)
13
+ else
14
+ "ncbigene:"+RDF::make_identifier(symbol) # remove all non-printable
15
+ end
16
+ end
17
+
18
+ def Symbols::parse_line(line)
19
+ symbol,aliases,descr = line.strip.split(/\t/)
20
+ aliases =
21
+ if aliases == 'NA'
22
+ nil
23
+ else
24
+ aliases.split(/\|/)
25
+ end
26
+ return symbol,aliases,(descr ? descr.strip : "")
27
+ end
28
+
29
+ def Symbols::each(fn)
30
+ is_bin = fn =~ /.bin$/
31
+
32
+ if is_bin
33
+ u = MessagePack::Unpacker.new(File.new(fn,'rb'))
34
+ begin
35
+ u.each do |obj|
36
+ # print obj[0],"\t",(obj[1] ? obj[1].join('|') : "NA"),"\t",obj[2],"\n"
37
+ yield obj[0],obj[1],obj[2]
38
+ end
39
+ rescue EOFError
40
+ end
41
+ else
42
+ File.open(fn).each_line do | line |
43
+ yield parse_line(line)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ end
@@ -0,0 +1,124 @@
1
+ # Text parsing
2
+
3
+ module BioExominer
4
+
5
+ module TextParser
6
+
7
+ MAX_SIZE = 120
8
+ SKIP_TOKENS = %w{ can has Ma van large was polymerase had far a impact was East early
9
+ face Park ali and team tag ras ac tail at al age ac TA tag small this pure such
10
+ type gene pmc but is ten org we an term not as by lost et out how up per for
11
+ end beta der The Ten Out At No How pdf Ding Jan To cell gov even Jun
12
+ Sun DNA Nat in hit doc bin with set Nov unknown key link to cgi
13
+ and or RDF NPG
14
+ }
15
+
16
+ # L3MBTL
17
+
18
+ def TextParser::valid_token? token
19
+ return false if token.strip == ""
20
+ return false if token =~ /^(\d|[,])+$/
21
+ return false if token !~ /[a-zA-Z]/ # at least one word char
22
+ true
23
+ end
24
+
25
+ def TextParser::add tokens, word
26
+ return if SKIP_TOKENS.include?(word)
27
+ # return if word.size < 2
28
+ tokens[word] ||= 0
29
+ tokens[word] += 1
30
+ end
31
+
32
+ def TextParser::rm_punctuation w
33
+ return nil if w == nil
34
+ word = w.dup
35
+ if word =~ /^\[\d+\]/
36
+ word = word.sub(/^\[\d+\]/,'')
37
+ end
38
+ word = word.sub(/^\(/,'')
39
+ word = word.sub(/\)$/,'')
40
+ word = word.sub(/[,:;!]$/,'') # remove punctuation
41
+ word = word.sub(/^[`"']/,'') # remove starting quotes
42
+ word = word.sub(/[`"']$/,'') # remove ending quotes
43
+ word
44
+ end
45
+
46
+ # Return tokens with count
47
+ def TextParser::tokenize buf
48
+ tokens = {}
49
+ list = buf.split(/[\r\n\s]+/)
50
+ list.each_with_index do | word,idx |
51
+ n1 = p1 = nil
52
+ p1 = rm_punctuation(list[idx-1]) if idx>0
53
+ w1 = rm_punctuation(word)
54
+ n1 = rm_punctuation(list[idx+1]) if idx<list.size
55
+ next if w1.size < 2
56
+ next if p1 =~ /table|dataset|supplement|figure|chapter|section|paragraph/i
57
+ # Filter out letters+name
58
+ if w1 =~ /^[A-Z]/ and w1.capitalize == w1
59
+ next if n1 and n1.size == 1
60
+ next if p1 and p1.size == 1
61
+ next if n1 and n1.size == 2 and n1 =~ /^[A-Z][A-Z]/
62
+ next if p1 and p1.size == 2 and p1 =~ /^[A-Z][A-Z]/
63
+ end
64
+ if w1.size == 2 and w1 =~ /^[A-Z][A-Z]/
65
+ next if p1 and p1 =~ /^[A-Z]/ and p1.capitalize == p1
66
+ next if n1 and n1 =~ /^[A-Z]/ and n1.capitalize == n1
67
+ end
68
+ # Filter out all lowercase small names
69
+ next if w1.size < 4 and w1 == w1.downcase and w1 !~ /\d/
70
+ # Remove brackets and braces in first and last positions
71
+ add(tokens,w1) if TextParser.valid_token?(word)
72
+ # p [word,w1,TextParser.valid_token?(word)]
73
+ add(tokens,word) if TextParser.valid_token?(word) and word != w1
74
+ # split on dash or underscore
75
+ if word =~ /-|_/
76
+ word.split(/-|_/).each do |w|
77
+ add(tokens,w) if TextParser.valid_token?(w)
78
+ end
79
+ end
80
+ end
81
+ # p tokens
82
+ tokens
83
+ end
84
+
85
+ # Return a list of tokens with count and context
86
+ def TextParser::tokenize_with_context buf, context_type = :sentence
87
+ tokens_context = {}
88
+ tokens_count = {}
89
+ # Split buf into sentences based on dots or newlines
90
+ sentences =
91
+ if context_type == :line or context_type == 'line'
92
+ buf.split(/\n/)
93
+ else
94
+ buf.split(/\.\s+/)
95
+ end
96
+ sentences.each do | sentence1 |
97
+ sentence = sentence1.strip.gsub(/(\r|\n)\s*/,' ')
98
+ # remove quotes
99
+ sentence = sentence.gsub(/"/,'')
100
+ tokens = tokenize(sentence)
101
+ tokens.each { | token, count |
102
+ # shorten the sentence
103
+ sentence2 =
104
+ if sentence.size > MAX_SIZE+2
105
+ half_size = MAX_SIZE/2
106
+ pos = sentence.index(token)
107
+ start = (pos-half_size<0 ? 0 : pos-half_size)
108
+ stop = pos+half_size
109
+ s2 = sentence[start..stop]
110
+ s2.sub(/^\w+\s+/,'').sub(/\s+\w+$/,'')
111
+ else
112
+ sentence
113
+ end
114
+ tokens_count[token] ||= 0
115
+ tokens_count[token] += count
116
+ tokens_context[token] ||= []
117
+ tokens_context[token] << sentence2
118
+ }
119
+ end
120
+ return tokens_count, tokens_context
121
+ end
122
+ end
123
+
124
+ end
@@ -0,0 +1,30 @@
1
+ #! /bin/sh
2
+ #
3
+ # Options
4
+ #
5
+ # -r Restart server
6
+ # -d Delete DB and restart server
7
+
8
+ dbname=exominer
9
+ if [ ! -z $1 ] ; then
10
+ dbname=$1
11
+ shift
12
+ fi
13
+
14
+ echo Starting DB $dbname
15
+
16
+ if [ "$1" = "-r" ]; then
17
+ killall 4s-httpd
18
+ killall 4s-backend
19
+ fi
20
+
21
+ if [ "$1" = "-d" ]; then
22
+ killall 4s-httpd
23
+ killall 4s-backend
24
+ 4s-backend-setup $dbname
25
+ fi
26
+
27
+
28
+ 4s-backend $dbname
29
+ 4s-httpd -p 8081 $dbname
30
+
@@ -0,0 +1,9 @@
1
+ #! /bin/sh
2
+
3
+
4
+ ./bin/exominer --rdf --name tcga_bc --hugo --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' -s ncbi_symbols.bin < tcga_bc.txt > tcga_bc.rdf
5
+
6
+ curl -T tcga_bc.rdf -H 'Content-Type: application/x-turtle' http://localhost:8081/data/exominer.rdf
7
+
8
+ ~/opt/bin/sparql-query http://localhost:8081/sparql/ 'SELECT * WHERE { ?s ?p ?o } LIMIT 5'
9
+
@@ -0,0 +1,7 @@
1
+ #! /bin/sh
2
+
3
+
4
+ ./bin/exominer --rdf --name tcga_bc --hugo --doi doi:10.1038/nature11412 --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' < tcga_bc.txt
5
+ ./bin/exominer --rdf --name tcga_bc --hugo --doi doi:10.1038/nature11412 --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' < tcga_bc.txt > tcga_bc.rdf
6
+
7
+ curl -T tcga_bc.rdf -H 'Content-Type: application/x-turtle' http://localhost:8081/data/exominer.rdf
@@ -0,0 +1,15 @@
1
+ #! /bin/sh
2
+
3
+ PORT=8081
4
+
5
+ for rdf in $* ; do
6
+ echo "Loading $rdf"
7
+ # ---- test syntax
8
+ rapper -i turtle $rdf > /dev/null
9
+
10
+ uri=http://localhost:$PORT/data/http://biobeat.org/data/$rdf
11
+
12
+ curl -X DELETE $uri
13
+ curl -T $rdf -H 'Content-Type: application/x-turtle' $uri
14
+ done
15
+
@@ -0,0 +1,8 @@
1
+ # require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ #
3
+ # describe "BioExominer" do
4
+ # it "fails" do
5
+ # fail "hey buddy, you should probably rename this file and start specing for real"
6
+ # end
7
+ # end
8
+
@@ -0,0 +1,28 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'minitest'
3
+
4
+ class TestRDF < MiniTest::Test
5
+
6
+ include BioExominer
7
+
8
+ # def test_uri_escape
9
+ # assert_equal !RDF::escape("")
10
+ # end
11
+
12
+ def test_uri_validator
13
+ # invalid
14
+ assert !RDF::valid_uri?("use`quote")
15
+ # assert !RDF::valid_uri?("use%7quote")
16
+
17
+ # valid
18
+ assert RDF::valid_uri?("use%07quote")
19
+
20
+ end
21
+
22
+ def test_make_identifier
23
+ assert_equal RDF::make_identifier("AA"), "AA"
24
+ assert_equal RDF::make_identifier("use:colon:"), "use_colon_"
25
+ assert_equal RDF::make_identifier("use|pipe"), "use_pipe"
26
+ end
27
+
28
+ end
@@ -0,0 +1,19 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ # require 'rspec'
4
+ require 'rubygems'
5
+ gem 'minitest' # ensures you're using the gem, and not the built in MT
6
+ require 'minitest/autorun'
7
+
8
+ require 'bio-exominer'
9
+
10
+ include BioExominer
11
+
12
+
13
+ # Requires supporting files with custom matchers and macros, etc,
14
+ # in ./support/ and its subdirectories.
15
+ # Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
16
+
17
+ # RSpec.configure do |config|
18
+
19
+ # end
@@ -0,0 +1,59 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ class TestTextParser < MiniTest::Test
3
+
4
+ include BioExominer
5
+
6
+ BUF =<<TEXT
7
+
8
+ Hello world. Test gene GEN1.X. This is with context! I don't believe this is true,
9
+ and that you can do this. Love Ruby, love RDF. Love the combination.
10
+ Fish the token out of a very long sentence
11
+ Fish the token out of a very long sentence
12
+ Fish the token out of a very long sentence
13
+ Fish the token out of a very long sentence
14
+ Fish the token out of a very long sentence
15
+ Fish the token "GEN2.X" out of a very long sentence
16
+ Fish the token out of a very long sentence
17
+ Fish the token out of a very long sentence
18
+ Fish the token out of a very long sentence
19
+ Fish the token out of a very long sentence
20
+ Fish the token out of a very long sentence
21
+ Fish the token out of a very long sentence
22
+ Fish the token out of a very long sentence
23
+ TEXT
24
+
25
+ def test_tokenize_with_context
26
+ counts,match = TextParser::tokenize_with_context(BUF)
27
+ assert_equal counts['world'],1
28
+ assert_equal counts['Love'],2
29
+ assert_equal match['world'], ['Hello world']
30
+ assert_equal match['Love'], ['Love Ruby, love RDF', 'Love the combination']
31
+ assert_equal match['context'], ['This is with context! I don\'t believe this is true, and that you can do this']
32
+ assert_equal match['GEN1.X'], ['Test gene GEN1.X']
33
+ assert_equal match['GEN2.X'], ['Fish the token out of a very long sentence Fish the token GEN2.X out of a very long sentence Fish the token out of a']
34
+ # ---- Line based context
35
+ counts,match = TextParser::tokenize_with_context(BUF,context=:line)
36
+ assert_equal match['GEN2.X'], ['Fish the token GEN2.X out of a very long sentence']
37
+ end
38
+
39
+ BUF2 =<<TEXT2
40
+ valid token figure S11 table XX p53
41
+ Invalid MD, and RD Jester, Wikkel W, Wokkel WOS
42
+ TEXT2
43
+
44
+ def test_valid_tokens
45
+ match = TextParser::tokenize(BUF2)
46
+ assert match['token']
47
+ assert !match['S11']
48
+ assert !match['XX']
49
+ assert !match['Wokkel']
50
+ assert match['p53']
51
+ assert match['WOS']
52
+ assert !match['MD']
53
+ assert !match['Invalid']
54
+ assert !match['RD']
55
+ assert !match['Jester']
56
+ assert !match['Wikkel']
57
+ assert !match['W']
58
+ end
59
+ end