bio-exominer 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ # Pack symbol file
4
+ #
5
+ # Copyright (C) 2013 Pjotr Prins <pjotr.prins@thebird.nl>
6
+ #
7
+
8
+ gempath = File.dirname(File.dirname(__FILE__))
9
+ $: << File.join(gempath,'lib')
10
+
11
+ require 'msgpack'
12
+ require 'bio-exominer/symbols'
13
+
14
+ include BioExominer
15
+
16
+ if ARGV[0] == '-d'
17
+ ARGV.shift
18
+
19
+ print "Unpacking symbols.bin..."
20
+ u = MessagePack::Unpacker.new(File.new('symbols.bin','rb'))
21
+ begin
22
+ u.each do |obj|
23
+ print obj[0],"\t",(obj[1] ? obj[1].join('|') : "NA"),"\t",obj[2],"\n"
24
+ end
25
+ rescue EOFError
26
+ end
27
+ else
28
+
29
+ print "Writing symbols.bin..."
30
+ bin = MessagePack::Packer.new(File.new('symbols.bin','wb'))
31
+
32
+ ARGF.each_line do | line |
33
+ symbol,aliases,descr = Symbols::parse_line(line)
34
+ bin.write([symbol,aliases,descr])
35
+ end
36
+ bin.flush
37
+
38
+ end
@@ -0,0 +1,9 @@
1
+ Feature: something something
2
+ In order to something something
3
+ A user something something
4
+ something something something
5
+
6
+ Scenario: something something
7
+ Given inspiration
8
+ When I create a sweet new gem
9
+ Then everyone should see how awesome I am
@@ -0,0 +1,13 @@
1
+ require 'bundler'
2
+ begin
3
+ Bundler.setup(:default, :development)
4
+ rescue Bundler::BundlerError => e
5
+ $stderr.puts e.message
6
+ $stderr.puts "Run `bundle install` to install missing gems"
7
+ exit e.status_code
8
+ end
9
+
10
+ $LOAD_PATH.unshift(File.dirname(__FILE__) + '/../../lib')
11
+ require 'bio-exominer'
12
+
13
+ require 'rspec/expectations'
@@ -0,0 +1,14 @@
1
+ # Please require your code below, respecting the naming conventions in the
2
+ # bioruby directory tree.
3
+ #
4
+ # For example, say you have a plugin named bio-plugin, the only uncommented
5
+ # line in this file would be
6
+ #
7
+ # require 'bio/bio-plugin/plugin'
8
+ #
9
+ # In this file only require other files. Avoid other source code.
10
+
11
+ require 'bio-exominer/rdf'
12
+ require 'bio-exominer/textparser'
13
+ require 'bio-exominer/exominer'
14
+
@@ -0,0 +1,3 @@
1
+
2
+ module BioExominer
3
+ end
@@ -0,0 +1,38 @@
1
+ module BioExominer
2
+
3
+ require 'uri'
4
+
5
+ # FIXME: use bioruby-rdf modules instead! It is all there now.
6
+ module RDF
7
+
8
+ def RDF::valid_uri? uri
9
+ uri =~ /^([!#$&-;=?_a-z~]|%[0-9a-f]{2})+$/i
10
+ end
11
+
12
+ # An identifier is used for the subject and predicate in RDF. This is a case-sensitive
13
+ # (shortened) URI. You can change default behaviour for identifiers using the options
14
+ # --transform-ids (i.e. in the input side, rather than the output side)
15
+ #
16
+ def RDF::make_identifier(s)
17
+ id = s.strip.gsub(/[^[:print:]]/, '').gsub(/[#)(,]/,"").gsub(/[%]/,"perc").gsub(/(\s|\.|\$|\/|\\)+/,"_")
18
+ # id = URI::escape(id)
19
+ id = id.gsub(/\|/,'_')
20
+ id = id.gsub(/\-|:/,'_')
21
+ if id != s
22
+ # logger = Bio::Log::LoggerPlus['bio-table']
23
+ $stderr.print "\nWARNING: Changed identifier <#{s}> to <#{id}>"
24
+ end
25
+ if not RDF::valid_uri?(id)
26
+ raise "Invalid URI after mangling <#{s}> to <#{id}>!"
27
+ end
28
+ valid_id = if id =~ /^\d/
29
+ 'r' + id
30
+ else
31
+ id
32
+ end
33
+ valid_id
34
+ end
35
+
36
+ end
37
+ end
38
+
@@ -0,0 +1,49 @@
1
+ module BioExominer
2
+
3
+ require 'msgpack'
4
+ require 'bio-exominer/rdf'
5
+
6
+ module Symbols
7
+
8
+ # Make a full URI out of a symbol
9
+ def Symbols::uri(symbol,hugo)
10
+ if hugo[symbol]
11
+ # http://bio2rdf.org/hugo:RAD51C
12
+ "hgnc:"+RDF::make_identifier(symbol)
13
+ else
14
+ "ncbigene:"+RDF::make_identifier(symbol) # remove all non-printable
15
+ end
16
+ end
17
+
18
+ def Symbols::parse_line(line)
19
+ symbol,aliases,descr = line.strip.split(/\t/)
20
+ aliases =
21
+ if aliases == 'NA'
22
+ nil
23
+ else
24
+ aliases.split(/\|/)
25
+ end
26
+ return symbol,aliases,(descr ? descr.strip : "")
27
+ end
28
+
29
+ def Symbols::each(fn)
30
+ is_bin = fn =~ /.bin$/
31
+
32
+ if is_bin
33
+ u = MessagePack::Unpacker.new(File.new(fn,'rb'))
34
+ begin
35
+ u.each do |obj|
36
+ # print obj[0],"\t",(obj[1] ? obj[1].join('|') : "NA"),"\t",obj[2],"\n"
37
+ yield obj[0],obj[1],obj[2]
38
+ end
39
+ rescue EOFError
40
+ end
41
+ else
42
+ File.open(fn).each_line do | line |
43
+ yield parse_line(line)
44
+ end
45
+ end
46
+ end
47
+ end
48
+
49
+ end
@@ -0,0 +1,124 @@
1
+ # Text parsing
2
+
3
+ module BioExominer
4
+
5
+ module TextParser
6
+
7
+ MAX_SIZE = 120
8
+ SKIP_TOKENS = %w{ can has Ma van large was polymerase had far a impact was East early
9
+ face Park ali and team tag ras ac tail at al age ac TA tag small this pure such
10
+ type gene pmc but is ten org we an term not as by lost et out how up per for
11
+ end beta der The Ten Out At No How pdf Ding Jan To cell gov even Jun
12
+ Sun DNA Nat in hit doc bin with set Nov unknown key link to cgi
13
+ and or RDF NPG
14
+ }
15
+
16
+ # L3MBTL
17
+
18
+ def TextParser::valid_token? token
19
+ return false if token.strip == ""
20
+ return false if token =~ /^(\d|[,])+$/
21
+ return false if token !~ /[a-zA-Z]/ # at least one word char
22
+ true
23
+ end
24
+
25
+ def TextParser::add tokens, word
26
+ return if SKIP_TOKENS.include?(word)
27
+ # return if word.size < 2
28
+ tokens[word] ||= 0
29
+ tokens[word] += 1
30
+ end
31
+
32
+ def TextParser::rm_punctuation w
33
+ return nil if w == nil
34
+ word = w.dup
35
+ if word =~ /^\[\d+\]/
36
+ word = word.sub(/^\[\d+\]/,'')
37
+ end
38
+ word = word.sub(/^\(/,'')
39
+ word = word.sub(/\)$/,'')
40
+ word = word.sub(/[,:;!]$/,'') # remove punctuation
41
+ word = word.sub(/^[`"']/,'') # remove starting quotes
42
+ word = word.sub(/[`"']$/,'') # remove ending quotes
43
+ word
44
+ end
45
+
46
+ # Return tokens with count
47
+ def TextParser::tokenize buf
48
+ tokens = {}
49
+ list = buf.split(/[\r\n\s]+/)
50
+ list.each_with_index do | word,idx |
51
+ n1 = p1 = nil
52
+ p1 = rm_punctuation(list[idx-1]) if idx>0
53
+ w1 = rm_punctuation(word)
54
+ n1 = rm_punctuation(list[idx+1]) if idx<list.size
55
+ next if w1.size < 2
56
+ next if p1 =~ /table|dataset|supplement|figure|chapter|section|paragraph/i
57
+ # Filter out letters+name
58
+ if w1 =~ /^[A-Z]/ and w1.capitalize == w1
59
+ next if n1 and n1.size == 1
60
+ next if p1 and p1.size == 1
61
+ next if n1 and n1.size == 2 and n1 =~ /^[A-Z][A-Z]/
62
+ next if p1 and p1.size == 2 and p1 =~ /^[A-Z][A-Z]/
63
+ end
64
+ if w1.size == 2 and w1 =~ /^[A-Z][A-Z]/
65
+ next if p1 and p1 =~ /^[A-Z]/ and p1.capitalize == p1
66
+ next if n1 and n1 =~ /^[A-Z]/ and n1.capitalize == n1
67
+ end
68
+ # Filter out all lowercase small names
69
+ next if w1.size < 4 and w1 == w1.downcase and w1 !~ /\d/
70
+ # Remove brackets and braces in first and last positions
71
+ add(tokens,w1) if TextParser.valid_token?(word)
72
+ # p [word,w1,TextParser.valid_token?(word)]
73
+ add(tokens,word) if TextParser.valid_token?(word) and word != w1
74
+ # split on dash or underscore
75
+ if word =~ /-|_/
76
+ word.split(/-|_/).each do |w|
77
+ add(tokens,w) if TextParser.valid_token?(w)
78
+ end
79
+ end
80
+ end
81
+ # p tokens
82
+ tokens
83
+ end
84
+
85
+ # Return a list of tokens with count and context
86
+ def TextParser::tokenize_with_context buf, context_type = :sentence
87
+ tokens_context = {}
88
+ tokens_count = {}
89
+ # Split buf into sentences based on dots or newlines
90
+ sentences =
91
+ if context_type == :line or context_type == 'line'
92
+ buf.split(/\n/)
93
+ else
94
+ buf.split(/\.\s+/)
95
+ end
96
+ sentences.each do | sentence1 |
97
+ sentence = sentence1.strip.gsub(/(\r|\n)\s*/,' ')
98
+ # remove quotes
99
+ sentence = sentence.gsub(/"/,'')
100
+ tokens = tokenize(sentence)
101
+ tokens.each { | token, count |
102
+ # shorten the sentence
103
+ sentence2 =
104
+ if sentence.size > MAX_SIZE+2
105
+ half_size = MAX_SIZE/2
106
+ pos = sentence.index(token)
107
+ start = (pos-half_size<0 ? 0 : pos-half_size)
108
+ stop = pos+half_size
109
+ s2 = sentence[start..stop]
110
+ s2.sub(/^\w+\s+/,'').sub(/\s+\w+$/,'')
111
+ else
112
+ sentence
113
+ end
114
+ tokens_count[token] ||= 0
115
+ tokens_count[token] += count
116
+ tokens_context[token] ||= []
117
+ tokens_context[token] << sentence2
118
+ }
119
+ end
120
+ return tokens_count, tokens_context
121
+ end
122
+ end
123
+
124
+ end
@@ -0,0 +1,30 @@
1
+ #! /bin/sh
2
+ #
3
+ # Options
4
+ #
5
+ # -r Restart server
6
+ # -d Delete DB and restart server
7
+
8
+ dbname=exominer
9
+ if [ ! -z $1 ] ; then
10
+ dbname=$1
11
+ shift
12
+ fi
13
+
14
+ echo Starting DB $dbname
15
+
16
+ if [ "$1" = "-r" ]; then
17
+ killall 4s-httpd
18
+ killall 4s-backend
19
+ fi
20
+
21
+ if [ "$1" = "-d" ]; then
22
+ killall 4s-httpd
23
+ killall 4s-backend
24
+ 4s-backend-setup $dbname
25
+ fi
26
+
27
+
28
+ 4s-backend $dbname
29
+ 4s-httpd -p 8081 $dbname
30
+
@@ -0,0 +1,9 @@
1
+ #! /bin/sh
2
+
3
+
4
+ ./bin/exominer --rdf --name tcga_bc --hugo --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' -s ncbi_symbols.bin < tcga_bc.txt > tcga_bc.rdf
5
+
6
+ curl -T tcga_bc.rdf -H 'Content-Type: application/x-turtle' http://localhost:8081/data/exominer.rdf
7
+
8
+ ~/opt/bin/sparql-query http://localhost:8081/sparql/ 'SELECT * WHERE { ?s ?p ?o } LIMIT 5'
9
+
@@ -0,0 +1,7 @@
1
+ #! /bin/sh
2
+
3
+
4
+ ./bin/exominer --rdf --name tcga_bc --hugo --doi doi:10.1038/nature11412 --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' < tcga_bc.txt
5
+ ./bin/exominer --rdf --name tcga_bc --hugo --doi doi:10.1038/nature11412 --tag 'title=Comprehensive molecular portraits of human breast tumours' --tag 'year=2012;species=human;type=breast cancer' < tcga_bc.txt > tcga_bc.rdf
6
+
7
+ curl -T tcga_bc.rdf -H 'Content-Type: application/x-turtle' http://localhost:8081/data/exominer.rdf
@@ -0,0 +1,15 @@
1
+ #! /bin/sh
2
+
3
+ PORT=8081
4
+
5
+ for rdf in $* ; do
6
+ echo "Loading $rdf"
7
+ # ---- test syntax
8
+ rapper -i turtle $rdf > /dev/null
9
+
10
+ uri=http://localhost:$PORT/data/http://biobeat.org/data/$rdf
11
+
12
+ curl -X DELETE $uri
13
+ curl -T $rdf -H 'Content-Type: application/x-turtle' $uri
14
+ done
15
+
@@ -0,0 +1,8 @@
1
+ # require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ #
3
+ # describe "BioExominer" do
4
+ # it "fails" do
5
+ # fail "hey buddy, you should probably rename this file and start specing for real"
6
+ # end
7
+ # end
8
+
@@ -0,0 +1,28 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ require 'minitest'
3
+
4
+ class TestRDF < MiniTest::Test
5
+
6
+ include BioExominer
7
+
8
+ # def test_uri_escape
9
+ # assert_equal !RDF::escape("")
10
+ # end
11
+
12
+ def test_uri_validator
13
+ # invalid
14
+ assert !RDF::valid_uri?("use`quote")
15
+ # assert !RDF::valid_uri?("use%7quote")
16
+
17
+ # valid
18
+ assert RDF::valid_uri?("use%07quote")
19
+
20
+ end
21
+
22
+ def test_make_identifier
23
+ assert_equal RDF::make_identifier("AA"), "AA"
24
+ assert_equal RDF::make_identifier("use:colon:"), "use_colon_"
25
+ assert_equal RDF::make_identifier("use|pipe"), "use_pipe"
26
+ end
27
+
28
+ end
@@ -0,0 +1,19 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ # require 'rspec'
4
+ require 'rubygems'
5
+ gem 'minitest' # ensures you're using the gem, and not the built in MT
6
+ require 'minitest/autorun'
7
+
8
+ require 'bio-exominer'
9
+
10
+ include BioExominer
11
+
12
+
13
+ # Requires supporting files with custom matchers and macros, etc,
14
+ # in ./support/ and its subdirectories.
15
+ # Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
16
+
17
+ # RSpec.configure do |config|
18
+
19
+ # end
@@ -0,0 +1,59 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+ class TestTextParser < MiniTest::Test
3
+
4
+ include BioExominer
5
+
6
+ BUF =<<TEXT
7
+
8
+ Hello world. Test gene GEN1.X. This is with context! I don't believe this is true,
9
+ and that you can do this. Love Ruby, love RDF. Love the combination.
10
+ Fish the token out of a very long sentence
11
+ Fish the token out of a very long sentence
12
+ Fish the token out of a very long sentence
13
+ Fish the token out of a very long sentence
14
+ Fish the token out of a very long sentence
15
+ Fish the token "GEN2.X" out of a very long sentence
16
+ Fish the token out of a very long sentence
17
+ Fish the token out of a very long sentence
18
+ Fish the token out of a very long sentence
19
+ Fish the token out of a very long sentence
20
+ Fish the token out of a very long sentence
21
+ Fish the token out of a very long sentence
22
+ Fish the token out of a very long sentence
23
+ TEXT
24
+
25
+ def test_tokenize_with_context
26
+ counts,match = TextParser::tokenize_with_context(BUF)
27
+ assert_equal counts['world'],1
28
+ assert_equal counts['Love'],2
29
+ assert_equal match['world'], ['Hello world']
30
+ assert_equal match['Love'], ['Love Ruby, love RDF', 'Love the combination']
31
+ assert_equal match['context'], ['This is with context! I don\'t believe this is true, and that you can do this']
32
+ assert_equal match['GEN1.X'], ['Test gene GEN1.X']
33
+ assert_equal match['GEN2.X'], ['Fish the token out of a very long sentence Fish the token GEN2.X out of a very long sentence Fish the token out of a']
34
+ # ---- Line based context
35
+ counts,match = TextParser::tokenize_with_context(BUF,context=:line)
36
+ assert_equal match['GEN2.X'], ['Fish the token GEN2.X out of a very long sentence']
37
+ end
38
+
39
+ BUF2 =<<TEXT2
40
+ valid token figure S11 table XX p53
41
+ Invalid MD, and RD Jester, Wikkel W, Wokkel WOS
42
+ TEXT2
43
+
44
+ def test_valid_tokens
45
+ match = TextParser::tokenize(BUF2)
46
+ assert match['token']
47
+ assert !match['S11']
48
+ assert !match['XX']
49
+ assert !match['Wokkel']
50
+ assert match['p53']
51
+ assert match['WOS']
52
+ assert !match['MD']
53
+ assert !match['Invalid']
54
+ assert !match['RD']
55
+ assert !match['Jester']
56
+ assert !match['Wikkel']
57
+ assert !match['W']
58
+ end
59
+ end