rbbt-text 1.1.9 → 1.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +56 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +61 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +42 -12
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -361
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -355
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -52
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
data/test/rbbt/ner/test_rnorm.rb
CHANGED
@@ -27,10 +27,9 @@ S000000376 AAA GENE1 DDD
|
|
27
27
|
assert_equal(["S000000029"], @norm.match("FUN21"))
|
28
28
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN").sort)
|
29
29
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN 2").sort)
|
30
|
-
assert_equal(["
|
31
|
-
assert_equal([], @norm.match("
|
32
|
-
|
33
|
-
@norm.match("FUN21")
|
30
|
+
assert_equal(["S000000029"].sort, @norm.match("FUN 21").sort)
|
31
|
+
assert_equal([], @norm.match("Non-sense"))
|
32
|
+
assert_equal(["S000000029", "S000000374"], @norm.match("GER4"))
|
34
33
|
end
|
35
34
|
|
36
35
|
def test_select
|
@@ -74,6 +74,7 @@ C2;11;22;3 3;bb
|
|
74
74
|
index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))
|
75
75
|
|
76
76
|
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
|
+
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
@@ -1,9 +1,43 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/genia/sentence_splitter'
|
3
3
|
|
4
|
-
class
|
5
|
-
def
|
6
|
-
|
4
|
+
class TestNLP < Test::Unit::TestCase
|
5
|
+
def test_sentences
|
6
|
+
text =<<-EOF
|
7
|
+
This is a sentence.
|
8
|
+
A funky character ™ in a sentence.
|
9
|
+
This is a sentence.
|
10
|
+
This is a broken
|
11
|
+
sentence. This is
|
12
|
+
another broken sentence.
|
13
|
+
EOF
|
14
|
+
|
15
|
+
iii NLP.geniass_sentence_splitter(text)
|
16
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_sentences_2
|
20
|
+
text =<<-EOF
|
21
|
+
This is a sentence.
|
22
|
+
This is a sentence.
|
23
|
+
This is a broken
|
24
|
+
sentence. This is
|
25
|
+
another broken sentence.
|
26
|
+
EOF
|
27
|
+
|
28
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_sentences_ext
|
32
|
+
text =<<-EOF
|
33
|
+
This is a sentence.
|
34
|
+
This is a sentence.
|
35
|
+
This is a broken
|
36
|
+
sentence. This is
|
37
|
+
another broken sentence.
|
38
|
+
EOF
|
39
|
+
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
7
41
|
end
|
8
42
|
end
|
9
43
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/open_nlp/sentence_splitter'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment'
|
4
4
|
|
5
5
|
$text=<<-EOF
|
6
6
|
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors
|
@@ -22,6 +22,22 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
def test_sentences
|
23
23
|
text =<<-EOF
|
24
24
|
This is a sentence.
|
25
|
+
No funky character in this sentence.
|
26
|
+
This is a sentence.
|
27
|
+
This is a
|
28
|
+
sentence. This is
|
29
|
+
another sentence.
|
30
|
+
EOF
|
31
|
+
|
32
|
+
assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
|
33
|
+
|
34
|
+
assert_equal 5, OpenNLP.sentence_splitter(text).length
|
35
|
+
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_sentences_fix_utf8
|
39
|
+
text =<<-EOF
|
40
|
+
This is a sentence.
|
25
41
|
A funky character ™ in a sentence.
|
26
42
|
This is a sentence.
|
27
43
|
This is a
|
@@ -35,12 +51,12 @@ another sentence.
|
|
35
51
|
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
52
|
end
|
37
53
|
|
38
|
-
def
|
54
|
+
def test_text_sentences
|
39
55
|
Misc.benchmark(100) do
|
40
|
-
OpenNLP.sentence_splitter($text).include?
|
56
|
+
assert OpenNLP.sentence_splitter($text).include?("Our
|
41
57
|
findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
|
42
58
|
AT/RT and the usefulness of antibodies directed against SMARCA4 in this
|
43
|
-
diagnostic setting."
|
59
|
+
diagnostic setting.")
|
44
60
|
end
|
45
61
|
end
|
46
62
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/annotation'
|
6
|
+
|
7
|
+
class TestAnnotation < Test::Unit::TestCase
|
8
|
+
def test_annotation
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
|
+
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
16
|
+
|
17
|
+
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_annotid
|
22
|
+
text = "This is a document"
|
23
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
24
|
+
|
25
|
+
corpus = Document::Corpus.setup({})
|
26
|
+
|
27
|
+
corpus.add_document(text)
|
28
|
+
|
29
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
30
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
31
|
+
|
32
|
+
annotid = annotation.annotid(corpus)
|
33
|
+
|
34
|
+
assert_equal 'verb', annotid.type
|
35
|
+
assert_equal 'verb', annotid.annotation.type
|
36
|
+
assert_equal 'is', annotid.annotation
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/corpus'
|
6
|
+
|
7
|
+
class TestSegmentCorpus < Test::Unit::TestCase
|
8
|
+
def test_corpus
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = {}
|
13
|
+
corpus.extend Document::Corpus
|
14
|
+
|
15
|
+
corpus.add_document(text)
|
16
|
+
|
17
|
+
docid = text.docid(corpus)
|
18
|
+
|
19
|
+
assert_equal docid.document, text
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_find
|
23
|
+
text = "This is a document"
|
24
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
25
|
+
|
26
|
+
TmpFile.with_file do |path|
|
27
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
28
|
+
corpus.extend Document::Corpus
|
29
|
+
|
30
|
+
corpus.add_document(text)
|
31
|
+
|
32
|
+
assert corpus.prefix("TEST:").include?(text.docid)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/encoding'
|
3
|
+
|
4
|
+
class TestEncoding < Test::Unit::TestCase
|
5
|
+
def test_bad_chars
|
6
|
+
text = "A funky character ™ in a sentence."
|
7
|
+
|
8
|
+
assert_equal ["™"], Segment.bad_chars(text)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_ascii
|
12
|
+
text = "A funky character ™ in a sentence."
|
13
|
+
|
14
|
+
Segment.ascii(text) do
|
15
|
+
assert_equal "A funky character ? in a sentence.", text
|
16
|
+
end
|
17
|
+
|
18
|
+
Segment.ascii(text, "NONASCII") do
|
19
|
+
assert_equal "A funky character NONASCII in a sentence.", text
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "A funky character ™ in a sentence.", text
|
23
|
+
end
|
24
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
|
5
5
|
class TestClass < Test::Unit::TestCase
|
6
6
|
def test_info
|
@@ -15,35 +15,39 @@ class TestClass < Test::Unit::TestCase
|
|
15
15
|
|
16
16
|
def test_all_args
|
17
17
|
a = "test"
|
18
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
18
|
+
NamedEntity.setup a, 10, "TEST:doc1:test_type:hash", "NamedEntity", "TYPE", "CODE", "SCORE"
|
19
19
|
assert_equal 10, a.offset
|
20
|
+
assert_equal "NamedEntity", a.type
|
21
|
+
assert_equal "TYPE", a.entity_type
|
22
|
+
assert_equal "SCORE", a.score
|
20
23
|
end
|
21
24
|
|
22
25
|
def test_tsv
|
23
26
|
a = "test"
|
24
27
|
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
25
|
-
assert
|
26
|
-
assert
|
27
|
-
assert
|
28
|
+
assert Annotated.tsv([a]).fields.include? "code"
|
29
|
+
assert Annotated.tsv([a], nil).fields.include? "code"
|
30
|
+
assert Annotated.tsv([a], :all).fields.include? "code"
|
31
|
+
assert Annotated.tsv([a], :all).fields.include? "literal"
|
28
32
|
end
|
29
33
|
|
30
|
-
def
|
34
|
+
def __test_segment_brat
|
31
35
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
32
36
|
|
33
37
|
gene1 = "TP53"
|
34
38
|
gene1.extend NamedEntity
|
35
39
|
gene1.offset = a.index gene1
|
36
|
-
gene1.
|
40
|
+
gene1.entity_type = "Gene"
|
37
41
|
|
38
42
|
gene2 = "CDK5R1"
|
39
43
|
gene2.extend NamedEntity
|
40
44
|
gene2.offset = a.index gene2
|
41
|
-
gene2.
|
45
|
+
gene2.entity_type = "Gene"
|
42
46
|
|
43
47
|
gene3 = "TP53 gene"
|
44
48
|
gene3.extend NamedEntity
|
45
49
|
gene3.offset = a.index gene3
|
46
|
-
gene3.
|
50
|
+
gene3.entity_type = "Gene"
|
47
51
|
|
48
52
|
segments = [gene1, gene2, gene3]
|
49
53
|
assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/overlaps'
|
4
|
+
|
5
|
+
class TestOverlaps < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@text = <<-EOF
|
8
|
+
This is a first sentence. More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
9
|
+
EOF
|
10
|
+
|
11
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
12
|
+
Segment.setup(literal, :offset => @text.index(literal))
|
13
|
+
end
|
14
|
+
|
15
|
+
@sentences = @text.partition(".").values_at(0, 2).collect do |sentence|
|
16
|
+
Segment.setup sentence, :offset => @text.index(sentence)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_make_relative
|
21
|
+
sentence = @sentences[1]
|
22
|
+
|
23
|
+
@entities.each do |e|
|
24
|
+
assert_equal e, @text[e.range]
|
25
|
+
end
|
26
|
+
|
27
|
+
sentence.make_relative @entities do
|
28
|
+
@entities.each do |e|
|
29
|
+
assert_equal e, sentence[e.range]
|
30
|
+
end
|
31
|
+
|
32
|
+
@entities.each do |e|
|
33
|
+
assert_not_equal e, @text[e.range]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@entities.each do |e|
|
38
|
+
assert_equal e, @text[e.range]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_range_in
|
43
|
+
sentence = @sentences[1]
|
44
|
+
|
45
|
+
@entities.each do |e|
|
46
|
+
assert_equal e.range_in(sentence).begin, sentence.index(e)
|
47
|
+
assert_equal e.range.begin - sentence.offset, sentence.index(e)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_includes
|
52
|
+
@entities.each do |e|
|
53
|
+
assert ! @sentences[0].include?(e)
|
54
|
+
assert @sentences[1].include?(e)
|
55
|
+
assert ! e.include?(@sentences[0])
|
56
|
+
assert ! e.include?(@sentences[1])
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_overlaps?
|
61
|
+
@entities.each do |e|
|
62
|
+
assert ! @sentences[0].overlaps?(e)
|
63
|
+
assert @sentences[1].overlaps?(e)
|
64
|
+
assert ! e.overlaps?(@sentences[0])
|
65
|
+
assert e.overlaps?(@sentences[1])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/range_index'
|
6
|
+
|
7
|
+
class TestRangeIndex < Test::Unit::TestCase
|
8
|
+
def test_segment_index
|
9
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = Document::Corpus.setup({})
|
13
|
+
|
14
|
+
corpus.add_document(text)
|
15
|
+
|
16
|
+
gene1 = "TP53"
|
17
|
+
gene1.extend Segment
|
18
|
+
gene1.offset = text.index gene1
|
19
|
+
gene1.docid = text.docid
|
20
|
+
|
21
|
+
gene2 = "CDK5R1"
|
22
|
+
gene2.extend Segment
|
23
|
+
gene2.offset = text.index gene2
|
24
|
+
gene2.docid = text.docid
|
25
|
+
|
26
|
+
gene3 = "TP53 gene"
|
27
|
+
gene3.extend Segment
|
28
|
+
gene3.offset = text.index gene1
|
29
|
+
gene3.docid = text.docid
|
30
|
+
|
31
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus)
|
32
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
33
|
+
|
34
|
+
TmpFile.with_file do |fwt|
|
35
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
36
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
37
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
38
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
@@ -1,10 +1,21 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/transformed'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
require 'rexml/document'
|
5
5
|
|
6
|
-
class
|
7
|
-
|
6
|
+
class TestTransformed < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@text = <<-EOF
|
10
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
11
|
+
EOF
|
12
|
+
|
13
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
14
|
+
NamedEntity.setup(literal, :offset => @text.index(literal))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_transform
|
8
19
|
text = <<-EOF
|
9
20
|
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
10
21
|
EOF
|
@@ -13,52 +24,25 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
13
24
|
NamedEntity.setup(literal, :offset => text.index(literal))
|
14
25
|
end
|
15
26
|
|
16
|
-
Transformed.
|
17
|
-
|
18
|
-
end
|
27
|
+
Transformed.transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" })
|
28
|
+
assert text.include? "such as [IL-2]"
|
19
29
|
end
|
20
30
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
gene1 = "TP53"
|
26
|
-
gene1.extend Segment
|
27
|
-
gene1.offset = a.index gene1
|
28
|
-
|
29
|
-
gene2 = "CDK5"
|
30
|
-
gene2.extend Segment
|
31
|
-
gene2.offset = a.index gene2
|
32
|
-
|
33
|
-
assert_equal gene1, a[gene1.range]
|
34
|
-
assert_equal gene2, a[gene2.range]
|
35
|
-
|
36
|
-
c = a.dup
|
37
|
-
|
38
|
-
c[gene2.range] = "GN"
|
39
|
-
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
40
|
-
c[gene1.range] = "GN"
|
41
|
-
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
42
|
-
|
43
|
-
iii a.transformation_offset_differences
|
44
|
-
raise
|
45
|
-
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
46
|
-
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
47
|
-
|
48
|
-
|
49
|
-
gene3 = "GN gene"
|
50
|
-
gene3.extend Segment
|
51
|
-
gene3.offset = a.index gene3
|
52
|
-
|
53
|
-
assert_equal gene3, a[gene3.range]
|
31
|
+
def test_with_transform
|
32
|
+
text = <<-EOF
|
33
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
34
|
+
EOF
|
54
35
|
|
55
|
-
|
56
|
-
|
57
|
-
|
36
|
+
entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
|
37
|
+
NamedEntity.setup(literal, :offset => text.index(literal))
|
38
|
+
end
|
58
39
|
|
40
|
+
Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
|
41
|
+
assert text.include? "such as [IL-2]"
|
42
|
+
end
|
59
43
|
end
|
60
44
|
|
61
|
-
def
|
45
|
+
def test_with_transform_2
|
62
46
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
63
47
|
original = a.dup
|
64
48
|
|
@@ -117,18 +101,47 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
117
101
|
assert_equal "CDK5R1 protein", exp2
|
118
102
|
end
|
119
103
|
|
104
|
+
def test_with_transform_sentences
|
105
|
+
a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
|
+
original = a.dup
|
107
|
+
|
108
|
+
gene1 = "TP53"
|
109
|
+
gene1.extend NamedEntity
|
110
|
+
gene1.offset = a.index gene1
|
111
|
+
|
112
|
+
gene2 = "CDK5R1"
|
113
|
+
gene2.extend NamedEntity
|
114
|
+
gene2.offset = a.index gene2
|
115
|
+
|
116
|
+
bread = "Bread"
|
117
|
+
bread.extend NamedEntity
|
118
|
+
bread.offset = a.index bread
|
119
|
+
|
120
|
+
sentences = Segment.align(a, a.split(". "))
|
121
|
+
|
122
|
+
Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
|
123
|
+
assert sentences[1].include?("GN gene and the GN protein")
|
124
|
+
end
|
125
|
+
|
126
|
+
Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
|
127
|
+
assert sentences[0].include?("first sentence mentions BR")
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
end
|
132
|
+
|
120
133
|
def test_html
|
121
134
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
122
135
|
|
123
136
|
gene1 = "TP53"
|
124
137
|
gene1.extend NamedEntity
|
125
138
|
gene1.offset = a.index gene1
|
126
|
-
gene1.
|
139
|
+
gene1.entity_type = "Gene"
|
127
140
|
|
128
141
|
gene2 = "CDK5R1"
|
129
142
|
gene2.extend NamedEntity
|
130
143
|
gene2.offset = a.index gene2
|
131
|
-
gene2.
|
144
|
+
gene2.entity_type = "Protein"
|
132
145
|
|
133
146
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
134
147
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -143,13 +156,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
143
156
|
gene1.extend NamedEntity
|
144
157
|
gene1.offset = a.index gene1
|
145
158
|
gene1.offset += 10
|
146
|
-
gene1.
|
159
|
+
gene1.entity_type = "Gene"
|
147
160
|
|
148
161
|
gene2 = "CDK5R1"
|
149
162
|
gene2.extend NamedEntity
|
150
163
|
gene2.offset = a.index gene2
|
151
164
|
gene2.offset += 10
|
152
|
-
gene2.
|
165
|
+
gene2.entity_type = "Protein"
|
153
166
|
|
154
167
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
155
168
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -162,12 +175,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
162
175
|
gene1 = "TP53"
|
163
176
|
gene1.extend NamedEntity
|
164
177
|
gene1.offset = a.index gene1
|
165
|
-
gene1.
|
178
|
+
gene1.entity_type = "Gene"
|
166
179
|
|
167
180
|
gene2 = "TP53 gene"
|
168
181
|
gene2.extend NamedEntity
|
169
182
|
gene2.offset = a.index gene2
|
170
|
-
gene2.
|
183
|
+
gene2.entity_type = "Expanded Gene"
|
171
184
|
|
172
185
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
173
186
|
|
@@ -379,5 +392,46 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
379
392
|
end
|
380
393
|
end
|
381
394
|
end
|
395
|
+
|
396
|
+
def ___test_transform
|
397
|
+
a = "This sentence mentions the TP53 gene and the CDK5 protein"
|
398
|
+
original = a.dup
|
399
|
+
|
400
|
+
gene1 = "TP53"
|
401
|
+
gene1.extend Segment
|
402
|
+
gene1.offset = a.index gene1
|
403
|
+
|
404
|
+
gene2 = "CDK5"
|
405
|
+
gene2.extend Segment
|
406
|
+
gene2.offset = a.index gene2
|
407
|
+
|
408
|
+
assert_equal gene1, a[gene1.range]
|
409
|
+
assert_equal gene2, a[gene2.range]
|
410
|
+
|
411
|
+
c = a.dup
|
412
|
+
|
413
|
+
c[gene2.range] = "GN"
|
414
|
+
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
415
|
+
c[gene1.range] = "GN"
|
416
|
+
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
417
|
+
|
418
|
+
iii a.transformation_offset_differences
|
419
|
+
raise
|
420
|
+
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
421
|
+
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
422
|
+
|
423
|
+
|
424
|
+
gene3 = "GN gene"
|
425
|
+
gene3.extend Segment
|
426
|
+
gene3.offset = a.index gene3
|
427
|
+
|
428
|
+
assert_equal gene3, a[gene3.range]
|
429
|
+
|
430
|
+
a.restore([gene3])
|
431
|
+
assert_equal original, a
|
432
|
+
assert_equal "TP53 gene", a[gene3.range]
|
433
|
+
|
434
|
+
end
|
435
|
+
|
382
436
|
end
|
383
437
|
|