rbbt-text 1.1.9 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +56 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +61 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +42 -12
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -10
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +10 -8
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -3
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -361
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -355
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -52
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
data/test/rbbt/ner/test_rnorm.rb
CHANGED
@@ -27,10 +27,9 @@ S000000376 AAA GENE1 DDD
|
|
27
27
|
assert_equal(["S000000029"], @norm.match("FUN21"))
|
28
28
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN").sort)
|
29
29
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN 2").sort)
|
30
|
-
assert_equal(["
|
31
|
-
assert_equal([], @norm.match("
|
32
|
-
|
33
|
-
@norm.match("FUN21")
|
30
|
+
assert_equal(["S000000029"].sort, @norm.match("FUN 21").sort)
|
31
|
+
assert_equal([], @norm.match("Non-sense"))
|
32
|
+
assert_equal(["S000000029", "S000000374"], @norm.match("GER4"))
|
34
33
|
end
|
35
34
|
|
36
35
|
def test_select
|
@@ -74,6 +74,7 @@ C2;11;22;3 3;bb
|
|
74
74
|
index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))
|
75
75
|
|
76
76
|
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
|
+
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
@@ -1,9 +1,43 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/genia/sentence_splitter'
|
3
3
|
|
4
|
-
class
|
5
|
-
def
|
6
|
-
|
4
|
+
class TestNLP < Test::Unit::TestCase
|
5
|
+
def test_sentences
|
6
|
+
text =<<-EOF
|
7
|
+
This is a sentence.
|
8
|
+
A funky character ™ in a sentence.
|
9
|
+
This is a sentence.
|
10
|
+
This is a broken
|
11
|
+
sentence. This is
|
12
|
+
another broken sentence.
|
13
|
+
EOF
|
14
|
+
|
15
|
+
iii NLP.geniass_sentence_splitter(text)
|
16
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_sentences_2
|
20
|
+
text =<<-EOF
|
21
|
+
This is a sentence.
|
22
|
+
This is a sentence.
|
23
|
+
This is a broken
|
24
|
+
sentence. This is
|
25
|
+
another broken sentence.
|
26
|
+
EOF
|
27
|
+
|
28
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_sentences_ext
|
32
|
+
text =<<-EOF
|
33
|
+
This is a sentence.
|
34
|
+
This is a sentence.
|
35
|
+
This is a broken
|
36
|
+
sentence. This is
|
37
|
+
another broken sentence.
|
38
|
+
EOF
|
39
|
+
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
7
41
|
end
|
8
42
|
end
|
9
43
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/open_nlp/sentence_splitter'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment'
|
4
4
|
|
5
5
|
$text=<<-EOF
|
6
6
|
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors
|
@@ -22,6 +22,22 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
def test_sentences
|
23
23
|
text =<<-EOF
|
24
24
|
This is a sentence.
|
25
|
+
No funky character in this sentence.
|
26
|
+
This is a sentence.
|
27
|
+
This is a
|
28
|
+
sentence. This is
|
29
|
+
another sentence.
|
30
|
+
EOF
|
31
|
+
|
32
|
+
assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
|
33
|
+
|
34
|
+
assert_equal 5, OpenNLP.sentence_splitter(text).length
|
35
|
+
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_sentences_fix_utf8
|
39
|
+
text =<<-EOF
|
40
|
+
This is a sentence.
|
25
41
|
A funky character ™ in a sentence.
|
26
42
|
This is a sentence.
|
27
43
|
This is a
|
@@ -35,12 +51,12 @@ another sentence.
|
|
35
51
|
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
52
|
end
|
37
53
|
|
38
|
-
def
|
54
|
+
def test_text_sentences
|
39
55
|
Misc.benchmark(100) do
|
40
|
-
OpenNLP.sentence_splitter($text).include?
|
56
|
+
assert OpenNLP.sentence_splitter($text).include?("Our
|
41
57
|
findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
|
42
58
|
AT/RT and the usefulness of antibodies directed against SMARCA4 in this
|
43
|
-
diagnostic setting."
|
59
|
+
diagnostic setting.")
|
44
60
|
end
|
45
61
|
end
|
46
62
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/annotation'
|
6
|
+
|
7
|
+
class TestAnnotation < Test::Unit::TestCase
|
8
|
+
def test_annotation
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
|
+
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
16
|
+
|
17
|
+
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_annotid
|
22
|
+
text = "This is a document"
|
23
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
24
|
+
|
25
|
+
corpus = Document::Corpus.setup({})
|
26
|
+
|
27
|
+
corpus.add_document(text)
|
28
|
+
|
29
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
30
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
31
|
+
|
32
|
+
annotid = annotation.annotid(corpus)
|
33
|
+
|
34
|
+
assert_equal 'verb', annotid.type
|
35
|
+
assert_equal 'verb', annotid.annotation.type
|
36
|
+
assert_equal 'is', annotid.annotation
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/corpus'
|
6
|
+
|
7
|
+
class TestSegmentCorpus < Test::Unit::TestCase
|
8
|
+
def test_corpus
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = {}
|
13
|
+
corpus.extend Document::Corpus
|
14
|
+
|
15
|
+
corpus.add_document(text)
|
16
|
+
|
17
|
+
docid = text.docid(corpus)
|
18
|
+
|
19
|
+
assert_equal docid.document, text
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_find
|
23
|
+
text = "This is a document"
|
24
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
25
|
+
|
26
|
+
TmpFile.with_file do |path|
|
27
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
28
|
+
corpus.extend Document::Corpus
|
29
|
+
|
30
|
+
corpus.add_document(text)
|
31
|
+
|
32
|
+
assert corpus.prefix("TEST:").include?(text.docid)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/encoding'
|
3
|
+
|
4
|
+
class TestEncoding < Test::Unit::TestCase
|
5
|
+
def test_bad_chars
|
6
|
+
text = "A funky character ™ in a sentence."
|
7
|
+
|
8
|
+
assert_equal ["™"], Segment.bad_chars(text)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_ascii
|
12
|
+
text = "A funky character ™ in a sentence."
|
13
|
+
|
14
|
+
Segment.ascii(text) do
|
15
|
+
assert_equal "A funky character ? in a sentence.", text
|
16
|
+
end
|
17
|
+
|
18
|
+
Segment.ascii(text, "NONASCII") do
|
19
|
+
assert_equal "A funky character NONASCII in a sentence.", text
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "A funky character ™ in a sentence.", text
|
23
|
+
end
|
24
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
|
5
5
|
class TestClass < Test::Unit::TestCase
|
6
6
|
def test_info
|
@@ -15,35 +15,39 @@ class TestClass < Test::Unit::TestCase
|
|
15
15
|
|
16
16
|
def test_all_args
|
17
17
|
a = "test"
|
18
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
18
|
+
NamedEntity.setup a, 10, "TEST:doc1:test_type:hash", "NamedEntity", "TYPE", "CODE", "SCORE"
|
19
19
|
assert_equal 10, a.offset
|
20
|
+
assert_equal "NamedEntity", a.type
|
21
|
+
assert_equal "TYPE", a.entity_type
|
22
|
+
assert_equal "SCORE", a.score
|
20
23
|
end
|
21
24
|
|
22
25
|
def test_tsv
|
23
26
|
a = "test"
|
24
27
|
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
25
|
-
assert
|
26
|
-
assert
|
27
|
-
assert
|
28
|
+
assert Annotated.tsv([a]).fields.include? "code"
|
29
|
+
assert Annotated.tsv([a], nil).fields.include? "code"
|
30
|
+
assert Annotated.tsv([a], :all).fields.include? "code"
|
31
|
+
assert Annotated.tsv([a], :all).fields.include? "literal"
|
28
32
|
end
|
29
33
|
|
30
|
-
def
|
34
|
+
def __test_segment_brat
|
31
35
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
32
36
|
|
33
37
|
gene1 = "TP53"
|
34
38
|
gene1.extend NamedEntity
|
35
39
|
gene1.offset = a.index gene1
|
36
|
-
gene1.
|
40
|
+
gene1.entity_type = "Gene"
|
37
41
|
|
38
42
|
gene2 = "CDK5R1"
|
39
43
|
gene2.extend NamedEntity
|
40
44
|
gene2.offset = a.index gene2
|
41
|
-
gene2.
|
45
|
+
gene2.entity_type = "Gene"
|
42
46
|
|
43
47
|
gene3 = "TP53 gene"
|
44
48
|
gene3.extend NamedEntity
|
45
49
|
gene3.offset = a.index gene3
|
46
|
-
gene3.
|
50
|
+
gene3.entity_type = "Gene"
|
47
51
|
|
48
52
|
segments = [gene1, gene2, gene3]
|
49
53
|
assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/overlaps'
|
4
|
+
|
5
|
+
class TestOverlaps < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@text = <<-EOF
|
8
|
+
This is a first sentence. More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
9
|
+
EOF
|
10
|
+
|
11
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
12
|
+
Segment.setup(literal, :offset => @text.index(literal))
|
13
|
+
end
|
14
|
+
|
15
|
+
@sentences = @text.partition(".").values_at(0, 2).collect do |sentence|
|
16
|
+
Segment.setup sentence, :offset => @text.index(sentence)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_make_relative
|
21
|
+
sentence = @sentences[1]
|
22
|
+
|
23
|
+
@entities.each do |e|
|
24
|
+
assert_equal e, @text[e.range]
|
25
|
+
end
|
26
|
+
|
27
|
+
sentence.make_relative @entities do
|
28
|
+
@entities.each do |e|
|
29
|
+
assert_equal e, sentence[e.range]
|
30
|
+
end
|
31
|
+
|
32
|
+
@entities.each do |e|
|
33
|
+
assert_not_equal e, @text[e.range]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@entities.each do |e|
|
38
|
+
assert_equal e, @text[e.range]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_range_in
|
43
|
+
sentence = @sentences[1]
|
44
|
+
|
45
|
+
@entities.each do |e|
|
46
|
+
assert_equal e.range_in(sentence).begin, sentence.index(e)
|
47
|
+
assert_equal e.range.begin - sentence.offset, sentence.index(e)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_includes
|
52
|
+
@entities.each do |e|
|
53
|
+
assert ! @sentences[0].include?(e)
|
54
|
+
assert @sentences[1].include?(e)
|
55
|
+
assert ! e.include?(@sentences[0])
|
56
|
+
assert ! e.include?(@sentences[1])
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_overlaps?
|
61
|
+
@entities.each do |e|
|
62
|
+
assert ! @sentences[0].overlaps?(e)
|
63
|
+
assert @sentences[1].overlaps?(e)
|
64
|
+
assert ! e.overlaps?(@sentences[0])
|
65
|
+
assert e.overlaps?(@sentences[1])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/range_index'
|
6
|
+
|
7
|
+
class TestRangeIndex < Test::Unit::TestCase
|
8
|
+
def test_segment_index
|
9
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = Document::Corpus.setup({})
|
13
|
+
|
14
|
+
corpus.add_document(text)
|
15
|
+
|
16
|
+
gene1 = "TP53"
|
17
|
+
gene1.extend Segment
|
18
|
+
gene1.offset = text.index gene1
|
19
|
+
gene1.docid = text.docid
|
20
|
+
|
21
|
+
gene2 = "CDK5R1"
|
22
|
+
gene2.extend Segment
|
23
|
+
gene2.offset = text.index gene2
|
24
|
+
gene2.docid = text.docid
|
25
|
+
|
26
|
+
gene3 = "TP53 gene"
|
27
|
+
gene3.extend Segment
|
28
|
+
gene3.offset = text.index gene1
|
29
|
+
gene3.docid = text.docid
|
30
|
+
|
31
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus)
|
32
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
33
|
+
|
34
|
+
TmpFile.with_file do |fwt|
|
35
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
36
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
37
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
38
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
@@ -1,10 +1,21 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/transformed'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
require 'rexml/document'
|
5
5
|
|
6
|
-
class
|
7
|
-
|
6
|
+
class TestTransformed < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@text = <<-EOF
|
10
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
11
|
+
EOF
|
12
|
+
|
13
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
14
|
+
NamedEntity.setup(literal, :offset => @text.index(literal))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_transform
|
8
19
|
text = <<-EOF
|
9
20
|
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
10
21
|
EOF
|
@@ -13,52 +24,25 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
13
24
|
NamedEntity.setup(literal, :offset => text.index(literal))
|
14
25
|
end
|
15
26
|
|
16
|
-
Transformed.
|
17
|
-
|
18
|
-
end
|
27
|
+
Transformed.transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" })
|
28
|
+
assert text.include? "such as [IL-2]"
|
19
29
|
end
|
20
30
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
gene1 = "TP53"
|
26
|
-
gene1.extend Segment
|
27
|
-
gene1.offset = a.index gene1
|
28
|
-
|
29
|
-
gene2 = "CDK5"
|
30
|
-
gene2.extend Segment
|
31
|
-
gene2.offset = a.index gene2
|
32
|
-
|
33
|
-
assert_equal gene1, a[gene1.range]
|
34
|
-
assert_equal gene2, a[gene2.range]
|
35
|
-
|
36
|
-
c = a.dup
|
37
|
-
|
38
|
-
c[gene2.range] = "GN"
|
39
|
-
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
40
|
-
c[gene1.range] = "GN"
|
41
|
-
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
42
|
-
|
43
|
-
iii a.transformation_offset_differences
|
44
|
-
raise
|
45
|
-
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
46
|
-
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
47
|
-
|
48
|
-
|
49
|
-
gene3 = "GN gene"
|
50
|
-
gene3.extend Segment
|
51
|
-
gene3.offset = a.index gene3
|
52
|
-
|
53
|
-
assert_equal gene3, a[gene3.range]
|
31
|
+
def test_with_transform
|
32
|
+
text = <<-EOF
|
33
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
34
|
+
EOF
|
54
35
|
|
55
|
-
|
56
|
-
|
57
|
-
|
36
|
+
entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
|
37
|
+
NamedEntity.setup(literal, :offset => text.index(literal))
|
38
|
+
end
|
58
39
|
|
40
|
+
Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
|
41
|
+
assert text.include? "such as [IL-2]"
|
42
|
+
end
|
59
43
|
end
|
60
44
|
|
61
|
-
def
|
45
|
+
def test_with_transform_2
|
62
46
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
63
47
|
original = a.dup
|
64
48
|
|
@@ -117,18 +101,47 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
117
101
|
assert_equal "CDK5R1 protein", exp2
|
118
102
|
end
|
119
103
|
|
104
|
+
def test_with_transform_sentences
|
105
|
+
a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
|
+
original = a.dup
|
107
|
+
|
108
|
+
gene1 = "TP53"
|
109
|
+
gene1.extend NamedEntity
|
110
|
+
gene1.offset = a.index gene1
|
111
|
+
|
112
|
+
gene2 = "CDK5R1"
|
113
|
+
gene2.extend NamedEntity
|
114
|
+
gene2.offset = a.index gene2
|
115
|
+
|
116
|
+
bread = "Bread"
|
117
|
+
bread.extend NamedEntity
|
118
|
+
bread.offset = a.index bread
|
119
|
+
|
120
|
+
sentences = Segment.align(a, a.split(". "))
|
121
|
+
|
122
|
+
Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
|
123
|
+
assert sentences[1].include?("GN gene and the GN protein")
|
124
|
+
end
|
125
|
+
|
126
|
+
Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
|
127
|
+
assert sentences[0].include?("first sentence mentions BR")
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
end
|
132
|
+
|
120
133
|
def test_html
|
121
134
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
122
135
|
|
123
136
|
gene1 = "TP53"
|
124
137
|
gene1.extend NamedEntity
|
125
138
|
gene1.offset = a.index gene1
|
126
|
-
gene1.
|
139
|
+
gene1.entity_type = "Gene"
|
127
140
|
|
128
141
|
gene2 = "CDK5R1"
|
129
142
|
gene2.extend NamedEntity
|
130
143
|
gene2.offset = a.index gene2
|
131
|
-
gene2.
|
144
|
+
gene2.entity_type = "Protein"
|
132
145
|
|
133
146
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
134
147
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -143,13 +156,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
143
156
|
gene1.extend NamedEntity
|
144
157
|
gene1.offset = a.index gene1
|
145
158
|
gene1.offset += 10
|
146
|
-
gene1.
|
159
|
+
gene1.entity_type = "Gene"
|
147
160
|
|
148
161
|
gene2 = "CDK5R1"
|
149
162
|
gene2.extend NamedEntity
|
150
163
|
gene2.offset = a.index gene2
|
151
164
|
gene2.offset += 10
|
152
|
-
gene2.
|
165
|
+
gene2.entity_type = "Protein"
|
153
166
|
|
154
167
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
155
168
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -162,12 +175,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
162
175
|
gene1 = "TP53"
|
163
176
|
gene1.extend NamedEntity
|
164
177
|
gene1.offset = a.index gene1
|
165
|
-
gene1.
|
178
|
+
gene1.entity_type = "Gene"
|
166
179
|
|
167
180
|
gene2 = "TP53 gene"
|
168
181
|
gene2.extend NamedEntity
|
169
182
|
gene2.offset = a.index gene2
|
170
|
-
gene2.
|
183
|
+
gene2.entity_type = "Expanded Gene"
|
171
184
|
|
172
185
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
173
186
|
|
@@ -379,5 +392,46 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
379
392
|
end
|
380
393
|
end
|
381
394
|
end
|
395
|
+
|
396
|
+
def ___test_transform
|
397
|
+
a = "This sentence mentions the TP53 gene and the CDK5 protein"
|
398
|
+
original = a.dup
|
399
|
+
|
400
|
+
gene1 = "TP53"
|
401
|
+
gene1.extend Segment
|
402
|
+
gene1.offset = a.index gene1
|
403
|
+
|
404
|
+
gene2 = "CDK5"
|
405
|
+
gene2.extend Segment
|
406
|
+
gene2.offset = a.index gene2
|
407
|
+
|
408
|
+
assert_equal gene1, a[gene1.range]
|
409
|
+
assert_equal gene2, a[gene2.range]
|
410
|
+
|
411
|
+
c = a.dup
|
412
|
+
|
413
|
+
c[gene2.range] = "GN"
|
414
|
+
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
415
|
+
c[gene1.range] = "GN"
|
416
|
+
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
417
|
+
|
418
|
+
iii a.transformation_offset_differences
|
419
|
+
raise
|
420
|
+
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
421
|
+
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
422
|
+
|
423
|
+
|
424
|
+
gene3 = "GN gene"
|
425
|
+
gene3.extend Segment
|
426
|
+
gene3.offset = a.index gene3
|
427
|
+
|
428
|
+
assert_equal gene3, a[gene3.range]
|
429
|
+
|
430
|
+
a.restore([gene3])
|
431
|
+
assert_equal original, a
|
432
|
+
assert_equal "TP53 gene", a[gene3.range]
|
433
|
+
|
434
|
+
end
|
435
|
+
|
382
436
|
end
|
383
437
|
|