rbbt-text 1.2.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/document.rb +46 -0
- data/lib/rbbt/document/annotation.rb +42 -0
- data/lib/rbbt/document/corpus.rb +38 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +19 -2
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +6 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +6 -6
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/segment.rb +177 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +40 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +43 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +187 -0
- data/test/test_helper.rb +5 -3
- metadata +40 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -23,9 +23,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
23
23
|
matches = RegExpNER.match_regexp_hash(sentence, regexp_hash)
|
24
24
|
|
25
25
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
26
|
-
assert_equal "In ".length, matches.select{|m| m.
|
27
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
28
|
-
assert_equal :this, matches.select{|m| m.
|
26
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
27
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
28
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
29
29
|
end
|
30
30
|
|
31
31
|
def test_define_regexps
|
@@ -39,9 +39,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
39
39
|
|
40
40
|
matches = ner.entities(sentence)
|
41
41
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
42
|
-
assert_equal "In ".length, matches.select{|m| m.
|
43
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
44
|
-
assert_equal :this, matches.select{|m| m.
|
42
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this }[0].offset
|
43
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this }[1].offset
|
44
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this }[0].entity_type
|
45
45
|
end
|
46
46
|
|
47
47
|
|
@@ -51,9 +51,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
51
51
|
ner = RegExpNER.new({:this => /this/, :that => /that/})
|
52
52
|
matches = ner.entities(sentence)
|
53
53
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
54
|
-
assert_equal "In ".length, matches.select{|m| m.
|
55
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
56
|
-
assert_equal :this, matches.select{|m| m.
|
54
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
55
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
56
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
57
57
|
|
58
58
|
Segmented.setup(sentence)
|
59
59
|
ner_this = RegExpNER.new({:this => /this/})
|
@@ -64,9 +64,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
64
64
|
matches = sentence.segments
|
65
65
|
|
66
66
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
67
|
-
assert_equal "In ".length, matches.select{|m| m.
|
68
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
69
|
-
assert_equal :this, matches.select{|m| m.
|
67
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
68
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
69
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
70
70
|
end
|
71
71
|
|
72
72
|
def test_entities_captures
|
@@ -75,8 +75,8 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
75
75
|
ner = RegExpNER.new({:this => /this/, :that => /that/, :should => /I (should)/})
|
76
76
|
matches = ner.entities(sentence)
|
77
77
|
assert_equal ["this", "this", "that", "should"].sort, matches.sort
|
78
|
-
assert_equal "In this sentence I ".length, matches.select{|m| m.
|
79
|
-
assert_equal :should, matches.select{|m| m.
|
78
|
+
assert_equal "In this sentence I ".length, matches.select{|m| m.entity_type == :should}[0].offset
|
79
|
+
assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
|
80
80
|
end
|
81
81
|
|
82
82
|
|
data/test/rbbt/ner/test_rnorm.rb
CHANGED
@@ -27,10 +27,9 @@ S000000376 AAA GENE1 DDD
|
|
27
27
|
assert_equal(["S000000029"], @norm.match("FUN21"))
|
28
28
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN").sort)
|
29
29
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN 2").sort)
|
30
|
-
assert_equal(["
|
31
|
-
assert_equal([], @norm.match("
|
32
|
-
|
33
|
-
@norm.match("FUN21")
|
30
|
+
assert_equal(["S000000029"].sort, @norm.match("FUN 21").sort)
|
31
|
+
assert_equal([], @norm.match("Non-sense"))
|
32
|
+
assert_equal(["S000000029", "S000000374"], @norm.match("GER4"))
|
34
33
|
end
|
35
34
|
|
36
35
|
def test_select
|
@@ -74,6 +74,7 @@ C2;11;22;3 3;bb
|
|
74
74
|
index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))
|
75
75
|
|
76
76
|
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
|
+
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
@@ -1,9 +1,19 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/genia/sentence_splitter'
|
3
3
|
|
4
|
-
class
|
5
|
-
def
|
6
|
-
|
4
|
+
class TestNLP < Test::Unit::TestCase
|
5
|
+
def test_sentences
|
6
|
+
text =<<-EOF
|
7
|
+
This is a sentence.
|
8
|
+
A funky character ™ in a sentence.
|
9
|
+
This is a sentence.
|
10
|
+
This is a
|
11
|
+
sentence. This is
|
12
|
+
another sentence.
|
13
|
+
EOF
|
14
|
+
|
15
|
+
assert_equal "This is a \nsentence.", NLP.geniass_sentence_splitter(text)[3]
|
7
16
|
end
|
17
|
+
|
8
18
|
end
|
9
19
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/open_nlp/sentence_splitter'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment'
|
4
4
|
|
5
5
|
$text=<<-EOF
|
6
6
|
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors
|
@@ -22,6 +22,22 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
def test_sentences
|
23
23
|
text =<<-EOF
|
24
24
|
This is a sentence.
|
25
|
+
No funky character in this sentence.
|
26
|
+
This is a sentence.
|
27
|
+
This is a
|
28
|
+
sentence. This is
|
29
|
+
another sentence.
|
30
|
+
EOF
|
31
|
+
|
32
|
+
assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
|
33
|
+
|
34
|
+
assert_equal 5, OpenNLP.sentence_splitter(text).length
|
35
|
+
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_sentences_fix_utf8
|
39
|
+
text =<<-EOF
|
40
|
+
This is a sentence.
|
25
41
|
A funky character ™ in a sentence.
|
26
42
|
This is a sentence.
|
27
43
|
This is a
|
@@ -35,12 +51,12 @@ another sentence.
|
|
35
51
|
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
52
|
end
|
37
53
|
|
38
|
-
def
|
54
|
+
def test_text_sentences
|
39
55
|
Misc.benchmark(100) do
|
40
|
-
OpenNLP.sentence_splitter($text).include?
|
56
|
+
assert OpenNLP.sentence_splitter($text).include?("Our
|
41
57
|
findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
|
42
58
|
AT/RT and the usefulness of antibodies directed against SMARCA4 in this
|
43
|
-
diagnostic setting."
|
59
|
+
diagnostic setting.")
|
44
60
|
end
|
45
61
|
end
|
46
62
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/annotation'
|
6
|
+
|
7
|
+
class TestAnnotation < Test::Unit::TestCase
|
8
|
+
def test_annotation
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
|
+
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":").last
|
16
|
+
|
17
|
+
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":").last
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_annotid
|
22
|
+
text = "This is a document"
|
23
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
24
|
+
|
25
|
+
corpus = {}
|
26
|
+
corpus.extend Document::Corpus
|
27
|
+
|
28
|
+
corpus.add_document(text)
|
29
|
+
|
30
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
31
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
32
|
+
|
33
|
+
annotid = annotation.annotid(corpus)
|
34
|
+
|
35
|
+
assert_equal 'verb', annotid.type
|
36
|
+
assert_equal 'verb', annotid.annotation.type
|
37
|
+
assert_equal 'is', annotid.annotation
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/corpus'
|
6
|
+
|
7
|
+
class TestSegmentCorpus < Test::Unit::TestCase
|
8
|
+
def test_corpus
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = {}
|
13
|
+
corpus.extend Document::Corpus
|
14
|
+
|
15
|
+
corpus.add_document(text)
|
16
|
+
|
17
|
+
docid = text.docid(corpus)
|
18
|
+
|
19
|
+
assert_equal docid.document, text
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_find
|
23
|
+
text = "This is a document"
|
24
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
25
|
+
|
26
|
+
TmpFile.with_file do |path|
|
27
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
28
|
+
corpus.extend Document::Corpus
|
29
|
+
|
30
|
+
corpus.add_document(text)
|
31
|
+
|
32
|
+
assert corpus.prefix("TEST:").include?(text.docid)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/encoding'
|
3
|
+
|
4
|
+
class TestEncoding < Test::Unit::TestCase
|
5
|
+
def _test_bad_chars
|
6
|
+
text = "A funky character ™ in a sentence."
|
7
|
+
|
8
|
+
assert_equal ["™"], Segment.bad_chars(text)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_ascii
|
12
|
+
text = "A funky character ™ in a sentence."
|
13
|
+
|
14
|
+
Segment.ascii(text) do
|
15
|
+
assert_equal "A funky character ? in a sentence.", text
|
16
|
+
end
|
17
|
+
|
18
|
+
Segment.ascii(text, "NONASCII") do
|
19
|
+
assert_equal "A funky character NONASCII in a sentence.", text
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "A funky character ™ in a sentence.", text
|
23
|
+
end
|
24
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
|
5
5
|
class TestClass < Test::Unit::TestCase
|
6
6
|
def test_info
|
@@ -15,11 +15,14 @@ class TestClass < Test::Unit::TestCase
|
|
15
15
|
|
16
16
|
def test_all_args
|
17
17
|
a = "test"
|
18
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
18
|
+
NamedEntity.setup a, 10, "TEST:doc1:test_type:hash", "NamedEntity", "TYPE", "CODE", "SCORE"
|
19
19
|
assert_equal 10, a.offset
|
20
|
+
assert_equal "NamedEntity", a.type
|
21
|
+
assert_equal "TYPE", a.entity_type
|
22
|
+
assert_equal "SCORE", a.score
|
20
23
|
end
|
21
24
|
|
22
|
-
def
|
25
|
+
def __test_tsv
|
23
26
|
a = "test"
|
24
27
|
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
25
28
|
assert Segment.tsv([a]).fields.include? "code"
|
@@ -27,23 +30,23 @@ class TestClass < Test::Unit::TestCase
|
|
27
30
|
assert Segment.tsv([a], "literal").fields.include? "code"
|
28
31
|
end
|
29
32
|
|
30
|
-
def
|
33
|
+
def __test_segment_brat
|
31
34
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
32
35
|
|
33
36
|
gene1 = "TP53"
|
34
37
|
gene1.extend NamedEntity
|
35
38
|
gene1.offset = a.index gene1
|
36
|
-
gene1.
|
39
|
+
gene1.entity_type = "Gene"
|
37
40
|
|
38
41
|
gene2 = "CDK5R1"
|
39
42
|
gene2.extend NamedEntity
|
40
43
|
gene2.offset = a.index gene2
|
41
|
-
gene2.
|
44
|
+
gene2.entity_type = "Gene"
|
42
45
|
|
43
46
|
gene3 = "TP53 gene"
|
44
47
|
gene3.extend NamedEntity
|
45
48
|
gene3.offset = a.index gene3
|
46
|
-
gene3.
|
49
|
+
gene3.entity_type = "Gene"
|
47
50
|
|
48
51
|
segments = [gene1, gene2, gene3]
|
49
52
|
assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/overlaps'
|
4
|
+
|
5
|
+
class TestOverlaps < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@text = <<-EOF
|
8
|
+
This is a first sentence. More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
9
|
+
EOF
|
10
|
+
|
11
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
12
|
+
Segment.setup(literal, :offset => @text.index(literal))
|
13
|
+
end
|
14
|
+
|
15
|
+
@sentences = @text.partition(".").values_at(0, 2).collect do |sentence|
|
16
|
+
Segment.setup sentence, :offset => @text.index(sentence)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_make_relative
|
21
|
+
sentence = @sentences[1]
|
22
|
+
|
23
|
+
@entities.each do |e|
|
24
|
+
assert_equal e, @text[e.range]
|
25
|
+
end
|
26
|
+
|
27
|
+
sentence.make_relative @entities do
|
28
|
+
@entities.each do |e|
|
29
|
+
assert_equal e, sentence[e.range]
|
30
|
+
end
|
31
|
+
|
32
|
+
@entities.each do |e|
|
33
|
+
assert_not_equal e, @text[e.range]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@entities.each do |e|
|
38
|
+
assert_equal e, @text[e.range]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_range_in
|
43
|
+
sentence = @sentences[1]
|
44
|
+
|
45
|
+
@entities.each do |e|
|
46
|
+
assert_equal e.range_in(sentence).begin, sentence.index(e)
|
47
|
+
assert_equal e.range.begin - sentence.offset, sentence.index(e)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_includes
|
52
|
+
@entities.each do |e|
|
53
|
+
assert ! @sentences[0].include?(e)
|
54
|
+
assert @sentences[1].include?(e)
|
55
|
+
assert ! e.include?(@sentences[0])
|
56
|
+
assert ! e.include?(@sentences[1])
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_overlaps?
|
61
|
+
@entities.each do |e|
|
62
|
+
assert ! @sentences[0].overlaps?(e)
|
63
|
+
assert @sentences[1].overlaps?(e)
|
64
|
+
assert ! e.overlaps?(@sentences[0])
|
65
|
+
assert e.overlaps?(@sentences[1])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/range_index'
|
6
|
+
|
7
|
+
class TestRangeIndex < Test::Unit::TestCase
|
8
|
+
def test_segment_index
|
9
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = {}
|
13
|
+
corpus.extend Document::Corpus
|
14
|
+
|
15
|
+
corpus.add_document(text)
|
16
|
+
|
17
|
+
gene1 = "TP53"
|
18
|
+
gene1.extend Segment
|
19
|
+
gene1.offset = text.index gene1
|
20
|
+
gene1.docid = text.docid
|
21
|
+
|
22
|
+
gene2 = "CDK5R1"
|
23
|
+
gene2.extend Segment
|
24
|
+
gene2.offset = text.index gene2
|
25
|
+
gene2.docid = text.docid
|
26
|
+
|
27
|
+
gene3 = "TP53 gene"
|
28
|
+
gene3.extend Segment
|
29
|
+
gene3.offset = text.index gene1
|
30
|
+
gene3.docid = text.docid
|
31
|
+
|
32
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus)
|
33
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
34
|
+
|
35
|
+
TmpFile.with_file do |fwt|
|
36
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
37
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
38
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
39
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
@@ -1,10 +1,21 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/transformed'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
require 'rexml/document'
|
5
5
|
|
6
|
-
class
|
7
|
-
|
6
|
+
class TestTransformed < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@text = <<-EOF
|
10
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
11
|
+
EOF
|
12
|
+
|
13
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
14
|
+
NamedEntity.setup(literal, :offset => @text.index(literal))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_transform
|
8
19
|
text = <<-EOF
|
9
20
|
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
10
21
|
EOF
|
@@ -13,52 +24,25 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
13
24
|
NamedEntity.setup(literal, :offset => text.index(literal))
|
14
25
|
end
|
15
26
|
|
16
|
-
Transformed.
|
17
|
-
|
18
|
-
end
|
27
|
+
Transformed.transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" })
|
28
|
+
assert text.include? "such as [IL-2]"
|
19
29
|
end
|
20
30
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
gene1 = "TP53"
|
26
|
-
gene1.extend Segment
|
27
|
-
gene1.offset = a.index gene1
|
28
|
-
|
29
|
-
gene2 = "CDK5"
|
30
|
-
gene2.extend Segment
|
31
|
-
gene2.offset = a.index gene2
|
32
|
-
|
33
|
-
assert_equal gene1, a[gene1.range]
|
34
|
-
assert_equal gene2, a[gene2.range]
|
35
|
-
|
36
|
-
c = a.dup
|
37
|
-
|
38
|
-
c[gene2.range] = "GN"
|
39
|
-
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
40
|
-
c[gene1.range] = "GN"
|
41
|
-
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
42
|
-
|
43
|
-
iii a.transformation_offset_differences
|
44
|
-
raise
|
45
|
-
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
46
|
-
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
47
|
-
|
48
|
-
|
49
|
-
gene3 = "GN gene"
|
50
|
-
gene3.extend Segment
|
51
|
-
gene3.offset = a.index gene3
|
52
|
-
|
53
|
-
assert_equal gene3, a[gene3.range]
|
31
|
+
def test_with_transform
|
32
|
+
text = <<-EOF
|
33
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
34
|
+
EOF
|
54
35
|
|
55
|
-
|
56
|
-
|
57
|
-
|
36
|
+
entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
|
37
|
+
NamedEntity.setup(literal, :offset => text.index(literal))
|
38
|
+
end
|
58
39
|
|
40
|
+
Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
|
41
|
+
assert text.include? "such as [IL-2]"
|
42
|
+
end
|
59
43
|
end
|
60
44
|
|
61
|
-
def
|
45
|
+
def test_with_transform_2
|
62
46
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
63
47
|
original = a.dup
|
64
48
|
|
@@ -123,12 +107,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
123
107
|
gene1 = "TP53"
|
124
108
|
gene1.extend NamedEntity
|
125
109
|
gene1.offset = a.index gene1
|
126
|
-
gene1.
|
110
|
+
gene1.entity_type = "Gene"
|
127
111
|
|
128
112
|
gene2 = "CDK5R1"
|
129
113
|
gene2.extend NamedEntity
|
130
114
|
gene2.offset = a.index gene2
|
131
|
-
gene2.
|
115
|
+
gene2.entity_type = "Protein"
|
132
116
|
|
133
117
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
134
118
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -143,13 +127,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
143
127
|
gene1.extend NamedEntity
|
144
128
|
gene1.offset = a.index gene1
|
145
129
|
gene1.offset += 10
|
146
|
-
gene1.
|
130
|
+
gene1.entity_type = "Gene"
|
147
131
|
|
148
132
|
gene2 = "CDK5R1"
|
149
133
|
gene2.extend NamedEntity
|
150
134
|
gene2.offset = a.index gene2
|
151
135
|
gene2.offset += 10
|
152
|
-
gene2.
|
136
|
+
gene2.entity_type = "Protein"
|
153
137
|
|
154
138
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
155
139
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -162,12 +146,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
162
146
|
gene1 = "TP53"
|
163
147
|
gene1.extend NamedEntity
|
164
148
|
gene1.offset = a.index gene1
|
165
|
-
gene1.
|
149
|
+
gene1.entity_type = "Gene"
|
166
150
|
|
167
151
|
gene2 = "TP53 gene"
|
168
152
|
gene2.extend NamedEntity
|
169
153
|
gene2.offset = a.index gene2
|
170
|
-
gene2.
|
154
|
+
gene2.entity_type = "Expanded Gene"
|
171
155
|
|
172
156
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
173
157
|
|
@@ -379,5 +363,46 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
379
363
|
end
|
380
364
|
end
|
381
365
|
end
|
366
|
+
|
367
|
+
def ___test_transform
|
368
|
+
a = "This sentence mentions the TP53 gene and the CDK5 protein"
|
369
|
+
original = a.dup
|
370
|
+
|
371
|
+
gene1 = "TP53"
|
372
|
+
gene1.extend Segment
|
373
|
+
gene1.offset = a.index gene1
|
374
|
+
|
375
|
+
gene2 = "CDK5"
|
376
|
+
gene2.extend Segment
|
377
|
+
gene2.offset = a.index gene2
|
378
|
+
|
379
|
+
assert_equal gene1, a[gene1.range]
|
380
|
+
assert_equal gene2, a[gene2.range]
|
381
|
+
|
382
|
+
c = a.dup
|
383
|
+
|
384
|
+
c[gene2.range] = "GN"
|
385
|
+
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
386
|
+
c[gene1.range] = "GN"
|
387
|
+
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
388
|
+
|
389
|
+
iii a.transformation_offset_differences
|
390
|
+
raise
|
391
|
+
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
392
|
+
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
393
|
+
|
394
|
+
|
395
|
+
gene3 = "GN gene"
|
396
|
+
gene3.extend Segment
|
397
|
+
gene3.offset = a.index gene3
|
398
|
+
|
399
|
+
assert_equal gene3, a[gene3.range]
|
400
|
+
|
401
|
+
a.restore([gene3])
|
402
|
+
assert_equal original, a
|
403
|
+
assert_equal "TP53 gene", a[gene3.range]
|
404
|
+
|
405
|
+
end
|
406
|
+
|
382
407
|
end
|
383
408
|
|