rbbt-text 1.2.0 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/document.rb +46 -0
- data/lib/rbbt/document/annotation.rb +42 -0
- data/lib/rbbt/document/corpus.rb +38 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +19 -2
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +6 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +6 -6
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +1 -1
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/segment.rb +177 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +11 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +7 -9
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -1
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +13 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +40 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +12 -9
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +43 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +76 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +187 -0
- data/test/test_helper.rb +5 -3
- metadata +40 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -23,9 +23,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
23
23
|
matches = RegExpNER.match_regexp_hash(sentence, regexp_hash)
|
24
24
|
|
25
25
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
26
|
-
assert_equal "In ".length, matches.select{|m| m.
|
27
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
28
|
-
assert_equal :this, matches.select{|m| m.
|
26
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
27
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
28
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
29
29
|
end
|
30
30
|
|
31
31
|
def test_define_regexps
|
@@ -39,9 +39,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
39
39
|
|
40
40
|
matches = ner.entities(sentence)
|
41
41
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
42
|
-
assert_equal "In ".length, matches.select{|m| m.
|
43
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
44
|
-
assert_equal :this, matches.select{|m| m.
|
42
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this }[0].offset
|
43
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this }[1].offset
|
44
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this }[0].entity_type
|
45
45
|
end
|
46
46
|
|
47
47
|
|
@@ -51,9 +51,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
51
51
|
ner = RegExpNER.new({:this => /this/, :that => /that/})
|
52
52
|
matches = ner.entities(sentence)
|
53
53
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
54
|
-
assert_equal "In ".length, matches.select{|m| m.
|
55
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
56
|
-
assert_equal :this, matches.select{|m| m.
|
54
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
55
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
56
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
57
57
|
|
58
58
|
Segmented.setup(sentence)
|
59
59
|
ner_this = RegExpNER.new({:this => /this/})
|
@@ -64,9 +64,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
64
64
|
matches = sentence.segments
|
65
65
|
|
66
66
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
67
|
-
assert_equal "In ".length, matches.select{|m| m.
|
68
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
69
|
-
assert_equal :this, matches.select{|m| m.
|
67
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
68
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
69
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
70
70
|
end
|
71
71
|
|
72
72
|
def test_entities_captures
|
@@ -75,8 +75,8 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
75
75
|
ner = RegExpNER.new({:this => /this/, :that => /that/, :should => /I (should)/})
|
76
76
|
matches = ner.entities(sentence)
|
77
77
|
assert_equal ["this", "this", "that", "should"].sort, matches.sort
|
78
|
-
assert_equal "In this sentence I ".length, matches.select{|m| m.
|
79
|
-
assert_equal :should, matches.select{|m| m.
|
78
|
+
assert_equal "In this sentence I ".length, matches.select{|m| m.entity_type == :should}[0].offset
|
79
|
+
assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
|
80
80
|
end
|
81
81
|
|
82
82
|
|
data/test/rbbt/ner/test_rnorm.rb
CHANGED
@@ -27,10 +27,9 @@ S000000376 AAA GENE1 DDD
|
|
27
27
|
assert_equal(["S000000029"], @norm.match("FUN21"))
|
28
28
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN").sort)
|
29
29
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN 2").sort)
|
30
|
-
assert_equal(["
|
31
|
-
assert_equal([], @norm.match("
|
32
|
-
|
33
|
-
@norm.match("FUN21")
|
30
|
+
assert_equal(["S000000029"].sort, @norm.match("FUN 21").sort)
|
31
|
+
assert_equal([], @norm.match("Non-sense"))
|
32
|
+
assert_equal(["S000000029", "S000000374"], @norm.match("GER4"))
|
34
33
|
end
|
35
34
|
|
36
35
|
def test_select
|
@@ -74,6 +74,7 @@ C2;11;22;3 3;bb
|
|
74
74
|
index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))
|
75
75
|
|
76
76
|
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
|
+
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
@@ -1,9 +1,19 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/genia/sentence_splitter'
|
3
3
|
|
4
|
-
class
|
5
|
-
def
|
6
|
-
|
4
|
+
class TestNLP < Test::Unit::TestCase
|
5
|
+
def test_sentences
|
6
|
+
text =<<-EOF
|
7
|
+
This is a sentence.
|
8
|
+
A funky character ™ in a sentence.
|
9
|
+
This is a sentence.
|
10
|
+
This is a
|
11
|
+
sentence. This is
|
12
|
+
another sentence.
|
13
|
+
EOF
|
14
|
+
|
15
|
+
assert_equal "This is a \nsentence.", NLP.geniass_sentence_splitter(text)[3]
|
7
16
|
end
|
17
|
+
|
8
18
|
end
|
9
19
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/open_nlp/sentence_splitter'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment'
|
4
4
|
|
5
5
|
$text=<<-EOF
|
6
6
|
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors
|
@@ -22,6 +22,22 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
def test_sentences
|
23
23
|
text =<<-EOF
|
24
24
|
This is a sentence.
|
25
|
+
No funky character in this sentence.
|
26
|
+
This is a sentence.
|
27
|
+
This is a
|
28
|
+
sentence. This is
|
29
|
+
another sentence.
|
30
|
+
EOF
|
31
|
+
|
32
|
+
assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
|
33
|
+
|
34
|
+
assert_equal 5, OpenNLP.sentence_splitter(text).length
|
35
|
+
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_sentences_fix_utf8
|
39
|
+
text =<<-EOF
|
40
|
+
This is a sentence.
|
25
41
|
A funky character ™ in a sentence.
|
26
42
|
This is a sentence.
|
27
43
|
This is a
|
@@ -35,12 +51,12 @@ another sentence.
|
|
35
51
|
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
52
|
end
|
37
53
|
|
38
|
-
def
|
54
|
+
def test_text_sentences
|
39
55
|
Misc.benchmark(100) do
|
40
|
-
OpenNLP.sentence_splitter($text).include?
|
56
|
+
assert OpenNLP.sentence_splitter($text).include?("Our
|
41
57
|
findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
|
42
58
|
AT/RT and the usefulness of antibodies directed against SMARCA4 in this
|
43
|
-
diagnostic setting."
|
59
|
+
diagnostic setting.")
|
44
60
|
end
|
45
61
|
end
|
46
62
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/annotation'
|
6
|
+
|
7
|
+
class TestAnnotation < Test::Unit::TestCase
|
8
|
+
def test_annotation
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
|
+
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":").last
|
16
|
+
|
17
|
+
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":").last
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_annotid
|
22
|
+
text = "This is a document"
|
23
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
24
|
+
|
25
|
+
corpus = {}
|
26
|
+
corpus.extend Document::Corpus
|
27
|
+
|
28
|
+
corpus.add_document(text)
|
29
|
+
|
30
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
31
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
32
|
+
|
33
|
+
annotid = annotation.annotid(corpus)
|
34
|
+
|
35
|
+
assert_equal 'verb', annotid.type
|
36
|
+
assert_equal 'verb', annotid.annotation.type
|
37
|
+
assert_equal 'is', annotid.annotation
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/corpus'
|
6
|
+
|
7
|
+
class TestSegmentCorpus < Test::Unit::TestCase
|
8
|
+
def test_corpus
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = {}
|
13
|
+
corpus.extend Document::Corpus
|
14
|
+
|
15
|
+
corpus.add_document(text)
|
16
|
+
|
17
|
+
docid = text.docid(corpus)
|
18
|
+
|
19
|
+
assert_equal docid.document, text
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_find
|
23
|
+
text = "This is a document"
|
24
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
25
|
+
|
26
|
+
TmpFile.with_file do |path|
|
27
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
28
|
+
corpus.extend Document::Corpus
|
29
|
+
|
30
|
+
corpus.add_document(text)
|
31
|
+
|
32
|
+
assert corpus.prefix("TEST:").include?(text.docid)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/encoding'
|
3
|
+
|
4
|
+
class TestEncoding < Test::Unit::TestCase
|
5
|
+
def _test_bad_chars
|
6
|
+
text = "A funky character ™ in a sentence."
|
7
|
+
|
8
|
+
assert_equal ["™"], Segment.bad_chars(text)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_ascii
|
12
|
+
text = "A funky character ™ in a sentence."
|
13
|
+
|
14
|
+
Segment.ascii(text) do
|
15
|
+
assert_equal "A funky character ? in a sentence.", text
|
16
|
+
end
|
17
|
+
|
18
|
+
Segment.ascii(text, "NONASCII") do
|
19
|
+
assert_equal "A funky character NONASCII in a sentence.", text
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "A funky character ™ in a sentence.", text
|
23
|
+
end
|
24
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
|
5
5
|
class TestClass < Test::Unit::TestCase
|
6
6
|
def test_info
|
@@ -15,11 +15,14 @@ class TestClass < Test::Unit::TestCase
|
|
15
15
|
|
16
16
|
def test_all_args
|
17
17
|
a = "test"
|
18
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
18
|
+
NamedEntity.setup a, 10, "TEST:doc1:test_type:hash", "NamedEntity", "TYPE", "CODE", "SCORE"
|
19
19
|
assert_equal 10, a.offset
|
20
|
+
assert_equal "NamedEntity", a.type
|
21
|
+
assert_equal "TYPE", a.entity_type
|
22
|
+
assert_equal "SCORE", a.score
|
20
23
|
end
|
21
24
|
|
22
|
-
def
|
25
|
+
def __test_tsv
|
23
26
|
a = "test"
|
24
27
|
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
25
28
|
assert Segment.tsv([a]).fields.include? "code"
|
@@ -27,23 +30,23 @@ class TestClass < Test::Unit::TestCase
|
|
27
30
|
assert Segment.tsv([a], "literal").fields.include? "code"
|
28
31
|
end
|
29
32
|
|
30
|
-
def
|
33
|
+
def __test_segment_brat
|
31
34
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
32
35
|
|
33
36
|
gene1 = "TP53"
|
34
37
|
gene1.extend NamedEntity
|
35
38
|
gene1.offset = a.index gene1
|
36
|
-
gene1.
|
39
|
+
gene1.entity_type = "Gene"
|
37
40
|
|
38
41
|
gene2 = "CDK5R1"
|
39
42
|
gene2.extend NamedEntity
|
40
43
|
gene2.offset = a.index gene2
|
41
|
-
gene2.
|
44
|
+
gene2.entity_type = "Gene"
|
42
45
|
|
43
46
|
gene3 = "TP53 gene"
|
44
47
|
gene3.extend NamedEntity
|
45
48
|
gene3.offset = a.index gene3
|
46
|
-
gene3.
|
49
|
+
gene3.entity_type = "Gene"
|
47
50
|
|
48
51
|
segments = [gene1, gene2, gene3]
|
49
52
|
assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"
|
@@ -0,0 +1,69 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/overlaps'
|
4
|
+
|
5
|
+
class TestOverlaps < Test::Unit::TestCase
|
6
|
+
def setup
|
7
|
+
@text = <<-EOF
|
8
|
+
This is a first sentence. More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
9
|
+
EOF
|
10
|
+
|
11
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
12
|
+
Segment.setup(literal, :offset => @text.index(literal))
|
13
|
+
end
|
14
|
+
|
15
|
+
@sentences = @text.partition(".").values_at(0, 2).collect do |sentence|
|
16
|
+
Segment.setup sentence, :offset => @text.index(sentence)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_make_relative
|
21
|
+
sentence = @sentences[1]
|
22
|
+
|
23
|
+
@entities.each do |e|
|
24
|
+
assert_equal e, @text[e.range]
|
25
|
+
end
|
26
|
+
|
27
|
+
sentence.make_relative @entities do
|
28
|
+
@entities.each do |e|
|
29
|
+
assert_equal e, sentence[e.range]
|
30
|
+
end
|
31
|
+
|
32
|
+
@entities.each do |e|
|
33
|
+
assert_not_equal e, @text[e.range]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
@entities.each do |e|
|
38
|
+
assert_equal e, @text[e.range]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_range_in
|
43
|
+
sentence = @sentences[1]
|
44
|
+
|
45
|
+
@entities.each do |e|
|
46
|
+
assert_equal e.range_in(sentence).begin, sentence.index(e)
|
47
|
+
assert_equal e.range.begin - sentence.offset, sentence.index(e)
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_includes
|
52
|
+
@entities.each do |e|
|
53
|
+
assert ! @sentences[0].include?(e)
|
54
|
+
assert @sentences[1].include?(e)
|
55
|
+
assert ! e.include?(@sentences[0])
|
56
|
+
assert ! e.include?(@sentences[1])
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_overlaps?
|
61
|
+
@entities.each do |e|
|
62
|
+
assert ! @sentences[0].overlaps?(e)
|
63
|
+
assert @sentences[1].overlaps?(e)
|
64
|
+
assert ! e.overlaps?(@sentences[0])
|
65
|
+
assert e.overlaps?(@sentences[1])
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/range_index'
|
6
|
+
|
7
|
+
class TestRangeIndex < Test::Unit::TestCase
|
8
|
+
def test_segment_index
|
9
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = {}
|
13
|
+
corpus.extend Document::Corpus
|
14
|
+
|
15
|
+
corpus.add_document(text)
|
16
|
+
|
17
|
+
gene1 = "TP53"
|
18
|
+
gene1.extend Segment
|
19
|
+
gene1.offset = text.index gene1
|
20
|
+
gene1.docid = text.docid
|
21
|
+
|
22
|
+
gene2 = "CDK5R1"
|
23
|
+
gene2.extend Segment
|
24
|
+
gene2.offset = text.index gene2
|
25
|
+
gene2.docid = text.docid
|
26
|
+
|
27
|
+
gene3 = "TP53 gene"
|
28
|
+
gene3.extend Segment
|
29
|
+
gene3.offset = text.index gene1
|
30
|
+
gene3.docid = text.docid
|
31
|
+
|
32
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus)
|
33
|
+
assert_equal "CDK5R1", index[gene2.offset + 1].segment.first
|
34
|
+
|
35
|
+
TmpFile.with_file do |fwt|
|
36
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
37
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
38
|
+
index = Segment::RangeIndex.index([gene1, gene2, gene3], corpus, fwt)
|
39
|
+
assert_equal %w(CDK5R1), index[gene2.offset + 1].segment
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
@@ -1,10 +1,21 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/transformed'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
require 'rexml/document'
|
5
5
|
|
6
|
-
class
|
7
|
-
|
6
|
+
class TestTransformed < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@text = <<-EOF
|
10
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
11
|
+
EOF
|
12
|
+
|
13
|
+
@entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].collect do |literal|
|
14
|
+
NamedEntity.setup(literal, :offset => @text.index(literal))
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_transform
|
8
19
|
text = <<-EOF
|
9
20
|
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
10
21
|
EOF
|
@@ -13,52 +24,25 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
13
24
|
NamedEntity.setup(literal, :offset => text.index(literal))
|
14
25
|
end
|
15
26
|
|
16
|
-
Transformed.
|
17
|
-
|
18
|
-
end
|
27
|
+
Transformed.transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" })
|
28
|
+
assert text.include? "such as [IL-2]"
|
19
29
|
end
|
20
30
|
|
21
|
-
def
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
gene1 = "TP53"
|
26
|
-
gene1.extend Segment
|
27
|
-
gene1.offset = a.index gene1
|
28
|
-
|
29
|
-
gene2 = "CDK5"
|
30
|
-
gene2.extend Segment
|
31
|
-
gene2.offset = a.index gene2
|
32
|
-
|
33
|
-
assert_equal gene1, a[gene1.range]
|
34
|
-
assert_equal gene2, a[gene2.range]
|
35
|
-
|
36
|
-
c = a.dup
|
37
|
-
|
38
|
-
c[gene2.range] = "GN"
|
39
|
-
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
40
|
-
c[gene1.range] = "GN"
|
41
|
-
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
42
|
-
|
43
|
-
iii a.transformation_offset_differences
|
44
|
-
raise
|
45
|
-
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
46
|
-
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
47
|
-
|
48
|
-
|
49
|
-
gene3 = "GN gene"
|
50
|
-
gene3.extend Segment
|
51
|
-
gene3.offset = a.index gene3
|
52
|
-
|
53
|
-
assert_equal gene3, a[gene3.range]
|
31
|
+
def test_with_transform
|
32
|
+
text = <<-EOF
|
33
|
+
More recently, PPAR activators were shown to inhibit the activation of inflammatory response genes (such as IL-2, IL-6, IL-8, TNF alpha and metalloproteases) by negatively interfering with the NF-kappa B, STAT and AP-1 signalling pathways in cells of the vascular wall.
|
34
|
+
EOF
|
54
35
|
|
55
|
-
|
56
|
-
|
57
|
-
|
36
|
+
entities = ["PPAR", "IL-2", "IL-6", "IL-8", "TNF alpha", "NF-kappa B", "AP-1", "STAT"].reverse.collect do |literal|
|
37
|
+
NamedEntity.setup(literal, :offset => text.index(literal))
|
38
|
+
end
|
58
39
|
|
40
|
+
Transformed.with_transform(text, entities, Proc.new{|e| "[" + e.upcase + "]" }) do
|
41
|
+
assert text.include? "such as [IL-2]"
|
42
|
+
end
|
59
43
|
end
|
60
44
|
|
61
|
-
def
|
45
|
+
def test_with_transform_2
|
62
46
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
63
47
|
original = a.dup
|
64
48
|
|
@@ -123,12 +107,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
123
107
|
gene1 = "TP53"
|
124
108
|
gene1.extend NamedEntity
|
125
109
|
gene1.offset = a.index gene1
|
126
|
-
gene1.
|
110
|
+
gene1.entity_type = "Gene"
|
127
111
|
|
128
112
|
gene2 = "CDK5R1"
|
129
113
|
gene2.extend NamedEntity
|
130
114
|
gene2.offset = a.index gene2
|
131
|
-
gene2.
|
115
|
+
gene2.entity_type = "Protein"
|
132
116
|
|
133
117
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
134
118
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -143,13 +127,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
143
127
|
gene1.extend NamedEntity
|
144
128
|
gene1.offset = a.index gene1
|
145
129
|
gene1.offset += 10
|
146
|
-
gene1.
|
130
|
+
gene1.entity_type = "Gene"
|
147
131
|
|
148
132
|
gene2 = "CDK5R1"
|
149
133
|
gene2.extend NamedEntity
|
150
134
|
gene2.offset = a.index gene2
|
151
135
|
gene2.offset += 10
|
152
|
-
gene2.
|
136
|
+
gene2.entity_type = "Protein"
|
153
137
|
|
154
138
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
155
139
|
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
@@ -162,12 +146,12 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
162
146
|
gene1 = "TP53"
|
163
147
|
gene1.extend NamedEntity
|
164
148
|
gene1.offset = a.index gene1
|
165
|
-
gene1.
|
149
|
+
gene1.entity_type = "Gene"
|
166
150
|
|
167
151
|
gene2 = "TP53 gene"
|
168
152
|
gene2.extend NamedEntity
|
169
153
|
gene2.offset = a.index gene2
|
170
|
-
gene2.
|
154
|
+
gene2.entity_type = "Expanded Gene"
|
171
155
|
|
172
156
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
173
157
|
|
@@ -379,5 +363,46 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
|
|
379
363
|
end
|
380
364
|
end
|
381
365
|
end
|
366
|
+
|
367
|
+
def ___test_transform
|
368
|
+
a = "This sentence mentions the TP53 gene and the CDK5 protein"
|
369
|
+
original = a.dup
|
370
|
+
|
371
|
+
gene1 = "TP53"
|
372
|
+
gene1.extend Segment
|
373
|
+
gene1.offset = a.index gene1
|
374
|
+
|
375
|
+
gene2 = "CDK5"
|
376
|
+
gene2.extend Segment
|
377
|
+
gene2.offset = a.index gene2
|
378
|
+
|
379
|
+
assert_equal gene1, a[gene1.range]
|
380
|
+
assert_equal gene2, a[gene2.range]
|
381
|
+
|
382
|
+
c = a.dup
|
383
|
+
|
384
|
+
c[gene2.range] = "GN"
|
385
|
+
assert_equal c, Transformed.transform(a,[gene2], "GN")
|
386
|
+
c[gene1.range] = "GN"
|
387
|
+
assert_equal c, Transformed.transform(a,[gene1], "GN")
|
388
|
+
|
389
|
+
iii a.transformation_offset_differences
|
390
|
+
raise
|
391
|
+
assert_equal gene2.offset, a.transformation_offset_differences.first.first.first
|
392
|
+
assert_equal gene1.offset, a.transformation_offset_differences.last.first.first
|
393
|
+
|
394
|
+
|
395
|
+
gene3 = "GN gene"
|
396
|
+
gene3.extend Segment
|
397
|
+
gene3.offset = a.index gene3
|
398
|
+
|
399
|
+
assert_equal gene3, a[gene3.range]
|
400
|
+
|
401
|
+
a.restore([gene3])
|
402
|
+
assert_equal original, a
|
403
|
+
assert_equal "TP53 gene", a[gene3.range]
|
404
|
+
|
405
|
+
end
|
406
|
+
|
382
407
|
end
|
383
408
|
|