rbbt-text 1.2.0 → 1.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +55 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +63 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +26 -3
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,33 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
|
5
|
+
class TestDocumentCorpus < Test::Unit::TestCase
|
6
|
+
def test_corpus
|
7
|
+
text = "This is a document"
|
8
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
9
|
+
|
10
|
+
corpus = Document::Corpus.setup({})
|
11
|
+
|
12
|
+
corpus.add_document(text)
|
13
|
+
|
14
|
+
docid = text.docid(corpus)
|
15
|
+
|
16
|
+
assert_equal docid.document, text
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_find
|
20
|
+
text = "This is a document"
|
21
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
22
|
+
|
23
|
+
TmpFile.with_file do |path|
|
24
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
25
|
+
corpus.extend Document::Corpus
|
26
|
+
|
27
|
+
corpus.add_document(text)
|
28
|
+
|
29
|
+
assert corpus.docids("TEST:").include?(text.docid)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
@@ -8,13 +8,13 @@ require 'rbbt/sources/NCI'
|
|
8
8
|
|
9
9
|
class TestFinder < Test::Unit::TestCase
|
10
10
|
|
11
|
-
def
|
11
|
+
def _test_namespace_and_format
|
12
12
|
f = Finder.new(CMD.cmd("head -n 1000", :in => Open.open(Organism.identifiers(Organism.default_code("Hsa")).produce.find)))
|
13
13
|
assert_equal Organism.default_code("Hsa"), f.instances.first.namespace
|
14
14
|
assert_equal "Ensembl Gene ID", f.instances.first.format
|
15
15
|
end
|
16
16
|
|
17
|
-
def
|
17
|
+
def _test_find
|
18
18
|
f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["SF3B1"])
|
19
19
|
|
20
20
|
assert_equal "ENSG00000115524", f.find("SF3B1").first
|
@@ -23,7 +23,7 @@ class TestFinder < Test::Unit::TestCase
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def _test_find2
|
27
27
|
f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["RASGRF2"])
|
28
28
|
|
29
29
|
m = f.find("RAS").first
|
@@ -5,11 +5,29 @@ Log.severity = 0
|
|
5
5
|
class TestGNormPlus < Test::Unit::TestCase
|
6
6
|
def test_match
|
7
7
|
text =<<-EOF
|
8
|
-
|
8
|
+
|
9
|
+
Introduction
|
10
|
+
|
11
|
+
We found that TP53 is regulated by MDM2 in Homo
|
12
|
+
sapiens
|
9
13
|
EOF
|
10
14
|
|
11
15
|
mentions = GNormPlus.process({:file => text})
|
12
|
-
|
16
|
+
|
17
|
+
assert_equal 1, mentions.length
|
18
|
+
assert_equal 3, mentions["file"].length
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_entities
|
22
|
+
text =<<-EOF
|
23
|
+
We found that TP53 is regulated by MDM2 in Homo sapiens
|
24
|
+
EOF
|
25
|
+
|
26
|
+
mentions = GNormPlus.entities({:file => text})
|
27
|
+
assert mentions["file"].include?("TP53")
|
28
|
+
mentions["file"].each do |mention|
|
29
|
+
assert_equal mention, text[mention.range].sub("\n", ' ')
|
30
|
+
end
|
13
31
|
end
|
14
32
|
end
|
15
33
|
|
@@ -2,17 +2,17 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
|
|
2
2
|
require 'rbbt/ner/patterns'
|
3
3
|
|
4
4
|
class TestPatternRelExt < Test::Unit::TestCase
|
5
|
-
def
|
5
|
+
def _test_simple_pattern
|
6
6
|
text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
|
7
7
|
|
8
8
|
gene1 = "TP53"
|
9
|
-
NamedEntity.setup(gene1, text.index(gene1), "Gene")
|
9
|
+
NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
|
10
10
|
|
11
11
|
gene2 = "CDK5"
|
12
|
-
NamedEntity.setup(gene2, text.index(gene2), "Gene")
|
12
|
+
NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
|
13
13
|
|
14
14
|
interaction = "interacts"
|
15
|
-
NamedEntity.setup(interaction, text.index(interaction), "Interaction")
|
15
|
+
NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
|
16
16
|
|
17
17
|
Segmented.setup(text, [gene1, gene2, interaction])
|
18
18
|
|
@@ -23,13 +23,13 @@ class TestPatternRelExt < Test::Unit::TestCase
|
|
23
23
|
text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
|
24
24
|
|
25
25
|
gene1 = "TP53"
|
26
|
-
NamedEntity.setup(gene1, text.index(gene1), "Gene")
|
26
|
+
NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
|
27
27
|
|
28
28
|
gene2 = "CDK5"
|
29
|
-
NamedEntity.setup(gene2, text.index(gene2), "Gene")
|
29
|
+
NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
|
30
30
|
|
31
31
|
interaction = "interacts"
|
32
|
-
NamedEntity.setup(interaction, text.index(interaction), "Interaction")
|
32
|
+
NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
|
33
33
|
|
34
34
|
Segmented.setup(text, {:entities => [gene1, gene2, interaction]})
|
35
35
|
|
@@ -40,7 +40,7 @@ class TestPatternRelExt < Test::Unit::TestCase
|
|
40
40
|
PatternRelExt.new(["NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]"]).match_sentences([text]).first.first
|
41
41
|
end
|
42
42
|
|
43
|
-
def
|
43
|
+
def _test_chunk_pattern
|
44
44
|
text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
|
45
45
|
|
46
46
|
drug = "thiazolidinediones"
|
@@ -57,7 +57,7 @@ class TestPatternRelExt < Test::Unit::TestCase
|
|
57
57
|
end
|
58
58
|
|
59
59
|
|
60
|
-
def
|
60
|
+
def _test_entities_with_spaces
|
61
61
|
PatternRelExt.new("NP[entity:Gene Name]").token_trie
|
62
62
|
end
|
63
63
|
|
@@ -23,9 +23,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
23
23
|
matches = RegExpNER.match_regexp_hash(sentence, regexp_hash)
|
24
24
|
|
25
25
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
26
|
-
assert_equal "In ".length, matches.select{|m| m.
|
27
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
28
|
-
assert_equal :this, matches.select{|m| m.
|
26
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
27
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
28
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
29
29
|
end
|
30
30
|
|
31
31
|
def test_define_regexps
|
@@ -39,9 +39,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
39
39
|
|
40
40
|
matches = ner.entities(sentence)
|
41
41
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
42
|
-
assert_equal "In ".length, matches.select{|m| m.
|
43
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
44
|
-
assert_equal :this, matches.select{|m| m.
|
42
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this }[0].offset
|
43
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this }[1].offset
|
44
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this }[0].entity_type
|
45
45
|
end
|
46
46
|
|
47
47
|
|
@@ -51,9 +51,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
51
51
|
ner = RegExpNER.new({:this => /this/, :that => /that/})
|
52
52
|
matches = ner.entities(sentence)
|
53
53
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
54
|
-
assert_equal "In ".length, matches.select{|m| m.
|
55
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
56
|
-
assert_equal :this, matches.select{|m| m.
|
54
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
55
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
56
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
57
57
|
|
58
58
|
Segmented.setup(sentence)
|
59
59
|
ner_this = RegExpNER.new({:this => /this/})
|
@@ -64,9 +64,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
64
64
|
matches = sentence.segments
|
65
65
|
|
66
66
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
67
|
-
assert_equal "In ".length, matches.select{|m| m.
|
68
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
69
|
-
assert_equal :this, matches.select{|m| m.
|
67
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
68
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
69
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
70
70
|
end
|
71
71
|
|
72
72
|
def test_entities_captures
|
@@ -75,8 +75,8 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
75
75
|
ner = RegExpNER.new({:this => /this/, :that => /that/, :should => /I (should)/})
|
76
76
|
matches = ner.entities(sentence)
|
77
77
|
assert_equal ["this", "this", "that", "should"].sort, matches.sort
|
78
|
-
assert_equal "In this sentence I ".length, matches.select{|m| m.
|
79
|
-
assert_equal :should, matches.select{|m| m.
|
78
|
+
assert_equal "In this sentence I ".length, matches.select{|m| m.entity_type == :should}[0].offset
|
79
|
+
assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
|
80
80
|
end
|
81
81
|
|
82
82
|
|
data/test/rbbt/ner/test_rnorm.rb
CHANGED
@@ -27,10 +27,9 @@ S000000376 AAA GENE1 DDD
|
|
27
27
|
assert_equal(["S000000029"], @norm.match("FUN21"))
|
28
28
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN").sort)
|
29
29
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN 2").sort)
|
30
|
-
assert_equal(["
|
31
|
-
assert_equal([], @norm.match("
|
32
|
-
|
33
|
-
@norm.match("FUN21")
|
30
|
+
assert_equal(["S000000029"].sort, @norm.match("FUN 21").sort)
|
31
|
+
assert_equal([], @norm.match("Non-sense"))
|
32
|
+
assert_equal(["S000000029", "S000000374"], @norm.match("GER4"))
|
34
33
|
end
|
35
34
|
|
36
35
|
def test_select
|
@@ -74,6 +74,7 @@ C2;11;22;3 3;bb
|
|
74
74
|
index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))
|
75
75
|
|
76
76
|
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
|
+
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
@@ -1,9 +1,43 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/genia/sentence_splitter'
|
3
3
|
|
4
|
-
class
|
5
|
-
def
|
6
|
-
|
4
|
+
class TestNLP < Test::Unit::TestCase
|
5
|
+
def test_sentences
|
6
|
+
text =<<-EOF
|
7
|
+
This is a sentence.
|
8
|
+
A funky character ™ in a sentence.
|
9
|
+
This is a sentence.
|
10
|
+
This is a broken
|
11
|
+
sentence. This is
|
12
|
+
another broken sentence.
|
13
|
+
EOF
|
14
|
+
|
15
|
+
iii NLP.geniass_sentence_splitter(text)
|
16
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_sentences_2
|
20
|
+
text =<<-EOF
|
21
|
+
This is a sentence.
|
22
|
+
This is a sentence.
|
23
|
+
This is a broken
|
24
|
+
sentence. This is
|
25
|
+
another broken sentence.
|
26
|
+
EOF
|
27
|
+
|
28
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_sentences_ext
|
32
|
+
text =<<-EOF
|
33
|
+
This is a sentence.
|
34
|
+
This is a sentence.
|
35
|
+
This is a broken
|
36
|
+
sentence. This is
|
37
|
+
another broken sentence.
|
38
|
+
EOF
|
39
|
+
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
7
41
|
end
|
8
42
|
end
|
9
43
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/open_nlp/sentence_splitter'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment'
|
4
4
|
|
5
5
|
$text=<<-EOF
|
6
6
|
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors
|
@@ -22,6 +22,22 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
def test_sentences
|
23
23
|
text =<<-EOF
|
24
24
|
This is a sentence.
|
25
|
+
No funky character in this sentence.
|
26
|
+
This is a sentence.
|
27
|
+
This is a
|
28
|
+
sentence. This is
|
29
|
+
another sentence.
|
30
|
+
EOF
|
31
|
+
|
32
|
+
assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
|
33
|
+
|
34
|
+
assert_equal 5, OpenNLP.sentence_splitter(text).length
|
35
|
+
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_sentences_fix_utf8
|
39
|
+
text =<<-EOF
|
40
|
+
This is a sentence.
|
25
41
|
A funky character ™ in a sentence.
|
26
42
|
This is a sentence.
|
27
43
|
This is a
|
@@ -35,12 +51,12 @@ another sentence.
|
|
35
51
|
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
52
|
end
|
37
53
|
|
38
|
-
def
|
54
|
+
def test_text_sentences
|
39
55
|
Misc.benchmark(100) do
|
40
|
-
OpenNLP.sentence_splitter($text).include?
|
56
|
+
assert OpenNLP.sentence_splitter($text).include?("Our
|
41
57
|
findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
|
42
58
|
AT/RT and the usefulness of antibodies directed against SMARCA4 in this
|
43
|
-
diagnostic setting."
|
59
|
+
diagnostic setting.")
|
44
60
|
end
|
45
61
|
end
|
46
62
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/annotation'
|
6
|
+
|
7
|
+
class TestAnnotation < Test::Unit::TestCase
|
8
|
+
def test_annotation
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
|
+
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
16
|
+
|
17
|
+
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_annotid
|
22
|
+
text = "This is a document"
|
23
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
24
|
+
|
25
|
+
corpus = Document::Corpus.setup({})
|
26
|
+
|
27
|
+
corpus.add_document(text)
|
28
|
+
|
29
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
30
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
31
|
+
|
32
|
+
annotid = annotation.annotid(corpus)
|
33
|
+
|
34
|
+
assert_equal 'verb', annotid.type
|
35
|
+
assert_equal 'verb', annotid.annotation.type
|
36
|
+
assert_equal 'is', annotid.annotation
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/corpus'
|
6
|
+
|
7
|
+
class TestSegmentCorpus < Test::Unit::TestCase
|
8
|
+
def test_corpus
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = {}
|
13
|
+
corpus.extend Document::Corpus
|
14
|
+
|
15
|
+
corpus.add_document(text)
|
16
|
+
|
17
|
+
docid = text.docid(corpus)
|
18
|
+
|
19
|
+
assert_equal docid.document, text
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_find
|
23
|
+
text = "This is a document"
|
24
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
25
|
+
|
26
|
+
TmpFile.with_file do |path|
|
27
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
28
|
+
corpus.extend Document::Corpus
|
29
|
+
|
30
|
+
corpus.add_document(text)
|
31
|
+
|
32
|
+
assert corpus.prefix("TEST:").include?(text.docid)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/encoding'
|
3
|
+
|
4
|
+
class TestEncoding < Test::Unit::TestCase
|
5
|
+
def test_bad_chars
|
6
|
+
text = "A funky character ™ in a sentence."
|
7
|
+
|
8
|
+
assert_equal ["™"], Segment.bad_chars(text)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_ascii
|
12
|
+
text = "A funky character ™ in a sentence."
|
13
|
+
|
14
|
+
Segment.ascii(text) do
|
15
|
+
assert_equal "A funky character ? in a sentence.", text
|
16
|
+
end
|
17
|
+
|
18
|
+
Segment.ascii(text, "NONASCII") do
|
19
|
+
assert_equal "A funky character NONASCII in a sentence.", text
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "A funky character ™ in a sentence.", text
|
23
|
+
end
|
24
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
|
5
5
|
class TestClass < Test::Unit::TestCase
|
6
6
|
def test_info
|
@@ -15,35 +15,39 @@ class TestClass < Test::Unit::TestCase
|
|
15
15
|
|
16
16
|
def test_all_args
|
17
17
|
a = "test"
|
18
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
18
|
+
NamedEntity.setup a, 10, "TEST:doc1:test_type:hash", "NamedEntity", "TYPE", "CODE", "SCORE"
|
19
19
|
assert_equal 10, a.offset
|
20
|
+
assert_equal "NamedEntity", a.type
|
21
|
+
assert_equal "TYPE", a.entity_type
|
22
|
+
assert_equal "SCORE", a.score
|
20
23
|
end
|
21
24
|
|
22
25
|
def test_tsv
|
23
26
|
a = "test"
|
24
27
|
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
25
|
-
assert
|
26
|
-
assert
|
27
|
-
assert
|
28
|
+
assert Annotated.tsv([a]).fields.include? "code"
|
29
|
+
assert Annotated.tsv([a], nil).fields.include? "code"
|
30
|
+
assert Annotated.tsv([a], :all).fields.include? "code"
|
31
|
+
assert Annotated.tsv([a], :all).fields.include? "literal"
|
28
32
|
end
|
29
33
|
|
30
|
-
def
|
34
|
+
def __test_segment_brat
|
31
35
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
32
36
|
|
33
37
|
gene1 = "TP53"
|
34
38
|
gene1.extend NamedEntity
|
35
39
|
gene1.offset = a.index gene1
|
36
|
-
gene1.
|
40
|
+
gene1.entity_type = "Gene"
|
37
41
|
|
38
42
|
gene2 = "CDK5R1"
|
39
43
|
gene2.extend NamedEntity
|
40
44
|
gene2.offset = a.index gene2
|
41
|
-
gene2.
|
45
|
+
gene2.entity_type = "Gene"
|
42
46
|
|
43
47
|
gene3 = "TP53 gene"
|
44
48
|
gene3.extend NamedEntity
|
45
49
|
gene3.offset = a.index gene3
|
46
|
-
gene3.
|
50
|
+
gene3.entity_type = "Gene"
|
47
51
|
|
48
52
|
segments = [gene1, gene2, gene3]
|
49
53
|
assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"
|