rbbt-text 1.2.0 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +55 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +63 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +26 -3
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,33 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
|
5
|
+
class TestDocumentCorpus < Test::Unit::TestCase
|
6
|
+
def test_corpus
|
7
|
+
text = "This is a document"
|
8
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
9
|
+
|
10
|
+
corpus = Document::Corpus.setup({})
|
11
|
+
|
12
|
+
corpus.add_document(text)
|
13
|
+
|
14
|
+
docid = text.docid(corpus)
|
15
|
+
|
16
|
+
assert_equal docid.document, text
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_find
|
20
|
+
text = "This is a document"
|
21
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
22
|
+
|
23
|
+
TmpFile.with_file do |path|
|
24
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
25
|
+
corpus.extend Document::Corpus
|
26
|
+
|
27
|
+
corpus.add_document(text)
|
28
|
+
|
29
|
+
assert corpus.docids("TEST:").include?(text.docid)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
@@ -8,13 +8,13 @@ require 'rbbt/sources/NCI'
|
|
8
8
|
|
9
9
|
class TestFinder < Test::Unit::TestCase
|
10
10
|
|
11
|
-
def
|
11
|
+
def _test_namespace_and_format
|
12
12
|
f = Finder.new(CMD.cmd("head -n 1000", :in => Open.open(Organism.identifiers(Organism.default_code("Hsa")).produce.find)))
|
13
13
|
assert_equal Organism.default_code("Hsa"), f.instances.first.namespace
|
14
14
|
assert_equal "Ensembl Gene ID", f.instances.first.format
|
15
15
|
end
|
16
16
|
|
17
|
-
def
|
17
|
+
def _test_find
|
18
18
|
f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["SF3B1"])
|
19
19
|
|
20
20
|
assert_equal "ENSG00000115524", f.find("SF3B1").first
|
@@ -23,7 +23,7 @@ class TestFinder < Test::Unit::TestCase
|
|
23
23
|
end
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def _test_find2
|
27
27
|
f = Finder.new(Organism.lexicon(Organism.default_code("Hsa")), :grep => ["RASGRF2"])
|
28
28
|
|
29
29
|
m = f.find("RAS").first
|
@@ -5,11 +5,29 @@ Log.severity = 0
|
|
5
5
|
class TestGNormPlus < Test::Unit::TestCase
|
6
6
|
def test_match
|
7
7
|
text =<<-EOF
|
8
|
-
|
8
|
+
|
9
|
+
Introduction
|
10
|
+
|
11
|
+
We found that TP53 is regulated by MDM2 in Homo
|
12
|
+
sapiens
|
9
13
|
EOF
|
10
14
|
|
11
15
|
mentions = GNormPlus.process({:file => text})
|
12
|
-
|
16
|
+
|
17
|
+
assert_equal 1, mentions.length
|
18
|
+
assert_equal 3, mentions["file"].length
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_entities
|
22
|
+
text =<<-EOF
|
23
|
+
We found that TP53 is regulated by MDM2 in Homo sapiens
|
24
|
+
EOF
|
25
|
+
|
26
|
+
mentions = GNormPlus.entities({:file => text})
|
27
|
+
assert mentions["file"].include?("TP53")
|
28
|
+
mentions["file"].each do |mention|
|
29
|
+
assert_equal mention, text[mention.range].sub("\n", ' ')
|
30
|
+
end
|
13
31
|
end
|
14
32
|
end
|
15
33
|
|
@@ -2,17 +2,17 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
|
|
2
2
|
require 'rbbt/ner/patterns'
|
3
3
|
|
4
4
|
class TestPatternRelExt < Test::Unit::TestCase
|
5
|
-
def
|
5
|
+
def _test_simple_pattern
|
6
6
|
text = "Experiments have shown that TP53 interacts with CDK5 under certain conditions"
|
7
7
|
|
8
8
|
gene1 = "TP53"
|
9
|
-
NamedEntity.setup(gene1, text.index(gene1), "Gene")
|
9
|
+
NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
|
10
10
|
|
11
11
|
gene2 = "CDK5"
|
12
|
-
NamedEntity.setup(gene2, text.index(gene2), "Gene")
|
12
|
+
NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
|
13
13
|
|
14
14
|
interaction = "interacts"
|
15
|
-
NamedEntity.setup(interaction, text.index(interaction), "Interaction")
|
15
|
+
NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
|
16
16
|
|
17
17
|
Segmented.setup(text, [gene1, gene2, interaction])
|
18
18
|
|
@@ -23,13 +23,13 @@ class TestPatternRelExt < Test::Unit::TestCase
|
|
23
23
|
text = "Experiments have shown that TP53 found in cultivated cells interacts with CDK5 under certain conditions"
|
24
24
|
|
25
25
|
gene1 = "TP53"
|
26
|
-
NamedEntity.setup(gene1, text.index(gene1), "Gene")
|
26
|
+
NamedEntity.setup(gene1, :offset => text.index(gene1), :entity_type => "Gene")
|
27
27
|
|
28
28
|
gene2 = "CDK5"
|
29
|
-
NamedEntity.setup(gene2, text.index(gene2), "Gene")
|
29
|
+
NamedEntity.setup(gene2, :offset => text.index(gene2), :entity_type => "Gene")
|
30
30
|
|
31
31
|
interaction = "interacts"
|
32
|
-
NamedEntity.setup(interaction, text.index(interaction), "Interaction")
|
32
|
+
NamedEntity.setup(interaction, :offset => text.index(interaction), :entity_type => "Interaction")
|
33
33
|
|
34
34
|
Segmented.setup(text, {:entities => [gene1, gene2, interaction]})
|
35
35
|
|
@@ -40,7 +40,7 @@ class TestPatternRelExt < Test::Unit::TestCase
|
|
40
40
|
PatternRelExt.new(["NP[entity:Gene] VP[stem:interacts] with NP[entity:Gene]"]).match_sentences([text]).first.first
|
41
41
|
end
|
42
42
|
|
43
|
-
def
|
43
|
+
def _test_chunk_pattern
|
44
44
|
text = "There is a concern with the use of thiazolidinediones in patients with an increased risk of colon cancer (e.g., familial colon polyposis)."
|
45
45
|
|
46
46
|
drug = "thiazolidinediones"
|
@@ -57,7 +57,7 @@ class TestPatternRelExt < Test::Unit::TestCase
|
|
57
57
|
end
|
58
58
|
|
59
59
|
|
60
|
-
def
|
60
|
+
def _test_entities_with_spaces
|
61
61
|
PatternRelExt.new("NP[entity:Gene Name]").token_trie
|
62
62
|
end
|
63
63
|
|
@@ -23,9 +23,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
23
23
|
matches = RegExpNER.match_regexp_hash(sentence, regexp_hash)
|
24
24
|
|
25
25
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
26
|
-
assert_equal "In ".length, matches.select{|m| m.
|
27
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
28
|
-
assert_equal :this, matches.select{|m| m.
|
26
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
27
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
28
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
29
29
|
end
|
30
30
|
|
31
31
|
def test_define_regexps
|
@@ -39,9 +39,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
39
39
|
|
40
40
|
matches = ner.entities(sentence)
|
41
41
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
42
|
-
assert_equal "In ".length, matches.select{|m| m.
|
43
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
44
|
-
assert_equal :this, matches.select{|m| m.
|
42
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this }[0].offset
|
43
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this }[1].offset
|
44
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this }[0].entity_type
|
45
45
|
end
|
46
46
|
|
47
47
|
|
@@ -51,9 +51,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
51
51
|
ner = RegExpNER.new({:this => /this/, :that => /that/})
|
52
52
|
matches = ner.entities(sentence)
|
53
53
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
54
|
-
assert_equal "In ".length, matches.select{|m| m.
|
55
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
56
|
-
assert_equal :this, matches.select{|m| m.
|
54
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
55
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
56
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
57
57
|
|
58
58
|
Segmented.setup(sentence)
|
59
59
|
ner_this = RegExpNER.new({:this => /this/})
|
@@ -64,9 +64,9 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
64
64
|
matches = sentence.segments
|
65
65
|
|
66
66
|
assert_equal ["this", "this", "that"].sort, matches.sort
|
67
|
-
assert_equal "In ".length, matches.select{|m| m.
|
68
|
-
assert_equal "In this sentence I should find ".length, matches.select{|m| m.
|
69
|
-
assert_equal :this, matches.select{|m| m.
|
67
|
+
assert_equal "In ".length, matches.select{|m| m.entity_type == :this}[0].offset
|
68
|
+
assert_equal "In this sentence I should find ".length, matches.select{|m| m.entity_type == :this}[1].offset
|
69
|
+
assert_equal :this, matches.select{|m| m.entity_type == :this}[0].entity_type
|
70
70
|
end
|
71
71
|
|
72
72
|
def test_entities_captures
|
@@ -75,8 +75,8 @@ class TestRegExpNER < Test::Unit::TestCase
|
|
75
75
|
ner = RegExpNER.new({:this => /this/, :that => /that/, :should => /I (should)/})
|
76
76
|
matches = ner.entities(sentence)
|
77
77
|
assert_equal ["this", "this", "that", "should"].sort, matches.sort
|
78
|
-
assert_equal "In this sentence I ".length, matches.select{|m| m.
|
79
|
-
assert_equal :should, matches.select{|m| m.
|
78
|
+
assert_equal "In this sentence I ".length, matches.select{|m| m.entity_type == :should}[0].offset
|
79
|
+
assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
|
80
80
|
end
|
81
81
|
|
82
82
|
|
data/test/rbbt/ner/test_rnorm.rb
CHANGED
@@ -27,10 +27,9 @@ S000000376 AAA GENE1 DDD
|
|
27
27
|
assert_equal(["S000000029"], @norm.match("FUN21"))
|
28
28
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN").sort)
|
29
29
|
assert_equal(["S000000030", "S000000029", "S000000031"].sort, @norm.match("FUN 2").sort)
|
30
|
-
assert_equal(["
|
31
|
-
assert_equal([], @norm.match("
|
32
|
-
|
33
|
-
@norm.match("FUN21")
|
30
|
+
assert_equal(["S000000029"].sort, @norm.match("FUN 21").sort)
|
31
|
+
assert_equal([], @norm.match("Non-sense"))
|
32
|
+
assert_equal(["S000000029", "S000000374"], @norm.match("GER4"))
|
34
33
|
end
|
35
34
|
|
36
35
|
def test_select
|
@@ -74,6 +74,7 @@ C2;11;22;3 3;bb
|
|
74
74
|
index = TokenTrieNER.new("test", TSV.open(file, :flat, :sep => ';'))
|
75
75
|
|
76
76
|
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
|
+
assert index.match(' asdfa dsf asdf aa asdfasdf ').select{|m| m.code.include? 'C1'}.any?
|
77
78
|
end
|
78
79
|
end
|
79
80
|
|
@@ -1,9 +1,43 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/genia/sentence_splitter'
|
3
3
|
|
4
|
-
class
|
5
|
-
def
|
6
|
-
|
4
|
+
class TestNLP < Test::Unit::TestCase
|
5
|
+
def test_sentences
|
6
|
+
text =<<-EOF
|
7
|
+
This is a sentence.
|
8
|
+
A funky character ™ in a sentence.
|
9
|
+
This is a sentence.
|
10
|
+
This is a broken
|
11
|
+
sentence. This is
|
12
|
+
another broken sentence.
|
13
|
+
EOF
|
14
|
+
|
15
|
+
iii NLP.geniass_sentence_splitter(text)
|
16
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_sentences_2
|
20
|
+
text =<<-EOF
|
21
|
+
This is a sentence.
|
22
|
+
This is a sentence.
|
23
|
+
This is a broken
|
24
|
+
sentence. This is
|
25
|
+
another broken sentence.
|
26
|
+
EOF
|
27
|
+
|
28
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_sentences_ext
|
32
|
+
text =<<-EOF
|
33
|
+
This is a sentence.
|
34
|
+
This is a sentence.
|
35
|
+
This is a broken
|
36
|
+
sentence. This is
|
37
|
+
another broken sentence.
|
38
|
+
EOF
|
39
|
+
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
7
41
|
end
|
8
42
|
end
|
9
43
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/nlp/open_nlp/sentence_splitter'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment'
|
4
4
|
|
5
5
|
$text=<<-EOF
|
6
6
|
Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors
|
@@ -22,6 +22,22 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
def test_sentences
|
23
23
|
text =<<-EOF
|
24
24
|
This is a sentence.
|
25
|
+
No funky character in this sentence.
|
26
|
+
This is a sentence.
|
27
|
+
This is a
|
28
|
+
sentence. This is
|
29
|
+
another sentence.
|
30
|
+
EOF
|
31
|
+
|
32
|
+
assert_equal 5, OpenNLP.sentence_split_detector.sentDetect(text).length
|
33
|
+
|
34
|
+
assert_equal 5, OpenNLP.sentence_splitter(text).length
|
35
|
+
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
|
+
end
|
37
|
+
|
38
|
+
def test_sentences_fix_utf8
|
39
|
+
text =<<-EOF
|
40
|
+
This is a sentence.
|
25
41
|
A funky character ™ in a sentence.
|
26
42
|
This is a sentence.
|
27
43
|
This is a
|
@@ -35,12 +51,12 @@ another sentence.
|
|
35
51
|
assert_equal "This is a \nsentence.", OpenNLP.sentence_splitter(text)[3]
|
36
52
|
end
|
37
53
|
|
38
|
-
def
|
54
|
+
def test_text_sentences
|
39
55
|
Misc.benchmark(100) do
|
40
|
-
OpenNLP.sentence_splitter($text).include?
|
56
|
+
assert OpenNLP.sentence_splitter($text).include?("Our
|
41
57
|
findings highlight the role of SMARCA4 in the pathogenesis of SMARCB1-positive
|
42
58
|
AT/RT and the usefulness of antibodies directed against SMARCA4 in this
|
43
|
-
diagnostic setting."
|
59
|
+
diagnostic setting.")
|
44
60
|
end
|
45
61
|
end
|
46
62
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/annotation'
|
6
|
+
|
7
|
+
class TestAnnotation < Test::Unit::TestCase
|
8
|
+
def test_annotation
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
|
+
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
16
|
+
|
17
|
+
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_annotid
|
22
|
+
text = "This is a document"
|
23
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
24
|
+
|
25
|
+
corpus = Document::Corpus.setup({})
|
26
|
+
|
27
|
+
corpus.add_document(text)
|
28
|
+
|
29
|
+
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
30
|
+
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
31
|
+
|
32
|
+
annotid = annotation.annotid(corpus)
|
33
|
+
|
34
|
+
assert_equal 'verb', annotid.type
|
35
|
+
assert_equal 'verb', annotid.annotation.type
|
36
|
+
assert_equal 'is', annotid.annotation
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/segment/corpus'
|
6
|
+
|
7
|
+
class TestSegmentCorpus < Test::Unit::TestCase
|
8
|
+
def test_corpus
|
9
|
+
text = "This is a document"
|
10
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
11
|
+
|
12
|
+
corpus = {}
|
13
|
+
corpus.extend Document::Corpus
|
14
|
+
|
15
|
+
corpus.add_document(text)
|
16
|
+
|
17
|
+
docid = text.docid(corpus)
|
18
|
+
|
19
|
+
assert_equal docid.document, text
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_find
|
23
|
+
text = "This is a document"
|
24
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
25
|
+
|
26
|
+
TmpFile.with_file do |path|
|
27
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
28
|
+
corpus.extend Document::Corpus
|
29
|
+
|
30
|
+
corpus.add_document(text)
|
31
|
+
|
32
|
+
assert corpus.prefix("TEST:").include?(text.docid)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment/encoding'
|
3
|
+
|
4
|
+
class TestEncoding < Test::Unit::TestCase
|
5
|
+
def test_bad_chars
|
6
|
+
text = "A funky character ™ in a sentence."
|
7
|
+
|
8
|
+
assert_equal ["™"], Segment.bad_chars(text)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_ascii
|
12
|
+
text = "A funky character ™ in a sentence."
|
13
|
+
|
14
|
+
Segment.ascii(text) do
|
15
|
+
assert_equal "A funky character ? in a sentence.", text
|
16
|
+
end
|
17
|
+
|
18
|
+
Segment.ascii(text, "NONASCII") do
|
19
|
+
assert_equal "A funky character NONASCII in a sentence.", text
|
20
|
+
end
|
21
|
+
|
22
|
+
assert_equal "A funky character ™ in a sentence.", text
|
23
|
+
end
|
24
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
require File.join(File.expand_path(File.dirname(__FILE__)), '
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/segment/named_entity'
|
4
4
|
|
5
5
|
class TestClass < Test::Unit::TestCase
|
6
6
|
def test_info
|
@@ -15,35 +15,39 @@ class TestClass < Test::Unit::TestCase
|
|
15
15
|
|
16
16
|
def test_all_args
|
17
17
|
a = "test"
|
18
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
18
|
+
NamedEntity.setup a, 10, "TEST:doc1:test_type:hash", "NamedEntity", "TYPE", "CODE", "SCORE"
|
19
19
|
assert_equal 10, a.offset
|
20
|
+
assert_equal "NamedEntity", a.type
|
21
|
+
assert_equal "TYPE", a.entity_type
|
22
|
+
assert_equal "SCORE", a.score
|
20
23
|
end
|
21
24
|
|
22
25
|
def test_tsv
|
23
26
|
a = "test"
|
24
27
|
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
25
|
-
assert
|
26
|
-
assert
|
27
|
-
assert
|
28
|
+
assert Annotated.tsv([a]).fields.include? "code"
|
29
|
+
assert Annotated.tsv([a], nil).fields.include? "code"
|
30
|
+
assert Annotated.tsv([a], :all).fields.include? "code"
|
31
|
+
assert Annotated.tsv([a], :all).fields.include? "literal"
|
28
32
|
end
|
29
33
|
|
30
|
-
def
|
34
|
+
def __test_segment_brat
|
31
35
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
32
36
|
|
33
37
|
gene1 = "TP53"
|
34
38
|
gene1.extend NamedEntity
|
35
39
|
gene1.offset = a.index gene1
|
36
|
-
gene1.
|
40
|
+
gene1.entity_type = "Gene"
|
37
41
|
|
38
42
|
gene2 = "CDK5R1"
|
39
43
|
gene2.extend NamedEntity
|
40
44
|
gene2.offset = a.index gene2
|
41
|
-
gene2.
|
45
|
+
gene2.entity_type = "Gene"
|
42
46
|
|
43
47
|
gene3 = "TP53 gene"
|
44
48
|
gene3.extend NamedEntity
|
45
49
|
gene3.offset = a.index gene3
|
46
|
-
gene3.
|
50
|
+
gene3.entity_type = "Gene"
|
47
51
|
|
48
52
|
segments = [gene1, gene2, gene3]
|
49
53
|
assert segments.collect{|s| s.to_brat}.include? "Gene 27 35"
|