rbbt-text 1.3.0 → 1.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +20 -5
- data/lib/rbbt/document/annotation.rb +7 -4
- data/lib/rbbt/document/corpus.rb +30 -3
- data/lib/rbbt/document/corpus/pubmed.rb +2 -1
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/g_norm_plus.rb +7 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/patterns.rb +0 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/token_trieNER.rb +32 -18
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +195 -0
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment.rb +9 -4
- data/lib/rbbt/segment/annotation.rb +3 -3
- data/lib/rbbt/segment/named_entity.rb +7 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/segment/transformed.rb +5 -1
- data/share/install/software/OpenNLP +1 -1
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
- data/test/rbbt/document/test_annotation.rb +15 -6
- data/test/rbbt/document/test_corpus.rb +15 -1
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
- data/test/rbbt/segment/test_annotation.rb +3 -4
- data/test/rbbt/segment/test_encoding.rb +1 -1
- data/test/rbbt/segment/test_named_entity.rb +7 -5
- data/test/rbbt/segment/test_range_index.rb +1 -2
- data/test/rbbt/segment/test_transformed.rb +33 -4
- data/test/rbbt/test_segment.rb +5 -10
- data/test/test_spaCy.rb +144 -0
- metadata +12 -3
@@ -6,7 +6,7 @@ module Segment::RangeIndex
|
|
6
6
|
SegID.setup(res, :corpus => corpus)
|
7
7
|
end
|
8
8
|
|
9
|
-
def self.index(segments, corpus, persist_file = :memory)
|
9
|
+
def self.index(segments, corpus = nil, persist_file = :memory)
|
10
10
|
segments = segments.values.flatten if Hash === segments
|
11
11
|
|
12
12
|
annotation_index =
|
@@ -68,6 +68,10 @@ module Transformed
|
|
68
68
|
|
69
69
|
segments = [segments] unless Array === segments
|
70
70
|
orig_length = self.length
|
71
|
+
|
72
|
+
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
|
+
segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
|
74
|
+
|
71
75
|
Segment.clean_sort(segments).each do |segment|
|
72
76
|
next if segment.offset.nil?
|
73
77
|
|
@@ -86,7 +90,7 @@ module Transformed
|
|
86
90
|
|
87
91
|
updated_text = self[updated_begin..updated_end]
|
88
92
|
if updated_text.nil?
|
89
|
-
Log.warn "Range outside of segment: #{self.length} #{segment.
|
93
|
+
Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
|
90
94
|
next
|
91
95
|
end
|
92
96
|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
isLetters /^[A-Z]+$/i
|
2
|
+
isUpper /^[A-Z]+$/
|
3
|
+
isLower /^[a-z]+$/
|
4
|
+
isDigits /^[0-9]+$/i
|
5
|
+
isRoman /^[IVX]+$/
|
6
|
+
isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
|
7
|
+
isPunctuation /^[,.;]$/
|
8
|
+
isDelim /^[\/()\[\]{}\-]$/
|
9
|
+
isNonWord /^[^\w]+$/
|
10
|
+
isConjunction /^and|or|&|,$/
|
11
|
+
|
12
|
+
hasLetters /[A-Z]/i
|
13
|
+
hasUpper /.[A-Z]/
|
14
|
+
hasLower /[a-z]/
|
15
|
+
hasDigits /[0-9]/i
|
16
|
+
hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
|
17
|
+
hasPunctuation /[,.;]/
|
18
|
+
hasDelim /[\/()\[\]{}\-]/
|
19
|
+
hasNonWord /[^\w]/
|
20
|
+
caspMix /[a-z].[A-Z]/
|
21
|
+
keywords /(?:protein|gene|domain|ase)s?$/
|
22
|
+
hasSuffix /[a-z][A-Z0-9]$/
|
23
|
+
|
24
|
+
numLetters do |w| w.scan(/[A-Z]/i).length end
|
25
|
+
numDigits do |w| w.scan(/[0-9]/).length end
|
26
|
+
#
|
27
|
+
prefix_3 /^(...)/
|
28
|
+
prefix_4 /^(....)/
|
29
|
+
suffix_3 /(...)$/
|
30
|
+
suffix_4 /(....)$/
|
31
|
+
|
32
|
+
|
33
|
+
token1 do |w|
|
34
|
+
w.sub(/[A-Z]/,'A').
|
35
|
+
sub(/[a-z]/,'a').
|
36
|
+
sub(/[0-9]/,'0').
|
37
|
+
sub(/[^0-9a-z]/i,'x')
|
38
|
+
end
|
39
|
+
token2 do |w|
|
40
|
+
w.sub(/[A-Z]+/,'A').
|
41
|
+
sub(/[a-z]+/,'a').
|
42
|
+
sub(/[0-9]+/,'0').
|
43
|
+
sub(/[^0-9a-z]+/i,'x')
|
44
|
+
end
|
45
|
+
token3 do |w| w.downcase end
|
46
|
+
special do |w| w.is_special? end
|
47
|
+
|
48
|
+
context %w(special token2 isPunctuation isDelim)
|
49
|
+
window %w(1 2 3 -1 -2 -3)
|
50
|
+
#direction :reverse
|
51
|
+
|
@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
|
|
7
7
|
def test_add_pmid
|
8
8
|
corpus = Document::Corpus.setup({})
|
9
9
|
|
10
|
-
document = corpus.add_pmid("
|
10
|
+
document = corpus.add_pmid("33359141", :abstract).first
|
11
|
+
iii document.docid
|
11
12
|
title = document.to(:title)
|
12
13
|
assert title.include?("COVID-19")
|
13
14
|
end
|
@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
|
|
4
4
|
require 'rbbt/segment'
|
5
5
|
require 'rbbt/document/annotation'
|
6
6
|
require 'rbbt/segment/named_entity'
|
7
|
+
require 'rbbt/ner/abner'
|
7
8
|
|
8
9
|
class TestAnnotation < Test::Unit::TestCase
|
9
10
|
class CalledOnce < Exception; end
|
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
|
|
28
29
|
self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
|
29
30
|
end
|
30
31
|
|
32
|
+
Document.define :abner do
|
33
|
+
$called_once = true
|
34
|
+
Abner.new.match(self)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
31
38
|
Document.persist :ner
|
32
39
|
end
|
33
40
|
|
@@ -36,7 +43,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
36
43
|
Document.setup(text, "TEST", "test_doc1", nil)
|
37
44
|
|
38
45
|
corpus = {}
|
39
|
-
|
46
|
+
Document::Corpus.setup corpus
|
40
47
|
|
41
48
|
corpus.add_document(text)
|
42
49
|
|
@@ -50,7 +57,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
50
57
|
Document.setup(text2, "TEST", "test_doc2", nil)
|
51
58
|
|
52
59
|
corpus = {}
|
53
|
-
|
60
|
+
Document::Corpus.setup corpus
|
54
61
|
|
55
62
|
corpus.add_document(text1)
|
56
63
|
corpus.add_document(text2)
|
@@ -68,7 +75,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
68
75
|
Document.setup(text, "TEST", "test_doc1", nil)
|
69
76
|
|
70
77
|
corpus = {}
|
71
|
-
|
78
|
+
Document::Corpus.setup corpus
|
72
79
|
|
73
80
|
corpus.add_document(text)
|
74
81
|
|
@@ -95,7 +102,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
95
102
|
Document.setup(text, "TEST", "test_doc1", nil)
|
96
103
|
|
97
104
|
corpus = {}
|
98
|
-
|
105
|
+
Document::Corpus.setup corpus
|
99
106
|
|
100
107
|
corpus.add_document(text)
|
101
108
|
|
@@ -122,7 +129,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
122
129
|
Document.setup(text, "TEST", "test_doc1", nil)
|
123
130
|
|
124
131
|
corpus = {}
|
125
|
-
|
132
|
+
Document::Corpus.setup corpus
|
126
133
|
|
127
134
|
corpus.add_document(text)
|
128
135
|
|
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
|
|
133
140
|
text.ner
|
134
141
|
|
135
142
|
assert ! $called_once
|
136
|
-
|
143
|
+
|
144
|
+
assert_equal text.abner.first.docid, text.docid
|
145
|
+
|
137
146
|
assert text.ner.first.segid.include?("TEST:")
|
138
147
|
end
|
139
148
|
end
|
@@ -26,7 +26,21 @@ class TestDocumentCorpus < Test::Unit::TestCase
|
|
26
26
|
|
27
27
|
corpus.add_document(text)
|
28
28
|
|
29
|
-
assert corpus.
|
29
|
+
assert corpus.docids("TEST:").include?(text.docid)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_load
|
34
|
+
text = "This is a document"
|
35
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
36
|
+
|
37
|
+
TmpFile.with_file do |path|
|
38
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
39
|
+
corpus.extend Document::Corpus
|
40
|
+
|
41
|
+
corpus.add_document(text)
|
42
|
+
|
43
|
+
assert corpus.docids("TEST:").include?(text.docid)
|
30
44
|
end
|
31
45
|
end
|
32
46
|
end
|
@@ -5,12 +5,17 @@ Log.severity = 0
|
|
5
5
|
class TestGNormPlus < Test::Unit::TestCase
|
6
6
|
def test_match
|
7
7
|
text =<<-EOF
|
8
|
-
|
8
|
+
|
9
|
+
Introduction
|
10
|
+
|
11
|
+
We found that TP53 is regulated by MDM2 in Homo
|
12
|
+
sapiens
|
9
13
|
EOF
|
10
14
|
|
11
15
|
mentions = GNormPlus.process({:file => text})
|
16
|
+
|
12
17
|
assert_equal 1, mentions.length
|
13
|
-
assert_equal
|
18
|
+
assert_equal 3, mentions["file"].length
|
14
19
|
end
|
15
20
|
|
16
21
|
def test_entities
|
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
|
|
19
24
|
EOF
|
20
25
|
|
21
26
|
mentions = GNormPlus.entities({:file => text})
|
22
|
-
mentions["file"].include?
|
27
|
+
assert mentions["file"].include?("TP53")
|
28
|
+
mentions["file"].each do |mention|
|
29
|
+
assert_equal mention, text[mention.range].sub("\n", ' ')
|
30
|
+
end
|
23
31
|
end
|
24
32
|
end
|
25
33
|
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt'
|
3
|
+
require 'rbbt/ner/rner'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestRNer < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@parser = NERFeatures.new() do
|
10
|
+
isLetters /^[A-Z]+$/i
|
11
|
+
context prefix_3 /^(...)/
|
12
|
+
downcase do |w| w.downcase end
|
13
|
+
|
14
|
+
context %w(downcase)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_config
|
19
|
+
config = <<-EOC
|
20
|
+
isLetters /^[A-Z]+$/i
|
21
|
+
context prefix_3 /^(...)/
|
22
|
+
downcase do |w| w.downcase end
|
23
|
+
|
24
|
+
context %w(downcase)
|
25
|
+
EOC
|
26
|
+
|
27
|
+
assert_equal config.strip, @parser.config.strip
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_reverse
|
31
|
+
assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
|
32
|
+
assert_equal(
|
33
|
+
". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
|
34
|
+
NERFeatures.reverse(
|
35
|
+
"A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
|
36
|
+
))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_features
|
40
|
+
assert_equal @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_template
|
44
|
+
template =<<-EOT
|
45
|
+
UisLetters: %x[0,1]
|
46
|
+
Uprefix_3: %x[0,2]
|
47
|
+
Uprefix_3#1: %x[1,2]
|
48
|
+
Uprefix_3#-1: %x[-1,2]
|
49
|
+
Udowncase: %x[0,3]
|
50
|
+
Udowncase#1: %x[1,3]
|
51
|
+
Udowncase#-1: %x[-1,3]
|
52
|
+
B
|
53
|
+
EOT
|
54
|
+
|
55
|
+
assert(@parser.template == template)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_tokens
|
59
|
+
assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
|
60
|
+
["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
def test_text_features
|
65
|
+
|
66
|
+
assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
|
67
|
+
assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
|
68
|
+
assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_tagged_features
|
73
|
+
assert_equal(
|
74
|
+
[["phosphorilation",true, "pho", "phosphorilation", 0],
|
75
|
+
["of",true, false, "of", 0],
|
76
|
+
["GENE1",false, "GEN", "gene1", 1],
|
77
|
+
[".", false, false, ".", 0]],
|
78
|
+
@parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
|
79
|
+
|
80
|
+
assert_equal(
|
81
|
+
[["GENE1",false, "GEN", "gene1", 1],
|
82
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]],
|
83
|
+
@parser.tagged_features("GENE1 phosphorilation",['GENE1']))
|
84
|
+
|
85
|
+
|
86
|
+
assert_equal(
|
87
|
+
[["phosphorilation",true, "pho", "phosphorilation", 0],
|
88
|
+
["of",true, false, "of", 0],
|
89
|
+
["GENE",true, "GEN", "gene", 1],
|
90
|
+
["1",false, false, "1", 2],
|
91
|
+
[".", false, false, ".", 0]],
|
92
|
+
@parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_tagged_features_reverse
|
96
|
+
@parser.reverse = true
|
97
|
+
assert_equal(
|
98
|
+
[
|
99
|
+
["GENE1",false, "GEN", "gene1", 1],
|
100
|
+
["of",true, false, "of", 0],
|
101
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]
|
102
|
+
],
|
103
|
+
@parser.tagged_features("phosphorilation of GENE1",['GENE1']))
|
104
|
+
|
105
|
+
assert_equal(
|
106
|
+
[
|
107
|
+
[".", false, false, ".", 0],
|
108
|
+
["1",false, false, "1", 1],
|
109
|
+
["GENE",true, "GEN", "gene", 2],
|
110
|
+
["of",true, false, "of", 0],
|
111
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]
|
112
|
+
],
|
113
|
+
@parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_default_config
|
117
|
+
require 'rbbt/bow/misc'
|
118
|
+
text =<<-EOF
|
119
|
+
This text explains how MDM2 interacts with TP53.
|
120
|
+
EOF
|
121
|
+
@parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
|
122
|
+
features = @parser.tagged_features text, %w(TP53 MDM2)
|
123
|
+
assert features.first.first == "This"
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
def __test_CRFPP_install
|
129
|
+
assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
|
|
7
7
|
This is a sentence.
|
8
8
|
A funky character ™ in a sentence.
|
9
9
|
This is a sentence.
|
10
|
-
This is a
|
10
|
+
This is a broken
|
11
11
|
sentence. This is
|
12
|
-
another sentence.
|
12
|
+
another broken sentence.
|
13
13
|
EOF
|
14
14
|
|
15
|
-
|
15
|
+
iii NLP.geniass_sentence_splitter(text)
|
16
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
16
17
|
end
|
17
18
|
|
19
|
+
def test_sentences_2
|
20
|
+
text =<<-EOF
|
21
|
+
This is a sentence.
|
22
|
+
This is a sentence.
|
23
|
+
This is a broken
|
24
|
+
sentence. This is
|
25
|
+
another broken sentence.
|
26
|
+
EOF
|
27
|
+
|
28
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_sentences_ext
|
32
|
+
text =<<-EOF
|
33
|
+
This is a sentence.
|
34
|
+
This is a sentence.
|
35
|
+
This is a broken
|
36
|
+
sentence. This is
|
37
|
+
another broken sentence.
|
38
|
+
EOF
|
39
|
+
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
41
|
+
end
|
18
42
|
end
|
19
43
|
|
@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
|
|
12
12
|
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
13
|
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
14
|
|
15
|
-
assert_equal 'verb', annotation.annotid.split(":")
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
16
16
|
|
17
17
|
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
-
assert_equal 'verb', annotation.annotid.split(":")
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_annotid
|
22
22
|
text = "This is a document"
|
23
23
|
Document.setup(text, "TEST", "test_doc1", nil)
|
24
24
|
|
25
|
-
corpus = {}
|
26
|
-
corpus.extend Document::Corpus
|
25
|
+
corpus = Document::Corpus.setup({})
|
27
26
|
|
28
27
|
corpus.add_document(text)
|
29
28
|
|
@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
|
|
2
2
|
require 'rbbt/segment/encoding'
|
3
3
|
|
4
4
|
class TestEncoding < Test::Unit::TestCase
|
5
|
-
def
|
5
|
+
def test_bad_chars
|
6
6
|
text = "A funky character ™ in a sentence."
|
7
7
|
|
8
8
|
assert_equal ["™"], Segment.bad_chars(text)
|