rbbt-text 1.3.0 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +20 -5
- data/lib/rbbt/document/annotation.rb +7 -4
- data/lib/rbbt/document/corpus.rb +30 -3
- data/lib/rbbt/document/corpus/pubmed.rb +2 -1
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/g_norm_plus.rb +7 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/patterns.rb +0 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/token_trieNER.rb +32 -18
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +195 -0
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment.rb +9 -4
- data/lib/rbbt/segment/annotation.rb +3 -3
- data/lib/rbbt/segment/named_entity.rb +7 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/segment/transformed.rb +5 -1
- data/share/install/software/OpenNLP +1 -1
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
- data/test/rbbt/document/test_annotation.rb +15 -6
- data/test/rbbt/document/test_corpus.rb +15 -1
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
- data/test/rbbt/segment/test_annotation.rb +3 -4
- data/test/rbbt/segment/test_encoding.rb +1 -1
- data/test/rbbt/segment/test_named_entity.rb +7 -5
- data/test/rbbt/segment/test_range_index.rb +1 -2
- data/test/rbbt/segment/test_transformed.rb +33 -4
- data/test/rbbt/test_segment.rb +5 -10
- data/test/test_spaCy.rb +144 -0
- metadata +12 -3
@@ -6,7 +6,7 @@ module Segment::RangeIndex
|
|
6
6
|
SegID.setup(res, :corpus => corpus)
|
7
7
|
end
|
8
8
|
|
9
|
-
def self.index(segments, corpus, persist_file = :memory)
|
9
|
+
def self.index(segments, corpus = nil, persist_file = :memory)
|
10
10
|
segments = segments.values.flatten if Hash === segments
|
11
11
|
|
12
12
|
annotation_index =
|
@@ -68,6 +68,10 @@ module Transformed
|
|
68
68
|
|
69
69
|
segments = [segments] unless Array === segments
|
70
70
|
orig_length = self.length
|
71
|
+
|
72
|
+
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
|
+
segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
|
74
|
+
|
71
75
|
Segment.clean_sort(segments).each do |segment|
|
72
76
|
next if segment.offset.nil?
|
73
77
|
|
@@ -86,7 +90,7 @@ module Transformed
|
|
86
90
|
|
87
91
|
updated_text = self[updated_begin..updated_end]
|
88
92
|
if updated_text.nil?
|
89
|
-
Log.warn "Range outside of segment: #{self.length} #{segment.
|
93
|
+
Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
|
90
94
|
next
|
91
95
|
end
|
92
96
|
|
@@ -0,0 +1,51 @@
|
|
1
|
+
isLetters /^[A-Z]+$/i
|
2
|
+
isUpper /^[A-Z]+$/
|
3
|
+
isLower /^[a-z]+$/
|
4
|
+
isDigits /^[0-9]+$/i
|
5
|
+
isRoman /^[IVX]+$/
|
6
|
+
isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
|
7
|
+
isPunctuation /^[,.;]$/
|
8
|
+
isDelim /^[\/()\[\]{}\-]$/
|
9
|
+
isNonWord /^[^\w]+$/
|
10
|
+
isConjunction /^and|or|&|,$/
|
11
|
+
|
12
|
+
hasLetters /[A-Z]/i
|
13
|
+
hasUpper /.[A-Z]/
|
14
|
+
hasLower /[a-z]/
|
15
|
+
hasDigits /[0-9]/i
|
16
|
+
hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
|
17
|
+
hasPunctuation /[,.;]/
|
18
|
+
hasDelim /[\/()\[\]{}\-]/
|
19
|
+
hasNonWord /[^\w]/
|
20
|
+
caspMix /[a-z].[A-Z]/
|
21
|
+
keywords /(?:protein|gene|domain|ase)s?$/
|
22
|
+
hasSuffix /[a-z][A-Z0-9]$/
|
23
|
+
|
24
|
+
numLetters do |w| w.scan(/[A-Z]/i).length end
|
25
|
+
numDigits do |w| w.scan(/[0-9]/).length end
|
26
|
+
#
|
27
|
+
prefix_3 /^(...)/
|
28
|
+
prefix_4 /^(....)/
|
29
|
+
suffix_3 /(...)$/
|
30
|
+
suffix_4 /(....)$/
|
31
|
+
|
32
|
+
|
33
|
+
token1 do |w|
|
34
|
+
w.sub(/[A-Z]/,'A').
|
35
|
+
sub(/[a-z]/,'a').
|
36
|
+
sub(/[0-9]/,'0').
|
37
|
+
sub(/[^0-9a-z]/i,'x')
|
38
|
+
end
|
39
|
+
token2 do |w|
|
40
|
+
w.sub(/[A-Z]+/,'A').
|
41
|
+
sub(/[a-z]+/,'a').
|
42
|
+
sub(/[0-9]+/,'0').
|
43
|
+
sub(/[^0-9a-z]+/i,'x')
|
44
|
+
end
|
45
|
+
token3 do |w| w.downcase end
|
46
|
+
special do |w| w.is_special? end
|
47
|
+
|
48
|
+
context %w(special token2 isPunctuation isDelim)
|
49
|
+
window %w(1 2 3 -1 -2 -3)
|
50
|
+
#direction :reverse
|
51
|
+
|
@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
|
|
7
7
|
def test_add_pmid
|
8
8
|
corpus = Document::Corpus.setup({})
|
9
9
|
|
10
|
-
document = corpus.add_pmid("
|
10
|
+
document = corpus.add_pmid("33359141", :abstract).first
|
11
|
+
iii document.docid
|
11
12
|
title = document.to(:title)
|
12
13
|
assert title.include?("COVID-19")
|
13
14
|
end
|
@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
|
|
4
4
|
require 'rbbt/segment'
|
5
5
|
require 'rbbt/document/annotation'
|
6
6
|
require 'rbbt/segment/named_entity'
|
7
|
+
require 'rbbt/ner/abner'
|
7
8
|
|
8
9
|
class TestAnnotation < Test::Unit::TestCase
|
9
10
|
class CalledOnce < Exception; end
|
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
|
|
28
29
|
self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
|
29
30
|
end
|
30
31
|
|
32
|
+
Document.define :abner do
|
33
|
+
$called_once = true
|
34
|
+
Abner.new.match(self)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
31
38
|
Document.persist :ner
|
32
39
|
end
|
33
40
|
|
@@ -36,7 +43,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
36
43
|
Document.setup(text, "TEST", "test_doc1", nil)
|
37
44
|
|
38
45
|
corpus = {}
|
39
|
-
|
46
|
+
Document::Corpus.setup corpus
|
40
47
|
|
41
48
|
corpus.add_document(text)
|
42
49
|
|
@@ -50,7 +57,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
50
57
|
Document.setup(text2, "TEST", "test_doc2", nil)
|
51
58
|
|
52
59
|
corpus = {}
|
53
|
-
|
60
|
+
Document::Corpus.setup corpus
|
54
61
|
|
55
62
|
corpus.add_document(text1)
|
56
63
|
corpus.add_document(text2)
|
@@ -68,7 +75,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
68
75
|
Document.setup(text, "TEST", "test_doc1", nil)
|
69
76
|
|
70
77
|
corpus = {}
|
71
|
-
|
78
|
+
Document::Corpus.setup corpus
|
72
79
|
|
73
80
|
corpus.add_document(text)
|
74
81
|
|
@@ -95,7 +102,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
95
102
|
Document.setup(text, "TEST", "test_doc1", nil)
|
96
103
|
|
97
104
|
corpus = {}
|
98
|
-
|
105
|
+
Document::Corpus.setup corpus
|
99
106
|
|
100
107
|
corpus.add_document(text)
|
101
108
|
|
@@ -122,7 +129,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
122
129
|
Document.setup(text, "TEST", "test_doc1", nil)
|
123
130
|
|
124
131
|
corpus = {}
|
125
|
-
|
132
|
+
Document::Corpus.setup corpus
|
126
133
|
|
127
134
|
corpus.add_document(text)
|
128
135
|
|
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
|
|
133
140
|
text.ner
|
134
141
|
|
135
142
|
assert ! $called_once
|
136
|
-
|
143
|
+
|
144
|
+
assert_equal text.abner.first.docid, text.docid
|
145
|
+
|
137
146
|
assert text.ner.first.segid.include?("TEST:")
|
138
147
|
end
|
139
148
|
end
|
@@ -26,7 +26,21 @@ class TestDocumentCorpus < Test::Unit::TestCase
|
|
26
26
|
|
27
27
|
corpus.add_document(text)
|
28
28
|
|
29
|
-
assert corpus.
|
29
|
+
assert corpus.docids("TEST:").include?(text.docid)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_load
|
34
|
+
text = "This is a document"
|
35
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
36
|
+
|
37
|
+
TmpFile.with_file do |path|
|
38
|
+
corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
|
39
|
+
corpus.extend Document::Corpus
|
40
|
+
|
41
|
+
corpus.add_document(text)
|
42
|
+
|
43
|
+
assert corpus.docids("TEST:").include?(text.docid)
|
30
44
|
end
|
31
45
|
end
|
32
46
|
end
|
@@ -5,12 +5,17 @@ Log.severity = 0
|
|
5
5
|
class TestGNormPlus < Test::Unit::TestCase
|
6
6
|
def test_match
|
7
7
|
text =<<-EOF
|
8
|
-
|
8
|
+
|
9
|
+
Introduction
|
10
|
+
|
11
|
+
We found that TP53 is regulated by MDM2 in Homo
|
12
|
+
sapiens
|
9
13
|
EOF
|
10
14
|
|
11
15
|
mentions = GNormPlus.process({:file => text})
|
16
|
+
|
12
17
|
assert_equal 1, mentions.length
|
13
|
-
assert_equal
|
18
|
+
assert_equal 3, mentions["file"].length
|
14
19
|
end
|
15
20
|
|
16
21
|
def test_entities
|
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
|
|
19
24
|
EOF
|
20
25
|
|
21
26
|
mentions = GNormPlus.entities({:file => text})
|
22
|
-
mentions["file"].include?
|
27
|
+
assert mentions["file"].include?("TP53")
|
28
|
+
mentions["file"].each do |mention|
|
29
|
+
assert_equal mention, text[mention.range].sub("\n", ' ')
|
30
|
+
end
|
23
31
|
end
|
24
32
|
end
|
25
33
|
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/../../test_helper'
|
2
|
+
require 'rbbt'
|
3
|
+
require 'rbbt/ner/rner'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestRNer < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
@parser = NERFeatures.new() do
|
10
|
+
isLetters /^[A-Z]+$/i
|
11
|
+
context prefix_3 /^(...)/
|
12
|
+
downcase do |w| w.downcase end
|
13
|
+
|
14
|
+
context %w(downcase)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_config
|
19
|
+
config = <<-EOC
|
20
|
+
isLetters /^[A-Z]+$/i
|
21
|
+
context prefix_3 /^(...)/
|
22
|
+
downcase do |w| w.downcase end
|
23
|
+
|
24
|
+
context %w(downcase)
|
25
|
+
EOC
|
26
|
+
|
27
|
+
assert_equal config.strip, @parser.config.strip
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_reverse
|
31
|
+
assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
|
32
|
+
assert_equal(
|
33
|
+
". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
|
34
|
+
NERFeatures.reverse(
|
35
|
+
"A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
|
36
|
+
))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_features
|
40
|
+
assert_equal @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_template
|
44
|
+
template =<<-EOT
|
45
|
+
UisLetters: %x[0,1]
|
46
|
+
Uprefix_3: %x[0,2]
|
47
|
+
Uprefix_3#1: %x[1,2]
|
48
|
+
Uprefix_3#-1: %x[-1,2]
|
49
|
+
Udowncase: %x[0,3]
|
50
|
+
Udowncase#1: %x[1,3]
|
51
|
+
Udowncase#-1: %x[-1,3]
|
52
|
+
B
|
53
|
+
EOT
|
54
|
+
|
55
|
+
assert(@parser.template == template)
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_tokens
|
59
|
+
assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
|
60
|
+
["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
|
61
|
+
|
62
|
+
|
63
|
+
end
|
64
|
+
def test_text_features
|
65
|
+
|
66
|
+
assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
|
67
|
+
assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
|
68
|
+
assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
|
69
|
+
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_tagged_features
|
73
|
+
assert_equal(
|
74
|
+
[["phosphorilation",true, "pho", "phosphorilation", 0],
|
75
|
+
["of",true, false, "of", 0],
|
76
|
+
["GENE1",false, "GEN", "gene1", 1],
|
77
|
+
[".", false, false, ".", 0]],
|
78
|
+
@parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
|
79
|
+
|
80
|
+
assert_equal(
|
81
|
+
[["GENE1",false, "GEN", "gene1", 1],
|
82
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]],
|
83
|
+
@parser.tagged_features("GENE1 phosphorilation",['GENE1']))
|
84
|
+
|
85
|
+
|
86
|
+
assert_equal(
|
87
|
+
[["phosphorilation",true, "pho", "phosphorilation", 0],
|
88
|
+
["of",true, false, "of", 0],
|
89
|
+
["GENE",true, "GEN", "gene", 1],
|
90
|
+
["1",false, false, "1", 2],
|
91
|
+
[".", false, false, ".", 0]],
|
92
|
+
@parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
|
93
|
+
end
|
94
|
+
|
95
|
+
def test_tagged_features_reverse
|
96
|
+
@parser.reverse = true
|
97
|
+
assert_equal(
|
98
|
+
[
|
99
|
+
["GENE1",false, "GEN", "gene1", 1],
|
100
|
+
["of",true, false, "of", 0],
|
101
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]
|
102
|
+
],
|
103
|
+
@parser.tagged_features("phosphorilation of GENE1",['GENE1']))
|
104
|
+
|
105
|
+
assert_equal(
|
106
|
+
[
|
107
|
+
[".", false, false, ".", 0],
|
108
|
+
["1",false, false, "1", 1],
|
109
|
+
["GENE",true, "GEN", "gene", 2],
|
110
|
+
["of",true, false, "of", 0],
|
111
|
+
["phosphorilation",true, "pho", "phosphorilation", 0]
|
112
|
+
],
|
113
|
+
@parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_default_config
|
117
|
+
require 'rbbt/bow/misc'
|
118
|
+
text =<<-EOF
|
119
|
+
This text explains how MDM2 interacts with TP53.
|
120
|
+
EOF
|
121
|
+
@parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
|
122
|
+
features = @parser.tagged_features text, %w(TP53 MDM2)
|
123
|
+
assert features.first.first == "This"
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
def __test_CRFPP_install
|
129
|
+
assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
|
|
7
7
|
This is a sentence.
|
8
8
|
A funky character ™ in a sentence.
|
9
9
|
This is a sentence.
|
10
|
-
This is a
|
10
|
+
This is a broken
|
11
11
|
sentence. This is
|
12
|
-
another sentence.
|
12
|
+
another broken sentence.
|
13
13
|
EOF
|
14
14
|
|
15
|
-
|
15
|
+
iii NLP.geniass_sentence_splitter(text)
|
16
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
16
17
|
end
|
17
18
|
|
19
|
+
def test_sentences_2
|
20
|
+
text =<<-EOF
|
21
|
+
This is a sentence.
|
22
|
+
This is a sentence.
|
23
|
+
This is a broken
|
24
|
+
sentence. This is
|
25
|
+
another broken sentence.
|
26
|
+
EOF
|
27
|
+
|
28
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_sentences_ext
|
32
|
+
text =<<-EOF
|
33
|
+
This is a sentence.
|
34
|
+
This is a sentence.
|
35
|
+
This is a broken
|
36
|
+
sentence. This is
|
37
|
+
another broken sentence.
|
38
|
+
EOF
|
39
|
+
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
41
|
+
end
|
18
42
|
end
|
19
43
|
|
@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
|
|
12
12
|
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
13
|
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
14
|
|
15
|
-
assert_equal 'verb', annotation.annotid.split(":")
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
16
16
|
|
17
17
|
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
-
assert_equal 'verb', annotation.annotid.split(":")
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_annotid
|
22
22
|
text = "This is a document"
|
23
23
|
Document.setup(text, "TEST", "test_doc1", nil)
|
24
24
|
|
25
|
-
corpus = {}
|
26
|
-
corpus.extend Document::Corpus
|
25
|
+
corpus = Document::Corpus.setup({})
|
27
26
|
|
28
27
|
corpus.add_document(text)
|
29
28
|
|
@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
|
|
2
2
|
require 'rbbt/segment/encoding'
|
3
3
|
|
4
4
|
class TestEncoding < Test::Unit::TestCase
|
5
|
-
def
|
5
|
+
def test_bad_chars
|
6
6
|
text = "A funky character ™ in a sentence."
|
7
7
|
|
8
8
|
assert_equal ["™"], Segment.bad_chars(text)
|