rbbt-text 1.3.0 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +20 -5
- data/lib/rbbt/document/annotation.rb +7 -4
- data/lib/rbbt/document/corpus.rb +30 -3
- data/lib/rbbt/document/corpus/pubmed.rb +2 -1
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/g_norm_plus.rb +7 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/patterns.rb +0 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/token_trieNER.rb +32 -18
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +195 -0
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment.rb +9 -4
- data/lib/rbbt/segment/annotation.rb +3 -3
- data/lib/rbbt/segment/named_entity.rb +7 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/segment/transformed.rb +5 -1
- data/share/install/software/OpenNLP +1 -1
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
- data/test/rbbt/document/test_annotation.rb +15 -6
- data/test/rbbt/document/test_corpus.rb +15 -1
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
- data/test/rbbt/segment/test_annotation.rb +3 -4
- data/test/rbbt/segment/test_encoding.rb +1 -1
- data/test/rbbt/segment/test_named_entity.rb +7 -5
- data/test/rbbt/segment/test_range_index.rb +1 -2
- data/test/rbbt/segment/test_transformed.rb +33 -4
- data/test/rbbt/test_segment.rb +5 -10
- data/test/test_spaCy.rb +144 -0
- metadata +12 -3
@@ -22,12 +22,14 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
assert_equal "SCORE", a.score
|
23
23
|
end
|
24
24
|
|
25
|
-
def
|
25
|
+
def test_tsv
|
26
26
|
a = "test"
|
27
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
28
|
-
|
29
|
-
assert
|
30
|
-
assert
|
27
|
+
NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
|
28
|
+
ppp Annotated.tsv([a,a])
|
29
|
+
assert Annotated.tsv([a]).fields.include? "code"
|
30
|
+
assert Annotated.tsv([a], nil).fields.include? "code"
|
31
|
+
assert Annotated.tsv([a], :all).fields.include? "code"
|
32
|
+
assert Annotated.tsv([a], :all).fields.include? "literal"
|
31
33
|
end
|
32
34
|
|
33
35
|
def __test_segment_brat
|
@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
|
|
9
9
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
10
|
Document.setup(text, "TEST", "test_doc1", nil)
|
11
11
|
|
12
|
-
corpus = {}
|
13
|
-
corpus.extend Document::Corpus
|
12
|
+
corpus = Document::Corpus.setup({})
|
14
13
|
|
15
14
|
corpus.add_document(text)
|
16
15
|
|
@@ -101,6 +101,35 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
101
101
|
assert_equal "CDK5R1 protein", exp2
|
102
102
|
end
|
103
103
|
|
104
|
+
def test_with_transform_sentences
|
105
|
+
a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
|
+
original = a.dup
|
107
|
+
|
108
|
+
gene1 = "TP53"
|
109
|
+
gene1.extend NamedEntity
|
110
|
+
gene1.offset = a.index gene1
|
111
|
+
|
112
|
+
gene2 = "CDK5R1"
|
113
|
+
gene2.extend NamedEntity
|
114
|
+
gene2.offset = a.index gene2
|
115
|
+
|
116
|
+
bread = "Bread"
|
117
|
+
bread.extend NamedEntity
|
118
|
+
bread.offset = a.index bread
|
119
|
+
|
120
|
+
sentences = Segment.align(a, a.split(". "))
|
121
|
+
|
122
|
+
Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
|
123
|
+
assert sentences[1].include?("GN gene and the GN protein")
|
124
|
+
end
|
125
|
+
|
126
|
+
Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
|
127
|
+
assert sentences[0].include?("first sentence mentions BR")
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
end
|
132
|
+
|
104
133
|
def test_html
|
105
134
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
135
|
|
@@ -115,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
115
144
|
gene2.entity_type = "Protein"
|
116
145
|
|
117
146
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
118
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
147
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
119
148
|
end
|
120
149
|
end
|
121
150
|
|
@@ -136,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
136
165
|
gene2.entity_type = "Protein"
|
137
166
|
|
138
167
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
139
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
168
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
140
169
|
end
|
141
170
|
end
|
142
171
|
|
@@ -156,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
156
185
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
157
186
|
|
158
187
|
Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
|
159
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
188
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
160
189
|
Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
|
161
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
190
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
162
191
|
end
|
163
192
|
end
|
164
193
|
end
|
data/test/rbbt/test_segment.rb
CHANGED
@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
|
|
17
17
|
text = "This is a document"
|
18
18
|
Document.setup(text, "TEST", "test_doc1", nil)
|
19
19
|
|
20
|
-
corpus = {}
|
21
|
-
corpus.extend Document::Corpus
|
20
|
+
corpus = Document::Corpus.setup({})
|
22
21
|
|
23
22
|
corpus.add_document(text)
|
24
23
|
|
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
|
|
41
40
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
42
41
|
Document.setup(text, "TEST", "test_doc1", nil)
|
43
42
|
|
44
|
-
corpus = {}
|
45
|
-
corpus.extend Document::Corpus
|
43
|
+
corpus = Document::Corpus.setup({})
|
46
44
|
|
47
45
|
corpus.add_document(text)
|
48
46
|
|
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
|
|
65
63
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
66
64
|
Document.setup(text, "TEST", "test_doc1", nil)
|
67
65
|
|
68
|
-
corpus = {}
|
69
|
-
corpus.extend Document::Corpus
|
66
|
+
corpus = Document::Corpus.setup({})
|
70
67
|
|
71
68
|
corpus.add_document(text)
|
72
69
|
|
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
|
|
94
91
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
95
92
|
Document.setup(text, "TEST", "test_doc1", nil)
|
96
93
|
|
97
|
-
corpus = {}
|
98
|
-
corpus.extend Document::Corpus
|
94
|
+
corpus = Document::Corpus.setup({})
|
99
95
|
|
100
96
|
corpus.add_document(text)
|
101
97
|
|
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
|
|
142
138
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
143
139
|
Document.setup(text, "TEST", "test_doc1", nil)
|
144
140
|
|
145
|
-
corpus = {}
|
146
|
-
corpus.extend Document::Corpus
|
141
|
+
corpus = Document::Corpus.setup({})
|
147
142
|
|
148
143
|
corpus.add_document(text)
|
149
144
|
|
data/test/test_spaCy.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
|
2
|
+
require 'rbbt/nlp/spaCy'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
|
5
|
+
class TestSpaCy < Test::Unit::TestCase
|
6
|
+
def test_tokens
|
7
|
+
text = "I tell a story"
|
8
|
+
|
9
|
+
tokens = SpaCy.tokens(text)
|
10
|
+
|
11
|
+
assert_equal 4, tokens.length
|
12
|
+
assert_equal "tell", tokens[1].to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_chunks
|
16
|
+
text = "Miguel Vazquez tell a good story"
|
17
|
+
|
18
|
+
tokens = SpaCy.chunks(text)
|
19
|
+
|
20
|
+
assert_equal 2, tokens.length
|
21
|
+
assert_equal "Miguel Vazquez", tokens[0].to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def test_segments
|
26
|
+
text = "I tell a story. It's a very good story."
|
27
|
+
|
28
|
+
corpus = Document::Corpus.setup({})
|
29
|
+
|
30
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
31
|
+
|
32
|
+
corpus.add_document text
|
33
|
+
text.corpus = corpus
|
34
|
+
|
35
|
+
segments = SpaCy.segments(text)
|
36
|
+
|
37
|
+
segments.each do |segment|
|
38
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_chunk_segments
|
43
|
+
text = "I tell a story. It's a very good story."
|
44
|
+
|
45
|
+
corpus = Document::Corpus.setup({})
|
46
|
+
|
47
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
48
|
+
|
49
|
+
corpus.add_document text
|
50
|
+
text.corpus = corpus
|
51
|
+
|
52
|
+
segments = SpaCy.chunk_segments(text)
|
53
|
+
|
54
|
+
segments.each do |segment|
|
55
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_dep_graph
|
60
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
61
|
+
graph = SpaCy.dep_graph(text, true)
|
62
|
+
|
63
|
+
tokens = SpaCy.segments(text)
|
64
|
+
index = Segment.index tokens
|
65
|
+
tf_s = tokens.select{|t| t == "TF" }.first
|
66
|
+
tg_s = tokens.select{|t| t == "TG" }.first
|
67
|
+
|
68
|
+
require 'rbbt/network/paths'
|
69
|
+
|
70
|
+
path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
|
71
|
+
path_tokens = path.collect do |segid|
|
72
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
73
|
+
text[range]
|
74
|
+
end
|
75
|
+
|
76
|
+
assert path_tokens.include? 'increase'
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_chunk_dep_graph
|
81
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
82
|
+
graph = SpaCy.chunk_dep_graph(text, true)
|
83
|
+
|
84
|
+
tokens = SpaCy.chunk_segments(text)
|
85
|
+
index = Segment.index tokens
|
86
|
+
tf_s = tokens.select{|t| t.include? "TF" }.first
|
87
|
+
tg_s = tokens.select{|t| t.include? "TG" }.first
|
88
|
+
|
89
|
+
|
90
|
+
require 'rbbt/network/paths'
|
91
|
+
|
92
|
+
path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
|
93
|
+
path_tokens = path.collect do |segid|
|
94
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
95
|
+
text[range]
|
96
|
+
end
|
97
|
+
|
98
|
+
assert path_tokens.include? 'increase'
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_paths
|
102
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
103
|
+
path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
|
104
|
+
|
105
|
+
|
106
|
+
path_tokens = path.collect do |segid|
|
107
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
108
|
+
text[range]
|
109
|
+
end
|
110
|
+
|
111
|
+
ppp text
|
112
|
+
iii path_tokens
|
113
|
+
|
114
|
+
assert path_tokens.include? 'increase'
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_paths2
|
118
|
+
text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
|
119
|
+
path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
|
120
|
+
|
121
|
+
|
122
|
+
path_tokens = path.collect do |segid|
|
123
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
124
|
+
text[range]
|
125
|
+
end
|
126
|
+
|
127
|
+
iii path_tokens
|
128
|
+
|
129
|
+
|
130
|
+
assert path_tokens.include? 'regulation'
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_paths3
|
134
|
+
text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
|
135
|
+
path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
|
136
|
+
|
137
|
+
path_tokens = path.collect do |segid|
|
138
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
139
|
+
text[range]
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -95,6 +95,7 @@ files:
|
|
95
95
|
- lib/rbbt/ner/oscar4.rb
|
96
96
|
- lib/rbbt/ner/patterns.rb
|
97
97
|
- lib/rbbt/ner/regexpNER.rb
|
98
|
+
- lib/rbbt/ner/rner.rb
|
98
99
|
- lib/rbbt/ner/rnorm.rb
|
99
100
|
- lib/rbbt/ner/rnorm/cue_index.rb
|
100
101
|
- lib/rbbt/ner/rnorm/tokens.rb
|
@@ -102,12 +103,15 @@ files:
|
|
102
103
|
- lib/rbbt/nlp/genia/sentence_splitter.rb
|
103
104
|
- lib/rbbt/nlp/nlp.rb
|
104
105
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
106
|
+
- lib/rbbt/nlp/spaCy.rb
|
107
|
+
- lib/rbbt/relationship.rb
|
105
108
|
- lib/rbbt/segment.rb
|
106
109
|
- lib/rbbt/segment/annotation.rb
|
107
110
|
- lib/rbbt/segment/encoding.rb
|
108
111
|
- lib/rbbt/segment/named_entity.rb
|
109
112
|
- lib/rbbt/segment/overlaps.rb
|
110
113
|
- lib/rbbt/segment/range_index.rb
|
114
|
+
- lib/rbbt/segment/relationship.rb
|
111
115
|
- lib/rbbt/segment/segmented.rb
|
112
116
|
- lib/rbbt/segment/token.rb
|
113
117
|
- lib/rbbt/segment/transformed.rb
|
@@ -124,6 +128,7 @@ files:
|
|
124
128
|
- share/install/software/OpenNLP
|
125
129
|
- share/install/software/StanfordParser
|
126
130
|
- share/patterns/drug_induce_disease
|
131
|
+
- share/rner/config.rb
|
127
132
|
- share/rnorm/cue_default
|
128
133
|
- share/rnorm/tokens_default
|
129
134
|
- share/wordlists/stopwords
|
@@ -146,6 +151,7 @@ files:
|
|
146
151
|
- test/rbbt/ner/test_oscar4.rb
|
147
152
|
- test/rbbt/ner/test_patterns.rb
|
148
153
|
- test/rbbt/ner/test_regexpNER.rb
|
154
|
+
- test/rbbt/ner/test_rner.rb
|
149
155
|
- test/rbbt/ner/test_rnorm.rb
|
150
156
|
- test/rbbt/ner/test_token_trieNER.rb
|
151
157
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
@@ -161,6 +167,7 @@ files:
|
|
161
167
|
- test/rbbt/test_document.rb
|
162
168
|
- test/rbbt/test_segment.rb
|
163
169
|
- test/test_helper.rb
|
170
|
+
- test/test_spaCy.rb
|
164
171
|
homepage: http://github.com/mikisvaz/rbbt-util
|
165
172
|
licenses: []
|
166
173
|
metadata: {}
|
@@ -179,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
179
186
|
- !ruby/object:Gem::Version
|
180
187
|
version: '0'
|
181
188
|
requirements: []
|
182
|
-
rubygems_version: 3.
|
189
|
+
rubygems_version: 3.1.4
|
183
190
|
signing_key:
|
184
191
|
specification_version: 4
|
185
192
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
@@ -207,6 +214,7 @@ test_files:
|
|
207
214
|
- test/rbbt/ner/test_banner.rb
|
208
215
|
- test/rbbt/ner/test_token_trieNER.rb
|
209
216
|
- test/rbbt/ner/test_finder.rb
|
217
|
+
- test/rbbt/ner/test_rner.rb
|
210
218
|
- test/rbbt/ner/test_linnaeus.rb
|
211
219
|
- test/rbbt/ner/test_oscar4.rb
|
212
220
|
- test/rbbt/test_segment.rb
|
@@ -217,4 +225,5 @@ test_files:
|
|
217
225
|
- test/rbbt/segment/test_encoding.rb
|
218
226
|
- test/rbbt/segment/test_range_index.rb
|
219
227
|
- test/rbbt/segment/test_corpus.rb
|
228
|
+
- test/test_spaCy.rb
|
220
229
|
- test/test_helper.rb
|