rbbt-text 1.3.0 → 1.3.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +20 -5
- data/lib/rbbt/document/annotation.rb +7 -4
- data/lib/rbbt/document/corpus.rb +30 -3
- data/lib/rbbt/document/corpus/pubmed.rb +2 -1
- data/lib/rbbt/ner/abner.rb +3 -2
- data/lib/rbbt/ner/banner.rb +3 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/g_norm_plus.rb +7 -1
- data/lib/rbbt/ner/linnaeus.rb +2 -1
- data/lib/rbbt/ner/patterns.rb +0 -1
- data/lib/rbbt/ner/rner.rb +229 -0
- data/lib/rbbt/ner/token_trieNER.rb +32 -18
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +195 -0
- data/lib/rbbt/relationship.rb +24 -0
- data/lib/rbbt/segment.rb +9 -4
- data/lib/rbbt/segment/annotation.rb +3 -3
- data/lib/rbbt/segment/named_entity.rb +7 -0
- data/lib/rbbt/segment/range_index.rb +1 -1
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/segment/transformed.rb +5 -1
- data/share/install/software/OpenNLP +1 -1
- data/share/rner/config.rb +51 -0
- data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
- data/test/rbbt/document/test_annotation.rb +15 -6
- data/test/rbbt/document/test_corpus.rb +15 -1
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
- data/test/rbbt/ner/test_rner.rb +132 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
- data/test/rbbt/segment/test_annotation.rb +3 -4
- data/test/rbbt/segment/test_encoding.rb +1 -1
- data/test/rbbt/segment/test_named_entity.rb +7 -5
- data/test/rbbt/segment/test_range_index.rb +1 -2
- data/test/rbbt/segment/test_transformed.rb +33 -4
- data/test/rbbt/test_segment.rb +5 -10
- data/test/test_spaCy.rb +144 -0
- metadata +12 -3
@@ -22,12 +22,14 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
assert_equal "SCORE", a.score
|
23
23
|
end
|
24
24
|
|
25
|
-
def
|
25
|
+
def test_tsv
|
26
26
|
a = "test"
|
27
|
-
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
28
|
-
|
29
|
-
assert
|
30
|
-
assert
|
27
|
+
NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
|
28
|
+
ppp Annotated.tsv([a,a])
|
29
|
+
assert Annotated.tsv([a]).fields.include? "code"
|
30
|
+
assert Annotated.tsv([a], nil).fields.include? "code"
|
31
|
+
assert Annotated.tsv([a], :all).fields.include? "code"
|
32
|
+
assert Annotated.tsv([a], :all).fields.include? "literal"
|
31
33
|
end
|
32
34
|
|
33
35
|
def __test_segment_brat
|
@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
|
|
9
9
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
10
|
Document.setup(text, "TEST", "test_doc1", nil)
|
11
11
|
|
12
|
-
corpus = {}
|
13
|
-
corpus.extend Document::Corpus
|
12
|
+
corpus = Document::Corpus.setup({})
|
14
13
|
|
15
14
|
corpus.add_document(text)
|
16
15
|
|
@@ -101,6 +101,35 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
101
101
|
assert_equal "CDK5R1 protein", exp2
|
102
102
|
end
|
103
103
|
|
104
|
+
def test_with_transform_sentences
|
105
|
+
a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
|
+
original = a.dup
|
107
|
+
|
108
|
+
gene1 = "TP53"
|
109
|
+
gene1.extend NamedEntity
|
110
|
+
gene1.offset = a.index gene1
|
111
|
+
|
112
|
+
gene2 = "CDK5R1"
|
113
|
+
gene2.extend NamedEntity
|
114
|
+
gene2.offset = a.index gene2
|
115
|
+
|
116
|
+
bread = "Bread"
|
117
|
+
bread.extend NamedEntity
|
118
|
+
bread.offset = a.index bread
|
119
|
+
|
120
|
+
sentences = Segment.align(a, a.split(". "))
|
121
|
+
|
122
|
+
Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
|
123
|
+
assert sentences[1].include?("GN gene and the GN protein")
|
124
|
+
end
|
125
|
+
|
126
|
+
Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
|
127
|
+
assert sentences[0].include?("first sentence mentions BR")
|
128
|
+
end
|
129
|
+
|
130
|
+
|
131
|
+
end
|
132
|
+
|
104
133
|
def test_html
|
105
134
|
a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
106
135
|
|
@@ -115,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
115
144
|
gene2.entity_type = "Protein"
|
116
145
|
|
117
146
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
118
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
147
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
119
148
|
end
|
120
149
|
end
|
121
150
|
|
@@ -136,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
136
165
|
gene2.entity_type = "Protein"
|
137
166
|
|
138
167
|
Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
|
139
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
|
168
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
|
140
169
|
end
|
141
170
|
end
|
142
171
|
|
@@ -156,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
|
|
156
185
|
assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
|
157
186
|
|
158
187
|
Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
|
159
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
188
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
|
160
189
|
Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
|
161
|
-
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
190
|
+
assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
|
162
191
|
end
|
163
192
|
end
|
164
193
|
end
|
data/test/rbbt/test_segment.rb
CHANGED
@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
|
|
17
17
|
text = "This is a document"
|
18
18
|
Document.setup(text, "TEST", "test_doc1", nil)
|
19
19
|
|
20
|
-
corpus = {}
|
21
|
-
corpus.extend Document::Corpus
|
20
|
+
corpus = Document::Corpus.setup({})
|
22
21
|
|
23
22
|
corpus.add_document(text)
|
24
23
|
|
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
|
|
41
40
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
42
41
|
Document.setup(text, "TEST", "test_doc1", nil)
|
43
42
|
|
44
|
-
corpus = {}
|
45
|
-
corpus.extend Document::Corpus
|
43
|
+
corpus = Document::Corpus.setup({})
|
46
44
|
|
47
45
|
corpus.add_document(text)
|
48
46
|
|
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
|
|
65
63
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
66
64
|
Document.setup(text, "TEST", "test_doc1", nil)
|
67
65
|
|
68
|
-
corpus = {}
|
69
|
-
corpus.extend Document::Corpus
|
66
|
+
corpus = Document::Corpus.setup({})
|
70
67
|
|
71
68
|
corpus.add_document(text)
|
72
69
|
|
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
|
|
94
91
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
95
92
|
Document.setup(text, "TEST", "test_doc1", nil)
|
96
93
|
|
97
|
-
corpus = {}
|
98
|
-
corpus.extend Document::Corpus
|
94
|
+
corpus = Document::Corpus.setup({})
|
99
95
|
|
100
96
|
corpus.add_document(text)
|
101
97
|
|
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
|
|
142
138
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
143
139
|
Document.setup(text, "TEST", "test_doc1", nil)
|
144
140
|
|
145
|
-
corpus = {}
|
146
|
-
corpus.extend Document::Corpus
|
141
|
+
corpus = Document::Corpus.setup({})
|
147
142
|
|
148
143
|
corpus.add_document(text)
|
149
144
|
|
data/test/test_spaCy.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
|
2
|
+
require 'rbbt/nlp/spaCy'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
|
5
|
+
class TestSpaCy < Test::Unit::TestCase
|
6
|
+
def test_tokens
|
7
|
+
text = "I tell a story"
|
8
|
+
|
9
|
+
tokens = SpaCy.tokens(text)
|
10
|
+
|
11
|
+
assert_equal 4, tokens.length
|
12
|
+
assert_equal "tell", tokens[1].to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_chunks
|
16
|
+
text = "Miguel Vazquez tell a good story"
|
17
|
+
|
18
|
+
tokens = SpaCy.chunks(text)
|
19
|
+
|
20
|
+
assert_equal 2, tokens.length
|
21
|
+
assert_equal "Miguel Vazquez", tokens[0].to_s
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
def test_segments
|
26
|
+
text = "I tell a story. It's a very good story."
|
27
|
+
|
28
|
+
corpus = Document::Corpus.setup({})
|
29
|
+
|
30
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
31
|
+
|
32
|
+
corpus.add_document text
|
33
|
+
text.corpus = corpus
|
34
|
+
|
35
|
+
segments = SpaCy.segments(text)
|
36
|
+
|
37
|
+
segments.each do |segment|
|
38
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_chunk_segments
|
43
|
+
text = "I tell a story. It's a very good story."
|
44
|
+
|
45
|
+
corpus = Document::Corpus.setup({})
|
46
|
+
|
47
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
48
|
+
|
49
|
+
corpus.add_document text
|
50
|
+
text.corpus = corpus
|
51
|
+
|
52
|
+
segments = SpaCy.chunk_segments(text)
|
53
|
+
|
54
|
+
segments.each do |segment|
|
55
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_dep_graph
|
60
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
61
|
+
graph = SpaCy.dep_graph(text, true)
|
62
|
+
|
63
|
+
tokens = SpaCy.segments(text)
|
64
|
+
index = Segment.index tokens
|
65
|
+
tf_s = tokens.select{|t| t == "TF" }.first
|
66
|
+
tg_s = tokens.select{|t| t == "TG" }.first
|
67
|
+
|
68
|
+
require 'rbbt/network/paths'
|
69
|
+
|
70
|
+
path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
|
71
|
+
path_tokens = path.collect do |segid|
|
72
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
73
|
+
text[range]
|
74
|
+
end
|
75
|
+
|
76
|
+
assert path_tokens.include? 'increase'
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_chunk_dep_graph
|
81
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
82
|
+
graph = SpaCy.chunk_dep_graph(text, true)
|
83
|
+
|
84
|
+
tokens = SpaCy.chunk_segments(text)
|
85
|
+
index = Segment.index tokens
|
86
|
+
tf_s = tokens.select{|t| t.include? "TF" }.first
|
87
|
+
tg_s = tokens.select{|t| t.include? "TG" }.first
|
88
|
+
|
89
|
+
|
90
|
+
require 'rbbt/network/paths'
|
91
|
+
|
92
|
+
path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
|
93
|
+
path_tokens = path.collect do |segid|
|
94
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
95
|
+
text[range]
|
96
|
+
end
|
97
|
+
|
98
|
+
assert path_tokens.include? 'increase'
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_paths
|
102
|
+
text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
|
103
|
+
path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
|
104
|
+
|
105
|
+
|
106
|
+
path_tokens = path.collect do |segid|
|
107
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
108
|
+
text[range]
|
109
|
+
end
|
110
|
+
|
111
|
+
ppp text
|
112
|
+
iii path_tokens
|
113
|
+
|
114
|
+
assert path_tokens.include? 'increase'
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_paths2
|
118
|
+
text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
|
119
|
+
path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
|
120
|
+
|
121
|
+
|
122
|
+
path_tokens = path.collect do |segid|
|
123
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
124
|
+
text[range]
|
125
|
+
end
|
126
|
+
|
127
|
+
iii path_tokens
|
128
|
+
|
129
|
+
|
130
|
+
assert path_tokens.include? 'regulation'
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_paths3
|
134
|
+
text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
|
135
|
+
path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
|
136
|
+
|
137
|
+
path_tokens = path.collect do |segid|
|
138
|
+
range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
|
139
|
+
text[range]
|
140
|
+
end
|
141
|
+
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -95,6 +95,7 @@ files:
|
|
95
95
|
- lib/rbbt/ner/oscar4.rb
|
96
96
|
- lib/rbbt/ner/patterns.rb
|
97
97
|
- lib/rbbt/ner/regexpNER.rb
|
98
|
+
- lib/rbbt/ner/rner.rb
|
98
99
|
- lib/rbbt/ner/rnorm.rb
|
99
100
|
- lib/rbbt/ner/rnorm/cue_index.rb
|
100
101
|
- lib/rbbt/ner/rnorm/tokens.rb
|
@@ -102,12 +103,15 @@ files:
|
|
102
103
|
- lib/rbbt/nlp/genia/sentence_splitter.rb
|
103
104
|
- lib/rbbt/nlp/nlp.rb
|
104
105
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
106
|
+
- lib/rbbt/nlp/spaCy.rb
|
107
|
+
- lib/rbbt/relationship.rb
|
105
108
|
- lib/rbbt/segment.rb
|
106
109
|
- lib/rbbt/segment/annotation.rb
|
107
110
|
- lib/rbbt/segment/encoding.rb
|
108
111
|
- lib/rbbt/segment/named_entity.rb
|
109
112
|
- lib/rbbt/segment/overlaps.rb
|
110
113
|
- lib/rbbt/segment/range_index.rb
|
114
|
+
- lib/rbbt/segment/relationship.rb
|
111
115
|
- lib/rbbt/segment/segmented.rb
|
112
116
|
- lib/rbbt/segment/token.rb
|
113
117
|
- lib/rbbt/segment/transformed.rb
|
@@ -124,6 +128,7 @@ files:
|
|
124
128
|
- share/install/software/OpenNLP
|
125
129
|
- share/install/software/StanfordParser
|
126
130
|
- share/patterns/drug_induce_disease
|
131
|
+
- share/rner/config.rb
|
127
132
|
- share/rnorm/cue_default
|
128
133
|
- share/rnorm/tokens_default
|
129
134
|
- share/wordlists/stopwords
|
@@ -146,6 +151,7 @@ files:
|
|
146
151
|
- test/rbbt/ner/test_oscar4.rb
|
147
152
|
- test/rbbt/ner/test_patterns.rb
|
148
153
|
- test/rbbt/ner/test_regexpNER.rb
|
154
|
+
- test/rbbt/ner/test_rner.rb
|
149
155
|
- test/rbbt/ner/test_rnorm.rb
|
150
156
|
- test/rbbt/ner/test_token_trieNER.rb
|
151
157
|
- test/rbbt/nlp/genia/test_sentence_splitter.rb
|
@@ -161,6 +167,7 @@ files:
|
|
161
167
|
- test/rbbt/test_document.rb
|
162
168
|
- test/rbbt/test_segment.rb
|
163
169
|
- test/test_helper.rb
|
170
|
+
- test/test_spaCy.rb
|
164
171
|
homepage: http://github.com/mikisvaz/rbbt-util
|
165
172
|
licenses: []
|
166
173
|
metadata: {}
|
@@ -179,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
179
186
|
- !ruby/object:Gem::Version
|
180
187
|
version: '0'
|
181
188
|
requirements: []
|
182
|
-
rubygems_version: 3.
|
189
|
+
rubygems_version: 3.1.4
|
183
190
|
signing_key:
|
184
191
|
specification_version: 4
|
185
192
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|
@@ -207,6 +214,7 @@ test_files:
|
|
207
214
|
- test/rbbt/ner/test_banner.rb
|
208
215
|
- test/rbbt/ner/test_token_trieNER.rb
|
209
216
|
- test/rbbt/ner/test_finder.rb
|
217
|
+
- test/rbbt/ner/test_rner.rb
|
210
218
|
- test/rbbt/ner/test_linnaeus.rb
|
211
219
|
- test/rbbt/ner/test_oscar4.rb
|
212
220
|
- test/rbbt/test_segment.rb
|
@@ -217,4 +225,5 @@ test_files:
|
|
217
225
|
- test/rbbt/segment/test_encoding.rb
|
218
226
|
- test/rbbt/segment/test_range_index.rb
|
219
227
|
- test/rbbt/segment/test_corpus.rb
|
228
|
+
- test/test_spaCy.rb
|
220
229
|
- test/test_helper.rb
|