rbbt-text 1.3.0 → 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +20 -5
  5. data/lib/rbbt/document/annotation.rb +7 -4
  6. data/lib/rbbt/document/corpus.rb +30 -3
  7. data/lib/rbbt/document/corpus/pubmed.rb +2 -1
  8. data/lib/rbbt/ner/abner.rb +3 -2
  9. data/lib/rbbt/ner/banner.rb +3 -1
  10. data/lib/rbbt/ner/brat.rb +1 -1
  11. data/lib/rbbt/ner/g_norm_plus.rb +7 -1
  12. data/lib/rbbt/ner/linnaeus.rb +2 -1
  13. data/lib/rbbt/ner/patterns.rb +0 -1
  14. data/lib/rbbt/ner/rner.rb +229 -0
  15. data/lib/rbbt/ner/token_trieNER.rb +32 -18
  16. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  17. data/lib/rbbt/nlp/spaCy.rb +195 -0
  18. data/lib/rbbt/relationship.rb +24 -0
  19. data/lib/rbbt/segment.rb +9 -4
  20. data/lib/rbbt/segment/annotation.rb +3 -3
  21. data/lib/rbbt/segment/named_entity.rb +7 -0
  22. data/lib/rbbt/segment/range_index.rb +1 -1
  23. data/lib/rbbt/segment/relationship.rb +7 -0
  24. data/lib/rbbt/segment/transformed.rb +5 -1
  25. data/share/install/software/OpenNLP +1 -1
  26. data/share/rner/config.rb +51 -0
  27. data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
  28. data/test/rbbt/document/test_annotation.rb +15 -6
  29. data/test/rbbt/document/test_corpus.rb +15 -1
  30. data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
  31. data/test/rbbt/ner/test_rner.rb +132 -0
  32. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
  33. data/test/rbbt/segment/test_annotation.rb +3 -4
  34. data/test/rbbt/segment/test_encoding.rb +1 -1
  35. data/test/rbbt/segment/test_named_entity.rb +7 -5
  36. data/test/rbbt/segment/test_range_index.rb +1 -2
  37. data/test/rbbt/segment/test_transformed.rb +33 -4
  38. data/test/rbbt/test_segment.rb +5 -10
  39. data/test/test_spaCy.rb +144 -0
  40. metadata +12 -3
@@ -22,12 +22,14 @@ class TestClass < Test::Unit::TestCase
22
22
  assert_equal "SCORE", a.score
23
23
  end
24
24
 
25
- def __test_tsv
25
+ def test_tsv
26
26
  a = "test"
27
- NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
28
- assert Segment.tsv([a]).fields.include? "code"
29
- assert Segment.tsv([a], nil).fields.include? "code"
30
- assert Segment.tsv([a], "literal").fields.include? "code"
27
+ NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
28
+ ppp Annotated.tsv([a,a])
29
+ assert Annotated.tsv([a]).fields.include? "code"
30
+ assert Annotated.tsv([a], nil).fields.include? "code"
31
+ assert Annotated.tsv([a], :all).fields.include? "code"
32
+ assert Annotated.tsv([a], :all).fields.include? "literal"
31
33
  end
32
34
 
33
35
  def __test_segment_brat
@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
9
9
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
10
10
  Document.setup(text, "TEST", "test_doc1", nil)
11
11
 
12
- corpus = {}
13
- corpus.extend Document::Corpus
12
+ corpus = Document::Corpus.setup({})
14
13
 
15
14
  corpus.add_document(text)
16
15
 
@@ -101,6 +101,35 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
101
101
  assert_equal "CDK5R1 protein", exp2
102
102
  end
103
103
 
104
+ def test_with_transform_sentences
105
+ a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
106
+ original = a.dup
107
+
108
+ gene1 = "TP53"
109
+ gene1.extend NamedEntity
110
+ gene1.offset = a.index gene1
111
+
112
+ gene2 = "CDK5R1"
113
+ gene2.extend NamedEntity
114
+ gene2.offset = a.index gene2
115
+
116
+ bread = "Bread"
117
+ bread.extend NamedEntity
118
+ bread.offset = a.index bread
119
+
120
+ sentences = Segment.align(a, a.split(". "))
121
+
122
+ Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
123
+ assert sentences[1].include?("GN gene and the GN protein")
124
+ end
125
+
126
+ Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
127
+ assert sentences[0].include?("first sentence mentions BR")
128
+ end
129
+
130
+
131
+ end
132
+
104
133
  def test_html
105
134
  a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
106
135
 
@@ -115,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
115
144
  gene2.entity_type = "Protein"
116
145
 
117
146
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
118
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
147
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
119
148
  end
120
149
  end
121
150
 
@@ -136,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
136
165
  gene2.entity_type = "Protein"
137
166
 
138
167
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
139
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
168
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
140
169
  end
141
170
  end
142
171
 
@@ -156,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
156
185
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
157
186
 
158
187
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
159
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
188
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
160
189
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
161
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
190
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
162
191
  end
163
192
  end
164
193
  end
@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
17
17
  text = "This is a document"
18
18
  Document.setup(text, "TEST", "test_doc1", nil)
19
19
 
20
- corpus = {}
21
- corpus.extend Document::Corpus
20
+ corpus = Document::Corpus.setup({})
22
21
 
23
22
  corpus.add_document(text)
24
23
 
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
41
40
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
42
41
  Document.setup(text, "TEST", "test_doc1", nil)
43
42
 
44
- corpus = {}
45
- corpus.extend Document::Corpus
43
+ corpus = Document::Corpus.setup({})
46
44
 
47
45
  corpus.add_document(text)
48
46
 
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
65
63
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
66
64
  Document.setup(text, "TEST", "test_doc1", nil)
67
65
 
68
- corpus = {}
69
- corpus.extend Document::Corpus
66
+ corpus = Document::Corpus.setup({})
70
67
 
71
68
  corpus.add_document(text)
72
69
 
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
94
91
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
95
92
  Document.setup(text, "TEST", "test_doc1", nil)
96
93
 
97
- corpus = {}
98
- corpus.extend Document::Corpus
94
+ corpus = Document::Corpus.setup({})
99
95
 
100
96
  corpus.add_document(text)
101
97
 
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
142
138
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
143
139
  Document.setup(text, "TEST", "test_doc1", nil)
144
140
 
145
- corpus = {}
146
- corpus.extend Document::Corpus
141
+ corpus = Document::Corpus.setup({})
147
142
 
148
143
  corpus.add_document(text)
149
144
 
@@ -0,0 +1,144 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
2
+ require 'rbbt/nlp/spaCy'
3
+ require 'rbbt/document/corpus'
4
+
5
+ class TestSpaCy < Test::Unit::TestCase
6
+ def test_tokens
7
+ text = "I tell a story"
8
+
9
+ tokens = SpaCy.tokens(text)
10
+
11
+ assert_equal 4, tokens.length
12
+ assert_equal "tell", tokens[1].to_s
13
+ end
14
+
15
+ def test_chunks
16
+ text = "Miguel Vazquez tell a good story"
17
+
18
+ tokens = SpaCy.chunks(text)
19
+
20
+ assert_equal 2, tokens.length
21
+ assert_equal "Miguel Vazquez", tokens[0].to_s
22
+ end
23
+
24
+
25
+ def test_segments
26
+ text = "I tell a story. It's a very good story."
27
+
28
+ corpus = Document::Corpus.setup({})
29
+
30
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
31
+
32
+ corpus.add_document text
33
+ text.corpus = corpus
34
+
35
+ segments = SpaCy.segments(text)
36
+
37
+ segments.each do |segment|
38
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
39
+ end
40
+ end
41
+
42
+ def test_chunk_segments
43
+ text = "I tell a story. It's a very good story."
44
+
45
+ corpus = Document::Corpus.setup({})
46
+
47
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
48
+
49
+ corpus.add_document text
50
+ text.corpus = corpus
51
+
52
+ segments = SpaCy.chunk_segments(text)
53
+
54
+ segments.each do |segment|
55
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
56
+ end
57
+ end
58
+
59
+ def test_dep_graph
60
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
61
+ graph = SpaCy.dep_graph(text, true)
62
+
63
+ tokens = SpaCy.segments(text)
64
+ index = Segment.index tokens
65
+ tf_s = tokens.select{|t| t == "TF" }.first
66
+ tg_s = tokens.select{|t| t == "TG" }.first
67
+
68
+ require 'rbbt/network/paths'
69
+
70
+ path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
71
+ path_tokens = path.collect do |segid|
72
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
73
+ text[range]
74
+ end
75
+
76
+ assert path_tokens.include? 'increase'
77
+
78
+ end
79
+
80
+ def test_chunk_dep_graph
81
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
82
+ graph = SpaCy.chunk_dep_graph(text, true)
83
+
84
+ tokens = SpaCy.chunk_segments(text)
85
+ index = Segment.index tokens
86
+ tf_s = tokens.select{|t| t.include? "TF" }.first
87
+ tg_s = tokens.select{|t| t.include? "TG" }.first
88
+
89
+
90
+ require 'rbbt/network/paths'
91
+
92
+ path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
93
+ path_tokens = path.collect do |segid|
94
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
95
+ text[range]
96
+ end
97
+
98
+ assert path_tokens.include? 'increase'
99
+ end
100
+
101
+ def test_paths
102
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
103
+ path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
104
+
105
+
106
+ path_tokens = path.collect do |segid|
107
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
108
+ text[range]
109
+ end
110
+
111
+ ppp text
112
+ iii path_tokens
113
+
114
+ assert path_tokens.include? 'increase'
115
+ end
116
+
117
+ def test_paths2
118
+ text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
119
+ path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
120
+
121
+
122
+ path_tokens = path.collect do |segid|
123
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
124
+ text[range]
125
+ end
126
+
127
+ iii path_tokens
128
+
129
+
130
+ assert path_tokens.include? 'regulation'
131
+ end
132
+
133
+ def test_paths3
134
+ text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
135
+ path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
136
+
137
+ path_tokens = path.collect do |segid|
138
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
139
+ text[range]
140
+ end
141
+
142
+ end
143
+ end
144
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-18 00:00:00.000000000 Z
11
+ date: 2021-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -95,6 +95,7 @@ files:
95
95
  - lib/rbbt/ner/oscar4.rb
96
96
  - lib/rbbt/ner/patterns.rb
97
97
  - lib/rbbt/ner/regexpNER.rb
98
+ - lib/rbbt/ner/rner.rb
98
99
  - lib/rbbt/ner/rnorm.rb
99
100
  - lib/rbbt/ner/rnorm/cue_index.rb
100
101
  - lib/rbbt/ner/rnorm/tokens.rb
@@ -102,12 +103,15 @@ files:
102
103
  - lib/rbbt/nlp/genia/sentence_splitter.rb
103
104
  - lib/rbbt/nlp/nlp.rb
104
105
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
106
+ - lib/rbbt/nlp/spaCy.rb
107
+ - lib/rbbt/relationship.rb
105
108
  - lib/rbbt/segment.rb
106
109
  - lib/rbbt/segment/annotation.rb
107
110
  - lib/rbbt/segment/encoding.rb
108
111
  - lib/rbbt/segment/named_entity.rb
109
112
  - lib/rbbt/segment/overlaps.rb
110
113
  - lib/rbbt/segment/range_index.rb
114
+ - lib/rbbt/segment/relationship.rb
111
115
  - lib/rbbt/segment/segmented.rb
112
116
  - lib/rbbt/segment/token.rb
113
117
  - lib/rbbt/segment/transformed.rb
@@ -124,6 +128,7 @@ files:
124
128
  - share/install/software/OpenNLP
125
129
  - share/install/software/StanfordParser
126
130
  - share/patterns/drug_induce_disease
131
+ - share/rner/config.rb
127
132
  - share/rnorm/cue_default
128
133
  - share/rnorm/tokens_default
129
134
  - share/wordlists/stopwords
@@ -146,6 +151,7 @@ files:
146
151
  - test/rbbt/ner/test_oscar4.rb
147
152
  - test/rbbt/ner/test_patterns.rb
148
153
  - test/rbbt/ner/test_regexpNER.rb
154
+ - test/rbbt/ner/test_rner.rb
149
155
  - test/rbbt/ner/test_rnorm.rb
150
156
  - test/rbbt/ner/test_token_trieNER.rb
151
157
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
@@ -161,6 +167,7 @@ files:
161
167
  - test/rbbt/test_document.rb
162
168
  - test/rbbt/test_segment.rb
163
169
  - test/test_helper.rb
170
+ - test/test_spaCy.rb
164
171
  homepage: http://github.com/mikisvaz/rbbt-util
165
172
  licenses: []
166
173
  metadata: {}
@@ -179,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
179
186
  - !ruby/object:Gem::Version
180
187
  version: '0'
181
188
  requirements: []
182
- rubygems_version: 3.0.6
189
+ rubygems_version: 3.1.4
183
190
  signing_key:
184
191
  specification_version: 4
185
192
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
@@ -207,6 +214,7 @@ test_files:
207
214
  - test/rbbt/ner/test_banner.rb
208
215
  - test/rbbt/ner/test_token_trieNER.rb
209
216
  - test/rbbt/ner/test_finder.rb
217
+ - test/rbbt/ner/test_rner.rb
210
218
  - test/rbbt/ner/test_linnaeus.rb
211
219
  - test/rbbt/ner/test_oscar4.rb
212
220
  - test/rbbt/test_segment.rb
@@ -217,4 +225,5 @@ test_files:
217
225
  - test/rbbt/segment/test_encoding.rb
218
226
  - test/rbbt/segment/test_range_index.rb
219
227
  - test/rbbt/segment/test_corpus.rb
228
+ - test/test_spaCy.rb
220
229
  - test/test_helper.rb