rbbt-text 1.3.0 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +20 -5
  5. data/lib/rbbt/document/annotation.rb +7 -4
  6. data/lib/rbbt/document/corpus.rb +30 -3
  7. data/lib/rbbt/document/corpus/pubmed.rb +2 -1
  8. data/lib/rbbt/ner/abner.rb +3 -2
  9. data/lib/rbbt/ner/banner.rb +3 -1
  10. data/lib/rbbt/ner/brat.rb +1 -1
  11. data/lib/rbbt/ner/g_norm_plus.rb +7 -1
  12. data/lib/rbbt/ner/linnaeus.rb +2 -1
  13. data/lib/rbbt/ner/patterns.rb +0 -1
  14. data/lib/rbbt/ner/rner.rb +229 -0
  15. data/lib/rbbt/ner/token_trieNER.rb +32 -18
  16. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  17. data/lib/rbbt/nlp/spaCy.rb +195 -0
  18. data/lib/rbbt/relationship.rb +24 -0
  19. data/lib/rbbt/segment.rb +9 -4
  20. data/lib/rbbt/segment/annotation.rb +3 -3
  21. data/lib/rbbt/segment/named_entity.rb +7 -0
  22. data/lib/rbbt/segment/range_index.rb +1 -1
  23. data/lib/rbbt/segment/relationship.rb +7 -0
  24. data/lib/rbbt/segment/transformed.rb +5 -1
  25. data/share/install/software/OpenNLP +1 -1
  26. data/share/rner/config.rb +51 -0
  27. data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
  28. data/test/rbbt/document/test_annotation.rb +15 -6
  29. data/test/rbbt/document/test_corpus.rb +15 -1
  30. data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
  31. data/test/rbbt/ner/test_rner.rb +132 -0
  32. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
  33. data/test/rbbt/segment/test_annotation.rb +3 -4
  34. data/test/rbbt/segment/test_encoding.rb +1 -1
  35. data/test/rbbt/segment/test_named_entity.rb +7 -5
  36. data/test/rbbt/segment/test_range_index.rb +1 -2
  37. data/test/rbbt/segment/test_transformed.rb +33 -4
  38. data/test/rbbt/test_segment.rb +5 -10
  39. data/test/test_spaCy.rb +144 -0
  40. metadata +12 -3
@@ -22,12 +22,14 @@ class TestClass < Test::Unit::TestCase
22
22
  assert_equal "SCORE", a.score
23
23
  end
24
24
 
25
- def __test_tsv
25
+ def test_tsv
26
26
  a = "test"
27
- NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
28
- assert Segment.tsv([a]).fields.include? "code"
29
- assert Segment.tsv([a], nil).fields.include? "code"
30
- assert Segment.tsv([a], "literal").fields.include? "code"
27
+ NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
28
+ ppp Annotated.tsv([a,a])
29
+ assert Annotated.tsv([a]).fields.include? "code"
30
+ assert Annotated.tsv([a], nil).fields.include? "code"
31
+ assert Annotated.tsv([a], :all).fields.include? "code"
32
+ assert Annotated.tsv([a], :all).fields.include? "literal"
31
33
  end
32
34
 
33
35
  def __test_segment_brat
@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
9
9
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
10
10
  Document.setup(text, "TEST", "test_doc1", nil)
11
11
 
12
- corpus = {}
13
- corpus.extend Document::Corpus
12
+ corpus = Document::Corpus.setup({})
14
13
 
15
14
  corpus.add_document(text)
16
15
 
@@ -101,6 +101,35 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
101
101
  assert_equal "CDK5R1 protein", exp2
102
102
  end
103
103
 
104
+ def test_with_transform_sentences
105
+ a = "This first sentence mentions Bread. This sentence mentions the TP53 gene and the CDK5R1 protein"
106
+ original = a.dup
107
+
108
+ gene1 = "TP53"
109
+ gene1.extend NamedEntity
110
+ gene1.offset = a.index gene1
111
+
112
+ gene2 = "CDK5R1"
113
+ gene2.extend NamedEntity
114
+ gene2.offset = a.index gene2
115
+
116
+ bread = "Bread"
117
+ bread.extend NamedEntity
118
+ bread.offset = a.index bread
119
+
120
+ sentences = Segment.align(a, a.split(". "))
121
+
122
+ Transformed.with_transform(sentences[1], [gene1, gene2, bread], "GN") do
123
+ assert sentences[1].include?("GN gene and the GN protein")
124
+ end
125
+
126
+ Transformed.with_transform(sentences[0], [gene1, gene2, bread], "BR") do
127
+ assert sentences[0].include?("first sentence mentions BR")
128
+ end
129
+
130
+
131
+ end
132
+
104
133
  def test_html
105
134
  a = "This sentence mentions the TP53 gene and the CDK5R1 protein"
106
135
 
@@ -115,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
115
144
  gene2.entity_type = "Protein"
116
145
 
117
146
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
118
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
147
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
119
148
  end
120
149
  end
121
150
 
@@ -136,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
136
165
  gene2.entity_type = "Protein"
137
166
 
138
167
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
139
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
168
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
140
169
  end
141
170
  end
142
171
 
@@ -156,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
156
185
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
157
186
 
158
187
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
159
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
188
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
160
189
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
161
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
190
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
162
191
  end
163
192
  end
164
193
  end
@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
17
17
  text = "This is a document"
18
18
  Document.setup(text, "TEST", "test_doc1", nil)
19
19
 
20
- corpus = {}
21
- corpus.extend Document::Corpus
20
+ corpus = Document::Corpus.setup({})
22
21
 
23
22
  corpus.add_document(text)
24
23
 
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
41
40
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
42
41
  Document.setup(text, "TEST", "test_doc1", nil)
43
42
 
44
- corpus = {}
45
- corpus.extend Document::Corpus
43
+ corpus = Document::Corpus.setup({})
46
44
 
47
45
  corpus.add_document(text)
48
46
 
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
65
63
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
66
64
  Document.setup(text, "TEST", "test_doc1", nil)
67
65
 
68
- corpus = {}
69
- corpus.extend Document::Corpus
66
+ corpus = Document::Corpus.setup({})
70
67
 
71
68
  corpus.add_document(text)
72
69
 
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
94
91
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
95
92
  Document.setup(text, "TEST", "test_doc1", nil)
96
93
 
97
- corpus = {}
98
- corpus.extend Document::Corpus
94
+ corpus = Document::Corpus.setup({})
99
95
 
100
96
  corpus.add_document(text)
101
97
 
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
142
138
  text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
143
139
  Document.setup(text, "TEST", "test_doc1", nil)
144
140
 
145
- corpus = {}
146
- corpus.extend Document::Corpus
141
+ corpus = Document::Corpus.setup({})
147
142
 
148
143
  corpus.add_document(text)
149
144
 
@@ -0,0 +1,144 @@
1
+ require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
2
+ require 'rbbt/nlp/spaCy'
3
+ require 'rbbt/document/corpus'
4
+
5
+ class TestSpaCy < Test::Unit::TestCase
6
+ def test_tokens
7
+ text = "I tell a story"
8
+
9
+ tokens = SpaCy.tokens(text)
10
+
11
+ assert_equal 4, tokens.length
12
+ assert_equal "tell", tokens[1].to_s
13
+ end
14
+
15
+ def test_chunks
16
+ text = "Miguel Vazquez tell a good story"
17
+
18
+ tokens = SpaCy.chunks(text)
19
+
20
+ assert_equal 2, tokens.length
21
+ assert_equal "Miguel Vazquez", tokens[0].to_s
22
+ end
23
+
24
+
25
+ def test_segments
26
+ text = "I tell a story. It's a very good story."
27
+
28
+ corpus = Document::Corpus.setup({})
29
+
30
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
31
+
32
+ corpus.add_document text
33
+ text.corpus = corpus
34
+
35
+ segments = SpaCy.segments(text)
36
+
37
+ segments.each do |segment|
38
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
39
+ end
40
+ end
41
+
42
+ def test_chunk_segments
43
+ text = "I tell a story. It's a very good story."
44
+
45
+ corpus = Document::Corpus.setup({})
46
+
47
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
48
+
49
+ corpus.add_document text
50
+ text.corpus = corpus
51
+
52
+ segments = SpaCy.chunk_segments(text)
53
+
54
+ segments.each do |segment|
55
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
56
+ end
57
+ end
58
+
59
+ def test_dep_graph
60
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
61
+ graph = SpaCy.dep_graph(text, true)
62
+
63
+ tokens = SpaCy.segments(text)
64
+ index = Segment.index tokens
65
+ tf_s = tokens.select{|t| t == "TF" }.first
66
+ tg_s = tokens.select{|t| t == "TG" }.first
67
+
68
+ require 'rbbt/network/paths'
69
+
70
+ path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
71
+ path_tokens = path.collect do |segid|
72
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
73
+ text[range]
74
+ end
75
+
76
+ assert path_tokens.include? 'increase'
77
+
78
+ end
79
+
80
+ def test_chunk_dep_graph
81
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
82
+ graph = SpaCy.chunk_dep_graph(text, true)
83
+
84
+ tokens = SpaCy.chunk_segments(text)
85
+ index = Segment.index tokens
86
+ tf_s = tokens.select{|t| t.include? "TF" }.first
87
+ tg_s = tokens.select{|t| t.include? "TG" }.first
88
+
89
+
90
+ require 'rbbt/network/paths'
91
+
92
+ path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
93
+ path_tokens = path.collect do |segid|
94
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
95
+ text[range]
96
+ end
97
+
98
+ assert path_tokens.include? 'increase'
99
+ end
100
+
101
+ def test_paths
102
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
103
+ path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
104
+
105
+
106
+ path_tokens = path.collect do |segid|
107
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
108
+ text[range]
109
+ end
110
+
111
+ ppp text
112
+ iii path_tokens
113
+
114
+ assert path_tokens.include? 'increase'
115
+ end
116
+
117
+ def test_paths2
118
+ text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
119
+ path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
120
+
121
+
122
+ path_tokens = path.collect do |segid|
123
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
124
+ text[range]
125
+ end
126
+
127
+ iii path_tokens
128
+
129
+
130
+ assert path_tokens.include? 'regulation'
131
+ end
132
+
133
+ def test_paths3
134
+ text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
135
+ path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
136
+
137
+ path_tokens = path.collect do |segid|
138
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
139
+ text[range]
140
+ end
141
+
142
+ end
143
+ end
144
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-04-18 00:00:00.000000000 Z
11
+ date: 2021-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -95,6 +95,7 @@ files:
95
95
  - lib/rbbt/ner/oscar4.rb
96
96
  - lib/rbbt/ner/patterns.rb
97
97
  - lib/rbbt/ner/regexpNER.rb
98
+ - lib/rbbt/ner/rner.rb
98
99
  - lib/rbbt/ner/rnorm.rb
99
100
  - lib/rbbt/ner/rnorm/cue_index.rb
100
101
  - lib/rbbt/ner/rnorm/tokens.rb
@@ -102,12 +103,15 @@ files:
102
103
  - lib/rbbt/nlp/genia/sentence_splitter.rb
103
104
  - lib/rbbt/nlp/nlp.rb
104
105
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
106
+ - lib/rbbt/nlp/spaCy.rb
107
+ - lib/rbbt/relationship.rb
105
108
  - lib/rbbt/segment.rb
106
109
  - lib/rbbt/segment/annotation.rb
107
110
  - lib/rbbt/segment/encoding.rb
108
111
  - lib/rbbt/segment/named_entity.rb
109
112
  - lib/rbbt/segment/overlaps.rb
110
113
  - lib/rbbt/segment/range_index.rb
114
+ - lib/rbbt/segment/relationship.rb
111
115
  - lib/rbbt/segment/segmented.rb
112
116
  - lib/rbbt/segment/token.rb
113
117
  - lib/rbbt/segment/transformed.rb
@@ -124,6 +128,7 @@ files:
124
128
  - share/install/software/OpenNLP
125
129
  - share/install/software/StanfordParser
126
130
  - share/patterns/drug_induce_disease
131
+ - share/rner/config.rb
127
132
  - share/rnorm/cue_default
128
133
  - share/rnorm/tokens_default
129
134
  - share/wordlists/stopwords
@@ -146,6 +151,7 @@ files:
146
151
  - test/rbbt/ner/test_oscar4.rb
147
152
  - test/rbbt/ner/test_patterns.rb
148
153
  - test/rbbt/ner/test_regexpNER.rb
154
+ - test/rbbt/ner/test_rner.rb
149
155
  - test/rbbt/ner/test_rnorm.rb
150
156
  - test/rbbt/ner/test_token_trieNER.rb
151
157
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
@@ -161,6 +167,7 @@ files:
161
167
  - test/rbbt/test_document.rb
162
168
  - test/rbbt/test_segment.rb
163
169
  - test/test_helper.rb
170
+ - test/test_spaCy.rb
164
171
  homepage: http://github.com/mikisvaz/rbbt-util
165
172
  licenses: []
166
173
  metadata: {}
@@ -179,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
179
186
  - !ruby/object:Gem::Version
180
187
  version: '0'
181
188
  requirements: []
182
- rubygems_version: 3.0.6
189
+ rubygems_version: 3.1.4
183
190
  signing_key:
184
191
  specification_version: 4
185
192
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
@@ -207,6 +214,7 @@ test_files:
207
214
  - test/rbbt/ner/test_banner.rb
208
215
  - test/rbbt/ner/test_token_trieNER.rb
209
216
  - test/rbbt/ner/test_finder.rb
217
+ - test/rbbt/ner/test_rner.rb
210
218
  - test/rbbt/ner/test_linnaeus.rb
211
219
  - test/rbbt/ner/test_oscar4.rb
212
220
  - test/rbbt/test_segment.rb
@@ -217,4 +225,5 @@ test_files:
217
225
  - test/rbbt/segment/test_encoding.rb
218
226
  - test/rbbt/segment/test_range_index.rb
219
227
  - test/rbbt/segment/test_corpus.rb
228
+ - test/test_spaCy.rb
220
229
  - test/test_helper.rb