rbbt-text 1.3.8 → 1.3.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
4
- data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
3
+ metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
4
+ data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
5
5
  SHA512:
6
- metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
7
- data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
6
+ metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
7
+ data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f
@@ -6,7 +6,9 @@ module Document
6
6
  send :property, type do
7
7
  segments = self.instance_exec &block
8
8
 
9
- Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
9
+ Segment.align(self, segments) unless segments.empty? ||
10
+ (Segment === segments && segments.offset) ||
11
+ (Array === segments && Segment === segments.first && segments.first.offset)
10
12
 
11
13
  segments.each do |segment|
12
14
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -17,6 +19,36 @@ module Document
17
19
 
18
20
  segments
19
21
  end
22
+
23
+ DocID.property type do
24
+ self.document.send(type)
25
+ end
26
+
27
+ SegID.property type do
28
+ self.overlaps(self.docid.send(type))
29
+ end
30
+
31
+ Segment.property type do
32
+ self.overlaps(self.docid.send(type))
33
+ end
34
+
35
+ seg_type = "segids_for_" + type.to_s
36
+
37
+ send :property, seg_type do
38
+ SegID.setup(self.send(type).collect{|s| s.segid })
39
+ end
40
+
41
+ DocID.property seg_type do
42
+ self.document.send(seg_type)
43
+ end
44
+
45
+ SegID.property seg_type do
46
+ self.overlaps(self.docid.send(seg_type))
47
+ end
48
+
49
+ Segment.property seg_type do
50
+ self.overlaps(self.docid.send(seg_type))
51
+ end
20
52
  end
21
53
 
22
54
  def self.define_multiple(type, &block)
@@ -28,7 +60,10 @@ module Document
28
60
  doc_segments.each_with_index do |segments,i|
29
61
  next if segments.nil?
30
62
  document = list[i]
31
- Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
63
+ Segment.align(document, segments) unless segments.nil? ||
64
+ segments.empty? ||
65
+ (Segment === segments && segments.offset) ||
66
+ (Array === segments && Segment === segments.first && segments.first.offset)
32
67
 
33
68
  segments.each do |segment|
34
69
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -41,5 +76,35 @@ module Document
41
76
  segments
42
77
  end
43
78
  end
79
+
80
+ DocID.property type do
81
+ self.document.send(type)
82
+ end
83
+
84
+ SegID.property type do
85
+ self.overlaps(self.docid.send(type))
86
+ end
87
+
88
+ Segment.property type do
89
+ self.overlaps(self.docid.send(type))
90
+ end
91
+
92
+ seg_type = "segids_for_" + type.to_s
93
+
94
+ send :property, seg_type do
95
+ SegID.setup(self.send(type).collect{|s| s.segid })
96
+ end
97
+
98
+ DocID.property seg_type do
99
+ self.document.send(seg_type)
100
+ end
101
+
102
+ SegID.property seg_type do
103
+ self.overlaps(self.docid.send(seg_type))
104
+ end
105
+
106
+ Segment.property seg_type do
107
+ self.overlaps(self.docid.send(seg_type))
108
+ end
44
109
  end
45
110
  end
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
2
2
 
3
3
  module Document::Corpus
4
4
  PUBMED_NAMESPACE="PMID"
5
- def add_pmid(pmid, type = nil, update = false)
6
- type = :abstract if type.nil?
5
+ def add_pmid(pmid, type = :title_and_abstract, update = false)
6
+ type = :title_and_abstract if type.nil?
7
7
 
8
8
  if ! (update || Array === pmid)
9
9
  id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
@@ -16,9 +16,11 @@ module Document::Corpus
16
16
 
17
17
  res = PubMed.get_article(pmids).collect do |pmid, article|
18
18
  document = if type.to_sym == :abstract
19
- Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
19
+ Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
20
20
  elsif type.to_sym == :title
21
- Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
21
+ Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
+ elsif type.to_sym == :title_and_abstract
23
+ Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
24
  else
23
25
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
24
26
  Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
@@ -3,7 +3,7 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
6
+ corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
7
7
  corpus.extend Document::Corpus unless Document::Corpus === corpus
8
8
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
9
9
  corpus.close
data/lib/rbbt/document.rb CHANGED
@@ -9,6 +9,10 @@ module DocID
9
9
  attr_accessor :default_corpus
10
10
  end
11
11
 
12
+ def id
13
+ self
14
+ end
15
+
12
16
  def corpus
13
17
  annotation_values[:corpus] || DocID.default_corpus
14
18
  end
@@ -66,7 +66,8 @@ EOF
66
66
  end
67
67
 
68
68
  Open.write('config', CONFIG)
69
- CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
69
+ mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
70
+ CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
70
71
 
71
72
  if texts.respond_to? :key_field
72
73
  key_field = texts.key_field
@@ -10,34 +10,41 @@ class RegExpNER < NER
10
10
  while matchdata = text.match(regexp)
11
11
  pre = matchdata.pre_match
12
12
  post = matchdata.post_match
13
- match = matchdata[0]
14
13
 
15
- if matchdata.captures.any?
16
- capture = matchdata.captures.first
17
- more_pre, more_post = match.split(/#{capture}/)
18
-
19
- match = capture
20
- pre << more_pre if more_pre
21
- post = more_post << post if more_post
22
- end
23
-
24
- if match and not match.empty?
25
- NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
14
+ if matchdata.named_captures.any?
15
+ match = matchdata[0]
16
+ code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
17
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
26
18
  matches << match
19
+ eend = match.length + pre.length
20
+ text = text[eend..-1]
21
+ start += match.length + pre.length
22
+ elsif matchdata.captures.any?
23
+ match = matchdata.captures.first
24
+ offset, eend = matchdata.offset(1)
25
+ NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
26
+ matches << match
27
+ start += offset + match.length
28
+ text = text[eend..-1]
29
+ else
30
+ match = matchdata[0]
31
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
32
+ matches << match
33
+ eend = match.length + pre.length
34
+ text = text[eend..-1]
35
+ start += match.length + pre.length
27
36
  end
28
-
29
- start += pre.length + match.length
30
- text = post
31
37
  end
32
38
 
33
39
  matches
34
40
  end
35
41
 
36
- def self.match_regexp_list(text, regexp_list, type = nil)
42
+ def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
37
43
  matches = []
38
44
 
39
45
  regexp_list.each do |regexp|
40
- chunks = Segment.split(text, matches)
46
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
47
+ chunks = Segment.split(text, [])
41
48
  chunks.each do |chunk|
42
49
  new_matches = match_regexp(chunk, regexp, type)
43
50
  new_matches.each do |match| match.offset += chunk.offset; matches << match end
@@ -47,15 +54,15 @@ class RegExpNER < NER
47
54
  matches
48
55
  end
49
56
 
50
- def self.match_regexp_hash(text, regexp_hash)
57
+ def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
51
58
  matches = []
52
59
 
53
60
  regexp_hash.each do |type, regexp_list|
54
61
  regexp_list = [regexp_list] unless Array === regexp_list
55
- chunks = Segment.split(text, matches)
62
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
56
63
  chunks.each do |chunk|
57
64
  chunk_offset = chunk.offset
58
- match_regexp_list(chunk, regexp_list, type).each do |match|
65
+ match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
59
66
  match.offset = match.offset + chunk_offset;
60
67
  matches << match
61
68
  end
@@ -65,7 +72,7 @@ class RegExpNER < NER
65
72
  matches
66
73
  end
67
74
 
68
- attr_accessor :regexps
75
+ attr_accessor :regexps, :split_on_matches
69
76
  def initialize(regexps = {})
70
77
  @regexps = regexps.collect{|p| p }
71
78
  end
@@ -87,9 +94,9 @@ class RegExpNER < NER
87
94
  end
88
95
 
89
96
  def match(text)
90
- matches = RegExpNER.match_regexp_hash(text, @regexps)
97
+ matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
91
98
  matches.collect do |m|
92
- NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
99
+ NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
93
100
  end
94
101
  end
95
102
 
@@ -1,6 +1,7 @@
1
1
  require 'rbbt/nlp/nlp'
2
2
  require 'rbbt/segment'
3
3
  module NLP
4
+
4
5
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
6
 
6
7
  def self.returnFeatures(prevWord, delimiter, nextWord)
@@ -136,6 +137,7 @@ module NLP
136
137
  end
137
138
 
138
139
  def self.process_labels(marked_text, labels)
140
+ return "" if marked_text.empty? || labels.empty?
139
141
  out = ""
140
142
 
141
143
  count = 0
@@ -171,8 +173,17 @@ module NLP
171
173
  end
172
174
 
173
175
  def self.geniass_sentence_splitter_extension(text)
176
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
177
+ events, marks = event_extraction(cleaned)
178
+
174
179
  Rbbt.software.opt.Geniass.produce
175
- require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
180
+ begin
181
+ ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
182
+ require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
183
+ rescue LoadError
184
+ raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
185
+ end
186
+
176
187
  geniass = Geniass.new
177
188
  if not geniass.geniass_is_loaded
178
189
  Misc.in_dir Rbbt.software.opt.Geniass.find do
@@ -180,9 +191,6 @@ module NLP
180
191
  end
181
192
  end
182
193
 
183
- cleaned = text.gsub("\n",NEW_LINE_MASK)
184
- events, marks = event_extraction(cleaned)
185
-
186
194
  labels = events.split(/\n/).collect{|line|
187
195
  geniass.label(line)
188
196
  }
@@ -4,6 +4,7 @@ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
7
+ include SegID
7
8
  self.annotation :corpus
8
9
 
9
10
  def _parts
@@ -23,13 +23,14 @@ Score: #{score.inspect}
23
23
  end
24
24
 
25
25
  def html
26
- title = code.nil? ? entity_type : [entity_type, code].compact * ":"
26
+ title = code.nil? ? entity_type : [entity_type, code].compact * " - "
27
27
 
28
28
  text = <<-EOF
29
29
  <span class='Entity'\
30
30
  #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
31
31
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
32
32
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
33
+ #{segid.nil? ? "" : " attr-segid='#{segid}'"}\
33
34
  #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
34
35
  >#{ self }</span>
35
36
  EOF
@@ -1,4 +1,4 @@
1
- module Segment
1
+ module SegmentRanges
2
2
  def pull(offset)
3
3
  if self.offset.nil? or offset.nil?
4
4
  self.offset = nil
@@ -61,3 +61,11 @@ module Segment
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ module Segment
66
+ include SegmentRanges
67
+ end
68
+
69
+ module SegID
70
+ include SegmentRanges
71
+ end
@@ -17,7 +17,7 @@ module Transformed
17
17
 
18
18
  segments = yield text
19
19
 
20
- segments = nil unless Array === segments && Segment === segments.first
20
+ segments = [] unless Array === segments && Segment === segments.first
21
21
 
22
22
  text.restore(segments, true)
23
23
  end
data/lib/rbbt/segment.rb CHANGED
@@ -22,6 +22,10 @@ module SegID
22
22
  range.begin
23
23
  end
24
24
 
25
+ def eend
26
+ offset.to_i + length - 1
27
+ end
28
+
25
29
  def segment_length
26
30
  range.end - range.begin + 1
27
31
  end
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
27
27
  make geniass
28
28
  make libgeniass.so
29
29
 
30
+ move_opt "$name" "$extra"
31
+
30
32
  mkdir lib
31
33
  mv libgeniass.so lib
32
34
 
33
- build "$name" "$extra"
34
-
35
35
  cd "$OPT_BUILD_DIR/$name"
36
36
 
37
37
  mkdir ruby
@@ -43,8 +43,7 @@ create_makefile('Geniass')
43
43
  EOF
44
44
 
45
45
  cat > ruby/Geniass.cpp <<'EOF'
46
- #include "rice/Class.hpp"
47
- #include "rice/String.hpp"
46
+ #include "rice/rice.hpp"
48
47
 
49
48
  #include <iostream>
50
49
  #include <iomanip>
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
59
58
  using namespace Rice;
60
59
  using namespace std;
61
60
 
62
- ME_Model model;
63
61
  bool geniass_loaded = false;
64
62
 
65
- void load_geniass(){
63
+ bool geniass_is_loaded(Object self){ return(geniass_loaded); };
64
+
65
+ ME_Model model;
66
+ void load_geniass(Object self){
66
67
  printf("loading model");
67
68
  string modelFile = "model1-1.0";
68
69
  model.load_from_file(modelFile.c_str());
@@ -70,8 +71,6 @@ void load_geniass(){
70
71
  printf("..done\n");
71
72
  }
72
73
 
73
- bool geniass_is_loaded(){ return(geniass_loaded); };
74
-
75
74
  void split(string& str, vector<string>& tokens)
76
75
  {
77
76
  istringstream in(str);
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
85
84
  }
86
85
  }
87
86
 
88
- string label(string line){
87
+ Object label(Object self, String rb_line){
89
88
  vector<string> tokens;
90
- split(line, tokens);
91
89
  ME_Sample s;
92
90
 
91
+ string line = rb_line.c_str();
92
+ split(line, tokens);
93
+
93
94
  for(vector<string>::const_iterator token = tokens.begin() + 1;
94
95
  token != tokens.end(); ++token){
95
96
  s.add_feature(*token);
96
97
  }
98
+
97
99
  (void) model.classify(s);
98
- return(s.label);
100
+ string label = s.label;
101
+ VALUE x;
102
+ x = rb_str_new_cstr(label.c_str());
103
+ return(x);
99
104
  }
100
105
 
101
106
  extern "C"
@@ -103,16 +108,20 @@ void Init_Geniass()
103
108
  {
104
109
  Class rb_cGeniass =
105
110
  define_class("Geniass")
111
+ .define_method("geniass_is_loaded", &geniass_is_loaded)
106
112
  .define_method("load_geniass", &load_geniass)
107
113
  .define_method("label", &label)
108
- .define_method("geniass_is_loaded", &geniass_is_loaded);
114
+ ;
109
115
  }
110
116
 
117
+
111
118
  EOF
112
119
 
113
120
  cd ruby
114
121
  ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
115
122
  make
116
123
 
124
+ setup "$name" "$extra"
125
+
117
126
 
118
127
 
@@ -6,6 +6,7 @@ tokens do
6
6
 
7
7
  # Some (possible) single letters first
8
8
  receptor /^(?:receptor|r)s?$/i
9
+ activator /^(?:activator|p)s?$/i
9
10
  protein /^(?:protein|p)s?$/i
10
11
  roman /^[IV]+$/
11
12
  greek_letter do |w| $inverse_greek[w.downcase] != nil end
@@ -58,6 +59,8 @@ comparisons do
58
59
 
59
60
  diff.promoter -10
60
61
  diff.receptor -10
62
+ diff.activator -10
63
+
61
64
  diff.similar -10
62
65
  diff.capital -10
63
66
 
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
13
13
  self.split(" ")
14
14
  end
15
15
 
16
+ Document.define :lines do
17
+ self.split("\n")
18
+ end
19
+
16
20
  $called_once = false
17
21
  Document.define :persisted_words do
18
22
  raise CalledOnce if $called_once
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
145
149
 
146
150
  assert text.ner.first.segid.include?("TEST:")
147
151
  end
152
+
153
+ def test_sentence_words
154
+ text =<<-EOF
155
+ This is sentence 1
156
+ This is sentence 2
157
+ EOF
158
+
159
+ Document.setup(text)
160
+
161
+ words = text.words
162
+ numbers = words.select{|w| w =~ /\d/}
163
+ text.lines.each do |sentence|
164
+ Transformed.with_transform(sentence, numbers, "[NUM]") do
165
+ puts sentence
166
+ end
167
+ end
168
+ end
148
169
  end
149
170
 
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
12
12
  sapiens
13
13
  EOF
14
14
 
15
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
15
16
  mentions = GNormPlus.process({:file => text})
16
17
 
17
18
  assert_equal 1, mentions.length
@@ -23,6 +24,7 @@ sapiens
23
24
  We found that TP53 is regulated by MDM2 in Homo sapiens
24
25
  EOF
25
26
 
27
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
26
28
  mentions = GNormPlus.entities({:file => text})
27
29
  assert mentions["file"].include?("TP53")
28
30
  mentions["file"].each do |mention|
@@ -79,6 +79,23 @@ class TestRegExpNER < Test::Unit::TestCase
79
79
  assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
80
80
  end
81
81
 
82
+ def test_entities_captures_repeat
83
+ sentence = "In a sentence I should find not this but this"
84
+
85
+ ner = RegExpNER.new({:this => /not this but (this)/})
86
+ matches = ner.entities(sentence)
87
+ assert sentence[0..matches.first.offset-1].include?('this')
88
+ end
89
+
90
+
91
+ def test_entities_named_captures
92
+ sentence = "In a sentence I should find not this but this"
93
+
94
+ ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
95
+ matches = ner.entities(sentence)
96
+ end
97
+
98
+
82
99
 
83
100
  def test_regexp_order
84
101
  text =<<-EOF
@@ -12,7 +12,6 @@ sentence. This is
12
12
  another broken sentence.
13
13
  EOF
14
14
 
15
- iii NLP.geniass_sentence_splitter(text)
16
15
  assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
17
16
  end
18
17
 
@@ -37,7 +36,17 @@ sentence. This is
37
36
  another broken sentence.
38
37
  EOF
39
38
 
40
- assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
39
+ Log.with_severity 0 do
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
42
+ end
43
+
44
+ def test_sentence_cmi
45
+ text =<<-EOF
46
+ The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
47
+ EOF
48
+
49
+ iii NLP.geniass_sentence_splitter(text)
41
50
  end
42
51
  end
43
52
 
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
73
73
  assert_equal original.gsub(/TP53/, 'GN'), a
74
74
  end
75
75
 
76
+ Transformed.with_transform(a, [gene2], "GN") do
77
+ Transformed.with_transform(a, [gene1], "GN") do
78
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
79
+ end
80
+ assert_equal original.gsub(/CDK5R1/, 'GN'), a
81
+ end
82
+
76
83
  Transformed.with_transform(a, [gene1], "GN") do
77
84
  Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
78
85
  assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
144
151
  gene2.entity_type = "Protein"
145
152
 
146
153
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
147
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
154
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
148
155
  end
149
156
  end
150
157
 
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
165
172
  gene2.entity_type = "Protein"
166
173
 
167
174
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
168
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
175
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
169
176
  end
170
177
  end
171
178
 
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
185
192
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
186
193
 
187
194
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
188
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
195
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
189
196
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
190
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
197
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
191
198
  end
192
199
  end
193
200
  end
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
415
422
  end
416
423
 
417
424
  end
418
-
419
425
  end
420
426
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.8
4
+ version: 1.3.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-08 00:00:00.000000000 Z
11
+ date: 2023-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
175
175
  - !ruby/object:Gem::Version
176
176
  version: '0'
177
177
  requirements: []
178
- rubygems_version: 3.1.4
178
+ rubygems_version: 3.1.2
179
179
  signing_key:
180
180
  specification_version: 4
181
181
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
182
182
  test_files:
183
- - test/rbbt/nlp/test_nlp.rb
184
- - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
185
- - test/rbbt/nlp/genia/test_sentence_splitter.rb
183
+ - test/test_spaCy.rb
184
+ - test/test_helper.rb
185
+ - test/rbbt/bow/test_dictionary.rb
186
186
  - test/rbbt/bow/test_bow.rb
187
187
  - test/rbbt/bow/test_misc.rb
188
- - test/rbbt/bow/test_dictionary.rb
189
- - test/rbbt/test_document.rb
190
- - test/rbbt/document/test_annotation.rb
188
+ - test/rbbt/segment/test_encoding.rb
189
+ - test/rbbt/segment/test_transformed.rb
190
+ - test/rbbt/segment/test_overlaps.rb
191
+ - test/rbbt/segment/test_named_entity.rb
192
+ - test/rbbt/segment/test_corpus.rb
193
+ - test/rbbt/segment/test_range_index.rb
194
+ - test/rbbt/segment/test_annotation.rb
195
+ - test/rbbt/entity/test_document.rb
191
196
  - test/rbbt/document/corpus/test_pubmed.rb
192
197
  - test/rbbt/document/test_corpus.rb
193
- - test/rbbt/entity/test_document.rb
198
+ - test/rbbt/document/test_annotation.rb
199
+ - test/rbbt/test_document.rb
194
200
  - test/rbbt/ner/test_patterns.rb
195
- - test/rbbt/ner/test_NER.rb
196
- - test/rbbt/ner/test_abner.rb
197
201
  - test/rbbt/ner/rnorm/test_tokens.rb
198
- - test/rbbt/ner/test_rnorm.rb
199
- - test/rbbt/ner/test_regexpNER.rb
200
202
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
203
+ - test/rbbt/ner/test_token_trieNER.rb
204
+ - test/rbbt/ner/test_finder.rb
201
205
  - test/rbbt/ner/test_brat.rb
206
+ - test/rbbt/ner/test_regexpNER.rb
202
207
  - test/rbbt/ner/test_g_norm_plus.rb
208
+ - test/rbbt/ner/test_rnorm.rb
209
+ - test/rbbt/ner/test_linnaeus.rb
203
210
  - test/rbbt/ner/test_chemical_tagger.rb
204
- - test/rbbt/ner/test_banner.rb
205
- - test/rbbt/ner/test_token_trieNER.rb
206
- - test/rbbt/ner/test_finder.rb
211
+ - test/rbbt/ner/test_NER.rb
212
+ - test/rbbt/ner/test_abner.rb
207
213
  - test/rbbt/ner/test_rner.rb
208
- - test/rbbt/ner/test_linnaeus.rb
209
214
  - test/rbbt/ner/test_oscar4.rb
215
+ - test/rbbt/ner/test_banner.rb
210
216
  - test/rbbt/test_segment.rb
211
- - test/rbbt/segment/test_transformed.rb
212
- - test/rbbt/segment/test_overlaps.rb
213
- - test/rbbt/segment/test_annotation.rb
214
- - test/rbbt/segment/test_named_entity.rb
215
- - test/rbbt/segment/test_encoding.rb
216
- - test/rbbt/segment/test_range_index.rb
217
- - test/rbbt/segment/test_corpus.rb
218
- - test/test_spaCy.rb
219
- - test/test_helper.rb
217
+ - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
218
+ - test/rbbt/nlp/test_nlp.rb
219
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb