rbbt-text 1.3.8 → 1.3.10

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1b07b2646ecdc51599e2a2356fd18708e88d819944910a07930f67ec3fc012d
4
- data.tar.gz: 03bcbe61f41d830668b50fcfc253fa2b43285774040f61fb3fb0a58f80e9dfd3
3
+ metadata.gz: 2f10312d9b6598ddc9b6fa98b38909afdd575b33a497ae1ff3f17c7a9c6e37bf
4
+ data.tar.gz: f79c61c7e34dd113a2c5002342c0c2df92a4a28c770394bf2c456a34a2730cc7
5
5
  SHA512:
6
- metadata.gz: ae6de2dd809642ca38276ff82e243efeb193cf432bc78aea92e772ab21ff489f23224b9e93de726dcacdb06910716f1107171433cc39e7b022ba14ee4ed284f6
7
- data.tar.gz: 82768060a28248d459031030b6ba49b500b63a9d3ae2199ccdf1417fd3b1f66ce0d962db17875615ee36bb3b5879d8ccbbdec892942f544fa08481b4551a1003
6
+ metadata.gz: a9fb4dc49c538a58a8aa04e81947df212668c5ef9097434fa7d3eff54dd17a8657f581451b64e6b247cb64428436823a305dd64ae6a5fed2126b92285c02ad81
7
+ data.tar.gz: 0d31423660cd232102aa2b9914dab61ff929cf02a37b5094bd58481cac733c167d0e4fcdb4b3025e41a4775bd8033566ed3f402f66c317b3955406d1a3d3eb6f
@@ -6,7 +6,9 @@ module Document
6
6
  send :property, type do
7
7
  segments = self.instance_exec &block
8
8
 
9
- Segment.align(self, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
9
+ Segment.align(self, segments) unless segments.empty? ||
10
+ (Segment === segments && segments.offset) ||
11
+ (Array === segments && Segment === segments.first && segments.first.offset)
10
12
 
11
13
  segments.each do |segment|
12
14
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -17,6 +19,36 @@ module Document
17
19
 
18
20
  segments
19
21
  end
22
+
23
+ DocID.property type do
24
+ self.document.send(type)
25
+ end
26
+
27
+ SegID.property type do
28
+ self.overlaps(self.docid.send(type))
29
+ end
30
+
31
+ Segment.property type do
32
+ self.overlaps(self.docid.send(type))
33
+ end
34
+
35
+ seg_type = "segids_for_" + type.to_s
36
+
37
+ send :property, seg_type do
38
+ SegID.setup(self.send(type).collect{|s| s.segid })
39
+ end
40
+
41
+ DocID.property seg_type do
42
+ self.document.send(seg_type)
43
+ end
44
+
45
+ SegID.property seg_type do
46
+ self.overlaps(self.docid.send(seg_type))
47
+ end
48
+
49
+ Segment.property seg_type do
50
+ self.overlaps(self.docid.send(seg_type))
51
+ end
20
52
  end
21
53
 
22
54
  def self.define_multiple(type, &block)
@@ -28,7 +60,10 @@ module Document
28
60
  doc_segments.each_with_index do |segments,i|
29
61
  next if segments.nil?
30
62
  document = list[i]
31
- Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
63
+ Segment.align(document, segments) unless segments.nil? ||
64
+ segments.empty? ||
65
+ (Segment === segments && segments.offset) ||
66
+ (Array === segments && Segment === segments.first && segments.first.offset)
32
67
 
33
68
  segments.each do |segment|
34
69
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
@@ -41,5 +76,35 @@ module Document
41
76
  segments
42
77
  end
43
78
  end
79
+
80
+ DocID.property type do
81
+ self.document.send(type)
82
+ end
83
+
84
+ SegID.property type do
85
+ self.overlaps(self.docid.send(type))
86
+ end
87
+
88
+ Segment.property type do
89
+ self.overlaps(self.docid.send(type))
90
+ end
91
+
92
+ seg_type = "segids_for_" + type.to_s
93
+
94
+ send :property, seg_type do
95
+ SegID.setup(self.send(type).collect{|s| s.segid })
96
+ end
97
+
98
+ DocID.property seg_type do
99
+ self.document.send(seg_type)
100
+ end
101
+
102
+ SegID.property seg_type do
103
+ self.overlaps(self.docid.send(seg_type))
104
+ end
105
+
106
+ Segment.property seg_type do
107
+ self.overlaps(self.docid.send(seg_type))
108
+ end
44
109
  end
45
110
  end
@@ -2,8 +2,8 @@ require 'rbbt/sources/pubmed'
2
2
 
3
3
  module Document::Corpus
4
4
  PUBMED_NAMESPACE="PMID"
5
- def add_pmid(pmid, type = nil, update = false)
6
- type = :abstract if type.nil?
5
+ def add_pmid(pmid, type = :title_and_abstract, update = false)
6
+ type = :title_and_abstract if type.nil?
7
7
 
8
8
  if ! (update || Array === pmid)
9
9
  id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
@@ -16,9 +16,11 @@ module Document::Corpus
16
16
 
17
17
  res = PubMed.get_article(pmids).collect do |pmid, article|
18
18
  document = if type.to_sym == :abstract
19
- Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
19
+ Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, type.to_sym , self, :corpus => self)
20
20
  elsif type.to_sym == :title
21
- Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
21
+ Document.setup(article.title || "", PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
+ elsif type.to_sym == :title_and_abstract
23
+ Document.setup((article.title || "") + "\n\n" + (article.abstract || ""), PUBMED_NAMESPACE, pmid, type.to_sym, self)
22
24
  else
23
25
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
24
26
  Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
@@ -3,7 +3,7 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
6
+ corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
7
7
  corpus.extend Document::Corpus unless Document::Corpus === corpus
8
8
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
9
9
  corpus.close
data/lib/rbbt/document.rb CHANGED
@@ -9,6 +9,10 @@ module DocID
9
9
  attr_accessor :default_corpus
10
10
  end
11
11
 
12
+ def id
13
+ self
14
+ end
15
+
12
16
  def corpus
13
17
  annotation_values[:corpus] || DocID.default_corpus
14
18
  end
@@ -66,7 +66,8 @@ EOF
66
66
  end
67
67
 
68
68
  Open.write('config', CONFIG)
69
- CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
69
+ mem = Rbbt::Config.get(:java_mem, :GNormPlus, :g_norm_plus, :gnormplus, :gnp, :default => "2G")
70
+ CMD.cmd_log("java -Xmx#{mem} -Xms#{mem} -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
70
71
 
71
72
  if texts.respond_to? :key_field
72
73
  key_field = texts.key_field
@@ -10,34 +10,41 @@ class RegExpNER < NER
10
10
  while matchdata = text.match(regexp)
11
11
  pre = matchdata.pre_match
12
12
  post = matchdata.post_match
13
- match = matchdata[0]
14
13
 
15
- if matchdata.captures.any?
16
- capture = matchdata.captures.first
17
- more_pre, more_post = match.split(/#{capture}/)
18
-
19
- match = capture
20
- pre << more_pre if more_pre
21
- post = more_post << post if more_post
22
- end
23
-
24
- if match and not match.empty?
25
- NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
14
+ if matchdata.named_captures.any?
15
+ match = matchdata[0]
16
+ code = matchdata.named_captures.collect{|k,v| [k,v] * "=" } * ";"
17
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type, :code => code)
26
18
  matches << match
19
+ eend = match.length + pre.length
20
+ text = text[eend..-1]
21
+ start += match.length + pre.length
22
+ elsif matchdata.captures.any?
23
+ match = matchdata.captures.first
24
+ offset, eend = matchdata.offset(1)
25
+ NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
26
+ matches << match
27
+ start += offset + match.length
28
+ text = text[eend..-1]
29
+ else
30
+ match = matchdata[0]
31
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
32
+ matches << match
33
+ eend = match.length + pre.length
34
+ text = text[eend..-1]
35
+ start += match.length + pre.length
27
36
  end
28
-
29
- start += pre.length + match.length
30
- text = post
31
37
  end
32
38
 
33
39
  matches
34
40
  end
35
41
 
36
- def self.match_regexp_list(text, regexp_list, type = nil)
42
+ def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
37
43
  matches = []
38
44
 
39
45
  regexp_list.each do |regexp|
40
- chunks = Segment.split(text, matches)
46
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
47
+ chunks = Segment.split(text, [])
41
48
  chunks.each do |chunk|
42
49
  new_matches = match_regexp(chunk, regexp, type)
43
50
  new_matches.each do |match| match.offset += chunk.offset; matches << match end
@@ -47,15 +54,15 @@ class RegExpNER < NER
47
54
  matches
48
55
  end
49
56
 
50
- def self.match_regexp_hash(text, regexp_hash)
57
+ def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
51
58
  matches = []
52
59
 
53
60
  regexp_hash.each do |type, regexp_list|
54
61
  regexp_list = [regexp_list] unless Array === regexp_list
55
- chunks = Segment.split(text, matches)
62
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
56
63
  chunks.each do |chunk|
57
64
  chunk_offset = chunk.offset
58
- match_regexp_list(chunk, regexp_list, type).each do |match|
65
+ match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
59
66
  match.offset = match.offset + chunk_offset;
60
67
  matches << match
61
68
  end
@@ -65,7 +72,7 @@ class RegExpNER < NER
65
72
  matches
66
73
  end
67
74
 
68
- attr_accessor :regexps
75
+ attr_accessor :regexps, :split_on_matches
69
76
  def initialize(regexps = {})
70
77
  @regexps = regexps.collect{|p| p }
71
78
  end
@@ -87,9 +94,9 @@ class RegExpNER < NER
87
94
  end
88
95
 
89
96
  def match(text)
90
- matches = RegExpNER.match_regexp_hash(text, @regexps)
97
+ matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
91
98
  matches.collect do |m|
92
- NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
99
+ NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m.code || m)
93
100
  end
94
101
  end
95
102
 
@@ -1,6 +1,7 @@
1
1
  require 'rbbt/nlp/nlp'
2
2
  require 'rbbt/segment'
3
3
  module NLP
4
+
4
5
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
6
 
6
7
  def self.returnFeatures(prevWord, delimiter, nextWord)
@@ -136,6 +137,7 @@ module NLP
136
137
  end
137
138
 
138
139
  def self.process_labels(marked_text, labels)
140
+ return "" if marked_text.empty? || labels.empty?
139
141
  out = ""
140
142
 
141
143
  count = 0
@@ -171,8 +173,17 @@ module NLP
171
173
  end
172
174
 
173
175
  def self.geniass_sentence_splitter_extension(text)
176
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
177
+ events, marks = event_extraction(cleaned)
178
+
174
179
  Rbbt.software.opt.Geniass.produce
175
- require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
180
+ begin
181
+ ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
182
+ require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
183
+ rescue LoadError
184
+ raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
185
+ end
186
+
176
187
  geniass = Geniass.new
177
188
  if not geniass.geniass_is_loaded
178
189
  Misc.in_dir Rbbt.software.opt.Geniass.find do
@@ -180,9 +191,6 @@ module NLP
180
191
  end
181
192
  end
182
193
 
183
- cleaned = text.gsub("\n",NEW_LINE_MASK)
184
- events, marks = event_extraction(cleaned)
185
-
186
194
  labels = events.split(/\n/).collect{|line|
187
195
  geniass.label(line)
188
196
  }
@@ -4,6 +4,7 @@ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
7
+ include SegID
7
8
  self.annotation :corpus
8
9
 
9
10
  def _parts
@@ -23,13 +23,14 @@ Score: #{score.inspect}
23
23
  end
24
24
 
25
25
  def html
26
- title = code.nil? ? entity_type : [entity_type, code].compact * ":"
26
+ title = code.nil? ? entity_type : [entity_type, code].compact * " - "
27
27
 
28
28
  text = <<-EOF
29
29
  <span class='Entity'\
30
30
  #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
31
31
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
32
32
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
33
+ #{segid.nil? ? "" : " attr-segid='#{segid}'"}\
33
34
  #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
34
35
  >#{ self }</span>
35
36
  EOF
@@ -1,4 +1,4 @@
1
- module Segment
1
+ module SegmentRanges
2
2
  def pull(offset)
3
3
  if self.offset.nil? or offset.nil?
4
4
  self.offset = nil
@@ -61,3 +61,11 @@ module Segment
61
61
  end
62
62
  end
63
63
  end
64
+
65
+ module Segment
66
+ include SegmentRanges
67
+ end
68
+
69
+ module SegID
70
+ include SegmentRanges
71
+ end
@@ -17,7 +17,7 @@ module Transformed
17
17
 
18
18
  segments = yield text
19
19
 
20
- segments = nil unless Array === segments && Segment === segments.first
20
+ segments = [] unless Array === segments && Segment === segments.first
21
21
 
22
22
  text.restore(segments, true)
23
23
  end
data/lib/rbbt/segment.rb CHANGED
@@ -22,6 +22,10 @@ module SegID
22
22
  range.begin
23
23
  end
24
24
 
25
+ def eend
26
+ offset.to_i + length - 1
27
+ end
28
+
25
29
  def segment_length
26
30
  range.end - range.begin + 1
27
31
  end
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
27
27
  make geniass
28
28
  make libgeniass.so
29
29
 
30
+ move_opt "$name" "$extra"
31
+
30
32
  mkdir lib
31
33
  mv libgeniass.so lib
32
34
 
33
- build "$name" "$extra"
34
-
35
35
  cd "$OPT_BUILD_DIR/$name"
36
36
 
37
37
  mkdir ruby
@@ -43,8 +43,7 @@ create_makefile('Geniass')
43
43
  EOF
44
44
 
45
45
  cat > ruby/Geniass.cpp <<'EOF'
46
- #include "rice/Class.hpp"
47
- #include "rice/String.hpp"
46
+ #include "rice/rice.hpp"
48
47
 
49
48
  #include <iostream>
50
49
  #include <iomanip>
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
59
58
  using namespace Rice;
60
59
  using namespace std;
61
60
 
62
- ME_Model model;
63
61
  bool geniass_loaded = false;
64
62
 
65
- void load_geniass(){
63
+ bool geniass_is_loaded(Object self){ return(geniass_loaded); };
64
+
65
+ ME_Model model;
66
+ void load_geniass(Object self){
66
67
  printf("loading model");
67
68
  string modelFile = "model1-1.0";
68
69
  model.load_from_file(modelFile.c_str());
@@ -70,8 +71,6 @@ void load_geniass(){
70
71
  printf("..done\n");
71
72
  }
72
73
 
73
- bool geniass_is_loaded(){ return(geniass_loaded); };
74
-
75
74
  void split(string& str, vector<string>& tokens)
76
75
  {
77
76
  istringstream in(str);
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
85
84
  }
86
85
  }
87
86
 
88
- string label(string line){
87
+ Object label(Object self, String rb_line){
89
88
  vector<string> tokens;
90
- split(line, tokens);
91
89
  ME_Sample s;
92
90
 
91
+ string line = rb_line.c_str();
92
+ split(line, tokens);
93
+
93
94
  for(vector<string>::const_iterator token = tokens.begin() + 1;
94
95
  token != tokens.end(); ++token){
95
96
  s.add_feature(*token);
96
97
  }
98
+
97
99
  (void) model.classify(s);
98
- return(s.label);
100
+ string label = s.label;
101
+ VALUE x;
102
+ x = rb_str_new_cstr(label.c_str());
103
+ return(x);
99
104
  }
100
105
 
101
106
  extern "C"
@@ -103,16 +108,20 @@ void Init_Geniass()
103
108
  {
104
109
  Class rb_cGeniass =
105
110
  define_class("Geniass")
111
+ .define_method("geniass_is_loaded", &geniass_is_loaded)
106
112
  .define_method("load_geniass", &load_geniass)
107
113
  .define_method("label", &label)
108
- .define_method("geniass_is_loaded", &geniass_is_loaded);
114
+ ;
109
115
  }
110
116
 
117
+
111
118
  EOF
112
119
 
113
120
  cd ruby
114
121
  ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
115
122
  make
116
123
 
124
+ setup "$name" "$extra"
125
+
117
126
 
118
127
 
@@ -6,6 +6,7 @@ tokens do
6
6
 
7
7
  # Some (possible) single letters first
8
8
  receptor /^(?:receptor|r)s?$/i
9
+ activator /^(?:activator|p)s?$/i
9
10
  protein /^(?:protein|p)s?$/i
10
11
  roman /^[IV]+$/
11
12
  greek_letter do |w| $inverse_greek[w.downcase] != nil end
@@ -58,6 +59,8 @@ comparisons do
58
59
 
59
60
  diff.promoter -10
60
61
  diff.receptor -10
62
+ diff.activator -10
63
+
61
64
  diff.similar -10
62
65
  diff.capital -10
63
66
 
@@ -13,6 +13,10 @@ class TestAnnotation < Test::Unit::TestCase
13
13
  self.split(" ")
14
14
  end
15
15
 
16
+ Document.define :lines do
17
+ self.split("\n")
18
+ end
19
+
16
20
  $called_once = false
17
21
  Document.define :persisted_words do
18
22
  raise CalledOnce if $called_once
@@ -145,5 +149,22 @@ class TestAnnotation < Test::Unit::TestCase
145
149
 
146
150
  assert text.ner.first.segid.include?("TEST:")
147
151
  end
152
+
153
+ def test_sentence_words
154
+ text =<<-EOF
155
+ This is sentence 1
156
+ This is sentence 2
157
+ EOF
158
+
159
+ Document.setup(text)
160
+
161
+ words = text.words
162
+ numbers = words.select{|w| w =~ /\d/}
163
+ text.lines.each do |sentence|
164
+ Transformed.with_transform(sentence, numbers, "[NUM]") do
165
+ puts sentence
166
+ end
167
+ end
168
+ end
148
169
  end
149
170
 
@@ -12,6 +12,7 @@ We found that TP53 is regulated by MDM2 in Homo
12
12
  sapiens
13
13
  EOF
14
14
 
15
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
15
16
  mentions = GNormPlus.process({:file => text})
16
17
 
17
18
  assert_equal 1, mentions.length
@@ -23,6 +24,7 @@ sapiens
23
24
  We found that TP53 is regulated by MDM2 in Homo sapiens
24
25
  EOF
25
26
 
27
+ Rbbt::Config.add_entry :java_mem, "2G", :gnp
26
28
  mentions = GNormPlus.entities({:file => text})
27
29
  assert mentions["file"].include?("TP53")
28
30
  mentions["file"].each do |mention|
@@ -79,6 +79,23 @@ class TestRegExpNER < Test::Unit::TestCase
79
79
  assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
80
80
  end
81
81
 
82
+ def test_entities_captures_repeat
83
+ sentence = "In a sentence I should find not this but this"
84
+
85
+ ner = RegExpNER.new({:this => /not this but (this)/})
86
+ matches = ner.entities(sentence)
87
+ assert sentence[0..matches.first.offset-1].include?('this')
88
+ end
89
+
90
+
91
+ def test_entities_named_captures
92
+ sentence = "In a sentence I should find not this but this"
93
+
94
+ ner = RegExpNER.new({:this => /(?<who>I) should find not this but (this)/})
95
+ matches = ner.entities(sentence)
96
+ end
97
+
98
+
82
99
 
83
100
  def test_regexp_order
84
101
  text =<<-EOF
@@ -12,7 +12,6 @@ sentence. This is
12
12
  another broken sentence.
13
13
  EOF
14
14
 
15
- iii NLP.geniass_sentence_splitter(text)
16
15
  assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
17
16
  end
18
17
 
@@ -37,7 +36,17 @@ sentence. This is
37
36
  another broken sentence.
38
37
  EOF
39
38
 
40
- assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
39
+ Log.with_severity 0 do
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
42
+ end
43
+
44
+ def test_sentence_cmi
45
+ text =<<-EOF
46
+ The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
47
+ EOF
48
+
49
+ iii NLP.geniass_sentence_splitter(text)
41
50
  end
42
51
  end
43
52
 
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
73
73
  assert_equal original.gsub(/TP53/, 'GN'), a
74
74
  end
75
75
 
76
+ Transformed.with_transform(a, [gene2], "GN") do
77
+ Transformed.with_transform(a, [gene1], "GN") do
78
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
79
+ end
80
+ assert_equal original.gsub(/CDK5R1/, 'GN'), a
81
+ end
82
+
76
83
  Transformed.with_transform(a, [gene1], "GN") do
77
84
  Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
78
85
  assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
144
151
  gene2.entity_type = "Protein"
145
152
 
146
153
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
147
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
154
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
148
155
  end
149
156
  end
150
157
 
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
165
172
  gene2.entity_type = "Protein"
166
173
 
167
174
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
168
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
175
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
169
176
  end
170
177
  end
171
178
 
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
185
192
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
186
193
 
187
194
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
188
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
195
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
189
196
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
190
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
197
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
191
198
  end
192
199
  end
193
200
  end
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
415
422
  end
416
423
 
417
424
  end
418
-
419
425
  end
420
426
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.8
4
+ version: 1.3.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-11-08 00:00:00.000000000 Z
11
+ date: 2023-02-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -175,45 +175,45 @@ required_rubygems_version: !ruby/object:Gem::Requirement
175
175
  - !ruby/object:Gem::Version
176
176
  version: '0'
177
177
  requirements: []
178
- rubygems_version: 3.1.4
178
+ rubygems_version: 3.1.2
179
179
  signing_key:
180
180
  specification_version: 4
181
181
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
182
182
  test_files:
183
- - test/rbbt/nlp/test_nlp.rb
184
- - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
185
- - test/rbbt/nlp/genia/test_sentence_splitter.rb
183
+ - test/test_spaCy.rb
184
+ - test/test_helper.rb
185
+ - test/rbbt/bow/test_dictionary.rb
186
186
  - test/rbbt/bow/test_bow.rb
187
187
  - test/rbbt/bow/test_misc.rb
188
- - test/rbbt/bow/test_dictionary.rb
189
- - test/rbbt/test_document.rb
190
- - test/rbbt/document/test_annotation.rb
188
+ - test/rbbt/segment/test_encoding.rb
189
+ - test/rbbt/segment/test_transformed.rb
190
+ - test/rbbt/segment/test_overlaps.rb
191
+ - test/rbbt/segment/test_named_entity.rb
192
+ - test/rbbt/segment/test_corpus.rb
193
+ - test/rbbt/segment/test_range_index.rb
194
+ - test/rbbt/segment/test_annotation.rb
195
+ - test/rbbt/entity/test_document.rb
191
196
  - test/rbbt/document/corpus/test_pubmed.rb
192
197
  - test/rbbt/document/test_corpus.rb
193
- - test/rbbt/entity/test_document.rb
198
+ - test/rbbt/document/test_annotation.rb
199
+ - test/rbbt/test_document.rb
194
200
  - test/rbbt/ner/test_patterns.rb
195
- - test/rbbt/ner/test_NER.rb
196
- - test/rbbt/ner/test_abner.rb
197
201
  - test/rbbt/ner/rnorm/test_tokens.rb
198
- - test/rbbt/ner/test_rnorm.rb
199
- - test/rbbt/ner/test_regexpNER.rb
200
202
  - test/rbbt/ner/test_ngram_prefix_dictionary.rb
203
+ - test/rbbt/ner/test_token_trieNER.rb
204
+ - test/rbbt/ner/test_finder.rb
201
205
  - test/rbbt/ner/test_brat.rb
206
+ - test/rbbt/ner/test_regexpNER.rb
202
207
  - test/rbbt/ner/test_g_norm_plus.rb
208
+ - test/rbbt/ner/test_rnorm.rb
209
+ - test/rbbt/ner/test_linnaeus.rb
203
210
  - test/rbbt/ner/test_chemical_tagger.rb
204
- - test/rbbt/ner/test_banner.rb
205
- - test/rbbt/ner/test_token_trieNER.rb
206
- - test/rbbt/ner/test_finder.rb
211
+ - test/rbbt/ner/test_NER.rb
212
+ - test/rbbt/ner/test_abner.rb
207
213
  - test/rbbt/ner/test_rner.rb
208
- - test/rbbt/ner/test_linnaeus.rb
209
214
  - test/rbbt/ner/test_oscar4.rb
215
+ - test/rbbt/ner/test_banner.rb
210
216
  - test/rbbt/test_segment.rb
211
- - test/rbbt/segment/test_transformed.rb
212
- - test/rbbt/segment/test_overlaps.rb
213
- - test/rbbt/segment/test_annotation.rb
214
- - test/rbbt/segment/test_named_entity.rb
215
- - test/rbbt/segment/test_encoding.rb
216
- - test/rbbt/segment/test_range_index.rb
217
- - test/rbbt/segment/test_corpus.rb
218
- - test/test_spaCy.rb
219
- - test/test_helper.rb
217
+ - test/rbbt/nlp/open_nlp/test_sentence_splitter.rb
218
+ - test/rbbt/nlp/test_nlp.rb
219
+ - test/rbbt/nlp/genia/test_sentence_splitter.rb