rbbt-text 1.3.7 → 1.3.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8dfc374254fcbe88c8be6bfffd9a3cfabf6e23c953c11ecd2f61cf41027ff3d6
4
- data.tar.gz: 3d3211f41cfecea05862505d1508a4b7b76eecb3c90b3b0000194eb08033715e
3
+ metadata.gz: 92c6b4b9d3452c6b495fc9f291b551a73c8c150faee05053b7ecadc62ccbbd53
4
+ data.tar.gz: 70e341cf31466628c42b9947c882b64ff592e4703e06c8629ab56d513fe0a975
5
5
  SHA512:
6
- metadata.gz: 7ed870e46bae2c113d0885697bfbade6064732a89477833c640eaf4ee8bdb2c0fbf52f69f456af5eb30a82e56a7f0aeb37e71127f884430c3d315202a07fa3cb
7
- data.tar.gz: e31853e816321a5ead788036b5f67eecaca179c75168c0bb2804be1f18ae844031ab808a4e3c9d67e1f9a52f94ca478949798b8101e164eba32481c0182a1f58
6
+ metadata.gz: 2e8fdff40dd93072c3c377c59ff02d7374f3f81961dfc0f2596386776408c623543eb2e1f0da0112b3a8384d865c8331c659c650f2f2288a3d6282eca80e804e
7
+ data.tar.gz: d7b335d138eb48de51af8922d80d01715c4c61c025b525a9f63fcae789de329eedc2557cb10ba369f5987205ef87ffd85fc407fc3a29f6a75ef0b41951e4962b
@@ -4,10 +4,11 @@ module Document::Corpus
4
4
  PUBMED_NAMESPACE="PMID"
5
5
  def add_pmid(pmid, type = nil, update = false)
6
6
  type = :abstract if type.nil?
7
- if update == false
7
+
8
+ if ! (update || Array === pmid)
8
9
  id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
9
10
  documents = self.documents(id)
10
- return documents if documents.any?
11
+ return documents.first if documents.any?
11
12
  end
12
13
 
13
14
  pmids = Array === pmid ? pmid : [pmid]
@@ -27,7 +28,14 @@ module Document::Corpus
27
28
  document
28
29
  end
29
30
 
30
- Document.setup(res)
31
+ if Array === pmid
32
+ corpus = res.first.corpus if res.first
33
+ Document.setup(res, :corpus => corpus)
34
+ else
35
+ res = res.first
36
+ end
37
+
38
+ res
31
39
  end
32
40
 
33
41
  def add_pubmed_query(query, max = 3000, type = nil)
@@ -35,8 +43,8 @@ module Document::Corpus
35
43
  add_pmid(pmids, type)
36
44
  end
37
45
 
38
- self.claim "PMID" do |id, type|
46
+ self.claim "PMID" do |id,type,update|
39
47
  Log.debug "Claiming #{id}"
40
- self.add_pmid(id, type).first
48
+ self.add_pmid(id, type,update)
41
49
  end
42
50
  end
@@ -3,7 +3,7 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
6
+ corpus = Persist.open_tokyocabinet(corpus, false, :single, "BDB") if String === corpus
7
7
  corpus.extend Document::Corpus unless Document::Corpus === corpus
8
8
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
9
9
  corpus.close
@@ -10,34 +10,33 @@ class RegExpNER < NER
10
10
  while matchdata = text.match(regexp)
11
11
  pre = matchdata.pre_match
12
12
  post = matchdata.post_match
13
- match = matchdata[0]
14
13
 
15
14
  if matchdata.captures.any?
16
- capture = matchdata.captures.first
17
- more_pre, more_post = match.split(/#{capture}/)
18
-
19
- match = capture
20
- pre << more_pre if more_pre
21
- post = more_post << post if more_post
22
- end
23
-
24
- if match and not match.empty?
25
- NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
15
+ match = matchdata.captures.first
16
+ offset, eend = matchdata.offset(1)
17
+ NamedEntity.setup(match, :offset => start + offset, :entity_type => type)
26
18
  matches << match
19
+ start += offset + match.length
20
+ text = text[eend..-1]
21
+ else
22
+ match = matchdata[0]
23
+ NamedEntity.setup(match, :offset => pre.length + start, :entity_type => type)
24
+ matches << match
25
+ eend = match.length + pre.length
26
+ text = text[eend..-1]
27
+ start += match.length + pre.length
27
28
  end
28
-
29
- start += pre.length + match.length
30
- text = post
31
29
  end
32
30
 
33
31
  matches
34
32
  end
35
33
 
36
- def self.match_regexp_list(text, regexp_list, type = nil)
34
+ def self.match_regexp_list(text, regexp_list, type = nil, split_on_matches = false)
37
35
  matches = []
38
36
 
39
37
  regexp_list.each do |regexp|
40
- chunks = Segment.split(text, matches)
38
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
39
+ chunks = Segment.split(text, [])
41
40
  chunks.each do |chunk|
42
41
  new_matches = match_regexp(chunk, regexp, type)
43
42
  new_matches.each do |match| match.offset += chunk.offset; matches << match end
@@ -47,15 +46,15 @@ class RegExpNER < NER
47
46
  matches
48
47
  end
49
48
 
50
- def self.match_regexp_hash(text, regexp_hash)
49
+ def self.match_regexp_hash(text, regexp_hash, split_on_matches = false)
51
50
  matches = []
52
51
 
53
52
  regexp_hash.each do |type, regexp_list|
54
53
  regexp_list = [regexp_list] unless Array === regexp_list
55
- chunks = Segment.split(text, matches)
54
+ chunks = split_on_matches ? Segment.split(text, matches) : Segment.split(text, [])
56
55
  chunks.each do |chunk|
57
56
  chunk_offset = chunk.offset
58
- match_regexp_list(chunk, regexp_list, type).each do |match|
57
+ match_regexp_list(chunk, regexp_list, type, split_on_matches).each do |match|
59
58
  match.offset = match.offset + chunk_offset;
60
59
  matches << match
61
60
  end
@@ -65,7 +64,7 @@ class RegExpNER < NER
65
64
  matches
66
65
  end
67
66
 
68
- attr_accessor :regexps
67
+ attr_accessor :regexps, :split_on_matches
69
68
  def initialize(regexps = {})
70
69
  @regexps = regexps.collect{|p| p }
71
70
  end
@@ -87,7 +86,7 @@ class RegExpNER < NER
87
86
  end
88
87
 
89
88
  def match(text)
90
- matches = RegExpNER.match_regexp_hash(text, @regexps)
89
+ matches = RegExpNER.match_regexp_hash(text, @regexps, @split_on_matches)
91
90
  matches.collect do |m|
92
91
  NamedEntity.setup(m, :offset => m.offset, :type => m.type, :code => m)
93
92
  end
@@ -1,6 +1,7 @@
1
1
  require 'rbbt/nlp/nlp'
2
2
  require 'rbbt/segment'
3
3
  module NLP
4
+
4
5
  Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
5
6
 
6
7
  def self.returnFeatures(prevWord, delimiter, nextWord)
@@ -136,6 +137,7 @@ module NLP
136
137
  end
137
138
 
138
139
  def self.process_labels(marked_text, labels)
140
+ return "" if marked_text.empty? || labels.empty?
139
141
  out = ""
140
142
 
141
143
  count = 0
@@ -171,8 +173,17 @@ module NLP
171
173
  end
172
174
 
173
175
  def self.geniass_sentence_splitter_extension(text)
176
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
177
+ events, marks = event_extraction(cleaned)
178
+
174
179
  Rbbt.software.opt.Geniass.produce
175
- require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
180
+ begin
181
+ ENV["LD_LIBRARY_PATH"] = Rbbt.software.opt.Geniass.lib.find + ":" + ENV["LD_LIBRARY_PATH"]
182
+ require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
183
+ rescue LoadError
184
+ raise LoadError, "Geniass ruby module needs to be able to find #{Rbbt.software.opt.Geniass.lib.find} in LD_LIBRARY_PATH"
185
+ end
186
+
176
187
  geniass = Geniass.new
177
188
  if not geniass.geniass_is_loaded
178
189
  Misc.in_dir Rbbt.software.opt.Geniass.find do
@@ -180,9 +191,6 @@ module NLP
180
191
  end
181
192
  end
182
193
 
183
- cleaned = text.gsub("\n",NEW_LINE_MASK)
184
- events, marks = event_extraction(cleaned)
185
-
186
194
  labels = events.split(/\n/).collect{|line|
187
195
  geniass.label(line)
188
196
  }
@@ -153,8 +153,8 @@ module SpaCy
153
153
 
154
154
  chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
155
155
 
156
- source_id = chunk_index[source.offset].first || source.segid
157
- target_id = chunk_index[target.offset].first || target.segid
156
+ source_id = chunk_index[source.offset.to_i].first || source.segid
157
+ target_id = chunk_index[target.offset.to_i].first || target.segid
158
158
 
159
159
  path = Paths.dijkstra(graph, source_id, [target_id])
160
160
 
@@ -23,13 +23,14 @@ Score: #{score.inspect}
23
23
  end
24
24
 
25
25
  def html
26
- title = code.nil? ? entity_type : [entity_type, code].compact * ":"
26
+ title = code.nil? ? entity_type : [entity_type, code].compact * " - "
27
27
 
28
28
  text = <<-EOF
29
29
  <span class='Entity'\
30
30
  #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
31
31
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
32
32
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
33
+ #{segid.nil? ? "" : " attr-segid='#{segid}'"}\
33
34
  #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
34
35
  >#{ self }</span>
35
36
  EOF
@@ -17,7 +17,7 @@ module Transformed
17
17
 
18
18
  segments = yield text
19
19
 
20
- segments = nil unless Array === segments && Segment === segments.first
20
+ segments = [] unless Array === segments && Segment === segments.first
21
21
 
22
22
  text.restore(segments, true)
23
23
  end
@@ -27,11 +27,11 @@ mv /tmp/clean.Makefile Makefile
27
27
  make geniass
28
28
  make libgeniass.so
29
29
 
30
+ move_opt "$name" "$extra"
31
+
30
32
  mkdir lib
31
33
  mv libgeniass.so lib
32
34
 
33
- build "$name" "$extra"
34
-
35
35
  cd "$OPT_BUILD_DIR/$name"
36
36
 
37
37
  mkdir ruby
@@ -43,8 +43,7 @@ create_makefile('Geniass')
43
43
  EOF
44
44
 
45
45
  cat > ruby/Geniass.cpp <<'EOF'
46
- #include "rice/Class.hpp"
47
- #include "rice/String.hpp"
46
+ #include "rice/rice.hpp"
48
47
 
49
48
  #include <iostream>
50
49
  #include <iomanip>
@@ -59,10 +58,12 @@ cat > ruby/Geniass.cpp <<'EOF'
59
58
  using namespace Rice;
60
59
  using namespace std;
61
60
 
62
- ME_Model model;
63
61
  bool geniass_loaded = false;
64
62
 
65
- void load_geniass(){
63
+ bool geniass_is_loaded(Object self){ return(geniass_loaded); };
64
+
65
+ ME_Model model;
66
+ void load_geniass(Object self){
66
67
  printf("loading model");
67
68
  string modelFile = "model1-1.0";
68
69
  model.load_from_file(modelFile.c_str());
@@ -70,8 +71,6 @@ void load_geniass(){
70
71
  printf("..done\n");
71
72
  }
72
73
 
73
- bool geniass_is_loaded(){ return(geniass_loaded); };
74
-
75
74
  void split(string& str, vector<string>& tokens)
76
75
  {
77
76
  istringstream in(str);
@@ -85,17 +84,23 @@ void split(string& str, vector<string>& tokens)
85
84
  }
86
85
  }
87
86
 
88
- string label(string line){
87
+ Object label(Object self, String rb_line){
89
88
  vector<string> tokens;
90
- split(line, tokens);
91
89
  ME_Sample s;
92
90
 
91
+ string line = rb_line.c_str();
92
+ split(line, tokens);
93
+
93
94
  for(vector<string>::const_iterator token = tokens.begin() + 1;
94
95
  token != tokens.end(); ++token){
95
96
  s.add_feature(*token);
96
97
  }
98
+
97
99
  (void) model.classify(s);
98
- return(s.label);
100
+ string label = s.label;
101
+ VALUE x;
102
+ x = rb_str_new_cstr(label.c_str());
103
+ return(x);
99
104
  }
100
105
 
101
106
  extern "C"
@@ -103,16 +108,20 @@ void Init_Geniass()
103
108
  {
104
109
  Class rb_cGeniass =
105
110
  define_class("Geniass")
111
+ .define_method("geniass_is_loaded", &geniass_is_loaded)
106
112
  .define_method("load_geniass", &load_geniass)
107
113
  .define_method("label", &label)
108
- .define_method("geniass_is_loaded", &geniass_is_loaded);
114
+ ;
109
115
  }
110
116
 
117
+
111
118
  EOF
112
119
 
113
120
  cd ruby
114
121
  ruby extconf.rb --with-geniass-dir="$OPT_DIR/$name"
115
122
  make
116
123
 
124
+ setup "$name" "$extra"
125
+
117
126
 
118
127
 
@@ -6,6 +6,7 @@ tokens do
6
6
 
7
7
  # Some (possible) single letters first
8
8
  receptor /^(?:receptor|r)s?$/i
9
+ activator /^(?:activator|p)s?$/i
9
10
  protein /^(?:protein|p)s?$/i
10
11
  roman /^[IV]+$/
11
12
  greek_letter do |w| $inverse_greek[w.downcase] != nil end
@@ -58,6 +59,8 @@ comparisons do
58
59
 
59
60
  diff.promoter -10
60
61
  diff.receptor -10
62
+ diff.activator -10
63
+
61
64
  diff.similar -10
62
65
  diff.capital -10
63
66
 
@@ -7,7 +7,7 @@ class TestCorpusPubmed < Test::Unit::TestCase
7
7
  def test_add_pmid
8
8
  corpus = Document::Corpus.setup({})
9
9
 
10
- document = corpus.add_pmid("33359141", :abstract).first
10
+ document = corpus.add_pmid("33359141", :abstract, true)
11
11
  title = document.to(:title)
12
12
  assert title.include?("COVID-19")
13
13
  end
@@ -79,6 +79,15 @@ class TestRegExpNER < Test::Unit::TestCase
79
79
  assert_equal :should, matches.select{|m| m.entity_type == :should}[0].entity_type
80
80
  end
81
81
 
82
+ def test_entities_captures_repeat
83
+ sentence = "In a sentence I should find not this but this"
84
+
85
+ ner = RegExpNER.new({:this => /not this but (this)/})
86
+ matches = ner.entities(sentence)
87
+ assert sentence[0..matches.first.offset-1].include?('this')
88
+ end
89
+
90
+
82
91
 
83
92
  def test_regexp_order
84
93
  text =<<-EOF
@@ -12,7 +12,6 @@ sentence. This is
12
12
  another broken sentence.
13
13
  EOF
14
14
 
15
- iii NLP.geniass_sentence_splitter(text)
16
15
  assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
17
16
  end
18
17
 
@@ -37,7 +36,17 @@ sentence. This is
37
36
  another broken sentence.
38
37
  EOF
39
38
 
40
- assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
39
+ Log.with_severity 0 do
40
+ assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
41
+ end
42
+ end
43
+
44
+ def test_sentence_cmi
45
+ text =<<-EOF
46
+ The COVID-19 infection was reported as the main cause of death and patients with a higher mortality risk were those aged ≥65 years [adjusted HR = 3.40 (95% CI 2.20-5.24)], with a higher disease severity [adjusted HR = 1.87 (95%CI 1.43-2.45)].
47
+ EOF
48
+
49
+ iii NLP.geniass_sentence_splitter(text)
41
50
  end
42
51
  end
43
52
 
@@ -73,6 +73,13 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
73
73
  assert_equal original.gsub(/TP53/, 'GN'), a
74
74
  end
75
75
 
76
+ Transformed.with_transform(a, [gene2], "GN") do
77
+ Transformed.with_transform(a, [gene1], "GN") do
78
+ assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
79
+ end
80
+ assert_equal original.gsub(/CDK5R1/, 'GN'), a
81
+ end
82
+
76
83
  Transformed.with_transform(a, [gene1], "GN") do
77
84
  Transformed.with_transform(a, [gene2], "LONG_GENE_PLACEHOLDER") do
78
85
  assert_equal original.gsub(/TP53/, 'GN').sub('CDK5R1', "LONG_GENE_PLACEHOLDER"), a
@@ -144,7 +151,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
144
151
  gene2.entity_type = "Protein"
145
152
 
146
153
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
147
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
154
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':45..50' title='Protein'>CDK5R1</span> protein", a
148
155
  end
149
156
  end
150
157
 
@@ -165,7 +172,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
165
172
  gene2.entity_type = "Protein"
166
173
 
167
174
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
168
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
175
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':37..40' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' attr-segid=':55..60' title='Protein'>CDK5R1</span> protein", a
169
176
  end
170
177
  end
171
178
 
@@ -185,9 +192,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
185
192
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
186
193
 
187
194
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
188
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
195
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
189
196
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
190
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
197
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' attr-segid=':27..121' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' attr-segid=':27..30' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
191
198
  end
192
199
  end
193
200
  end
@@ -415,6 +422,5 @@ This is another sentence. Among the nonstructural proteins, the leader protein (
415
422
  end
416
423
 
417
424
  end
418
-
419
425
  end
420
426
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.7
4
+ version: 1.3.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-21 00:00:00.000000000 Z
11
+ date: 2023-01-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util