rbbt-text 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 496288d7d3ff1215ded1fd210192d5887a6a071eea5f322295a669a5d648d77b
4
- data.tar.gz: 47996496009cbcdaab38a9dc9bf6efbbe7fc0145f315b0a48bfab0f543742f94
3
+ metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
4
+ data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
5
5
  SHA512:
6
- metadata.gz: 36e7415ad06207066844a30001c8541865f066d1e83a4a2ddc5182c54b704cd3d442cbccce219bd2114717a83656d07558c42725eca75597fea239b6e13244ab
7
- data.tar.gz: 988eff4d242d0425910b96fac4188df079c8c53c3abea2825cc97d5af5118841680705fa33461a5b4cfa7b8d6b32a486465e44b75f20fad324e4623c6c8083d8
6
+ metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
7
+ data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e
data/lib/rbbt/document.rb CHANGED
@@ -53,3 +53,9 @@ module Document
53
53
  alias id docid
54
54
  end
55
55
 
56
+ #class String
57
+ # def docid
58
+ # digest = Misc.digest(self)
59
+ # ["STRING", digest, nil, nil] * ":"
60
+ # end
61
+ #end
@@ -13,7 +13,7 @@ module Document
13
13
  end
14
14
 
15
15
  docid = self.docid
16
- segments.each{|s| s.docid = docid if s.docid.nil? }
16
+ segments.each{|s| s.docid = docid }
17
17
 
18
18
  segments
19
19
  end
@@ -36,7 +36,7 @@ module Document
36
36
 
37
37
  docid = document.docid
38
38
 
39
- segments.each{|s| s.docid = docid if s.docid.nil? }
39
+ segments.each{|s| s.docid = docid }
40
40
 
41
41
  segments
42
42
  end
@@ -3,6 +3,7 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
+ corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
6
7
  corpus.extend Document::Corpus unless Document::Corpus === corpus
7
8
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
8
9
  corpus
@@ -16,7 +17,8 @@ module Document::Corpus
16
17
  end
17
18
  end
18
19
 
19
- def docids(prefix)
20
+ def docids(*prefix)
21
+ prefix = prefix * ":"
20
22
  prefix += ":" unless prefix == :all || prefix[-1] == ":"
21
23
  docids = self.read_and_close do
22
24
  prefix == :all ? self.keys : self.prefix(prefix)
@@ -24,8 +26,8 @@ module Document::Corpus
24
26
  DocID.setup(docids, :corpus => self)
25
27
  end
26
28
 
27
- def documents(prefix)
28
- self.docids(prefix).document
29
+ def documents(*prefix)
30
+ self.docids(*prefix).document
29
31
  end
30
32
 
31
33
  def [](*args)
@@ -16,6 +16,7 @@ module Document::Corpus
16
16
  end
17
17
  Log.debug "Loading pmid #{pmid}"
18
18
  add_document(document)
19
+ document
19
20
  end
20
21
 
21
22
  Document.setup(res)
@@ -39,14 +39,15 @@ class Abner < NER
39
39
  types = res[1]
40
40
  strings = res[0]
41
41
 
42
+ docid = Misc.digest(text)
42
43
  global_offset = 0
43
44
  strings.zip(types).collect do |mention, type|
44
45
  mention = mention.to_s;
45
46
  offset = text.index(mention)
46
47
  if offset.nil?
47
- NamedEntity.setup(mention, nil, type.to_s)
48
+ NamedEntity.setup(mention, :docid => docid, :entity_type => type)
48
49
  else
49
- NamedEntity.setup(mention, offset + global_offset, type.to_s)
50
+ NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
50
51
  text = text[offset + mention.length..-1]
51
52
  global_offset += offset + mention.length
52
53
  end
@@ -55,6 +55,7 @@ class Banner < NER
55
55
  # text.
56
56
  def match(text)
57
57
  return [] if text.nil?
58
+ text = text.dup if text.frozen?
58
59
  text.gsub!(/\n/,' ')
59
60
  text.gsub!(/\|/,'/') # Character | gives an error
60
61
  return [] if text.strip.empty?
@@ -66,6 +67,7 @@ class Banner < NER
66
67
  @parenPP.postProcess(sentence)
67
68
  tagged = sentence.getSGML
68
69
 
70
+ docid = Misc.digest text
69
71
  res = tagged.scan(/<GENE>.*?<\/GENE>/).
70
72
  collect{|r|
71
73
  r.match(/<GENE>(.*?)<\/GENE>/)
@@ -73,7 +75,7 @@ class Banner < NER
73
75
  mention.sub!(/^\s*/,'')
74
76
  mention.sub!(/\s*$/,'')
75
77
  offset = text.index(mention)
76
- NamedEntity.setup(mention, offset, 'GENE')
78
+ NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
77
79
  mention
78
80
  }
79
81
  res
data/lib/rbbt/ner/brat.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/segment/named_entity'
2
- require 'rbbt/text/segment/relationship'
2
+ require 'rbbt/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
5
 
@@ -31,7 +31,8 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
34
+ best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
35
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
35
36
  end
36
37
  end
37
38
  end
@@ -0,0 +1,229 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/misc'
4
+ require 'rbbt/util/simpleDSL'
5
+
6
+ class NERFeatures
7
+ include SimpleDSL
8
+
9
+ def self.tokens(text)
10
+ text.scan(/
11
+ \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
12
+ \w-\w*|
13
+ \w+-[A-Z](?!\w)|
14
+ \w+|
15
+ [.,()\/\[\]{}'"+-]
16
+ /x)
17
+ end
18
+
19
+ def self.reverse(text)
20
+ tokens(text).reverse.join(" ")
21
+ end
22
+
23
+ def define(name, *args, &block)
24
+ action = args[0] || block || /#{name.to_s}s?/i
25
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
26
+
27
+ @types[name.to_s] = action
28
+ @order.push name.to_s
29
+
30
+ name.to_s
31
+ end
32
+
33
+ attr_accessor :reverse
34
+ def initialize(file = nil, reverse = false, &block)
35
+ @types = {}
36
+ @order = []
37
+ @context = []
38
+ @reverse = reverse
39
+
40
+ file ||= Rbbt.share.ner['config.rb'].find if !file && !block
41
+
42
+ parse(:define, file, &block)
43
+ end
44
+
45
+ def config
46
+ @config[:define]
47
+ end
48
+
49
+ def window(positions)
50
+ @window = positions
51
+ end
52
+
53
+ def context(name, &block)
54
+ if name.is_a? Array
55
+ @context += name
56
+ else
57
+ @context.push name
58
+
59
+ # The block might be wrongly assigned to this function
60
+ # instead of the actual definition, fix that.
61
+ if block
62
+ @types[name] = block
63
+ end
64
+ end
65
+ end
66
+
67
+ def direction(dir)
68
+ if dir.to_sym == :reverse
69
+ @reverse = true
70
+ end
71
+ end
72
+
73
+ def features(word)
74
+ values = [word]
75
+
76
+ @order.each{|features|
77
+ action = @types[features]
78
+ if action.is_a?(Proc)
79
+ values.push(action.call(word))
80
+ else
81
+ m = action.match(word)
82
+ if m
83
+ if m[1]
84
+ values.push(m[1])
85
+ else
86
+ values.push(m != nil)
87
+ end
88
+ else
89
+ values.push(false)
90
+ end
91
+ end
92
+ }
93
+ values
94
+ end
95
+
96
+ def template(window=nil)
97
+ window ||= @window || [1,-1]
98
+ template = ""
99
+
100
+ i = 1
101
+ @order.each{|feat|
102
+ template += "U#{ feat }: %x[0,#{ i }]\n"
103
+
104
+ if @context.include?(feat)
105
+ window.each{|p|
106
+ template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
107
+ }
108
+ end
109
+ i += 1
110
+ }
111
+
112
+ template += "B\n"
113
+
114
+ template
115
+ end
116
+
117
+
118
+ def text_features(text, positive = nil)
119
+ text = self.class.reverse(text) if @reverse
120
+ initial = true
121
+ self.class.tokens(text).collect{|token|
122
+ features = features(token)
123
+ if !positive.nil?
124
+ features << (positive ? (initial ? 1 : 2) : 0)
125
+ initial = false
126
+ end
127
+ features
128
+ }
129
+ end
130
+
131
+ def tagged_features(text, mentions)
132
+ mentions ||= []
133
+ mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
134
+ re = mentions.collect{|mention|
135
+ Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
136
+ }.join("|")
137
+
138
+ positive = false
139
+ features = []
140
+ chunks = text.split(/(#{re})/)
141
+ chunks.each{|t|
142
+ chunk_features = text_features(t, positive)
143
+ positive = !positive
144
+ if @reverse
145
+ features = chunk_features + features
146
+ else
147
+ features = features + chunk_features
148
+ end
149
+ }
150
+ features
151
+ end
152
+
153
+ def train(features, model)
154
+ tmp_template = TmpFile.tmp_file("template-")
155
+ Open.write(tmp_template,template)
156
+
157
+ cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
158
+ system cmd
159
+ Open.write(model + '.config',config)
160
+ FileUtils.rm tmp_template
161
+ end
162
+
163
+ end
164
+
165
+ class NER
166
+
167
+ def initialize(model = nil)
168
+ begin
169
+ require 'CRFPP'
170
+ rescue Exception
171
+ require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
172
+ end
173
+
174
+ model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
175
+
176
+ @parser = NERFeatures.new(model + '.config')
177
+ @reverse = @parser.reverse
178
+ @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
179
+ end
180
+
181
+ def extract(text)
182
+ features = @parser.text_features(text)
183
+
184
+ @tagger.clear
185
+ features.each{|feats|
186
+ @tagger.add(feats.join(" "))
187
+ }
188
+
189
+ @tagger.parse
190
+
191
+ found = []
192
+ mention = []
193
+
194
+ @tagger.size.times{|i|
195
+ label = @tagger.y(i)
196
+ word = @tagger.x(i,0)
197
+
198
+ if word == ')'
199
+ mention.push(')') if mention.join =~ /\(/
200
+ next
201
+ end
202
+
203
+ case label
204
+ when 1
205
+ if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
206
+ found.push(mention)
207
+ mention = []
208
+ end
209
+ mention.push(word)
210
+ when 2
211
+ mention.push(word)
212
+ when 0
213
+ found.push(mention) if mention.any?
214
+ mention = []
215
+ end
216
+ }
217
+
218
+ found << mention if mention.any?
219
+
220
+ found.collect{|list|
221
+ list = list.reverse if @reverse
222
+ list.join(" ")
223
+ }
224
+ end
225
+
226
+ end
227
+
228
+
229
+
@@ -249,7 +249,8 @@ class TokenTrieNER < NER
249
249
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
250
250
  }
251
251
 
252
- NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
252
+ type = type.first
253
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
253
254
  end
254
255
 
255
256
  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
@@ -2,30 +2,55 @@ require 'rbbt/segment'
2
2
  require 'rbbt/document'
3
3
  require 'rbbt/segment/annotation'
4
4
  require 'rbbt/util/python'
5
+ require 'rbbt/network/paths'
5
6
 
6
7
  module SpaCy
7
8
 
8
- PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
9
+ TOKEN_PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
10
+ CHUNK_PROPERTIES = %w(lemma_)
9
11
 
10
- def self.tokens(text, lang = 'en')
12
+ def self.nlp(lang = 'en_core_web_md')
13
+ @@nlp ||= {}
14
+ @@nlp[lang] ||= RbbtPython.run :spacy do
15
+ spacy.load(lang)
16
+ end
17
+ end
18
+
19
+ def self.tokens(text, lang = 'en_core_web_sm')
11
20
 
12
21
  tokens = []
13
- RbbtPython.run 'spacy' do
14
- nlp = spacy.load(lang)
15
- doc = nlp.call(text)
16
- doc.__len__.times do |i|
17
- tokens << doc.__getitem__(i)
18
- end
22
+
23
+ nlp = nlp(lang)
24
+ doc = nlp.call(text)
25
+
26
+ doc.__len__.times do |i|
27
+ tokens << doc.__getitem__(i)
28
+ end
29
+
30
+ tokens
31
+ end
32
+
33
+ def self.chunks(text, lang = 'en_core_web_sm')
34
+
35
+ tokens = []
36
+ nlp = nlp(lang)
37
+
38
+ doc = nlp.call(text)
39
+ chunks = doc.noun_chunks.__iter__
40
+
41
+ RbbtPython.iterate chunks do |item|
42
+ tokens << item
19
43
  end
44
+
20
45
  tokens
21
46
  end
22
47
 
23
- def self.segments(text, lang = 'en')
24
- docid = text.docid if Document === text
48
+ def self.segments(text, lang = 'en_core_web_sm')
49
+ docid = text.docid if Document === text
25
50
  corpus = text.corpus if Document === text
26
51
  tokens = self.tokens(text, lang).collect do |token|
27
52
  info = {}
28
- PROPERTIES.each do |p|
53
+ TOKEN_PROPERTIES.each do |p|
29
54
  info[p] = token.instance_eval(p.to_s)
30
55
  end
31
56
  info[:type] = "SpaCy"
@@ -35,7 +60,120 @@ module SpaCy
35
60
  info[:corpus] = corpus if corpus
36
61
  SpaCyToken.setup(token.text, info)
37
62
  end
38
- SpaCyToken.setup(tokens, :corpus => corpus)
63
+
64
+ tokens
65
+ end
66
+
67
+ def self.chunk_segments(text, lang = 'en_core_web_sm')
68
+ docid = text.docid if Document === text
69
+ corpus = text.corpus if Document === text
70
+ chunks = self.chunks(text, lang).collect do |chunk|
71
+ info = {}
72
+ CHUNK_PROPERTIES.each do |p|
73
+ info[p] = chunk.instance_eval(p.to_s)
74
+ end
75
+ start = eend = nil
76
+ deps = []
77
+ RbbtPython.iterate chunk.__iter__ do |token|
78
+ start = token.idx if start.nil?
79
+ eend = start + chunk.text.length if eend.nil?
80
+ deps << token.idx.to_s + ":" + token.dep_ + "->" + token.head.idx.to_s if token.head.idx < start || token.head.idx > eend
81
+ end
82
+ info[:type] = "SpaCy"
83
+ info[:offset] = chunk.__iter__.__next__.idx
84
+ info[:dep] = deps * ";"
85
+ info[:docid] = docid if docid
86
+ info[:corpus] = corpus if corpus
87
+ SpaCySpan.setup(chunk.text, info)
88
+ end
89
+
90
+ chunks
91
+ end
92
+
93
+ def self.dep_graph(text, reverse = false, lang = 'en_core_web_md')
94
+ tokens = self.segments(text, lang)
95
+ index = Segment.index(tokens)
96
+ associations = {}
97
+ tokens.each do |token|
98
+ type, target_pos = token.dep.split("->")
99
+ target_tokens = index[target_pos.to_i]
100
+ associations[token.segid] = target_tokens
101
+ end
102
+
103
+ if reverse
104
+ old = associations.dup
105
+ old.each do |s,ts|
106
+ ts.each do |t|
107
+ associations[t] ||= []
108
+ associations[t] += [s] unless associations[t].include?(s)
109
+ end
110
+ end
111
+ end
112
+
113
+ associations
114
+ end
115
+
116
+ def self.chunk_dep_graph(text, reverse = false, lang = 'en_core_web_md')
117
+ associations = dep_graph(text, false, lang)
118
+
119
+ chunks = self.chunk_segments(text, lang)
120
+ tokens = self.segments(text, lang)
121
+ index = Segment.index(tokens + chunks)
122
+
123
+ chunks.each do |chunk|
124
+ target_token_ids = chunk.dep.split(";").collect do|dep|
125
+ type, target_pos = dep.split("->")
126
+ index[target_pos.to_i]
127
+ end.flatten
128
+
129
+ target_tokens = target_token_ids.collect do |target_token_id|
130
+ range = Range.new(*target_token_id.split(":").last.split("..").map(&:to_i))
131
+ range.collect do |pos|
132
+ index[pos]
133
+ end.uniq
134
+ end.flatten
135
+ associations[chunk.segid] = target_tokens
136
+ end
137
+
138
+ if reverse
139
+ old = associations.dup
140
+ old.each do |s,ts|
141
+ ts.each do |t|
142
+ associations[t] ||= []
143
+ associations[t] += [s] unless associations[t].include?(s)
144
+ end
145
+ end
146
+ end
147
+
148
+ associations
149
+ end
150
+
151
+ def self.paths(text, source, target, reverse = true, lang = 'en_core_web_md')
152
+ graph = SpaCy.chunk_dep_graph(text, reverse, lang)
153
+
154
+ chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
155
+
156
+ source_id = chunk_index[source.offset].first || source.segid
157
+ target_id = chunk_index[target.offset].first || target.segid
158
+
159
+ path = Paths.dijkstra(graph, source_id, [target_id])
160
+
161
+ return nil if path.nil?
162
+
163
+ path.reverse
164
+ end
165
+
166
+ def self.config(base, target = nil)
167
+ TmpFile.with_file(base) do |baseconfig|
168
+ if target
169
+ CMD.cmd(:spacy, "init fill-config #{baseconfig} #{target}")
170
+ else
171
+ TmpFile.with_file do |tmptarget|
172
+ CMD.cmd(:spacy, "init fill-config #{baseconfig} #{tmptarget}")
173
+ Open.read(targetconfig)
174
+ end
175
+ end
176
+ end
39
177
  end
40
178
  end
41
179
 
@@ -43,10 +181,15 @@ module SpaCyToken
43
181
  extend Entity
44
182
  include SegmentAnnotation
45
183
 
46
- self.annotation *SpaCy::PROPERTIES
184
+ self.annotation *SpaCy::TOKEN_PROPERTIES
47
185
  self.annotation :dep
48
186
  end
49
187
 
50
- if __FILE__ == $0
51
- ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
188
+ module SpaCySpan
189
+ extend Entity
190
+ include SegmentAnnotation
191
+
192
+ self.annotation *SpaCy::CHUNK_PROPERTIES
193
+ self.annotation :dep
52
194
  end
195
+
@@ -0,0 +1,24 @@
1
+ require 'rbbt/segment'
2
+
3
+ module Relationship
4
+ extend Annotation
5
+ self.annotation :segment
6
+ self.annotation :terms
7
+ self.annotation :type
8
+
9
+ def text
10
+ if segment
11
+ segment
12
+ else
13
+ type + ": " + terms * ", "
14
+ end
15
+ end
16
+
17
+ def html
18
+ text = <<-EOF
19
+ <span class='Relationship'\
20
+ >#{ self.text }</span>
21
+ EOF
22
+ text.chomp
23
+ end
24
+ end
data/lib/rbbt/segment.rb CHANGED
@@ -49,10 +49,13 @@ module Segment
49
49
  length
50
50
  end
51
51
 
52
+
52
53
  def eend
53
54
  offset.to_i + length - 1
54
55
  end
55
56
 
57
+ alias end eend
58
+
56
59
  def range
57
60
  (offset.to_i..eend)
58
61
  end
@@ -8,6 +8,10 @@ module NamedEntity
8
8
 
9
9
  self.annotation :entity_type, :code, :score
10
10
 
11
+ def entity_type
12
+ annotation_values[:entity_type] || annotation_values[:type]
13
+ end
14
+
11
15
  def report
12
16
  <<-EOF
13
17
  String: #{ self }
@@ -6,7 +6,7 @@ module Segment::RangeIndex
6
6
  SegID.setup(res, :corpus => corpus)
7
7
  end
8
8
 
9
- def self.index(segments, corpus, persist_file = :memory)
9
+ def self.index(segments, corpus = nil, persist_file = :memory)
10
10
  segments = segments.values.flatten if Hash === segments
11
11
 
12
12
  annotation_index =
@@ -0,0 +1,51 @@
1
+ isLetters /^[A-Z]+$/i
2
+ isUpper /^[A-Z]+$/
3
+ isLower /^[a-z]+$/
4
+ isDigits /^[0-9]+$/i
5
+ isRoman /^[IVX]+$/
6
+ isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
+ isPunctuation /^[,.;]$/
8
+ isDelim /^[\/()\[\]{}\-]$/
9
+ isNonWord /^[^\w]+$/
10
+ isConjunction /^and|or|&|,$/
11
+
12
+ hasLetters /[A-Z]/i
13
+ hasUpper /.[A-Z]/
14
+ hasLower /[a-z]/
15
+ hasDigits /[0-9]/i
16
+ hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
+ hasPunctuation /[,.;]/
18
+ hasDelim /[\/()\[\]{}\-]/
19
+ hasNonWord /[^\w]/
20
+ caspMix /[a-z].[A-Z]/
21
+ keywords /(?:protein|gene|domain|ase)s?$/
22
+ hasSuffix /[a-z][A-Z0-9]$/
23
+
24
+ numLetters do |w| w.scan(/[A-Z]/i).length end
25
+ numDigits do |w| w.scan(/[0-9]/).length end
26
+ #
27
+ prefix_3 /^(...)/
28
+ prefix_4 /^(....)/
29
+ suffix_3 /(...)$/
30
+ suffix_4 /(....)$/
31
+
32
+
33
+ token1 do |w|
34
+ w.sub(/[A-Z]/,'A').
35
+ sub(/[a-z]/,'a').
36
+ sub(/[0-9]/,'0').
37
+ sub(/[^0-9a-z]/i,'x')
38
+ end
39
+ token2 do |w|
40
+ w.sub(/[A-Z]+/,'A').
41
+ sub(/[a-z]+/,'a').
42
+ sub(/[0-9]+/,'0').
43
+ sub(/[^0-9a-z]+/i,'x')
44
+ end
45
+ token3 do |w| w.downcase end
46
+ special do |w| w.is_special? end
47
+
48
+ context %w(special token2 isPunctuation isDelim)
49
+ window %w(1 2 3 -1 -2 -3)
50
+ #direction :reverse
51
+
@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
7
7
  def test_add_pmid
8
8
  corpus = Document::Corpus.setup({})
9
9
 
10
- document = corpus.add_pmid("32299157", :abstract).first
10
+ document = corpus.add_pmid("33359141", :abstract).first
11
+ iii document.docid
11
12
  title = document.to(:title)
12
13
  assert title.include?("COVID-19")
13
14
  end
@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
4
4
  require 'rbbt/segment'
5
5
  require 'rbbt/document/annotation'
6
6
  require 'rbbt/segment/named_entity'
7
+ require 'rbbt/ner/abner'
7
8
 
8
9
  class TestAnnotation < Test::Unit::TestCase
9
10
  class CalledOnce < Exception; end
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
28
29
  self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
29
30
  end
30
31
 
32
+ Document.define :abner do
33
+ $called_once = true
34
+ Abner.new.match(self)
35
+ end
36
+
37
+
31
38
  Document.persist :ner
32
39
  end
33
40
 
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
133
140
  text.ner
134
141
 
135
142
  assert ! $called_once
136
-
143
+
144
+ assert_equal text.abner.first.docid, text.docid
145
+
137
146
  assert text.ner.first.segid.include?("TEST:")
138
147
  end
139
148
  end
@@ -29,5 +29,19 @@ class TestDocumentCorpus < Test::Unit::TestCase
29
29
  assert corpus.docids("TEST:").include?(text.docid)
30
30
  end
31
31
  end
32
+
33
+ def test_load
34
+ text = "This is a document"
35
+ Document.setup(text, "TEST", "test_doc1", nil)
36
+
37
+ TmpFile.with_file do |path|
38
+ corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
39
+ corpus.extend Document::Corpus
40
+
41
+ corpus.add_document(text)
42
+
43
+ assert corpus.docids("TEST:").include?(text.docid)
44
+ end
45
+ end
32
46
  end
33
47
 
@@ -0,0 +1,132 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/ner/rner'
4
+ require 'test/unit'
5
+
6
+ class TestRNer < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @parser = NERFeatures.new() do
10
+ isLetters /^[A-Z]+$/i
11
+ context prefix_3 /^(...)/
12
+ downcase do |w| w.downcase end
13
+
14
+ context %w(downcase)
15
+ end
16
+ end
17
+
18
+ def test_config
19
+ config = <<-EOC
20
+ isLetters /^[A-Z]+$/i
21
+ context prefix_3 /^(...)/
22
+ downcase do |w| w.downcase end
23
+
24
+ context %w(downcase)
25
+ EOC
26
+
27
+ assert_equal config.strip, @parser.config.strip
28
+ end
29
+
30
+ def test_reverse
31
+ assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
32
+ assert_equal(
33
+ ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
34
+ NERFeatures.reverse(
35
+ "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
36
+ ))
37
+ end
38
+
39
+ def test_features
40
+ assert_equal @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
41
+ end
42
+
43
+ def test_template
44
+ template =<<-EOT
45
+ UisLetters: %x[0,1]
46
+ Uprefix_3: %x[0,2]
47
+ Uprefix_3#1: %x[1,2]
48
+ Uprefix_3#-1: %x[-1,2]
49
+ Udowncase: %x[0,3]
50
+ Udowncase#1: %x[1,3]
51
+ Udowncase#-1: %x[-1,3]
52
+ B
53
+ EOT
54
+
55
+ assert(@parser.template == template)
56
+ end
57
+
58
+ def test_tokens
59
+ assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
60
+ ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
61
+
62
+
63
+ end
64
+ def test_text_features
65
+
66
+ assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
67
+ assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
68
+ assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
69
+
70
+ end
71
+
72
+ def test_tagged_features
73
+ assert_equal(
74
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
75
+ ["of",true, false, "of", 0],
76
+ ["GENE1",false, "GEN", "gene1", 1],
77
+ [".", false, false, ".", 0]],
78
+ @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
79
+
80
+ assert_equal(
81
+ [["GENE1",false, "GEN", "gene1", 1],
82
+ ["phosphorilation",true, "pho", "phosphorilation", 0]],
83
+ @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
84
+
85
+
86
+ assert_equal(
87
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
88
+ ["of",true, false, "of", 0],
89
+ ["GENE",true, "GEN", "gene", 1],
90
+ ["1",false, false, "1", 2],
91
+ [".", false, false, ".", 0]],
92
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
93
+ end
94
+
95
+ def test_tagged_features_reverse
96
+ @parser.reverse = true
97
+ assert_equal(
98
+ [
99
+ ["GENE1",false, "GEN", "gene1", 1],
100
+ ["of",true, false, "of", 0],
101
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
102
+ ],
103
+ @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
104
+
105
+ assert_equal(
106
+ [
107
+ [".", false, false, ".", 0],
108
+ ["1",false, false, "1", 1],
109
+ ["GENE",true, "GEN", "gene", 2],
110
+ ["of",true, false, "of", 0],
111
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
112
+ ],
113
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
114
+ end
115
+
116
+ def test_default_config
117
+ require 'rbbt/bow/misc'
118
+ text =<<-EOF
119
+ This text explains how MDM2 interacts with TP53.
120
+ EOF
121
+ @parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
122
+ features = @parser.tagged_features text, %w(TP53 MDM2)
123
+ assert features.first.first == "This"
124
+ end
125
+
126
+
127
+
128
+ def __test_CRFPP_install
129
+ assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
130
+ end
131
+
132
+ end
@@ -24,7 +24,8 @@ class TestClass < Test::Unit::TestCase
24
24
 
25
25
  def test_tsv
26
26
  a = "test"
27
- NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
27
+ NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
28
+ ppp Annotated.tsv([a,a])
28
29
  assert Annotated.tsv([a]).fields.include? "code"
29
30
  assert Annotated.tsv([a], nil).fields.include? "code"
30
31
  assert Annotated.tsv([a], :all).fields.include? "code"
@@ -144,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
144
144
  gene2.entity_type = "Protein"
145
145
 
146
146
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
147
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
147
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
148
148
  end
149
149
  end
150
150
 
@@ -165,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
165
165
  gene2.entity_type = "Protein"
166
166
 
167
167
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
168
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
168
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
169
169
  end
170
170
  end
171
171
 
@@ -185,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
185
185
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
186
186
 
187
187
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
188
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
188
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
189
189
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
190
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
190
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
191
191
  end
192
192
  end
193
193
  end
data/test/test_spaCy.rb CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/nlp/spaCy'
3
3
  require 'rbbt/document/corpus'
4
4
 
5
5
  class TestSpaCy < Test::Unit::TestCase
6
- def _test_tokens
6
+ def test_tokens
7
7
  text = "I tell a story"
8
8
 
9
9
  tokens = SpaCy.tokens(text)
@@ -12,6 +12,16 @@ class TestSpaCy < Test::Unit::TestCase
12
12
  assert_equal "tell", tokens[1].to_s
13
13
  end
14
14
 
15
+ def test_chunks
16
+ text = "Miguel Vazquez tell a good story"
17
+
18
+ tokens = SpaCy.chunks(text)
19
+
20
+ assert_equal 2, tokens.length
21
+ assert_equal "Miguel Vazquez", tokens[0].to_s
22
+ end
23
+
24
+
15
25
  def test_segments
16
26
  text = "I tell a story. It's a very good story."
17
27
 
@@ -28,5 +38,107 @@ class TestSpaCy < Test::Unit::TestCase
28
38
  assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
29
39
  end
30
40
  end
41
+
42
+ def test_chunk_segments
43
+ text = "I tell a story. It's a very good story."
44
+
45
+ corpus = Document::Corpus.setup({})
46
+
47
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
48
+
49
+ corpus.add_document text
50
+ text.corpus = corpus
51
+
52
+ segments = SpaCy.chunk_segments(text)
53
+
54
+ segments.each do |segment|
55
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
56
+ end
57
+ end
58
+
59
+ def test_dep_graph
60
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
61
+ graph = SpaCy.dep_graph(text, true)
62
+
63
+ tokens = SpaCy.segments(text)
64
+ index = Segment.index tokens
65
+ tf_s = tokens.select{|t| t == "TF" }.first
66
+ tg_s = tokens.select{|t| t == "TG" }.first
67
+
68
+ require 'rbbt/network/paths'
69
+
70
+ path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
71
+ path_tokens = path.collect do |segid|
72
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
73
+ text[range]
74
+ end
75
+
76
+ assert path_tokens.include? 'increase'
77
+
78
+ end
79
+
80
+ def test_chunk_dep_graph
81
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
82
+ graph = SpaCy.chunk_dep_graph(text, true)
83
+
84
+ tokens = SpaCy.chunk_segments(text)
85
+ index = Segment.index tokens
86
+ tf_s = tokens.select{|t| t.include? "TF" }.first
87
+ tg_s = tokens.select{|t| t.include? "TG" }.first
88
+
89
+
90
+ require 'rbbt/network/paths'
91
+
92
+ path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
93
+ path_tokens = path.collect do |segid|
94
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
95
+ text[range]
96
+ end
97
+
98
+ assert path_tokens.include? 'increase'
99
+ end
100
+
101
+ def test_paths
102
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
103
+ path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
104
+
105
+
106
+ path_tokens = path.collect do |segid|
107
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
108
+ text[range]
109
+ end
110
+
111
+ ppp text
112
+ iii path_tokens
113
+
114
+ assert path_tokens.include? 'increase'
115
+ end
116
+
117
+ def test_paths2
118
+ text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
119
+ path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
120
+
121
+
122
+ path_tokens = path.collect do |segid|
123
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
124
+ text[range]
125
+ end
126
+
127
+ iii path_tokens
128
+
129
+
130
+ assert path_tokens.include? 'regulation'
131
+ end
132
+
133
+ def test_paths3
134
+ text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
135
+ path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
136
+
137
+ path_tokens = path.collect do |segid|
138
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
139
+ text[range]
140
+ end
141
+
142
+ end
31
143
  end
32
144
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 1.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-23 00:00:00.000000000 Z
11
+ date: 2021-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -95,6 +95,7 @@ files:
95
95
  - lib/rbbt/ner/oscar4.rb
96
96
  - lib/rbbt/ner/patterns.rb
97
97
  - lib/rbbt/ner/regexpNER.rb
98
+ - lib/rbbt/ner/rner.rb
98
99
  - lib/rbbt/ner/rnorm.rb
99
100
  - lib/rbbt/ner/rnorm/cue_index.rb
100
101
  - lib/rbbt/ner/rnorm/tokens.rb
@@ -103,6 +104,7 @@ files:
103
104
  - lib/rbbt/nlp/nlp.rb
104
105
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
105
106
  - lib/rbbt/nlp/spaCy.rb
107
+ - lib/rbbt/relationship.rb
106
108
  - lib/rbbt/segment.rb
107
109
  - lib/rbbt/segment/annotation.rb
108
110
  - lib/rbbt/segment/encoding.rb
@@ -126,6 +128,7 @@ files:
126
128
  - share/install/software/OpenNLP
127
129
  - share/install/software/StanfordParser
128
130
  - share/patterns/drug_induce_disease
131
+ - share/rner/config.rb
129
132
  - share/rnorm/cue_default
130
133
  - share/rnorm/tokens_default
131
134
  - share/wordlists/stopwords
@@ -148,6 +151,7 @@ files:
148
151
  - test/rbbt/ner/test_oscar4.rb
149
152
  - test/rbbt/ner/test_patterns.rb
150
153
  - test/rbbt/ner/test_regexpNER.rb
154
+ - test/rbbt/ner/test_rner.rb
151
155
  - test/rbbt/ner/test_rnorm.rb
152
156
  - test/rbbt/ner/test_token_trieNER.rb
153
157
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
@@ -182,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
186
  - !ruby/object:Gem::Version
183
187
  version: '0'
184
188
  requirements: []
185
- rubygems_version: 3.0.6
189
+ rubygems_version: 3.1.4
186
190
  signing_key:
187
191
  specification_version: 4
188
192
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
@@ -210,6 +214,7 @@ test_files:
210
214
  - test/rbbt/ner/test_banner.rb
211
215
  - test/rbbt/ner/test_token_trieNER.rb
212
216
  - test/rbbt/ner/test_finder.rb
217
+ - test/rbbt/ner/test_rner.rb
213
218
  - test/rbbt/ner/test_linnaeus.rb
214
219
  - test/rbbt/ner/test_oscar4.rb
215
220
  - test/rbbt/test_segment.rb