rbbt-text 1.3.0 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +20 -5
  5. data/lib/rbbt/document/annotation.rb +7 -4
  6. data/lib/rbbt/document/corpus.rb +30 -3
  7. data/lib/rbbt/document/corpus/pubmed.rb +2 -1
  8. data/lib/rbbt/ner/abner.rb +3 -2
  9. data/lib/rbbt/ner/banner.rb +3 -1
  10. data/lib/rbbt/ner/brat.rb +1 -1
  11. data/lib/rbbt/ner/g_norm_plus.rb +7 -1
  12. data/lib/rbbt/ner/linnaeus.rb +2 -1
  13. data/lib/rbbt/ner/patterns.rb +0 -1
  14. data/lib/rbbt/ner/rner.rb +229 -0
  15. data/lib/rbbt/ner/token_trieNER.rb +32 -18
  16. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  17. data/lib/rbbt/nlp/spaCy.rb +195 -0
  18. data/lib/rbbt/relationship.rb +24 -0
  19. data/lib/rbbt/segment.rb +9 -4
  20. data/lib/rbbt/segment/annotation.rb +3 -3
  21. data/lib/rbbt/segment/named_entity.rb +7 -0
  22. data/lib/rbbt/segment/range_index.rb +1 -1
  23. data/lib/rbbt/segment/relationship.rb +7 -0
  24. data/lib/rbbt/segment/transformed.rb +5 -1
  25. data/share/install/software/OpenNLP +1 -1
  26. data/share/rner/config.rb +51 -0
  27. data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
  28. data/test/rbbt/document/test_annotation.rb +15 -6
  29. data/test/rbbt/document/test_corpus.rb +15 -1
  30. data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
  31. data/test/rbbt/ner/test_rner.rb +132 -0
  32. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
  33. data/test/rbbt/segment/test_annotation.rb +3 -4
  34. data/test/rbbt/segment/test_encoding.rb +1 -1
  35. data/test/rbbt/segment/test_named_entity.rb +7 -5
  36. data/test/rbbt/segment/test_range_index.rb +1 -2
  37. data/test/rbbt/segment/test_transformed.rb +33 -4
  38. data/test/rbbt/test_segment.rb +5 -10
  39. data/test/test_spaCy.rb +144 -0
  40. metadata +12 -3
@@ -5,15 +5,27 @@ require 'rbbt/ner/NER'
5
5
  require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
- def self.clean(token)
8
+ def self.clean(token, stem = false)
9
9
  if token.length > 3
10
- token.downcase.sub(/-/,'')
10
+ upcase = token !~ /[a-z]/
11
+ token = token.downcase.sub(/-/,'')
12
+
13
+ if stem && ! upcase
14
+ require 'stemmer'
15
+ if stem == :double
16
+ token = token.stem.stem
17
+ else
18
+ token = token.stem
19
+ end
20
+ end
21
+
22
+ token
11
23
  else
12
24
  token
13
25
  end
14
26
  end
15
27
 
16
- def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
28
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
17
29
  if no_clean
18
30
  if extend_to_token
19
31
  Token.setup(token, :offset => start, :original => token)
@@ -22,25 +34,25 @@ class TokenTrieNER < NER
22
34
  end
23
35
  else
24
36
  if extend_to_token
25
- Token.setup(clean(token), :offset => start, :original => token)
37
+ Token.setup(clean(token, stem), :offset => start, :original => token)
26
38
  else
27
- clean(token)
39
+ clean(token, stem)
28
40
  end
29
41
  end
30
42
  end
31
43
 
32
- def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
33
- split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
44
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
45
+ split_at = /\s|(\(|\)|[-."':,;])/ if split_at.nil?
34
46
 
35
47
  tokens = []
36
48
  while matchdata = text.match(split_at)
37
- tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
49
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
50
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
39
51
  start += matchdata.end(0)
40
52
  text = matchdata.post_match
41
53
  end
42
54
 
43
- tokens << prepare_token(text, start, extend_to_token) unless text.empty?
55
+ tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
44
56
 
45
57
  tokens
46
58
  end
@@ -130,7 +142,7 @@ class TokenTrieNER < NER
130
142
  index1
131
143
  end
132
144
 
133
- def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
145
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
134
146
 
135
147
  chunk_size = hash.size / 100
136
148
  items_in_chunk = 0
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
146
158
  names.each do |name|
147
159
  next if name.empty? or (String === name and name.length < 2)
148
160
 
149
- tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
161
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
150
162
  tokens.extend EnumeratedArray
151
163
 
152
164
  token_index = index_for_tokens(tokens, code, type, slack)
@@ -237,10 +249,11 @@ class TokenTrieNER < NER
237
249
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
238
250
  }
239
251
 
240
- NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
252
+ type = type.first
253
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
241
254
  end
242
255
 
243
- attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
256
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
244
257
  def initialize(type = nil, file = nil, options = {})
245
258
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
246
259
  :persist => false
@@ -248,6 +261,7 @@ class TokenTrieNER < NER
248
261
  @longest_match = options.delete :longest_match
249
262
  @split_at = options.delete :split_at
250
263
  @no_clean = options.delete :no_clean
264
+ @stem = options.delete :stem
251
265
 
252
266
  file = [] if file.nil?
253
267
  file = [file] unless Array === file
@@ -273,7 +287,7 @@ class TokenTrieNER < NER
273
287
  Log.debug "TokenTrieNER merging TSV"
274
288
  new.with_unnamed do
275
289
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
276
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
290
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
277
291
  end
278
292
  end
279
293
  when Hash === new
@@ -284,18 +298,18 @@ class TokenTrieNER < NER
284
298
  new = TSV.open(new, :flat)
285
299
  new.with_unnamed do
286
300
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
287
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
301
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
288
302
  end
289
303
  end
290
304
  end
291
305
  end
292
306
 
293
307
  def match(text)
294
- tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
308
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
295
309
 
296
310
  tokens.extend EnumeratedArray
297
311
  tokens.pos = 0
298
-
312
+
299
313
  matches = []
300
314
  while tokens.left?
301
315
  new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
@@ -239,6 +239,7 @@ module NLP
239
239
  end
240
240
 
241
241
  def self.geniass_sentence_splitter(text)
242
+ Rbbt.software.opt.Geniass.produce
242
243
  offsets = []
243
244
 
244
245
  cleaned = text.gsub("\n",NEW_LINE_MASK)
@@ -294,7 +295,7 @@ module NLP
294
295
  offsets.collect do |s,e|
295
296
  sentence = text[s..e]
296
297
  next if sentence.nil?
297
- #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ sentence.gsub!(NEW_LINE_MASK, "\n")
298
299
  Segment.setup sentence, s
299
300
  sentence
300
301
  end
@@ -0,0 +1,195 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/document'
3
+ require 'rbbt/segment/annotation'
4
+ require 'rbbt/util/python'
5
+ require 'rbbt/network/paths'
6
+
7
+ module SpaCy
8
+
9
+ TOKEN_PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
10
+ CHUNK_PROPERTIES = %w(lemma_)
11
+
12
+ def self.nlp(lang = 'en_core_web_md')
13
+ @@nlp ||= {}
14
+ @@nlp[lang] ||= RbbtPython.run :spacy do
15
+ spacy.load(lang)
16
+ end
17
+ end
18
+
19
+ def self.tokens(text, lang = 'en_core_web_sm')
20
+
21
+ tokens = []
22
+
23
+ nlp = nlp(lang)
24
+ doc = nlp.call(text)
25
+
26
+ doc.__len__.times do |i|
27
+ tokens << doc.__getitem__(i)
28
+ end
29
+
30
+ tokens
31
+ end
32
+
33
+ def self.chunks(text, lang = 'en_core_web_sm')
34
+
35
+ tokens = []
36
+ nlp = nlp(lang)
37
+
38
+ doc = nlp.call(text)
39
+ chunks = doc.noun_chunks.__iter__
40
+
41
+ RbbtPython.iterate chunks do |item|
42
+ tokens << item
43
+ end
44
+
45
+ tokens
46
+ end
47
+
48
+ def self.segments(text, lang = 'en_core_web_sm')
49
+ docid = text.docid if Document === text
50
+ corpus = text.corpus if Document === text
51
+ tokens = self.tokens(text, lang).collect do |token|
52
+ info = {}
53
+ TOKEN_PROPERTIES.each do |p|
54
+ info[p] = token.instance_eval(p.to_s)
55
+ end
56
+ info[:type] = "SpaCy"
57
+ info[:offset] = token.idx
58
+ info[:dep] = token.dep_ + "->" + token.head.idx.to_s
59
+ info[:docid] = docid if docid
60
+ info[:corpus] = corpus if corpus
61
+ SpaCyToken.setup(token.text, info)
62
+ end
63
+
64
+ tokens
65
+ end
66
+
67
+ def self.chunk_segments(text, lang = 'en_core_web_sm')
68
+ docid = text.docid if Document === text
69
+ corpus = text.corpus if Document === text
70
+ chunks = self.chunks(text, lang).collect do |chunk|
71
+ info = {}
72
+ CHUNK_PROPERTIES.each do |p|
73
+ info[p] = chunk.instance_eval(p.to_s)
74
+ end
75
+ start = eend = nil
76
+ deps = []
77
+ RbbtPython.iterate chunk.__iter__ do |token|
78
+ start = token.idx if start.nil?
79
+ eend = start + chunk.text.length if eend.nil?
80
+ deps << token.idx.to_s + ":" + token.dep_ + "->" + token.head.idx.to_s if token.head.idx < start || token.head.idx > eend
81
+ end
82
+ info[:type] = "SpaCy"
83
+ info[:offset] = chunk.__iter__.__next__.idx
84
+ info[:dep] = deps * ";"
85
+ info[:docid] = docid if docid
86
+ info[:corpus] = corpus if corpus
87
+ SpaCySpan.setup(chunk.text, info)
88
+ end
89
+
90
+ chunks
91
+ end
92
+
93
+ def self.dep_graph(text, reverse = false, lang = 'en_core_web_md')
94
+ tokens = self.segments(text, lang)
95
+ index = Segment.index(tokens)
96
+ associations = {}
97
+ tokens.each do |token|
98
+ type, target_pos = token.dep.split("->")
99
+ target_tokens = index[target_pos.to_i]
100
+ associations[token.segid] = target_tokens
101
+ end
102
+
103
+ if reverse
104
+ old = associations.dup
105
+ old.each do |s,ts|
106
+ ts.each do |t|
107
+ associations[t] ||= []
108
+ associations[t] += [s] unless associations[t].include?(s)
109
+ end
110
+ end
111
+ end
112
+
113
+ associations
114
+ end
115
+
116
+ def self.chunk_dep_graph(text, reverse = false, lang = 'en_core_web_md')
117
+ associations = dep_graph(text, false, lang)
118
+
119
+ chunks = self.chunk_segments(text, lang)
120
+ tokens = self.segments(text, lang)
121
+ index = Segment.index(tokens + chunks)
122
+
123
+ chunks.each do |chunk|
124
+ target_token_ids = chunk.dep.split(";").collect do|dep|
125
+ type, target_pos = dep.split("->")
126
+ index[target_pos.to_i]
127
+ end.flatten
128
+
129
+ target_tokens = target_token_ids.collect do |target_token_id|
130
+ range = Range.new(*target_token_id.split(":").last.split("..").map(&:to_i))
131
+ range.collect do |pos|
132
+ index[pos]
133
+ end.uniq
134
+ end.flatten
135
+ associations[chunk.segid] = target_tokens
136
+ end
137
+
138
+ if reverse
139
+ old = associations.dup
140
+ old.each do |s,ts|
141
+ ts.each do |t|
142
+ associations[t] ||= []
143
+ associations[t] += [s] unless associations[t].include?(s)
144
+ end
145
+ end
146
+ end
147
+
148
+ associations
149
+ end
150
+
151
+ def self.paths(text, source, target, reverse = true, lang = 'en_core_web_md')
152
+ graph = SpaCy.chunk_dep_graph(text, reverse, lang)
153
+
154
+ chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
155
+
156
+ source_id = chunk_index[source.offset].first || source.segid
157
+ target_id = chunk_index[target.offset].first || target.segid
158
+
159
+ path = Paths.dijkstra(graph, source_id, [target_id])
160
+
161
+ return nil if path.nil?
162
+
163
+ path.reverse
164
+ end
165
+
166
+ def self.config(base, target = nil)
167
+ TmpFile.with_file(base) do |baseconfig|
168
+ if target
169
+ CMD.cmd(:spacy, "init fill-config #{baseconfig} #{target}")
170
+ else
171
+ TmpFile.with_file do |tmptarget|
172
+ CMD.cmd(:spacy, "init fill-config #{baseconfig} #{tmptarget}")
173
+ Open.read(targetconfig)
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
179
+
180
+ module SpaCyToken
181
+ extend Entity
182
+ include SegmentAnnotation
183
+
184
+ self.annotation *SpaCy::TOKEN_PROPERTIES
185
+ self.annotation :dep
186
+ end
187
+
188
+ module SpaCySpan
189
+ extend Entity
190
+ include SegmentAnnotation
191
+
192
+ self.annotation *SpaCy::CHUNK_PROPERTIES
193
+ self.annotation :dep
194
+ end
195
+
@@ -0,0 +1,24 @@
1
+ require 'rbbt/segment'
2
+
3
+ module Relationship
4
+ extend Annotation
5
+ self.annotation :segment
6
+ self.annotation :terms
7
+ self.annotation :type
8
+
9
+ def text
10
+ if segment
11
+ segment
12
+ else
13
+ type + ": " + terms * ", "
14
+ end
15
+ end
16
+
17
+ def html
18
+ text = <<-EOF
19
+ <span class='Relationship'\
20
+ >#{ self.text }</span>
21
+ EOF
22
+ text.chomp
23
+ end
24
+ end
data/lib/rbbt/segment.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/entity'
3
+ require 'rbbt/document'
3
4
 
4
5
  module SegID
5
6
  extend Entity
@@ -10,11 +11,11 @@ module SegID
10
11
  end
11
12
 
12
13
  def range
13
- @range ||= Range.new(*_parts.last.split("..").map(&:to_i))
14
+ @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
14
15
  end
15
16
 
16
17
  def docid
17
- @docid ||= _parts[0..3] * ":"
18
+ @docid ||= DocID.setup(_parts[0..3] * ":")
18
19
  end
19
20
 
20
21
  def offset
@@ -25,12 +26,13 @@ module SegID
25
26
  range.end - range.begin + 1
26
27
  end
27
28
 
28
- property :segment do
29
+ property :segment => :single do
30
+ docid = self.docid
29
31
  document = DocID.setup(docid, :corpus => corpus).document
30
32
 
31
33
  text = document[range]
32
34
 
33
- Segment.setup(text, docid)
35
+ Segment.setup(text, :docid => docid, :offset => offset)
34
36
  end
35
37
 
36
38
  property :segid do
@@ -47,10 +49,13 @@ module Segment
47
49
  length
48
50
  end
49
51
 
52
+
50
53
  def eend
51
54
  offset.to_i + length - 1
52
55
  end
53
56
 
57
+ alias end eend
58
+
54
59
  def range
55
60
  (offset.to_i..eend)
56
61
  end
@@ -1,6 +1,6 @@
1
1
  require 'rbbt-util'
2
- require 'rbbt/entity'
3
2
  require 'rbbt/segment'
3
+ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
@@ -32,7 +32,7 @@ end
32
32
 
33
33
  module SegmentAnnotation
34
34
  extend Entity
35
- include Segment
35
+ include Object::Segment
36
36
  self.annotation :type
37
37
 
38
38
  property :segid do
@@ -47,7 +47,7 @@ module SegmentAnnotation
47
47
  end
48
48
 
49
49
  property :annotid do |corpus=nil|
50
- AnnotID.setup([segid, type] * ":", :corpus => corpus)
50
+ AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
51
51
  end
52
52
 
53
53
  alias id annotid
@@ -8,6 +8,10 @@ module NamedEntity
8
8
 
9
9
  self.annotation :entity_type, :code, :score
10
10
 
11
+ def entity_type
12
+ annotation_values[:entity_type] || annotation_values[:type]
13
+ end
14
+
11
15
  def report
12
16
  <<-EOF
13
17
  String: #{ self }
@@ -19,11 +23,14 @@ Score: #{score.inspect}
19
23
  end
20
24
 
21
25
  def html
26
+ title = code.nil? ? entity_type : [entity_type, code].compact * ":"
27
+
22
28
  text = <<-EOF
23
29
  <span class='Entity'\
24
30
  #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
25
31
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
26
32
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
33
+ #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
27
34
  >#{ self }</span>
28
35
  EOF
29
36
  text.chomp