rbbt-text 1.3.0 → 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +20 -5
  5. data/lib/rbbt/document/annotation.rb +7 -4
  6. data/lib/rbbt/document/corpus.rb +30 -3
  7. data/lib/rbbt/document/corpus/pubmed.rb +2 -1
  8. data/lib/rbbt/ner/abner.rb +3 -2
  9. data/lib/rbbt/ner/banner.rb +3 -1
  10. data/lib/rbbt/ner/brat.rb +1 -1
  11. data/lib/rbbt/ner/g_norm_plus.rb +7 -1
  12. data/lib/rbbt/ner/linnaeus.rb +2 -1
  13. data/lib/rbbt/ner/patterns.rb +0 -1
  14. data/lib/rbbt/ner/rner.rb +229 -0
  15. data/lib/rbbt/ner/token_trieNER.rb +32 -18
  16. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  17. data/lib/rbbt/nlp/spaCy.rb +195 -0
  18. data/lib/rbbt/relationship.rb +24 -0
  19. data/lib/rbbt/segment.rb +9 -4
  20. data/lib/rbbt/segment/annotation.rb +3 -3
  21. data/lib/rbbt/segment/named_entity.rb +7 -0
  22. data/lib/rbbt/segment/range_index.rb +1 -1
  23. data/lib/rbbt/segment/relationship.rb +7 -0
  24. data/lib/rbbt/segment/transformed.rb +5 -1
  25. data/share/install/software/OpenNLP +1 -1
  26. data/share/rner/config.rb +51 -0
  27. data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
  28. data/test/rbbt/document/test_annotation.rb +15 -6
  29. data/test/rbbt/document/test_corpus.rb +15 -1
  30. data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
  31. data/test/rbbt/ner/test_rner.rb +132 -0
  32. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
  33. data/test/rbbt/segment/test_annotation.rb +3 -4
  34. data/test/rbbt/segment/test_encoding.rb +1 -1
  35. data/test/rbbt/segment/test_named_entity.rb +7 -5
  36. data/test/rbbt/segment/test_range_index.rb +1 -2
  37. data/test/rbbt/segment/test_transformed.rb +33 -4
  38. data/test/rbbt/test_segment.rb +5 -10
  39. data/test/test_spaCy.rb +144 -0
  40. metadata +12 -3
@@ -5,15 +5,27 @@ require 'rbbt/ner/NER'
5
5
  require 'rbbt/segment/token'
6
6
 
7
7
  class TokenTrieNER < NER
8
- def self.clean(token)
8
+ def self.clean(token, stem = false)
9
9
  if token.length > 3
10
- token.downcase.sub(/-/,'')
10
+ upcase = token !~ /[a-z]/
11
+ token = token.downcase.sub(/-/,'')
12
+
13
+ if stem && ! upcase
14
+ require 'stemmer'
15
+ if stem == :double
16
+ token = token.stem.stem
17
+ else
18
+ token = token.stem
19
+ end
20
+ end
21
+
22
+ token
11
23
  else
12
24
  token
13
25
  end
14
26
  end
15
27
 
16
- def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
28
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
17
29
  if no_clean
18
30
  if extend_to_token
19
31
  Token.setup(token, :offset => start, :original => token)
@@ -22,25 +34,25 @@ class TokenTrieNER < NER
22
34
  end
23
35
  else
24
36
  if extend_to_token
25
- Token.setup(clean(token), :offset => start, :original => token)
37
+ Token.setup(clean(token, stem), :offset => start, :original => token)
26
38
  else
27
- clean(token)
39
+ clean(token, stem)
28
40
  end
29
41
  end
30
42
  end
31
43
 
32
- def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
33
- split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
44
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
45
+ split_at = /\s|(\(|\)|[-."':,;])/ if split_at.nil?
34
46
 
35
47
  tokens = []
36
48
  while matchdata = text.match(split_at)
37
- tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
49
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
50
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
39
51
  start += matchdata.end(0)
40
52
  text = matchdata.post_match
41
53
  end
42
54
 
43
- tokens << prepare_token(text, start, extend_to_token) unless text.empty?
55
+ tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
44
56
 
45
57
  tokens
46
58
  end
@@ -130,7 +142,7 @@ class TokenTrieNER < NER
130
142
  index1
131
143
  end
132
144
 
133
- def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
145
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
134
146
 
135
147
  chunk_size = hash.size / 100
136
148
  items_in_chunk = 0
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
146
158
  names.each do |name|
147
159
  next if name.empty? or (String === name and name.length < 2)
148
160
 
149
- tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
161
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
150
162
  tokens.extend EnumeratedArray
151
163
 
152
164
  token_index = index_for_tokens(tokens, code, type, slack)
@@ -237,10 +249,11 @@ class TokenTrieNER < NER
237
249
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
238
250
  }
239
251
 
240
- NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
252
+ type = type.first
253
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
241
254
  end
242
255
 
243
- attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
256
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
244
257
  def initialize(type = nil, file = nil, options = {})
245
258
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
246
259
  :persist => false
@@ -248,6 +261,7 @@ class TokenTrieNER < NER
248
261
  @longest_match = options.delete :longest_match
249
262
  @split_at = options.delete :split_at
250
263
  @no_clean = options.delete :no_clean
264
+ @stem = options.delete :stem
251
265
 
252
266
  file = [] if file.nil?
253
267
  file = [file] unless Array === file
@@ -273,7 +287,7 @@ class TokenTrieNER < NER
273
287
  Log.debug "TokenTrieNER merging TSV"
274
288
  new.with_unnamed do
275
289
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
276
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
290
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
277
291
  end
278
292
  end
279
293
  when Hash === new
@@ -284,18 +298,18 @@ class TokenTrieNER < NER
284
298
  new = TSV.open(new, :flat)
285
299
  new.with_unnamed do
286
300
  new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
287
- TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
301
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
288
302
  end
289
303
  end
290
304
  end
291
305
  end
292
306
 
293
307
  def match(text)
294
- tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
308
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
295
309
 
296
310
  tokens.extend EnumeratedArray
297
311
  tokens.pos = 0
298
-
312
+
299
313
  matches = []
300
314
  while tokens.left?
301
315
  new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
@@ -239,6 +239,7 @@ module NLP
239
239
  end
240
240
 
241
241
  def self.geniass_sentence_splitter(text)
242
+ Rbbt.software.opt.Geniass.produce
242
243
  offsets = []
243
244
 
244
245
  cleaned = text.gsub("\n",NEW_LINE_MASK)
@@ -294,7 +295,7 @@ module NLP
294
295
  offsets.collect do |s,e|
295
296
  sentence = text[s..e]
296
297
  next if sentence.nil?
297
- #sentence.gsub!(NEW_LINE_MASK, "\n")
298
+ sentence.gsub!(NEW_LINE_MASK, "\n")
298
299
  Segment.setup sentence, s
299
300
  sentence
300
301
  end
@@ -0,0 +1,195 @@
1
+ require 'rbbt/segment'
2
+ require 'rbbt/document'
3
+ require 'rbbt/segment/annotation'
4
+ require 'rbbt/util/python'
5
+ require 'rbbt/network/paths'
6
+
7
+ module SpaCy
8
+
9
+ TOKEN_PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
10
+ CHUNK_PROPERTIES = %w(lemma_)
11
+
12
+ def self.nlp(lang = 'en_core_web_md')
13
+ @@nlp ||= {}
14
+ @@nlp[lang] ||= RbbtPython.run :spacy do
15
+ spacy.load(lang)
16
+ end
17
+ end
18
+
19
+ def self.tokens(text, lang = 'en_core_web_sm')
20
+
21
+ tokens = []
22
+
23
+ nlp = nlp(lang)
24
+ doc = nlp.call(text)
25
+
26
+ doc.__len__.times do |i|
27
+ tokens << doc.__getitem__(i)
28
+ end
29
+
30
+ tokens
31
+ end
32
+
33
+ def self.chunks(text, lang = 'en_core_web_sm')
34
+
35
+ tokens = []
36
+ nlp = nlp(lang)
37
+
38
+ doc = nlp.call(text)
39
+ chunks = doc.noun_chunks.__iter__
40
+
41
+ RbbtPython.iterate chunks do |item|
42
+ tokens << item
43
+ end
44
+
45
+ tokens
46
+ end
47
+
48
+ def self.segments(text, lang = 'en_core_web_sm')
49
+ docid = text.docid if Document === text
50
+ corpus = text.corpus if Document === text
51
+ tokens = self.tokens(text, lang).collect do |token|
52
+ info = {}
53
+ TOKEN_PROPERTIES.each do |p|
54
+ info[p] = token.instance_eval(p.to_s)
55
+ end
56
+ info[:type] = "SpaCy"
57
+ info[:offset] = token.idx
58
+ info[:dep] = token.dep_ + "->" + token.head.idx.to_s
59
+ info[:docid] = docid if docid
60
+ info[:corpus] = corpus if corpus
61
+ SpaCyToken.setup(token.text, info)
62
+ end
63
+
64
+ tokens
65
+ end
66
+
67
+ def self.chunk_segments(text, lang = 'en_core_web_sm')
68
+ docid = text.docid if Document === text
69
+ corpus = text.corpus if Document === text
70
+ chunks = self.chunks(text, lang).collect do |chunk|
71
+ info = {}
72
+ CHUNK_PROPERTIES.each do |p|
73
+ info[p] = chunk.instance_eval(p.to_s)
74
+ end
75
+ start = eend = nil
76
+ deps = []
77
+ RbbtPython.iterate chunk.__iter__ do |token|
78
+ start = token.idx if start.nil?
79
+ eend = start + chunk.text.length if eend.nil?
80
+ deps << token.idx.to_s + ":" + token.dep_ + "->" + token.head.idx.to_s if token.head.idx < start || token.head.idx > eend
81
+ end
82
+ info[:type] = "SpaCy"
83
+ info[:offset] = chunk.__iter__.__next__.idx
84
+ info[:dep] = deps * ";"
85
+ info[:docid] = docid if docid
86
+ info[:corpus] = corpus if corpus
87
+ SpaCySpan.setup(chunk.text, info)
88
+ end
89
+
90
+ chunks
91
+ end
92
+
93
+ def self.dep_graph(text, reverse = false, lang = 'en_core_web_md')
94
+ tokens = self.segments(text, lang)
95
+ index = Segment.index(tokens)
96
+ associations = {}
97
+ tokens.each do |token|
98
+ type, target_pos = token.dep.split("->")
99
+ target_tokens = index[target_pos.to_i]
100
+ associations[token.segid] = target_tokens
101
+ end
102
+
103
+ if reverse
104
+ old = associations.dup
105
+ old.each do |s,ts|
106
+ ts.each do |t|
107
+ associations[t] ||= []
108
+ associations[t] += [s] unless associations[t].include?(s)
109
+ end
110
+ end
111
+ end
112
+
113
+ associations
114
+ end
115
+
116
+ def self.chunk_dep_graph(text, reverse = false, lang = 'en_core_web_md')
117
+ associations = dep_graph(text, false, lang)
118
+
119
+ chunks = self.chunk_segments(text, lang)
120
+ tokens = self.segments(text, lang)
121
+ index = Segment.index(tokens + chunks)
122
+
123
+ chunks.each do |chunk|
124
+ target_token_ids = chunk.dep.split(";").collect do|dep|
125
+ type, target_pos = dep.split("->")
126
+ index[target_pos.to_i]
127
+ end.flatten
128
+
129
+ target_tokens = target_token_ids.collect do |target_token_id|
130
+ range = Range.new(*target_token_id.split(":").last.split("..").map(&:to_i))
131
+ range.collect do |pos|
132
+ index[pos]
133
+ end.uniq
134
+ end.flatten
135
+ associations[chunk.segid] = target_tokens
136
+ end
137
+
138
+ if reverse
139
+ old = associations.dup
140
+ old.each do |s,ts|
141
+ ts.each do |t|
142
+ associations[t] ||= []
143
+ associations[t] += [s] unless associations[t].include?(s)
144
+ end
145
+ end
146
+ end
147
+
148
+ associations
149
+ end
150
+
151
+ def self.paths(text, source, target, reverse = true, lang = 'en_core_web_md')
152
+ graph = SpaCy.chunk_dep_graph(text, reverse, lang)
153
+
154
+ chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
155
+
156
+ source_id = chunk_index[source.offset].first || source.segid
157
+ target_id = chunk_index[target.offset].first || target.segid
158
+
159
+ path = Paths.dijkstra(graph, source_id, [target_id])
160
+
161
+ return nil if path.nil?
162
+
163
+ path.reverse
164
+ end
165
+
166
+ def self.config(base, target = nil)
167
+ TmpFile.with_file(base) do |baseconfig|
168
+ if target
169
+ CMD.cmd(:spacy, "init fill-config #{baseconfig} #{target}")
170
+ else
171
+ TmpFile.with_file do |tmptarget|
172
+ CMD.cmd(:spacy, "init fill-config #{baseconfig} #{tmptarget}")
173
+ Open.read(targetconfig)
174
+ end
175
+ end
176
+ end
177
+ end
178
+ end
179
+
180
+ module SpaCyToken
181
+ extend Entity
182
+ include SegmentAnnotation
183
+
184
+ self.annotation *SpaCy::TOKEN_PROPERTIES
185
+ self.annotation :dep
186
+ end
187
+
188
+ module SpaCySpan
189
+ extend Entity
190
+ include SegmentAnnotation
191
+
192
+ self.annotation *SpaCy::CHUNK_PROPERTIES
193
+ self.annotation :dep
194
+ end
195
+
@@ -0,0 +1,24 @@
1
+ require 'rbbt/segment'
2
+
3
+ module Relationship
4
+ extend Annotation
5
+ self.annotation :segment
6
+ self.annotation :terms
7
+ self.annotation :type
8
+
9
+ def text
10
+ if segment
11
+ segment
12
+ else
13
+ type + ": " + terms * ", "
14
+ end
15
+ end
16
+
17
+ def html
18
+ text = <<-EOF
19
+ <span class='Relationship'\
20
+ >#{ self.text }</span>
21
+ EOF
22
+ text.chomp
23
+ end
24
+ end
data/lib/rbbt/segment.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/entity'
3
+ require 'rbbt/document'
3
4
 
4
5
  module SegID
5
6
  extend Entity
@@ -10,11 +11,11 @@ module SegID
10
11
  end
11
12
 
12
13
  def range
13
- @range ||= Range.new(*_parts.last.split("..").map(&:to_i))
14
+ @range ||= Range.new(*_parts[4].split("..").map(&:to_i))
14
15
  end
15
16
 
16
17
  def docid
17
- @docid ||= _parts[0..3] * ":"
18
+ @docid ||= DocID.setup(_parts[0..3] * ":")
18
19
  end
19
20
 
20
21
  def offset
@@ -25,12 +26,13 @@ module SegID
25
26
  range.end - range.begin + 1
26
27
  end
27
28
 
28
- property :segment do
29
+ property :segment => :single do
30
+ docid = self.docid
29
31
  document = DocID.setup(docid, :corpus => corpus).document
30
32
 
31
33
  text = document[range]
32
34
 
33
- Segment.setup(text, docid)
35
+ Segment.setup(text, :docid => docid, :offset => offset)
34
36
  end
35
37
 
36
38
  property :segid do
@@ -47,10 +49,13 @@ module Segment
47
49
  length
48
50
  end
49
51
 
52
+
50
53
  def eend
51
54
  offset.to_i + length - 1
52
55
  end
53
56
 
57
+ alias end eend
58
+
54
59
  def range
55
60
  (offset.to_i..eend)
56
61
  end
@@ -1,6 +1,6 @@
1
1
  require 'rbbt-util'
2
- require 'rbbt/entity'
3
2
  require 'rbbt/segment'
3
+ require 'rbbt/entity'
4
4
 
5
5
  module AnnotID
6
6
  extend Entity
@@ -32,7 +32,7 @@ end
32
32
 
33
33
  module SegmentAnnotation
34
34
  extend Entity
35
- include Segment
35
+ include Object::Segment
36
36
  self.annotation :type
37
37
 
38
38
  property :segid do
@@ -47,7 +47,7 @@ module SegmentAnnotation
47
47
  end
48
48
 
49
49
  property :annotid do |corpus=nil|
50
- AnnotID.setup([segid, type] * ":", :corpus => corpus)
50
+ AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
51
51
  end
52
52
 
53
53
  alias id annotid
@@ -8,6 +8,10 @@ module NamedEntity
8
8
 
9
9
  self.annotation :entity_type, :code, :score
10
10
 
11
+ def entity_type
12
+ annotation_values[:entity_type] || annotation_values[:type]
13
+ end
14
+
11
15
  def report
12
16
  <<-EOF
13
17
  String: #{ self }
@@ -19,11 +23,14 @@ Score: #{score.inspect}
19
23
  end
20
24
 
21
25
  def html
26
+ title = code.nil? ? entity_type : [entity_type, code].compact * ":"
27
+
22
28
  text = <<-EOF
23
29
  <span class='Entity'\
24
30
  #{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
25
31
  #{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
26
32
  #{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
33
+ #{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
27
34
  >#{ self }</span>
28
35
  EOF
29
36
  text.chomp