rbbt-text 0.6.3 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,14 +8,17 @@ require 'rbbt/util/log'
8
8
  class OSCAR3 < NER
9
9
  Rbbt.claim Rbbt.software.opt.OSCAR3, :install, Rbbt.share.install.software.OSCAR3.find
10
10
 
11
- @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
12
- @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
13
- @@MEMMSingleton = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
14
- @@DFANEFinder = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
15
- @@MEMM = @@MEMMSingleton.getInstance();
16
- @@DFA = @@DFANEFinder.getInstance();
11
+ def self.init
12
+ @@TextToSciXML ||= Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
13
+ @@ProcessingDocumentFactory ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
14
+ @@MEMMSingleton ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
15
+ @@DFANEFinder ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
16
+ @@MEMM ||= @@MEMMSingleton.getInstance();
17
+ @@DFA ||= @@DFANEFinder.getInstance();
18
+ end
17
19
 
18
20
  def self.match(text, type = nil, memm = false)
21
+ self.init
19
22
  doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
20
23
  mentions = []
21
24
  it = doc.getTokenSequences().iterator
@@ -8,15 +8,25 @@ require 'rbbt/util/log'
8
8
  class OSCAR4 < NER
9
9
  Rbbt.claim Rbbt.software.opt.OSCAR4, :install, Rbbt.share.install.software.OSCAR4.find
10
10
 
11
- Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
12
- @@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
11
+ def self.init
12
+ Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
13
13
 
14
- def self.match(text, type = nil, memm = false)
14
+ @@OSCAR ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
15
+ @@FormatType ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.FormatType')
16
+ end
17
+
18
+ def self.tagger
19
+ @@tagger ||= @@OSCAR.new()
20
+ end
21
+
22
+ def self.match(text, type = nil)
23
+ self.init
15
24
 
16
25
  return [] if text.nil? or text.strip.empty?
17
26
 
18
- oscar = @@OSCAR.new();
19
- entities = oscar.findAndResolveNamedEntities(text);
27
+ oscar = tagger
28
+ #entities = oscar.findAndResolveNamedEntities(text);
29
+ entities = oscar.findNamedEntities(text);
20
30
  it = entities.iterator
21
31
 
22
32
  result = []
@@ -24,9 +34,13 @@ class OSCAR4 < NER
24
34
  while it.hasNext
25
35
  entity = it.next
26
36
  mention = entity.getSurface
27
- result << mention
37
+ #inchi = entity.getFirstChemicalStructure(@@FormatType.INCHI)
38
+ #inchi = inchi.getValue() unless inchi.nil?
39
+ inchi = nil
40
+
41
+ NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
28
42
 
29
- NamedEntity.setup mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
43
+ result << mention
30
44
  end
31
45
 
32
46
  result
@@ -13,30 +13,28 @@ class Normalizer
13
13
  # score above the minimum. Otherwise it return an empty list.
14
14
  def self.get_best(values, min)
15
15
  return [] if values.empty?
16
- best = values.collect{|p| p[1]}.max
16
+ best = values.collect{|p| p[1] }.max
17
17
  return [] if best < min
18
18
  values.select{|p| p[1] == best}
19
19
  end
20
20
 
21
21
  # Compares the tokens and gives each candidate a score based on the
22
22
  # commonalities and differences amongst the tokens.
23
- def token_score(candidates, mention)
24
- candidates.collect{|code|
25
- next if @synonyms[code].nil?
26
- value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
27
- case
28
- when mention == name
29
- 100
30
- when mention.downcase == name.downcase
31
- 90
32
- when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
33
- 80
34
- else
35
- @tokens.evaluate(mention, name)
36
- end
37
- }.max
38
- [code, value]
39
- }.compact
23
+ def token_score(code, mention)
24
+ return nil if @synonyms[code].nil?
25
+ @synonyms[code].select{|name| name =~ /[a-zA-Z]/ }.collect{|name|
26
+ value = case
27
+ when mention == name
28
+ 100
29
+ when mention.downcase == name.downcase
30
+ 90
31
+ when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
32
+ 80
33
+ else
34
+ @tokens.evaluate(mention, name)
35
+ end
36
+ [value, name]
37
+ }.sort_by{|value, name| value }.last
40
38
  end
41
39
 
42
40
  # Order candidates with the number of words in common between the text
@@ -46,7 +44,7 @@ class Normalizer
46
44
  # been a Proc or a Hash.
47
45
  def entrez_score(candidates, text, to_entrez = nil)
48
46
  code2entrez = {}
49
- candidates.each{|code|
47
+ candidates.each{|code, score|
50
48
  if to_entrez.is_a? Proc
51
49
  entrez = to_entrez.call(code)
52
50
  elsif to_entrez.is_a? Hash
@@ -72,24 +70,37 @@ class Normalizer
72
70
  # positions. This is based on the idea that synonym lists order their
73
71
  # synonyms by importance.
74
72
  def appearence_order(candidates, mention)
75
- positions = candidates.collect{|code|
73
+ positions = candidates.collect{|code, score, name|
76
74
  next unless @synonyms[code]
77
75
  pos = nil
78
76
  @synonyms[code].each_with_index{|list,i|
79
77
  next if pos
80
78
  pos = i if list.include? mention
81
79
  }
82
- pos
80
+ pos
83
81
  }
82
+
84
83
  return nil if positions.compact.empty?
85
- best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
86
- candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
84
+
85
+ best = candidates.zip(positions).sort{|a,b|
86
+ case
87
+ when (a[1].nil? and b[1].nil?)
88
+ 0
89
+ when b[1].nil?
90
+ 1
91
+ when a[1].nil?
92
+ -1
93
+ else
94
+ a[1] <=> b[1]
95
+ end
96
+ }.first[1]
97
+ candidates.zip(positions).select{|p| p[1] == best}
87
98
  end
88
99
 
89
100
 
90
101
 
91
102
  def initialize(lexicon, options = {})
92
- @synonyms = TSV.open(lexicon, :flat)
103
+ @synonyms = TSV === lexicon ? lexicon : TSV.open(lexicon, :type => :flat, :unnamed => true)
93
104
 
94
105
  @index = CueIndex.new
95
106
  @index.load(lexicon, options[:max_candidates])
@@ -103,30 +114,43 @@ class Normalizer
103
114
  end
104
115
 
105
116
  def select(candidates, mention, text = nil, options = {})
106
- threshold = options[:threshold] || 0
107
- max_candidates = options[:max_candidates] || 200
108
- max_entrez = options[:max_entrez] || 10
117
+ options = Misc.add_defaults options, :threshold => 0, :max_candidates => 20, :max_entrez => 10, :keep_matches => false
118
+ threshold, max_candidates, max_entrez, keep_matches = Misc.process_options options, :threshold, :max_candidates, :max_entrez, :keep_matches
109
119
 
110
120
  # Abort if too ambigous
111
121
  return [] if candidates.empty?
112
122
  return [] if candidates.length > max_candidates
113
123
 
114
- scores = token_score(candidates, mention)
115
- best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
124
+ scores = candidates.zip(candidates.collect{|candidate| token_score(candidate, mention)}).collect{|v| v.flatten}
125
+ scores.delete_if{|candidate, score, name| score.nil? or score <= threshold}
126
+
127
+ best_codes = Normalizer::get_best(scores, threshold)
116
128
 
117
129
  # Abort if too ambigous
118
130
  return [] if best_codes.length > max_entrez
119
131
 
120
132
  if best_codes.length > 1 and text
121
- scores = entrez_score(best_codes, text, @to_entrez)
133
+ scores = entrez_score(best_codes.collect{|c| c.first}, text, @to_entrez)
122
134
 
123
- Normalizer::get_best(scores, 0).collect{|p| p[0]}
135
+ if keep_matches
136
+ Normalizer::get_best(scores, 0)
137
+ else
138
+ Normalizer::get_best(scores, 0).collect{|p| p[0]}
139
+ end
124
140
  else
125
141
  orders = appearence_order(best_codes, mention)
126
142
  if orders
127
- orders
143
+ if keep_matches
144
+ orders.collect{|p| p[0]}
145
+ else
146
+ orders.collect{|p| p[0][0]}
147
+ end
128
148
  else
129
- best_codes
149
+ if keep_matches
150
+ best_codes
151
+ else
152
+ best_codes.collect{|p| p[0]}
153
+ end
130
154
  end
131
155
  end
132
156
 
@@ -17,6 +17,7 @@ class CueIndex
17
17
 
18
18
  file ||= Rbbt.share.rnorm.cue_default.produce if !file && !block
19
19
 
20
+ file = file.find if file.respond_to? :find
20
21
  load_config(:define, file, &block)
21
22
  end
22
23
 
@@ -47,12 +48,12 @@ class CueIndex
47
48
 
48
49
  def load(file, max_candidates = 50)
49
50
  @indexes = Array.new(@rules.size){Hash.new}
50
- data = TSV.open(file, :flat)
51
+ data = TSV === file ? file : TSV.open(file, :type => :flat, :unnamed => true)
51
52
  data.each{|code, values|
52
53
  values.each{|value|
53
54
  cues(value).each_with_index{|cue_list,i|
54
55
  cue_list.each{|cue|
55
- @indexes[i][cue] ||= []
56
+ @indexes[i][cue] ||= Set.new
56
57
  @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
58
  }
58
59
  }
@@ -69,7 +70,7 @@ class CueIndex
69
70
  @indexes.each_with_index{|index,i|
70
71
  best = []
71
72
  cues[i].each{|cue|
72
- best << index[cue] if index[cue]
73
+ best << index[cue].to_a if index[cue]
73
74
  }
74
75
  return best.flatten if best.any?
75
76
  }
@@ -33,7 +33,8 @@ class Tokenizer
33
33
 
34
34
  def method_missing(name, *args, &bloc)
35
35
  @token = name.to_sym
36
- @value = *args.first
36
+ value = args.first
37
+ @value = value
37
38
  self
38
39
  end
39
40
 
@@ -123,8 +124,12 @@ class Tokenizer
123
124
 
124
125
  #{{{ Metaprogramming hooks
125
126
  def define_tokens(name, *args, &block)
126
- action = *args[0] || block || /#{name.to_s}s?/i
127
- raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
127
+ action = args[0] || block || /#{name.to_s}s?/i
128
+
129
+ #HACK: Misterious error where *args[0] returns an array [/regexp/i] for
130
+ #example
131
+ #action = action.first if Array === action
132
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
128
133
 
129
134
  @types[name.to_sym] = action
130
135
  @order.push name.to_sym
@@ -160,6 +165,7 @@ class Tokenizer
160
165
  @transforms = []
161
166
 
162
167
  file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
168
+ file = file.find if file.respond_to? :find
163
169
  load_config :main, file, &block
164
170
  end
165
171
 
@@ -200,7 +206,7 @@ class Tokenizer
200
206
  #{{{ Comparisons
201
207
 
202
208
  def evaluate_tokens(list1, list2)
203
- @operations.inject(0){| acc, o|
209
+ @operations.inject(0){|acc, o|
204
210
  acc + o.eval(list1, list2)
205
211
  }
206
212
  end
@@ -9,11 +9,16 @@ module Segment
9
9
  @offset = offset.nil? ? nil : offset.to_i
10
10
  end
11
11
 
12
+
13
+ def segment_length
14
+ self.length
15
+ end
16
+
12
17
  #{{{ Ranges
13
18
 
14
19
  def end
15
20
  return nil if offset.nil?
16
- offset + length - 1
21
+ offset + segment_length - 1
17
22
  end
18
23
 
19
24
  def range
@@ -41,8 +46,14 @@ module Segment
41
46
  self
42
47
  end
43
48
 
44
- def make_relative(segments)
45
- segments.collect{|s| s.push offset}
49
+ def make_relative(segments, &block)
50
+ if block_given?
51
+ segments.each{|s| s.push offset}
52
+ yield(segments)
53
+ segments.each{|s| s.pull offset}
54
+ else
55
+ segments.each{|s| s.push offset}
56
+ end
46
57
  end
47
58
 
48
59
  def range_in(container = nil)
@@ -72,7 +83,7 @@ module Segment
72
83
  when (not a.range.include? b.offset and not b.range.include? a.offset)
73
84
  a.offset <=> b.offset
74
85
  else
75
- a.length <=> b.length
86
+ a.segment_length <=> b.segment_length
76
87
  end
77
88
  end
78
89
  else
@@ -125,7 +136,7 @@ module Segment
125
136
  chunks << chunk
126
137
  end
127
138
 
128
- segment_end = offset + segment.length - 1
139
+ segment_end = offset + segment.segment_length - 1
129
140
 
130
141
  if not skip_segments
131
142
  chunk = text[offset..segment_end]
@@ -138,7 +149,7 @@ module Segment
138
149
 
139
150
  end
140
151
 
141
- if not text.nil? and text.any?
152
+ if not text.nil? and not text.empty?
142
153
  chunk = text.dup
143
154
  Segment.setup(chunk, text_offset)
144
155
  chunks << chunk
@@ -156,8 +167,8 @@ module Segment
156
167
  offset = text.index part
157
168
  next if offset.nil?
158
169
  Segment.setup(part, pre_offset + offset)
159
- pre_offset += offset + part.length - 1
160
- text = text[(offset + part.length - 1)..-1]
170
+ pre_offset += offset + part.segment_length - 1
171
+ text = text[(offset + part.segment_length - 1)..-1]
161
172
  end
162
173
  end
163
174
 
@@ -0,0 +1,46 @@
1
+ require 'rbbt/ner/segment'
2
+
3
+ module SegmentWithDocid
4
+ extend Annotation
5
+
6
+ self.annotation :docid
7
+
8
+ def masked?
9
+ self[0..5] == "MASKED"
10
+ end
11
+
12
+ def mask
13
+ return self if masked?
14
+ raise "Cannot mask an array of elements, they must be masked individually" if Array === self
15
+ raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
16
+ raise "Cannot mask a segment with no offset" if offset.nil?
17
+ textual_position = ["MASKED", length] * ":"
18
+ self.replace(textual_position)
19
+ self
20
+ end
21
+
22
+ def unmasked_text
23
+ return self unless masked?
24
+ tag, length = self.split(":")
25
+ Document.setup(docid).text[offset..(offset+length.to_i-1)]
26
+ end
27
+
28
+ def unmask
29
+ return self unless masked?
30
+ self.replace(unmasked_text)
31
+ self
32
+ end
33
+
34
+ def str_length
35
+ self.length
36
+ end
37
+
38
+ def masked_length
39
+ self.split(":").last.to_i
40
+ end
41
+
42
+ def segment_length
43
+ masked? ? masked_length : str_length
44
+ end
45
+ end
46
+
@@ -5,7 +5,7 @@ module NamedEntity
5
5
  extend Entity
6
6
  include Segment
7
7
 
8
- self.annotation :type, :code, :score, :docid
8
+ self.annotation :type, :code, :score
9
9
 
10
10
  def report
11
11
  <<-EOF
@@ -11,7 +11,7 @@ module Transformed
11
11
  text
12
12
  end
13
13
 
14
- def self.with_transform(text, segments, replacement)
14
+ def self.with_transform(text, segments, replacement = nil)
15
15
 
16
16
  text.extend Transformed
17
17
  text.replace(segments, replacement)
@@ -119,7 +119,7 @@ module Transformed
119
119
  new = replacement.call(segment)
120
120
  end
121
121
 
122
- diff = new.length - segment.length
122
+ diff = new.length - segment.segment_length
123
123
 
124
124
  self[updated_begin..updated_end] = new
125
125
 
@@ -146,7 +146,9 @@ module Transformed
146
146
  end
147
147
  end
148
148
 
149
- def restore(segments, first_only = false)
149
+ # Restore the sentence from all transformation. Segments that are passed as
150
+ # parameters are restored from transformed space to original space
151
+ def restore(segments = [], first_only = false)
150
152
  return segments if @transformation_stack.empty?
151
153
 
152
154
  if first_only