rbbt-text 0.6.3 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -8,14 +8,17 @@ require 'rbbt/util/log'
8
8
  class OSCAR3 < NER
9
9
  Rbbt.claim Rbbt.software.opt.OSCAR3, :install, Rbbt.share.install.software.OSCAR3.find
10
10
 
11
- @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
12
- @@ProcessingDocumentFactory = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
13
- @@MEMMSingleton = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
14
- @@DFANEFinder = Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
15
- @@MEMM = @@MEMMSingleton.getInstance();
16
- @@DFA = @@DFANEFinder.getInstance();
11
+ def self.init
12
+ @@TextToSciXML ||= Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
13
+ @@ProcessingDocumentFactory ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.document.ProcessingDocumentFactory')
14
+ @@MEMMSingleton ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.memm.MEMMSingleton')
15
+ @@DFANEFinder ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar3.recogniser.finder.DFANEFinder')
16
+ @@MEMM ||= @@MEMMSingleton.getInstance();
17
+ @@DFA ||= @@DFANEFinder.getInstance();
18
+ end
17
19
 
18
20
  def self.match(text, type = nil, memm = false)
21
+ self.init
19
22
  doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
20
23
  mentions = []
21
24
  it = doc.getTokenSequences().iterator
@@ -8,15 +8,25 @@ require 'rbbt/util/log'
8
8
  class OSCAR4 < NER
9
9
  Rbbt.claim Rbbt.software.opt.OSCAR4, :install, Rbbt.share.install.software.OSCAR4.find
10
10
 
11
- Rjb::load(nil, jvmargs = ['-Xms128m','-Xmx2048m'])
12
- @@OSCAR = Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
11
+ def self.init
12
+ Rjb::load(nil, jvmargs = ['-Xms1G','-Xmx2G']) unless Rjb.loaded?
13
13
 
14
- def self.match(text, type = nil, memm = false)
14
+ @@OSCAR ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar.Oscar')
15
+ @@FormatType ||= Rjb::import('uk.ac.cam.ch.wwmm.oscar.chemnamedict.entities.FormatType')
16
+ end
17
+
18
+ def self.tagger
19
+ @@tagger ||= @@OSCAR.new()
20
+ end
21
+
22
+ def self.match(text, type = nil)
23
+ self.init
15
24
 
16
25
  return [] if text.nil? or text.strip.empty?
17
26
 
18
- oscar = @@OSCAR.new();
19
- entities = oscar.findAndResolveNamedEntities(text);
27
+ oscar = tagger
28
+ #entities = oscar.findAndResolveNamedEntities(text);
29
+ entities = oscar.findNamedEntities(text);
20
30
  it = entities.iterator
21
31
 
22
32
  result = []
@@ -24,9 +34,13 @@ class OSCAR4 < NER
24
34
  while it.hasNext
25
35
  entity = it.next
26
36
  mention = entity.getSurface
27
- result << mention
37
+ #inchi = entity.getFirstChemicalStructure(@@FormatType.INCHI)
38
+ #inchi = inchi.getValue() unless inchi.nil?
39
+ inchi = nil
40
+
41
+ NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
28
42
 
29
- NamedEntity.setup mention, entity.getStart, entity.getType, nil, entity.getNamedEntity.getConfidence
43
+ result << mention
30
44
  end
31
45
 
32
46
  result
@@ -13,30 +13,28 @@ class Normalizer
13
13
  # score above the minimum. Otherwise it return an empty list.
14
14
  def self.get_best(values, min)
15
15
  return [] if values.empty?
16
- best = values.collect{|p| p[1]}.max
16
+ best = values.collect{|p| p[1] }.max
17
17
  return [] if best < min
18
18
  values.select{|p| p[1] == best}
19
19
  end
20
20
 
21
21
  # Compares the tokens and gives each candidate a score based on the
22
22
  # commonalities and differences amongst the tokens.
23
- def token_score(candidates, mention)
24
- candidates.collect{|code|
25
- next if @synonyms[code].nil?
26
- value = @synonyms[code].select{|name| name =~ /\w/}.collect{|name|
27
- case
28
- when mention == name
29
- 100
30
- when mention.downcase == name.downcase
31
- 90
32
- when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
33
- 80
34
- else
35
- @tokens.evaluate(mention, name)
36
- end
37
- }.max
38
- [code, value]
39
- }.compact
23
+ def token_score(code, mention)
24
+ return nil if @synonyms[code].nil?
25
+ @synonyms[code].select{|name| name =~ /[a-zA-Z]/ }.collect{|name|
26
+ value = case
27
+ when mention == name
28
+ 100
29
+ when mention.downcase == name.downcase
30
+ 90
31
+ when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
32
+ 80
33
+ else
34
+ @tokens.evaluate(mention, name)
35
+ end
36
+ [value, name]
37
+ }.sort_by{|value, name| value }.last
40
38
  end
41
39
 
42
40
  # Order candidates with the number of words in common between the text
@@ -46,7 +44,7 @@ class Normalizer
46
44
  # been a Proc or a Hash.
47
45
  def entrez_score(candidates, text, to_entrez = nil)
48
46
  code2entrez = {}
49
- candidates.each{|code|
47
+ candidates.each{|code, score|
50
48
  if to_entrez.is_a? Proc
51
49
  entrez = to_entrez.call(code)
52
50
  elsif to_entrez.is_a? Hash
@@ -72,24 +70,37 @@ class Normalizer
72
70
  # positions. This is based on the idea that synonym lists order their
73
71
  # synonyms by importance.
74
72
  def appearence_order(candidates, mention)
75
- positions = candidates.collect{|code|
73
+ positions = candidates.collect{|code, score, name|
76
74
  next unless @synonyms[code]
77
75
  pos = nil
78
76
  @synonyms[code].each_with_index{|list,i|
79
77
  next if pos
80
78
  pos = i if list.include? mention
81
79
  }
82
- pos
80
+ pos
83
81
  }
82
+
84
83
  return nil if positions.compact.empty?
85
- best = candidates.zip(positions).sort{|a,b| a[1] <=> b[1]}.first[1]
86
- candidates.zip(positions).select{|p| p[1] == best}.collect{|p| p[0]}
84
+
85
+ best = candidates.zip(positions).sort{|a,b|
86
+ case
87
+ when (a[1].nil? and b[1].nil?)
88
+ 0
89
+ when b[1].nil?
90
+ 1
91
+ when a[1].nil?
92
+ -1
93
+ else
94
+ a[1] <=> b[1]
95
+ end
96
+ }.first[1]
97
+ candidates.zip(positions).select{|p| p[1] == best}
87
98
  end
88
99
 
89
100
 
90
101
 
91
102
  def initialize(lexicon, options = {})
92
- @synonyms = TSV.open(lexicon, :flat)
103
+ @synonyms = TSV === lexicon ? lexicon : TSV.open(lexicon, :type => :flat, :unnamed => true)
93
104
 
94
105
  @index = CueIndex.new
95
106
  @index.load(lexicon, options[:max_candidates])
@@ -103,30 +114,43 @@ class Normalizer
103
114
  end
104
115
 
105
116
  def select(candidates, mention, text = nil, options = {})
106
- threshold = options[:threshold] || 0
107
- max_candidates = options[:max_candidates] || 200
108
- max_entrez = options[:max_entrez] || 10
117
+ options = Misc.add_defaults options, :threshold => 0, :max_candidates => 20, :max_entrez => 10, :keep_matches => false
118
+ threshold, max_candidates, max_entrez, keep_matches = Misc.process_options options, :threshold, :max_candidates, :max_entrez, :keep_matches
109
119
 
110
120
  # Abort if too ambigous
111
121
  return [] if candidates.empty?
112
122
  return [] if candidates.length > max_candidates
113
123
 
114
- scores = token_score(candidates, mention)
115
- best_codes = Normalizer::get_best(scores, threshold).collect{|p| p[0]}
124
+ scores = candidates.zip(candidates.collect{|candidate| token_score(candidate, mention)}).collect{|v| v.flatten}
125
+ scores.delete_if{|candidate, score, name| score.nil? or score <= threshold}
126
+
127
+ best_codes = Normalizer::get_best(scores, threshold)
116
128
 
117
129
  # Abort if too ambigous
118
130
  return [] if best_codes.length > max_entrez
119
131
 
120
132
  if best_codes.length > 1 and text
121
- scores = entrez_score(best_codes, text, @to_entrez)
133
+ scores = entrez_score(best_codes.collect{|c| c.first}, text, @to_entrez)
122
134
 
123
- Normalizer::get_best(scores, 0).collect{|p| p[0]}
135
+ if keep_matches
136
+ Normalizer::get_best(scores, 0)
137
+ else
138
+ Normalizer::get_best(scores, 0).collect{|p| p[0]}
139
+ end
124
140
  else
125
141
  orders = appearence_order(best_codes, mention)
126
142
  if orders
127
- orders
143
+ if keep_matches
144
+ orders.collect{|p| p[0]}
145
+ else
146
+ orders.collect{|p| p[0][0]}
147
+ end
128
148
  else
129
- best_codes
149
+ if keep_matches
150
+ best_codes
151
+ else
152
+ best_codes.collect{|p| p[0]}
153
+ end
130
154
  end
131
155
  end
132
156
 
@@ -17,6 +17,7 @@ class CueIndex
17
17
 
18
18
  file ||= Rbbt.share.rnorm.cue_default.produce if !file && !block
19
19
 
20
+ file = file.find if file.respond_to? :find
20
21
  load_config(:define, file, &block)
21
22
  end
22
23
 
@@ -47,12 +48,12 @@ class CueIndex
47
48
 
48
49
  def load(file, max_candidates = 50)
49
50
  @indexes = Array.new(@rules.size){Hash.new}
50
- data = TSV.open(file, :flat)
51
+ data = TSV === file ? file : TSV.open(file, :type => :flat, :unnamed => true)
51
52
  data.each{|code, values|
52
53
  values.each{|value|
53
54
  cues(value).each_with_index{|cue_list,i|
54
55
  cue_list.each{|cue|
55
- @indexes[i][cue] ||= []
56
+ @indexes[i][cue] ||= Set.new
56
57
  @indexes[i][cue] << code unless @indexes[i][cue].include? code
57
58
  }
58
59
  }
@@ -69,7 +70,7 @@ class CueIndex
69
70
  @indexes.each_with_index{|index,i|
70
71
  best = []
71
72
  cues[i].each{|cue|
72
- best << index[cue] if index[cue]
73
+ best << index[cue].to_a if index[cue]
73
74
  }
74
75
  return best.flatten if best.any?
75
76
  }
@@ -33,7 +33,8 @@ class Tokenizer
33
33
 
34
34
  def method_missing(name, *args, &bloc)
35
35
  @token = name.to_sym
36
- @value = *args.first
36
+ value = args.first
37
+ @value = value
37
38
  self
38
39
  end
39
40
 
@@ -123,8 +124,12 @@ class Tokenizer
123
124
 
124
125
  #{{{ Metaprogramming hooks
125
126
  def define_tokens(name, *args, &block)
126
- action = *args[0] || block || /#{name.to_s}s?/i
127
- raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
127
+ action = args[0] || block || /#{name.to_s}s?/i
128
+
129
+ #HACK: Misterious error where *args[0] returns an array [/regexp/i] for
130
+ #example
131
+ #action = action.first if Array === action
132
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
128
133
 
129
134
  @types[name.to_sym] = action
130
135
  @order.push name.to_sym
@@ -160,6 +165,7 @@ class Tokenizer
160
165
  @transforms = []
161
166
 
162
167
  file ||= Rbbt.share.rnorm.tokens_default.produce if !file && !block
168
+ file = file.find if file.respond_to? :find
163
169
  load_config :main, file, &block
164
170
  end
165
171
 
@@ -200,7 +206,7 @@ class Tokenizer
200
206
  #{{{ Comparisons
201
207
 
202
208
  def evaluate_tokens(list1, list2)
203
- @operations.inject(0){| acc, o|
209
+ @operations.inject(0){|acc, o|
204
210
  acc + o.eval(list1, list2)
205
211
  }
206
212
  end
@@ -9,11 +9,16 @@ module Segment
9
9
  @offset = offset.nil? ? nil : offset.to_i
10
10
  end
11
11
 
12
+
13
+ def segment_length
14
+ self.length
15
+ end
16
+
12
17
  #{{{ Ranges
13
18
 
14
19
  def end
15
20
  return nil if offset.nil?
16
- offset + length - 1
21
+ offset + segment_length - 1
17
22
  end
18
23
 
19
24
  def range
@@ -41,8 +46,14 @@ module Segment
41
46
  self
42
47
  end
43
48
 
44
- def make_relative(segments)
45
- segments.collect{|s| s.push offset}
49
+ def make_relative(segments, &block)
50
+ if block_given?
51
+ segments.each{|s| s.push offset}
52
+ yield(segments)
53
+ segments.each{|s| s.pull offset}
54
+ else
55
+ segments.each{|s| s.push offset}
56
+ end
46
57
  end
47
58
 
48
59
  def range_in(container = nil)
@@ -72,7 +83,7 @@ module Segment
72
83
  when (not a.range.include? b.offset and not b.range.include? a.offset)
73
84
  a.offset <=> b.offset
74
85
  else
75
- a.length <=> b.length
86
+ a.segment_length <=> b.segment_length
76
87
  end
77
88
  end
78
89
  else
@@ -125,7 +136,7 @@ module Segment
125
136
  chunks << chunk
126
137
  end
127
138
 
128
- segment_end = offset + segment.length - 1
139
+ segment_end = offset + segment.segment_length - 1
129
140
 
130
141
  if not skip_segments
131
142
  chunk = text[offset..segment_end]
@@ -138,7 +149,7 @@ module Segment
138
149
 
139
150
  end
140
151
 
141
- if not text.nil? and text.any?
152
+ if not text.nil? and not text.empty?
142
153
  chunk = text.dup
143
154
  Segment.setup(chunk, text_offset)
144
155
  chunks << chunk
@@ -156,8 +167,8 @@ module Segment
156
167
  offset = text.index part
157
168
  next if offset.nil?
158
169
  Segment.setup(part, pre_offset + offset)
159
- pre_offset += offset + part.length - 1
160
- text = text[(offset + part.length - 1)..-1]
170
+ pre_offset += offset + part.segment_length - 1
171
+ text = text[(offset + part.segment_length - 1)..-1]
161
172
  end
162
173
  end
163
174
 
@@ -0,0 +1,46 @@
1
+ require 'rbbt/ner/segment'
2
+
3
+ module SegmentWithDocid
4
+ extend Annotation
5
+
6
+ self.annotation :docid
7
+
8
+ def masked?
9
+ self[0..5] == "MASKED"
10
+ end
11
+
12
+ def mask
13
+ return self if masked?
14
+ raise "Cannot mask an array of elements, they must be masked individually" if Array === self
15
+ raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
16
+ raise "Cannot mask a segment with no offset" if offset.nil?
17
+ textual_position = ["MASKED", length] * ":"
18
+ self.replace(textual_position)
19
+ self
20
+ end
21
+
22
+ def unmasked_text
23
+ return self unless masked?
24
+ tag, length = self.split(":")
25
+ Document.setup(docid).text[offset..(offset+length.to_i-1)]
26
+ end
27
+
28
+ def unmask
29
+ return self unless masked?
30
+ self.replace(unmasked_text)
31
+ self
32
+ end
33
+
34
+ def str_length
35
+ self.length
36
+ end
37
+
38
+ def masked_length
39
+ self.split(":").last.to_i
40
+ end
41
+
42
+ def segment_length
43
+ masked? ? masked_length : str_length
44
+ end
45
+ end
46
+
@@ -5,7 +5,7 @@ module NamedEntity
5
5
  extend Entity
6
6
  include Segment
7
7
 
8
- self.annotation :type, :code, :score, :docid
8
+ self.annotation :type, :code, :score
9
9
 
10
10
  def report
11
11
  <<-EOF
@@ -11,7 +11,7 @@ module Transformed
11
11
  text
12
12
  end
13
13
 
14
- def self.with_transform(text, segments, replacement)
14
+ def self.with_transform(text, segments, replacement = nil)
15
15
 
16
16
  text.extend Transformed
17
17
  text.replace(segments, replacement)
@@ -119,7 +119,7 @@ module Transformed
119
119
  new = replacement.call(segment)
120
120
  end
121
121
 
122
- diff = new.length - segment.length
122
+ diff = new.length - segment.segment_length
123
123
 
124
124
  self[updated_begin..updated_end] = new
125
125
 
@@ -146,7 +146,9 @@ module Transformed
146
146
  end
147
147
  end
148
148
 
149
- def restore(segments, first_only = false)
149
+ # Restore the sentence from all transformation. Segments that are passed as
150
+ # parameters are restored from transformed space to original space
151
+ def restore(segments = [], first_only = false)
150
152
  return segments if @transformation_stack.empty?
151
153
 
152
154
  if first_only