rbbt-text 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -174,7 +174,7 @@ class Dictionary::KL
174
174
  if limit
175
175
  Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
176
176
  else
177
- Hash[*best.flatten]
177
+ best
178
178
  end
179
179
  end
180
180
 
data/lib/rbbt/bow/misc.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/open'
3
3
 
4
- Rbbt.add_datafiles 'stopwords' => ['wordlists', 'stopwords']
4
+ Rbbt.claim 'stopwords', 'stopwords', 'wordlist'
5
5
 
6
- $stopwords = Open.read(Rbbt.find_datafile 'stopwords').scan(/\w+/) if File.exists?(Rbbt.find_datafile 'stopwords')
6
+ $stopwords = Rbbt.files.wordlists.stopwords.read.scan(/\w+/)
7
7
 
@@ -0,0 +1,22 @@
1
+ require 'rbbt/ner/annotations'
2
+
3
+ class NER
4
+ def entities(text, overlap = true, *args)
5
+ case
6
+ when Array === text
7
+ text.collect do |element|
8
+ matches = entities(element, overlap, *args)
9
+ matches.each{|match|
10
+ match.offset += element.offset if match.offset
11
+ }
12
+ matches
13
+ end.flatten
14
+ when (Annotated === text and not overlap)
15
+ entities(text.split, overlap, *args)
16
+ else
17
+ match(text, *args)
18
+ end
19
+ end
20
+ end
21
+
22
+
@@ -1,10 +1,11 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/named_entity'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/NER'
4
5
 
5
6
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
6
7
  # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
7
- class Abner
8
+ class Abner < NER
8
9
 
9
10
  Rbbt.add_software "ABNER" => ['','']
10
11
 
@@ -25,13 +26,16 @@ class Abner
25
26
  # Given a chunk of text, it finds all the mentions appearing in it. It
26
27
  # returns all the mentions found, regardless of type, to be coherent
27
28
  # with the rest of NER packages in Rbbt.
28
- def extract(text)
29
+ def match(text)
29
30
 
30
31
  res = @tagger.getEntities(text)
31
32
  types = res[1]
32
33
  strings = res[0]
33
34
 
34
- strings.zip(types).collect{|mention, type| mention = mention.to_s; NamedEntity mention, types.to_s; mention}
35
+ strings.zip(types).collect do |mention, type|
36
+ mention = mention.to_s;
37
+ NamedEntity.annotate(mention, nil, type.to_s)
38
+ end
35
39
  end
36
40
 
37
41
  end
@@ -0,0 +1,123 @@
1
+ module Segment
2
+ attr_accessor :offset
3
+
4
+ def self.sort(segments, inline = true)
5
+ if inline
6
+ segments.sort do |a,b|
7
+ case
8
+ when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
9
+ 0
10
+ when (a.nil? or a.offset.nil?)
11
+ -1
12
+ when (b.nil? or b.offset.nil?)
13
+ +1
14
+ when (not a.range.include? b.offset and not b.range.include? a.offset)
15
+ a.offset <=> b.offset
16
+ else
17
+ b.length <=> a.length
18
+ end
19
+ end.reverse
20
+ else
21
+ segments.sort_by do |segment| segment.offset || 0 end
22
+ end
23
+ end
24
+
25
+ def self.split(text, segments)
26
+ sorted_segments = sort segments
27
+
28
+ chunks = []
29
+ segment_end = 0
30
+ text_offset = 0
31
+ sorted_segments.each do |segment|
32
+ return chunks if text.nil? or text.empty?
33
+ next if segment.offset.nil?
34
+ offset = segment.offset - text_offset
35
+
36
+ # Consider segment offset. Save pre, or skip if overlap
37
+ case
38
+ when offset < 0 # Overlap, skip
39
+ next
40
+ when offset > 0 # Save pre
41
+ chunk = text[0..offset - 1]
42
+ Segment.annotate(chunk, text_offset)
43
+ chunks << chunk
44
+ end
45
+
46
+ segment_end = offset + segment.length - 1
47
+
48
+ chunk = text[offset..segment_end]
49
+ Segment.annotate(chunk, text_offset + offset)
50
+ chunks << chunk
51
+
52
+ text_offset += segment_end + 1
53
+ text = text[segment_end + 1..-1]
54
+ end
55
+
56
+ if not text.nil? and text.any?
57
+ chunk = text.dup
58
+ Segment.annotate(chunk, text_offset)
59
+ chunks << chunk
60
+ end
61
+
62
+ chunks
63
+ end
64
+
65
+ def self.annotate(string, offset = nil)
66
+ string.extend Segment
67
+ string.offset = offset
68
+ string
69
+ end
70
+
71
+ def range
72
+ (offset..offset + length - 1)
73
+ end
74
+ end
75
+
76
+ module Annotated
77
+ attr_accessor :annotations
78
+ def self.annotate(string)
79
+ string.extend Annotated
80
+ string.annotations = []
81
+ string
82
+ end
83
+
84
+ def split
85
+ Segment.split(self, @annotations)
86
+ end
87
+ end
88
+
89
+ module NamedEntity
90
+ include Segment
91
+ attr_accessor :type, :code, :score
92
+
93
+ def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
94
+ string.extend NamedEntity
95
+ string.offset = offset
96
+ string.type = type
97
+ string.code = code
98
+ string.score = score
99
+ string
100
+ end
101
+
102
+ def to_s
103
+ <<-EOF
104
+ String: #{ self }
105
+ Offset: #{ offset.inspect }
106
+ Type: #{type.inspect}
107
+ Code: #{code.inspect}
108
+ Score: #{score.inspect}
109
+ EOF
110
+ end
111
+ end
112
+
113
+ module Token
114
+ include Segment
115
+ attr_accessor :original
116
+ def self.annotate(string, offset = nil, original = nil)
117
+ string.extend Token
118
+ string.offset = offset
119
+ string.original = original
120
+ string
121
+ end
122
+ end
123
+
@@ -1,10 +1,11 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/named_entity'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/NER'
4
5
 
5
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
6
7
  # in Java. Banner[http://banner.sourceforge.net/].
7
- class Banner
8
+ class Banner < NER
8
9
 
9
10
  Rbbt.add_software "BANNER" => ['','']
10
11
 
@@ -48,10 +49,11 @@ class Banner
48
49
 
49
50
  # Returns an array with the mention found in the provided piece of
50
51
  # text.
51
- def extract(text)
52
+ def match(text)
52
53
  text.gsub!(/\n/,' ')
53
54
  text.gsub!(/\|/,'/') # Character | gives an error
54
55
  sentence = @@Sentence.new(text)
56
+
55
57
  @tokenizer.tokenize(sentence)
56
58
  @tagger.tag(sentence)
57
59
  @parenPP.postProcess(sentence)
@@ -63,7 +65,7 @@ class Banner
63
65
  mention = $1
64
66
  mention.sub!(/^\s*/,'')
65
67
  mention.sub!(/\s*$/,'')
66
- NamedEntity.annotate mention
68
+ NamedEntity.annotate mention, nil, 'GENE'
67
69
  mention
68
70
  }
69
71
  res
@@ -1,10 +1,11 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/named_entity'
4
+ require 'rbbt/ner/annotations'
5
+ require 'rbbt/ner/NER'
5
6
  require 'rbbt/util/log'
6
7
 
7
- class OSCAR3
8
+ class OSCAR3 < NER
8
9
  Rbbt.add_software "OSCAR3" => ['','']
9
10
 
10
11
  @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
@@ -14,37 +15,52 @@ class OSCAR3
14
15
  @@MEMM = @@MEMMSingleton.getInstance();
15
16
  @@DFA = @@DFANEFinder.getInstance();
16
17
 
17
- def self.extract(text, type = nil, memm = true)
18
+ def self.match(text, type = nil, memm = false)
18
19
  doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
19
20
  mentions = []
20
21
  it = doc.getTokenSequences().iterator
21
22
 
22
- reconizer = memm ? @@MEMM : @@DFA
23
23
  type = [type] unless type.nil? or Array === type
24
- pos = 0
25
24
  while it.hasNext do
26
- Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
27
25
  sequence = it.next
28
- entities = @@MEMM.findNEs(sequence, text)
29
26
 
30
- keys = entities.keySet.iterator
27
+ # Fix sequence offset
28
+ sequence_str = sequence.getSourceString.to_s
29
+ sequence_offset = sequence.offset.to_i
30
+ offset = 0
31
+ while text[(sequence_offset + offset)..(sequence_offset + offset + sequence_str.length - 1)] != sequence_str and
32
+ not offset + sequence_offset + sequence_str.length > text.length
33
+
34
+ offset += 1
35
+ end
36
+
37
+ next if offset + sequence_offset + sequence_str.length > text.length
38
+
39
+ if memm
40
+ entities = @@MEMM.findNEs(sequence, text)
41
+ keys = entities.keySet.iterator
42
+ else
43
+ entities = @@DFA.getNEs(sequence)
44
+ keys = entities.iterator
45
+ end
46
+
31
47
  while keys.hasNext do
32
48
  key = keys.next
33
49
  mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
34
50
  next unless type.nil? or type.include? mention_type
35
- score = entities.get(key)
51
+ score = memm ? entities.get(key).to_string.to_f : nil
36
52
 
37
- NamedEntity.annotate mention, mention_type, score.to_string.to_f, (rstart..rend)
53
+ NamedEntity.annotate mention, rstart.to_i + offset, mention_type, nil, score
38
54
 
39
- mentions << mention
55
+ mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
40
56
  end
41
57
  end
42
58
 
43
59
  mentions
44
60
  end
45
61
 
46
- def extract(*args)
47
- OSCAR3.extract *args
62
+ def match(*args)
63
+ OSCAR3.match *args
48
64
  end
49
65
  end
50
66
 
@@ -1,67 +1,91 @@
1
- require 'rbbt-util'
2
- require 'rbbt/bow/misc'
1
+ require 'rbbt/ner/annotations'
2
+ require 'rbbt/ner/NER'
3
+ require 'rbbt/util/simpleDSL'
3
4
 
4
- class RegExpNER
5
- def initialize(lexicon, options = {})
6
- options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
5
+ class RegExpNER < NER
6
+ include SimpleDSL
7
7
 
8
- if $stopwords and (options[:stopwords].nil? || options[:stopwords] == true)
9
- options[:stopwords] = $stopwords
10
- else
11
- options[:stopwords] = []
8
+ def self.match_regexp(text, regexp, type = nil)
9
+ matches = []
10
+ start = 0
11
+ while matchdata = text.match(regexp)
12
+ pre = matchdata.pre_match
13
+ post = matchdata.post_match
14
+ match = matchdata[0]
15
+
16
+ if matchdata.captures.any?
17
+ capture = matchdata.captures.first
18
+ more_pre, more_post = match.split(/#{capture}/)
19
+
20
+ match = capture
21
+ pre << more_pre if more_pre
22
+ post = more_post << post if more_post
23
+ end
24
+
25
+ if match and not match.empty?
26
+ NamedEntity.annotate(match, start + pre.length, type)
27
+ matches << match
28
+ end
29
+
30
+ start += pre.length + match.length
31
+ text = post
12
32
  end
13
33
 
14
- data = TSV.new(lexicon, options)
15
-
16
- @index = {}
17
- data.each{|code, names|
18
- next if code.nil? || code == ""
19
- names << code if names.empty?
20
-
21
-
22
- if options[:stopwords].any?
23
- names = names.select{|n|
24
- ! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
25
- }
34
+ matches
35
+ end
36
+
37
+ def self.match_regexp_list(text, regexp_list, type = nil)
38
+ matches = []
39
+
40
+ regexp_list.each do |regexp|
41
+ chunks = Segment.split(text, matches)
42
+ chunks.each do |chunk|
43
+ new_matches = match_regexp(chunk, regexp, type)
44
+ new_matches.each do |match| match.offset += chunk.offset; matches << match end
26
45
  end
27
- @index[code] = RegExpNER.build_re(names, options[:case_insensitive])
28
- }
46
+ end
47
+
48
+ matches
29
49
  end
30
50
 
51
+ def self.match_regexp_hash(text, regexp_hash)
52
+ matches = []
31
53
 
32
- def self.build_re(names, ignorecase=true)
33
- res = names.compact.reject{|n| n.empty? or n.length < 3}.
34
- sort_by{|a| a.length }.reverse.collect{|n| Regexp.quote(n) }
54
+ regexp_hash.each do |type, regexp_list|
55
+ regexp_list = [regexp_list] unless Array === regexp_list
56
+ chunks = Segment.split(text, matches)
57
+ chunks.each do |chunk|
58
+ chunk_offset = chunk.offset
59
+ match_regexp_list(chunk, regexp_list, type).collect do |match|
60
+ match.offset += chunk_offset;
61
+ matches << match
62
+ end
63
+ end
64
+ end
35
65
 
36
- return nil if res.empty?
66
+ matches
67
+ end
37
68
 
38
- /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/i
69
+ attr_accessor :regexps
70
+ def initialize(regexps = {})
71
+ @regexps = regexps.collect
39
72
  end
40
73
 
41
- def self.match_re(text, res)
42
- res = [res] unless Array === res
43
74
 
44
- res.collect{|re|
45
- text.scan(re)
46
- }.flatten
75
+ def __define_regexp_hook(name, regexp, *args)
76
+ @regexps << [name, regexp]
47
77
  end
48
78
 
79
+ def define_regexp(*args, &block)
80
+ load_config("__define_regexp_hook", *args, &block)
81
+ end
49
82
 
50
- def match_hash(text)
51
- return {} if text.nil? or text.empty?
52
- matches = {}
53
- @index.each{|code, re|
54
- next if re.nil?
55
- RegExpNER.match_re(text, re).each{|match|
56
- matches[code] ||= []
57
- matches[code] << match
58
- }
59
- }
60
- matches
83
+ def add_regexp(list = {})
84
+ @regexps.concat list.collect
61
85
  end
62
86
 
63
87
  def match(text)
64
- match_hash(text)
88
+ matches = RegExpNER.match_regexp_hash(text, @regexps)
65
89
  end
66
90
 
67
91
  end