rbbt-text 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -174,7 +174,7 @@ class Dictionary::KL
174
174
  if limit
175
175
  Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
176
176
  else
177
- Hash[*best.flatten]
177
+ best
178
178
  end
179
179
  end
180
180
 
data/lib/rbbt/bow/misc.rb CHANGED
@@ -1,7 +1,7 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/open'
3
3
 
4
- Rbbt.add_datafiles 'stopwords' => ['wordlists', 'stopwords']
4
+ Rbbt.claim 'stopwords', 'stopwords', 'wordlist'
5
5
 
6
- $stopwords = Open.read(Rbbt.find_datafile 'stopwords').scan(/\w+/) if File.exists?(Rbbt.find_datafile 'stopwords')
6
+ $stopwords = Rbbt.files.wordlists.stopwords.read.scan(/\w+/)
7
7
 
@@ -0,0 +1,22 @@
1
+ require 'rbbt/ner/annotations'
2
+
3
+ class NER
4
+ def entities(text, overlap = true, *args)
5
+ case
6
+ when Array === text
7
+ text.collect do |element|
8
+ matches = entities(element, overlap, *args)
9
+ matches.each{|match|
10
+ match.offset += element.offset if match.offset
11
+ }
12
+ matches
13
+ end.flatten
14
+ when (Annotated === text and not overlap)
15
+ entities(text.split, overlap, *args)
16
+ else
17
+ match(text, *args)
18
+ end
19
+ end
20
+ end
21
+
22
+
@@ -1,10 +1,11 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/named_entity'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/NER'
4
5
 
5
6
  # Offers a Ruby interface to the Abner Named Entity Recognition Package
6
7
  # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
7
- class Abner
8
+ class Abner < NER
8
9
 
9
10
  Rbbt.add_software "ABNER" => ['','']
10
11
 
@@ -25,13 +26,16 @@ class Abner
25
26
  # Given a chunk of text, it finds all the mentions appearing in it. It
26
27
  # returns all the mentions found, regardless of type, to be coherent
27
28
  # with the rest of NER packages in Rbbt.
28
- def extract(text)
29
+ def match(text)
29
30
 
30
31
  res = @tagger.getEntities(text)
31
32
  types = res[1]
32
33
  strings = res[0]
33
34
 
34
- strings.zip(types).collect{|mention, type| mention = mention.to_s; NamedEntity mention, types.to_s; mention}
35
+ strings.zip(types).collect do |mention, type|
36
+ mention = mention.to_s;
37
+ NamedEntity.annotate(mention, nil, type.to_s)
38
+ end
35
39
  end
36
40
 
37
41
  end
@@ -0,0 +1,123 @@
1
+ module Segment
2
+ attr_accessor :offset
3
+
4
+ def self.sort(segments, inline = true)
5
+ if inline
6
+ segments.sort do |a,b|
7
+ case
8
+ when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
9
+ 0
10
+ when (a.nil? or a.offset.nil?)
11
+ -1
12
+ when (b.nil? or b.offset.nil?)
13
+ +1
14
+ when (not a.range.include? b.offset and not b.range.include? a.offset)
15
+ a.offset <=> b.offset
16
+ else
17
+ b.length <=> a.length
18
+ end
19
+ end.reverse
20
+ else
21
+ segments.sort_by do |segment| segment.offset || 0 end
22
+ end
23
+ end
24
+
25
+ def self.split(text, segments)
26
+ sorted_segments = sort segments
27
+
28
+ chunks = []
29
+ segment_end = 0
30
+ text_offset = 0
31
+ sorted_segments.each do |segment|
32
+ return chunks if text.nil? or text.empty?
33
+ next if segment.offset.nil?
34
+ offset = segment.offset - text_offset
35
+
36
+ # Consider segment offset. Save pre, or skip if overlap
37
+ case
38
+ when offset < 0 # Overlap, skip
39
+ next
40
+ when offset > 0 # Save pre
41
+ chunk = text[0..offset - 1]
42
+ Segment.annotate(chunk, text_offset)
43
+ chunks << chunk
44
+ end
45
+
46
+ segment_end = offset + segment.length - 1
47
+
48
+ chunk = text[offset..segment_end]
49
+ Segment.annotate(chunk, text_offset + offset)
50
+ chunks << chunk
51
+
52
+ text_offset += segment_end + 1
53
+ text = text[segment_end + 1..-1]
54
+ end
55
+
56
+ if not text.nil? and text.any?
57
+ chunk = text.dup
58
+ Segment.annotate(chunk, text_offset)
59
+ chunks << chunk
60
+ end
61
+
62
+ chunks
63
+ end
64
+
65
+ def self.annotate(string, offset = nil)
66
+ string.extend Segment
67
+ string.offset = offset
68
+ string
69
+ end
70
+
71
+ def range
72
+ (offset..offset + length - 1)
73
+ end
74
+ end
75
+
76
+ module Annotated
77
+ attr_accessor :annotations
78
+ def self.annotate(string)
79
+ string.extend Annotated
80
+ string.annotations = []
81
+ string
82
+ end
83
+
84
+ def split
85
+ Segment.split(self, @annotations)
86
+ end
87
+ end
88
+
89
+ module NamedEntity
90
+ include Segment
91
+ attr_accessor :type, :code, :score
92
+
93
+ def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
94
+ string.extend NamedEntity
95
+ string.offset = offset
96
+ string.type = type
97
+ string.code = code
98
+ string.score = score
99
+ string
100
+ end
101
+
102
+ def to_s
103
+ <<-EOF
104
+ String: #{ self }
105
+ Offset: #{ offset.inspect }
106
+ Type: #{type.inspect}
107
+ Code: #{code.inspect}
108
+ Score: #{score.inspect}
109
+ EOF
110
+ end
111
+ end
112
+
113
+ module Token
114
+ include Segment
115
+ attr_accessor :original
116
+ def self.annotate(string, offset = nil, original = nil)
117
+ string.extend Token
118
+ string.offset = offset
119
+ string.original = original
120
+ string
121
+ end
122
+ end
123
+
@@ -1,10 +1,11 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'rbbt/ner/named_entity'
3
+ require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/NER'
4
5
 
5
6
  # Offers a Ruby interface to the Banner Named Entity Recognition Package
6
7
  # in Java. Banner[http://banner.sourceforge.net/].
7
- class Banner
8
+ class Banner < NER
8
9
 
9
10
  Rbbt.add_software "BANNER" => ['','']
10
11
 
@@ -48,10 +49,11 @@ class Banner
48
49
 
49
50
  # Returns an array with the mention found in the provided piece of
50
51
  # text.
51
- def extract(text)
52
+ def match(text)
52
53
  text.gsub!(/\n/,' ')
53
54
  text.gsub!(/\|/,'/') # Character | gives an error
54
55
  sentence = @@Sentence.new(text)
56
+
55
57
  @tokenizer.tokenize(sentence)
56
58
  @tagger.tag(sentence)
57
59
  @parenPP.postProcess(sentence)
@@ -63,7 +65,7 @@ class Banner
63
65
  mention = $1
64
66
  mention.sub!(/^\s*/,'')
65
67
  mention.sub!(/\s*$/,'')
66
- NamedEntity.annotate mention
68
+ NamedEntity.annotate mention, nil, 'GENE'
67
69
  mention
68
70
  }
69
71
  res
@@ -1,10 +1,11 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
3
  require 'libxml'
4
- require 'rbbt/ner/named_entity'
4
+ require 'rbbt/ner/annotations'
5
+ require 'rbbt/ner/NER'
5
6
  require 'rbbt/util/log'
6
7
 
7
- class OSCAR3
8
+ class OSCAR3 < NER
8
9
  Rbbt.add_software "OSCAR3" => ['','']
9
10
 
10
11
  @@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
@@ -14,37 +15,52 @@ class OSCAR3
14
15
  @@MEMM = @@MEMMSingleton.getInstance();
15
16
  @@DFA = @@DFANEFinder.getInstance();
16
17
 
17
- def self.extract(text, type = nil, memm = true)
18
+ def self.match(text, type = nil, memm = false)
18
19
  doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
19
20
  mentions = []
20
21
  it = doc.getTokenSequences().iterator
21
22
 
22
- reconizer = memm ? @@MEMM : @@DFA
23
23
  type = [type] unless type.nil? or Array === type
24
- pos = 0
25
24
  while it.hasNext do
26
- Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
27
25
  sequence = it.next
28
- entities = @@MEMM.findNEs(sequence, text)
29
26
 
30
- keys = entities.keySet.iterator
27
+ # Fix sequence offset
28
+ sequence_str = sequence.getSourceString.to_s
29
+ sequence_offset = sequence.offset.to_i
30
+ offset = 0
31
+ while text[(sequence_offset + offset)..(sequence_offset + offset + sequence_str.length - 1)] != sequence_str and
32
+ not offset + sequence_offset + sequence_str.length > text.length
33
+
34
+ offset += 1
35
+ end
36
+
37
+ next if offset + sequence_offset + sequence_str.length > text.length
38
+
39
+ if memm
40
+ entities = @@MEMM.findNEs(sequence, text)
41
+ keys = entities.keySet.iterator
42
+ else
43
+ entities = @@DFA.getNEs(sequence)
44
+ keys = entities.iterator
45
+ end
46
+
31
47
  while keys.hasNext do
32
48
  key = keys.next
33
49
  mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
34
50
  next unless type.nil? or type.include? mention_type
35
- score = entities.get(key)
51
+ score = memm ? entities.get(key).to_string.to_f : nil
36
52
 
37
- NamedEntity.annotate mention, mention_type, score.to_string.to_f, (rstart..rend)
53
+ NamedEntity.annotate mention, rstart.to_i + offset, mention_type, nil, score
38
54
 
39
- mentions << mention
55
+ mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
40
56
  end
41
57
  end
42
58
 
43
59
  mentions
44
60
  end
45
61
 
46
- def extract(*args)
47
- OSCAR3.extract *args
62
+ def match(*args)
63
+ OSCAR3.match *args
48
64
  end
49
65
  end
50
66
 
@@ -1,67 +1,91 @@
1
- require 'rbbt-util'
2
- require 'rbbt/bow/misc'
1
+ require 'rbbt/ner/annotations'
2
+ require 'rbbt/ner/NER'
3
+ require 'rbbt/util/simpleDSL'
3
4
 
4
- class RegExpNER
5
- def initialize(lexicon, options = {})
6
- options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
5
+ class RegExpNER < NER
6
+ include SimpleDSL
7
7
 
8
- if $stopwords and (options[:stopwords].nil? || options[:stopwords] == true)
9
- options[:stopwords] = $stopwords
10
- else
11
- options[:stopwords] = []
8
+ def self.match_regexp(text, regexp, type = nil)
9
+ matches = []
10
+ start = 0
11
+ while matchdata = text.match(regexp)
12
+ pre = matchdata.pre_match
13
+ post = matchdata.post_match
14
+ match = matchdata[0]
15
+
16
+ if matchdata.captures.any?
17
+ capture = matchdata.captures.first
18
+ more_pre, more_post = match.split(/#{capture}/)
19
+
20
+ match = capture
21
+ pre << more_pre if more_pre
22
+ post = more_post << post if more_post
23
+ end
24
+
25
+ if match and not match.empty?
26
+ NamedEntity.annotate(match, start + pre.length, type)
27
+ matches << match
28
+ end
29
+
30
+ start += pre.length + match.length
31
+ text = post
12
32
  end
13
33
 
14
- data = TSV.new(lexicon, options)
15
-
16
- @index = {}
17
- data.each{|code, names|
18
- next if code.nil? || code == ""
19
- names << code if names.empty?
20
-
21
-
22
- if options[:stopwords].any?
23
- names = names.select{|n|
24
- ! options[:stopwords].include?(options[:case_insensitive] ? n.downcase : n)
25
- }
34
+ matches
35
+ end
36
+
37
+ def self.match_regexp_list(text, regexp_list, type = nil)
38
+ matches = []
39
+
40
+ regexp_list.each do |regexp|
41
+ chunks = Segment.split(text, matches)
42
+ chunks.each do |chunk|
43
+ new_matches = match_regexp(chunk, regexp, type)
44
+ new_matches.each do |match| match.offset += chunk.offset; matches << match end
26
45
  end
27
- @index[code] = RegExpNER.build_re(names, options[:case_insensitive])
28
- }
46
+ end
47
+
48
+ matches
29
49
  end
30
50
 
51
+ def self.match_regexp_hash(text, regexp_hash)
52
+ matches = []
31
53
 
32
- def self.build_re(names, ignorecase=true)
33
- res = names.compact.reject{|n| n.empty? or n.length < 3}.
34
- sort_by{|a| a.length }.reverse.collect{|n| Regexp.quote(n) }
54
+ regexp_hash.each do |type, regexp_list|
55
+ regexp_list = [regexp_list] unless Array === regexp_list
56
+ chunks = Segment.split(text, matches)
57
+ chunks.each do |chunk|
58
+ chunk_offset = chunk.offset
59
+ match_regexp_list(chunk, regexp_list, type).collect do |match|
60
+ match.offset += chunk_offset;
61
+ matches << match
62
+ end
63
+ end
64
+ end
35
65
 
36
- return nil if res.empty?
66
+ matches
67
+ end
37
68
 
38
- /\b(#{ res.join("|").gsub(/\\?\s/,'\s+') })\b/i
69
+ attr_accessor :regexps
70
+ def initialize(regexps = {})
71
+ @regexps = regexps.collect
39
72
  end
40
73
 
41
- def self.match_re(text, res)
42
- res = [res] unless Array === res
43
74
 
44
- res.collect{|re|
45
- text.scan(re)
46
- }.flatten
75
+ def __define_regexp_hook(name, regexp, *args)
76
+ @regexps << [name, regexp]
47
77
  end
48
78
 
79
+ def define_regexp(*args, &block)
80
+ load_config("__define_regexp_hook", *args, &block)
81
+ end
49
82
 
50
- def match_hash(text)
51
- return {} if text.nil? or text.empty?
52
- matches = {}
53
- @index.each{|code, re|
54
- next if re.nil?
55
- RegExpNER.match_re(text, re).each{|match|
56
- matches[code] ||= []
57
- matches[code] << match
58
- }
59
- }
60
- matches
83
+ def add_regexp(list = {})
84
+ @regexps.concat list.collect
61
85
  end
62
86
 
63
87
  def match(text)
64
- match_hash(text)
88
+ matches = RegExpNER.match_regexp_hash(text, @regexps)
65
89
  end
66
90
 
67
91
  end