rbbt-text 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/bow/dictionary.rb +1 -1
- data/lib/rbbt/bow/misc.rb +2 -2
- data/lib/rbbt/ner/NER.rb +22 -0
- data/lib/rbbt/ner/abner.rb +8 -4
- data/lib/rbbt/ner/annotations.rb +123 -0
- data/lib/rbbt/ner/banner.rb +6 -4
- data/lib/rbbt/ner/oscar3.rb +29 -13
- data/lib/rbbt/ner/regexpNER.rb +69 -45
- data/lib/rbbt/ner/token_trieNER.rb +168 -0
- data/test/rbbt/ner/test_NER.rb +10 -0
- data/test/rbbt/ner/test_abner.rb +2 -2
- data/test/rbbt/ner/test_annotations.rb +8 -0
- data/test/rbbt/ner/test_banner.rb +2 -2
- data/test/rbbt/ner/test_oscar3.rb +35 -2
- data/test/rbbt/ner/test_regexpNER.rb +83 -35
- data/test/rbbt/ner/test_token_trieNER.rb +112 -0
- metadata +15 -12
- data/lib/rbbt/ner/named_entity.rb +0 -11
- data/lib/rbbt/ner/tokenNER.rb +0 -237
- data/test/rbbt/ner/test_named_entity.rb +0 -16
- data/test/rbbt/ner/test_tokenNER.rb +0 -239
data/lib/rbbt/bow/dictionary.rb
CHANGED
data/lib/rbbt/bow/misc.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/open'
|
3
3
|
|
4
|
-
Rbbt.
|
4
|
+
Rbbt.claim 'stopwords', 'stopwords', 'wordlist'
|
5
5
|
|
6
|
-
$stopwords =
|
6
|
+
$stopwords = Rbbt.files.wordlists.stopwords.read.scan(/\w+/)
|
7
7
|
|
data/lib/rbbt/ner/NER.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
|
3
|
+
class NER
|
4
|
+
def entities(text, overlap = true, *args)
|
5
|
+
case
|
6
|
+
when Array === text
|
7
|
+
text.collect do |element|
|
8
|
+
matches = entities(element, overlap, *args)
|
9
|
+
matches.each{|match|
|
10
|
+
match.offset += element.offset if match.offset
|
11
|
+
}
|
12
|
+
matches
|
13
|
+
end.flatten
|
14
|
+
when (Annotated === text and not overlap)
|
15
|
+
entities(text.split, overlap, *args)
|
16
|
+
else
|
17
|
+
match(text, *args)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/annotations'
|
4
|
+
require 'rbbt/ner/NER'
|
4
5
|
|
5
6
|
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
6
7
|
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
7
|
-
class Abner
|
8
|
+
class Abner < NER
|
8
9
|
|
9
10
|
Rbbt.add_software "ABNER" => ['','']
|
10
11
|
|
@@ -25,13 +26,16 @@ class Abner
|
|
25
26
|
# Given a chunk of text, it finds all the mentions appearing in it. It
|
26
27
|
# returns all the mentions found, regardless of type, to be coherent
|
27
28
|
# with the rest of NER packages in Rbbt.
|
28
|
-
def
|
29
|
+
def match(text)
|
29
30
|
|
30
31
|
res = @tagger.getEntities(text)
|
31
32
|
types = res[1]
|
32
33
|
strings = res[0]
|
33
34
|
|
34
|
-
strings.zip(types).collect
|
35
|
+
strings.zip(types).collect do |mention, type|
|
36
|
+
mention = mention.to_s;
|
37
|
+
NamedEntity.annotate(mention, nil, type.to_s)
|
38
|
+
end
|
35
39
|
end
|
36
40
|
|
37
41
|
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
module Segment
|
2
|
+
attr_accessor :offset
|
3
|
+
|
4
|
+
def self.sort(segments, inline = true)
|
5
|
+
if inline
|
6
|
+
segments.sort do |a,b|
|
7
|
+
case
|
8
|
+
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
9
|
+
0
|
10
|
+
when (a.nil? or a.offset.nil?)
|
11
|
+
-1
|
12
|
+
when (b.nil? or b.offset.nil?)
|
13
|
+
+1
|
14
|
+
when (not a.range.include? b.offset and not b.range.include? a.offset)
|
15
|
+
a.offset <=> b.offset
|
16
|
+
else
|
17
|
+
b.length <=> a.length
|
18
|
+
end
|
19
|
+
end.reverse
|
20
|
+
else
|
21
|
+
segments.sort_by do |segment| segment.offset || 0 end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.split(text, segments)
|
26
|
+
sorted_segments = sort segments
|
27
|
+
|
28
|
+
chunks = []
|
29
|
+
segment_end = 0
|
30
|
+
text_offset = 0
|
31
|
+
sorted_segments.each do |segment|
|
32
|
+
return chunks if text.nil? or text.empty?
|
33
|
+
next if segment.offset.nil?
|
34
|
+
offset = segment.offset - text_offset
|
35
|
+
|
36
|
+
# Consider segment offset. Save pre, or skip if overlap
|
37
|
+
case
|
38
|
+
when offset < 0 # Overlap, skip
|
39
|
+
next
|
40
|
+
when offset > 0 # Save pre
|
41
|
+
chunk = text[0..offset - 1]
|
42
|
+
Segment.annotate(chunk, text_offset)
|
43
|
+
chunks << chunk
|
44
|
+
end
|
45
|
+
|
46
|
+
segment_end = offset + segment.length - 1
|
47
|
+
|
48
|
+
chunk = text[offset..segment_end]
|
49
|
+
Segment.annotate(chunk, text_offset + offset)
|
50
|
+
chunks << chunk
|
51
|
+
|
52
|
+
text_offset += segment_end + 1
|
53
|
+
text = text[segment_end + 1..-1]
|
54
|
+
end
|
55
|
+
|
56
|
+
if not text.nil? and text.any?
|
57
|
+
chunk = text.dup
|
58
|
+
Segment.annotate(chunk, text_offset)
|
59
|
+
chunks << chunk
|
60
|
+
end
|
61
|
+
|
62
|
+
chunks
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.annotate(string, offset = nil)
|
66
|
+
string.extend Segment
|
67
|
+
string.offset = offset
|
68
|
+
string
|
69
|
+
end
|
70
|
+
|
71
|
+
def range
|
72
|
+
(offset..offset + length - 1)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
module Annotated
|
77
|
+
attr_accessor :annotations
|
78
|
+
def self.annotate(string)
|
79
|
+
string.extend Annotated
|
80
|
+
string.annotations = []
|
81
|
+
string
|
82
|
+
end
|
83
|
+
|
84
|
+
def split
|
85
|
+
Segment.split(self, @annotations)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
module NamedEntity
|
90
|
+
include Segment
|
91
|
+
attr_accessor :type, :code, :score
|
92
|
+
|
93
|
+
def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
|
94
|
+
string.extend NamedEntity
|
95
|
+
string.offset = offset
|
96
|
+
string.type = type
|
97
|
+
string.code = code
|
98
|
+
string.score = score
|
99
|
+
string
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_s
|
103
|
+
<<-EOF
|
104
|
+
String: #{ self }
|
105
|
+
Offset: #{ offset.inspect }
|
106
|
+
Type: #{type.inspect}
|
107
|
+
Code: #{code.inspect}
|
108
|
+
Score: #{score.inspect}
|
109
|
+
EOF
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
module Token
|
114
|
+
include Segment
|
115
|
+
attr_accessor :original
|
116
|
+
def self.annotate(string, offset = nil, original = nil)
|
117
|
+
string.extend Token
|
118
|
+
string.offset = offset
|
119
|
+
string.original = original
|
120
|
+
string
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/annotations'
|
4
|
+
require 'rbbt/ner/NER'
|
4
5
|
|
5
6
|
# Offers a Ruby interface to the Banner Named Entity Recognition Package
|
6
7
|
# in Java. Banner[http://banner.sourceforge.net/].
|
7
|
-
class Banner
|
8
|
+
class Banner < NER
|
8
9
|
|
9
10
|
Rbbt.add_software "BANNER" => ['','']
|
10
11
|
|
@@ -48,10 +49,11 @@ class Banner
|
|
48
49
|
|
49
50
|
# Returns an array with the mention found in the provided piece of
|
50
51
|
# text.
|
51
|
-
def
|
52
|
+
def match(text)
|
52
53
|
text.gsub!(/\n/,' ')
|
53
54
|
text.gsub!(/\|/,'/') # Character | gives an error
|
54
55
|
sentence = @@Sentence.new(text)
|
56
|
+
|
55
57
|
@tokenizer.tokenize(sentence)
|
56
58
|
@tagger.tag(sentence)
|
57
59
|
@parenPP.postProcess(sentence)
|
@@ -63,7 +65,7 @@ class Banner
|
|
63
65
|
mention = $1
|
64
66
|
mention.sub!(/^\s*/,'')
|
65
67
|
mention.sub!(/\s*$/,'')
|
66
|
-
NamedEntity.annotate mention
|
68
|
+
NamedEntity.annotate mention, nil, 'GENE'
|
67
69
|
mention
|
68
70
|
}
|
69
71
|
res
|
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/ner/
|
4
|
+
require 'rbbt/ner/annotations'
|
5
|
+
require 'rbbt/ner/NER'
|
5
6
|
require 'rbbt/util/log'
|
6
7
|
|
7
|
-
class OSCAR3
|
8
|
+
class OSCAR3 < NER
|
8
9
|
Rbbt.add_software "OSCAR3" => ['','']
|
9
10
|
|
10
11
|
@@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
@@ -14,37 +15,52 @@ class OSCAR3
|
|
14
15
|
@@MEMM = @@MEMMSingleton.getInstance();
|
15
16
|
@@DFA = @@DFANEFinder.getInstance();
|
16
17
|
|
17
|
-
def self.
|
18
|
+
def self.match(text, type = nil, memm = false)
|
18
19
|
doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
|
19
20
|
mentions = []
|
20
21
|
it = doc.getTokenSequences().iterator
|
21
22
|
|
22
|
-
reconizer = memm ? @@MEMM : @@DFA
|
23
23
|
type = [type] unless type.nil? or Array === type
|
24
|
-
pos = 0
|
25
24
|
while it.hasNext do
|
26
|
-
Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
|
27
25
|
sequence = it.next
|
28
|
-
entities = @@MEMM.findNEs(sequence, text)
|
29
26
|
|
30
|
-
|
27
|
+
# Fix sequence offset
|
28
|
+
sequence_str = sequence.getSourceString.to_s
|
29
|
+
sequence_offset = sequence.offset.to_i
|
30
|
+
offset = 0
|
31
|
+
while text[(sequence_offset + offset)..(sequence_offset + offset + sequence_str.length - 1)] != sequence_str and
|
32
|
+
not offset + sequence_offset + sequence_str.length > text.length
|
33
|
+
|
34
|
+
offset += 1
|
35
|
+
end
|
36
|
+
|
37
|
+
next if offset + sequence_offset + sequence_str.length > text.length
|
38
|
+
|
39
|
+
if memm
|
40
|
+
entities = @@MEMM.findNEs(sequence, text)
|
41
|
+
keys = entities.keySet.iterator
|
42
|
+
else
|
43
|
+
entities = @@DFA.getNEs(sequence)
|
44
|
+
keys = entities.iterator
|
45
|
+
end
|
46
|
+
|
31
47
|
while keys.hasNext do
|
32
48
|
key = keys.next
|
33
49
|
mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
|
34
50
|
next unless type.nil? or type.include? mention_type
|
35
|
-
score = entities.get(key)
|
51
|
+
score = memm ? entities.get(key).to_string.to_f : nil
|
36
52
|
|
37
|
-
NamedEntity.annotate mention, mention_type,
|
53
|
+
NamedEntity.annotate mention, rstart.to_i + offset, mention_type, nil, score
|
38
54
|
|
39
|
-
mentions << mention
|
55
|
+
mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
|
40
56
|
end
|
41
57
|
end
|
42
58
|
|
43
59
|
mentions
|
44
60
|
end
|
45
61
|
|
46
|
-
def
|
47
|
-
OSCAR3.
|
62
|
+
def match(*args)
|
63
|
+
OSCAR3.match *args
|
48
64
|
end
|
49
65
|
end
|
50
66
|
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -1,67 +1,91 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
require 'rbbt/ner/NER'
|
3
|
+
require 'rbbt/util/simpleDSL'
|
3
4
|
|
4
|
-
class RegExpNER
|
5
|
-
|
6
|
-
options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
|
5
|
+
class RegExpNER < NER
|
6
|
+
include SimpleDSL
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
def self.match_regexp(text, regexp, type = nil)
|
9
|
+
matches = []
|
10
|
+
start = 0
|
11
|
+
while matchdata = text.match(regexp)
|
12
|
+
pre = matchdata.pre_match
|
13
|
+
post = matchdata.post_match
|
14
|
+
match = matchdata[0]
|
15
|
+
|
16
|
+
if matchdata.captures.any?
|
17
|
+
capture = matchdata.captures.first
|
18
|
+
more_pre, more_post = match.split(/#{capture}/)
|
19
|
+
|
20
|
+
match = capture
|
21
|
+
pre << more_pre if more_pre
|
22
|
+
post = more_post << post if more_post
|
23
|
+
end
|
24
|
+
|
25
|
+
if match and not match.empty?
|
26
|
+
NamedEntity.annotate(match, start + pre.length, type)
|
27
|
+
matches << match
|
28
|
+
end
|
29
|
+
|
30
|
+
start += pre.length + match.length
|
31
|
+
text = post
|
12
32
|
end
|
13
33
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
}
|
34
|
+
matches
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.match_regexp_list(text, regexp_list, type = nil)
|
38
|
+
matches = []
|
39
|
+
|
40
|
+
regexp_list.each do |regexp|
|
41
|
+
chunks = Segment.split(text, matches)
|
42
|
+
chunks.each do |chunk|
|
43
|
+
new_matches = match_regexp(chunk, regexp, type)
|
44
|
+
new_matches.each do |match| match.offset += chunk.offset; matches << match end
|
26
45
|
end
|
27
|
-
|
28
|
-
|
46
|
+
end
|
47
|
+
|
48
|
+
matches
|
29
49
|
end
|
30
50
|
|
51
|
+
def self.match_regexp_hash(text, regexp_hash)
|
52
|
+
matches = []
|
31
53
|
|
32
|
-
|
33
|
-
|
34
|
-
|
54
|
+
regexp_hash.each do |type, regexp_list|
|
55
|
+
regexp_list = [regexp_list] unless Array === regexp_list
|
56
|
+
chunks = Segment.split(text, matches)
|
57
|
+
chunks.each do |chunk|
|
58
|
+
chunk_offset = chunk.offset
|
59
|
+
match_regexp_list(chunk, regexp_list, type).collect do |match|
|
60
|
+
match.offset += chunk_offset;
|
61
|
+
matches << match
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
35
65
|
|
36
|
-
|
66
|
+
matches
|
67
|
+
end
|
37
68
|
|
38
|
-
|
69
|
+
attr_accessor :regexps
|
70
|
+
def initialize(regexps = {})
|
71
|
+
@regexps = regexps.collect
|
39
72
|
end
|
40
73
|
|
41
|
-
def self.match_re(text, res)
|
42
|
-
res = [res] unless Array === res
|
43
74
|
|
44
|
-
|
45
|
-
|
46
|
-
}.flatten
|
75
|
+
def __define_regexp_hook(name, regexp, *args)
|
76
|
+
@regexps << [name, regexp]
|
47
77
|
end
|
48
78
|
|
79
|
+
def define_regexp(*args, &block)
|
80
|
+
load_config("__define_regexp_hook", *args, &block)
|
81
|
+
end
|
49
82
|
|
50
|
-
def
|
51
|
-
|
52
|
-
matches = {}
|
53
|
-
@index.each{|code, re|
|
54
|
-
next if re.nil?
|
55
|
-
RegExpNER.match_re(text, re).each{|match|
|
56
|
-
matches[code] ||= []
|
57
|
-
matches[code] << match
|
58
|
-
}
|
59
|
-
}
|
60
|
-
matches
|
83
|
+
def add_regexp(list = {})
|
84
|
+
@regexps.concat list.collect
|
61
85
|
end
|
62
86
|
|
63
87
|
def match(text)
|
64
|
-
|
88
|
+
matches = RegExpNER.match_regexp_hash(text, @regexps)
|
65
89
|
end
|
66
90
|
|
67
91
|
end
|