rbbt-text 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/bow/dictionary.rb +1 -1
- data/lib/rbbt/bow/misc.rb +2 -2
- data/lib/rbbt/ner/NER.rb +22 -0
- data/lib/rbbt/ner/abner.rb +8 -4
- data/lib/rbbt/ner/annotations.rb +123 -0
- data/lib/rbbt/ner/banner.rb +6 -4
- data/lib/rbbt/ner/oscar3.rb +29 -13
- data/lib/rbbt/ner/regexpNER.rb +69 -45
- data/lib/rbbt/ner/token_trieNER.rb +168 -0
- data/test/rbbt/ner/test_NER.rb +10 -0
- data/test/rbbt/ner/test_abner.rb +2 -2
- data/test/rbbt/ner/test_annotations.rb +8 -0
- data/test/rbbt/ner/test_banner.rb +2 -2
- data/test/rbbt/ner/test_oscar3.rb +35 -2
- data/test/rbbt/ner/test_regexpNER.rb +83 -35
- data/test/rbbt/ner/test_token_trieNER.rb +112 -0
- metadata +15 -12
- data/lib/rbbt/ner/named_entity.rb +0 -11
- data/lib/rbbt/ner/tokenNER.rb +0 -237
- data/test/rbbt/ner/test_named_entity.rb +0 -16
- data/test/rbbt/ner/test_tokenNER.rb +0 -239
data/lib/rbbt/bow/dictionary.rb
CHANGED
data/lib/rbbt/bow/misc.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/open'
|
3
3
|
|
4
|
-
Rbbt.
|
4
|
+
Rbbt.claim 'stopwords', 'stopwords', 'wordlist'
|
5
5
|
|
6
|
-
$stopwords =
|
6
|
+
$stopwords = Rbbt.files.wordlists.stopwords.read.scan(/\w+/)
|
7
7
|
|
data/lib/rbbt/ner/NER.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
|
3
|
+
class NER
|
4
|
+
def entities(text, overlap = true, *args)
|
5
|
+
case
|
6
|
+
when Array === text
|
7
|
+
text.collect do |element|
|
8
|
+
matches = entities(element, overlap, *args)
|
9
|
+
matches.each{|match|
|
10
|
+
match.offset += element.offset if match.offset
|
11
|
+
}
|
12
|
+
matches
|
13
|
+
end.flatten
|
14
|
+
when (Annotated === text and not overlap)
|
15
|
+
entities(text.split, overlap, *args)
|
16
|
+
else
|
17
|
+
match(text, *args)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
data/lib/rbbt/ner/abner.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/annotations'
|
4
|
+
require 'rbbt/ner/NER'
|
4
5
|
|
5
6
|
# Offers a Ruby interface to the Abner Named Entity Recognition Package
|
6
7
|
# in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
|
7
|
-
class Abner
|
8
|
+
class Abner < NER
|
8
9
|
|
9
10
|
Rbbt.add_software "ABNER" => ['','']
|
10
11
|
|
@@ -25,13 +26,16 @@ class Abner
|
|
25
26
|
# Given a chunk of text, it finds all the mentions appearing in it. It
|
26
27
|
# returns all the mentions found, regardless of type, to be coherent
|
27
28
|
# with the rest of NER packages in Rbbt.
|
28
|
-
def
|
29
|
+
def match(text)
|
29
30
|
|
30
31
|
res = @tagger.getEntities(text)
|
31
32
|
types = res[1]
|
32
33
|
strings = res[0]
|
33
34
|
|
34
|
-
strings.zip(types).collect
|
35
|
+
strings.zip(types).collect do |mention, type|
|
36
|
+
mention = mention.to_s;
|
37
|
+
NamedEntity.annotate(mention, nil, type.to_s)
|
38
|
+
end
|
35
39
|
end
|
36
40
|
|
37
41
|
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
module Segment
|
2
|
+
attr_accessor :offset
|
3
|
+
|
4
|
+
def self.sort(segments, inline = true)
|
5
|
+
if inline
|
6
|
+
segments.sort do |a,b|
|
7
|
+
case
|
8
|
+
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
9
|
+
0
|
10
|
+
when (a.nil? or a.offset.nil?)
|
11
|
+
-1
|
12
|
+
when (b.nil? or b.offset.nil?)
|
13
|
+
+1
|
14
|
+
when (not a.range.include? b.offset and not b.range.include? a.offset)
|
15
|
+
a.offset <=> b.offset
|
16
|
+
else
|
17
|
+
b.length <=> a.length
|
18
|
+
end
|
19
|
+
end.reverse
|
20
|
+
else
|
21
|
+
segments.sort_by do |segment| segment.offset || 0 end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.split(text, segments)
|
26
|
+
sorted_segments = sort segments
|
27
|
+
|
28
|
+
chunks = []
|
29
|
+
segment_end = 0
|
30
|
+
text_offset = 0
|
31
|
+
sorted_segments.each do |segment|
|
32
|
+
return chunks if text.nil? or text.empty?
|
33
|
+
next if segment.offset.nil?
|
34
|
+
offset = segment.offset - text_offset
|
35
|
+
|
36
|
+
# Consider segment offset. Save pre, or skip if overlap
|
37
|
+
case
|
38
|
+
when offset < 0 # Overlap, skip
|
39
|
+
next
|
40
|
+
when offset > 0 # Save pre
|
41
|
+
chunk = text[0..offset - 1]
|
42
|
+
Segment.annotate(chunk, text_offset)
|
43
|
+
chunks << chunk
|
44
|
+
end
|
45
|
+
|
46
|
+
segment_end = offset + segment.length - 1
|
47
|
+
|
48
|
+
chunk = text[offset..segment_end]
|
49
|
+
Segment.annotate(chunk, text_offset + offset)
|
50
|
+
chunks << chunk
|
51
|
+
|
52
|
+
text_offset += segment_end + 1
|
53
|
+
text = text[segment_end + 1..-1]
|
54
|
+
end
|
55
|
+
|
56
|
+
if not text.nil? and text.any?
|
57
|
+
chunk = text.dup
|
58
|
+
Segment.annotate(chunk, text_offset)
|
59
|
+
chunks << chunk
|
60
|
+
end
|
61
|
+
|
62
|
+
chunks
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.annotate(string, offset = nil)
|
66
|
+
string.extend Segment
|
67
|
+
string.offset = offset
|
68
|
+
string
|
69
|
+
end
|
70
|
+
|
71
|
+
def range
|
72
|
+
(offset..offset + length - 1)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
module Annotated
|
77
|
+
attr_accessor :annotations
|
78
|
+
def self.annotate(string)
|
79
|
+
string.extend Annotated
|
80
|
+
string.annotations = []
|
81
|
+
string
|
82
|
+
end
|
83
|
+
|
84
|
+
def split
|
85
|
+
Segment.split(self, @annotations)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
module NamedEntity
|
90
|
+
include Segment
|
91
|
+
attr_accessor :type, :code, :score
|
92
|
+
|
93
|
+
def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
|
94
|
+
string.extend NamedEntity
|
95
|
+
string.offset = offset
|
96
|
+
string.type = type
|
97
|
+
string.code = code
|
98
|
+
string.score = score
|
99
|
+
string
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_s
|
103
|
+
<<-EOF
|
104
|
+
String: #{ self }
|
105
|
+
Offset: #{ offset.inspect }
|
106
|
+
Type: #{type.inspect}
|
107
|
+
Code: #{code.inspect}
|
108
|
+
Score: #{score.inspect}
|
109
|
+
EOF
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
module Token
|
114
|
+
include Segment
|
115
|
+
attr_accessor :original
|
116
|
+
def self.annotate(string, offset = nil, original = nil)
|
117
|
+
string.extend Token
|
118
|
+
string.offset = offset
|
119
|
+
string.original = original
|
120
|
+
string
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
data/lib/rbbt/ner/banner.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/ner/annotations'
|
4
|
+
require 'rbbt/ner/NER'
|
4
5
|
|
5
6
|
# Offers a Ruby interface to the Banner Named Entity Recognition Package
|
6
7
|
# in Java. Banner[http://banner.sourceforge.net/].
|
7
|
-
class Banner
|
8
|
+
class Banner < NER
|
8
9
|
|
9
10
|
Rbbt.add_software "BANNER" => ['','']
|
10
11
|
|
@@ -48,10 +49,11 @@ class Banner
|
|
48
49
|
|
49
50
|
# Returns an array with the mention found in the provided piece of
|
50
51
|
# text.
|
51
|
-
def
|
52
|
+
def match(text)
|
52
53
|
text.gsub!(/\n/,' ')
|
53
54
|
text.gsub!(/\|/,'/') # Character | gives an error
|
54
55
|
sentence = @@Sentence.new(text)
|
56
|
+
|
55
57
|
@tokenizer.tokenize(sentence)
|
56
58
|
@tagger.tag(sentence)
|
57
59
|
@parenPP.postProcess(sentence)
|
@@ -63,7 +65,7 @@ class Banner
|
|
63
65
|
mention = $1
|
64
66
|
mention.sub!(/^\s*/,'')
|
65
67
|
mention.sub!(/\s*$/,'')
|
66
|
-
NamedEntity.annotate mention
|
68
|
+
NamedEntity.annotate mention, nil, 'GENE'
|
67
69
|
mention
|
68
70
|
}
|
69
71
|
res
|
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/ner/
|
4
|
+
require 'rbbt/ner/annotations'
|
5
|
+
require 'rbbt/ner/NER'
|
5
6
|
require 'rbbt/util/log'
|
6
7
|
|
7
|
-
class OSCAR3
|
8
|
+
class OSCAR3 < NER
|
8
9
|
Rbbt.add_software "OSCAR3" => ['','']
|
9
10
|
|
10
11
|
@@TextToSciXML = Rjb::import('uk.ac.cam.ch.wwmm.ptclib.scixml.TextToSciXML')
|
@@ -14,37 +15,52 @@ class OSCAR3
|
|
14
15
|
@@MEMM = @@MEMMSingleton.getInstance();
|
15
16
|
@@DFA = @@DFANEFinder.getInstance();
|
16
17
|
|
17
|
-
def self.
|
18
|
+
def self.match(text, type = nil, memm = false)
|
18
19
|
doc = @@ProcessingDocumentFactory.getInstance().makeTokenisedDocument(@@TextToSciXML.textToSciXML(text), true, false, false);
|
19
20
|
mentions = []
|
20
21
|
it = doc.getTokenSequences().iterator
|
21
22
|
|
22
|
-
reconizer = memm ? @@MEMM : @@DFA
|
23
23
|
type = [type] unless type.nil? or Array === type
|
24
|
-
pos = 0
|
25
24
|
while it.hasNext do
|
26
|
-
Log.debug "OSCAR3: Finding mentions in sequence #{pos += 1}"
|
27
25
|
sequence = it.next
|
28
|
-
entities = @@MEMM.findNEs(sequence, text)
|
29
26
|
|
30
|
-
|
27
|
+
# Fix sequence offset
|
28
|
+
sequence_str = sequence.getSourceString.to_s
|
29
|
+
sequence_offset = sequence.offset.to_i
|
30
|
+
offset = 0
|
31
|
+
while text[(sequence_offset + offset)..(sequence_offset + offset + sequence_str.length - 1)] != sequence_str and
|
32
|
+
not offset + sequence_offset + sequence_str.length > text.length
|
33
|
+
|
34
|
+
offset += 1
|
35
|
+
end
|
36
|
+
|
37
|
+
next if offset + sequence_offset + sequence_str.length > text.length
|
38
|
+
|
39
|
+
if memm
|
40
|
+
entities = @@MEMM.findNEs(sequence, text)
|
41
|
+
keys = entities.keySet.iterator
|
42
|
+
else
|
43
|
+
entities = @@DFA.getNEs(sequence)
|
44
|
+
keys = entities.iterator
|
45
|
+
end
|
46
|
+
|
31
47
|
while keys.hasNext do
|
32
48
|
key = keys.next
|
33
49
|
mention_type, rstart, rend, mention = key.to_string.match(/\[NE:(.*):(.*):(.*):(.*)\]/).values_at(1,2,3,4)
|
34
50
|
next unless type.nil? or type.include? mention_type
|
35
|
-
score = entities.get(key)
|
51
|
+
score = memm ? entities.get(key).to_string.to_f : nil
|
36
52
|
|
37
|
-
NamedEntity.annotate mention, mention_type,
|
53
|
+
NamedEntity.annotate mention, rstart.to_i + offset, mention_type, nil, score
|
38
54
|
|
39
|
-
mentions << mention
|
55
|
+
mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
|
40
56
|
end
|
41
57
|
end
|
42
58
|
|
43
59
|
mentions
|
44
60
|
end
|
45
61
|
|
46
|
-
def
|
47
|
-
OSCAR3.
|
62
|
+
def match(*args)
|
63
|
+
OSCAR3.match *args
|
48
64
|
end
|
49
65
|
end
|
50
66
|
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -1,67 +1,91 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/ner/annotations'
|
2
|
+
require 'rbbt/ner/NER'
|
3
|
+
require 'rbbt/util/simpleDSL'
|
3
4
|
|
4
|
-
class RegExpNER
|
5
|
-
|
6
|
-
options = Misc.add_defaults options, :flatten => true, :case_insensitive => true, :stopwords => nil
|
5
|
+
class RegExpNER < NER
|
6
|
+
include SimpleDSL
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
8
|
+
def self.match_regexp(text, regexp, type = nil)
|
9
|
+
matches = []
|
10
|
+
start = 0
|
11
|
+
while matchdata = text.match(regexp)
|
12
|
+
pre = matchdata.pre_match
|
13
|
+
post = matchdata.post_match
|
14
|
+
match = matchdata[0]
|
15
|
+
|
16
|
+
if matchdata.captures.any?
|
17
|
+
capture = matchdata.captures.first
|
18
|
+
more_pre, more_post = match.split(/#{capture}/)
|
19
|
+
|
20
|
+
match = capture
|
21
|
+
pre << more_pre if more_pre
|
22
|
+
post = more_post << post if more_post
|
23
|
+
end
|
24
|
+
|
25
|
+
if match and not match.empty?
|
26
|
+
NamedEntity.annotate(match, start + pre.length, type)
|
27
|
+
matches << match
|
28
|
+
end
|
29
|
+
|
30
|
+
start += pre.length + match.length
|
31
|
+
text = post
|
12
32
|
end
|
13
33
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
}
|
34
|
+
matches
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.match_regexp_list(text, regexp_list, type = nil)
|
38
|
+
matches = []
|
39
|
+
|
40
|
+
regexp_list.each do |regexp|
|
41
|
+
chunks = Segment.split(text, matches)
|
42
|
+
chunks.each do |chunk|
|
43
|
+
new_matches = match_regexp(chunk, regexp, type)
|
44
|
+
new_matches.each do |match| match.offset += chunk.offset; matches << match end
|
26
45
|
end
|
27
|
-
|
28
|
-
|
46
|
+
end
|
47
|
+
|
48
|
+
matches
|
29
49
|
end
|
30
50
|
|
51
|
+
def self.match_regexp_hash(text, regexp_hash)
|
52
|
+
matches = []
|
31
53
|
|
32
|
-
|
33
|
-
|
34
|
-
|
54
|
+
regexp_hash.each do |type, regexp_list|
|
55
|
+
regexp_list = [regexp_list] unless Array === regexp_list
|
56
|
+
chunks = Segment.split(text, matches)
|
57
|
+
chunks.each do |chunk|
|
58
|
+
chunk_offset = chunk.offset
|
59
|
+
match_regexp_list(chunk, regexp_list, type).collect do |match|
|
60
|
+
match.offset += chunk_offset;
|
61
|
+
matches << match
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
35
65
|
|
36
|
-
|
66
|
+
matches
|
67
|
+
end
|
37
68
|
|
38
|
-
|
69
|
+
attr_accessor :regexps
|
70
|
+
def initialize(regexps = {})
|
71
|
+
@regexps = regexps.collect
|
39
72
|
end
|
40
73
|
|
41
|
-
def self.match_re(text, res)
|
42
|
-
res = [res] unless Array === res
|
43
74
|
|
44
|
-
|
45
|
-
|
46
|
-
}.flatten
|
75
|
+
def __define_regexp_hook(name, regexp, *args)
|
76
|
+
@regexps << [name, regexp]
|
47
77
|
end
|
48
78
|
|
79
|
+
def define_regexp(*args, &block)
|
80
|
+
load_config("__define_regexp_hook", *args, &block)
|
81
|
+
end
|
49
82
|
|
50
|
-
def
|
51
|
-
|
52
|
-
matches = {}
|
53
|
-
@index.each{|code, re|
|
54
|
-
next if re.nil?
|
55
|
-
RegExpNER.match_re(text, re).each{|match|
|
56
|
-
matches[code] ||= []
|
57
|
-
matches[code] << match
|
58
|
-
}
|
59
|
-
}
|
60
|
-
matches
|
83
|
+
def add_regexp(list = {})
|
84
|
+
@regexps.concat list.collect
|
61
85
|
end
|
62
86
|
|
63
87
|
def match(text)
|
64
|
-
|
88
|
+
matches = RegExpNER.match_regexp_hash(text, @regexps)
|
65
89
|
end
|
66
90
|
|
67
91
|
end
|