rbbt-text 1.2.0 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +55 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +63 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +26 -3
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/text/segment'
|
5
4
|
require 'rbbt/ner/NER'
|
6
5
|
require 'rbbt/util/log'
|
7
6
|
|
@@ -53,7 +52,7 @@ class OSCAR3 < NER
|
|
53
52
|
next unless type.nil? or type.include? mention_type
|
54
53
|
score = memm ? entities.get(key).to_string.to_f : nil
|
55
54
|
|
56
|
-
NamedEntity.setup mention, rstart.to_i + offset, mention_type,
|
55
|
+
NamedEntity.setup mention, :offset => rstart.to_i + offset, :entity_type => mention_type, :score => score
|
57
56
|
|
58
57
|
mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
|
59
58
|
end
|
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/
|
4
|
+
require 'rbbt/segment'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
require 'rbbt/util/log'
|
7
7
|
|
@@ -25,7 +25,7 @@ class OSCAR4 < NER
|
|
25
25
|
@@tagger ||= @@OSCAR.new()
|
26
26
|
end
|
27
27
|
|
28
|
-
def self.match(text, type = nil)
|
28
|
+
def self.match(text, protect = false, type = nil)
|
29
29
|
self.init
|
30
30
|
|
31
31
|
return [] if text.nil? or text.strip.empty?
|
@@ -46,7 +46,7 @@ class OSCAR4 < NER
|
|
46
46
|
|
47
47
|
next unless entity.getType.toString == type unless type.nil?
|
48
48
|
|
49
|
-
NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
|
49
|
+
NamedEntity.setup mention, :offset => entity.getStart, :entity_type => entity.getType, :code => inchi, :score => entity.getConfidence
|
50
50
|
|
51
51
|
result << mention
|
52
52
|
end
|
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/segment/named_entity'
|
2
|
+
require 'rbbt/segment/segmented'
|
3
|
+
require 'rbbt/segment/transformed'
|
4
|
+
#require 'rbbt/segment/relationship'
|
5
5
|
require 'rbbt/ner/regexpNER'
|
6
6
|
require 'rbbt/ner/token_trieNER'
|
7
7
|
require 'rbbt/nlp/nlp'
|
@@ -14,7 +14,7 @@ class PatternRelExt
|
|
14
14
|
regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
|
15
15
|
segments = sentence.segments
|
16
16
|
segments = segments.values.flatten if Hash === segments
|
17
|
-
Transformed.with_transform(sentence, segments, Proc.new{|s| s.
|
17
|
+
Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
|
18
18
|
regexpNER.entities(sentence)
|
19
19
|
end
|
20
20
|
end
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'rbbt/text/segment'
|
2
1
|
require 'rbbt/ner/NER'
|
3
2
|
require 'rbbt/util/simpleDSL'
|
4
3
|
|
@@ -23,7 +22,7 @@ class RegExpNER < NER
|
|
23
22
|
end
|
24
23
|
|
25
24
|
if match and not match.empty?
|
26
|
-
NamedEntity.setup(match, start + pre.length, type)
|
25
|
+
NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
|
27
26
|
matches << match
|
28
27
|
end
|
29
28
|
|
@@ -1,46 +1,58 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/tsv'
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/text/segment/token'
|
3
|
+
require 'rbbt/segment'
|
5
4
|
require 'rbbt/ner/NER'
|
5
|
+
require 'rbbt/segment/token'
|
6
6
|
|
7
7
|
class TokenTrieNER < NER
|
8
|
-
def self.clean(token)
|
8
|
+
def self.clean(token, stem = false)
|
9
9
|
if token.length > 3
|
10
|
-
token
|
10
|
+
upcase = token !~ /[a-z]/
|
11
|
+
token = token.downcase.sub(/-/,'')
|
12
|
+
|
13
|
+
if stem && ! upcase
|
14
|
+
require 'stemmer'
|
15
|
+
if stem == :double
|
16
|
+
token = token.stem.stem
|
17
|
+
else
|
18
|
+
token = token.stem
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
token
|
11
23
|
else
|
12
24
|
token
|
13
25
|
end
|
14
26
|
end
|
15
27
|
|
16
|
-
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
28
|
+
def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
|
17
29
|
if no_clean
|
18
30
|
if extend_to_token
|
19
|
-
Token.setup(
|
31
|
+
Token.setup(token, :offset => start, :original => token)
|
20
32
|
else
|
21
33
|
token
|
22
34
|
end
|
23
35
|
else
|
24
36
|
if extend_to_token
|
25
|
-
Token.setup(clean(token), start, token)
|
37
|
+
Token.setup(clean(token, stem), :offset => start, :original => token)
|
26
38
|
else
|
27
|
-
clean(token)
|
39
|
+
clean(token, stem)
|
28
40
|
end
|
29
41
|
end
|
30
42
|
end
|
31
43
|
|
32
|
-
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
|
33
|
-
split_at = /\s|(\(|\)|[-."'
|
44
|
+
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
|
45
|
+
split_at = /\s|(\(|\)|[-."':,;])/ if split_at.nil?
|
34
46
|
|
35
47
|
tokens = []
|
36
48
|
while matchdata = text.match(split_at)
|
37
|
-
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
|
38
|
-
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
49
|
+
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
|
50
|
+
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
39
51
|
start += matchdata.end(0)
|
40
52
|
text = matchdata.post_match
|
41
53
|
end
|
42
54
|
|
43
|
-
tokens << prepare_token(text, start, extend_to_token) unless text.empty?
|
55
|
+
tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
|
44
56
|
|
45
57
|
tokens
|
46
58
|
end
|
@@ -130,14 +142,14 @@ class TokenTrieNER < NER
|
|
130
142
|
index1
|
131
143
|
end
|
132
144
|
|
133
|
-
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
|
145
|
+
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
|
134
146
|
|
135
147
|
chunk_size = hash.size / 100
|
136
148
|
items_in_chunk = 0
|
137
149
|
tmp_index = {}
|
138
150
|
hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
|
139
151
|
names = Array === names ? names : [names]
|
140
|
-
names.flatten! if Array === names.first and not
|
152
|
+
names.flatten! if Array === names.first and not Segment === names.first.first
|
141
153
|
|
142
154
|
if names.empty?
|
143
155
|
names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
|
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
|
|
146
158
|
names.each do |name|
|
147
159
|
next if name.empty? or (String === name and name.length < 2)
|
148
160
|
|
149
|
-
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
|
161
|
+
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
|
150
162
|
tokens.extend EnumeratedArray
|
151
163
|
|
152
164
|
token_index = index_for_tokens(tokens, code, type, slack)
|
@@ -237,10 +249,10 @@ class TokenTrieNER < NER
|
|
237
249
|
match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
|
238
250
|
}
|
239
251
|
|
240
|
-
NamedEntity.setup(match, match_tokens.first.offset, type, codes)
|
252
|
+
NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
|
241
253
|
end
|
242
254
|
|
243
|
-
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|
255
|
+
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
|
244
256
|
def initialize(type = nil, file = nil, options = {})
|
245
257
|
options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
|
246
258
|
:persist => false
|
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
|
|
248
260
|
@longest_match = options.delete :longest_match
|
249
261
|
@split_at = options.delete :split_at
|
250
262
|
@no_clean = options.delete :no_clean
|
263
|
+
@stem = options.delete :stem
|
251
264
|
|
252
265
|
file = [] if file.nil?
|
253
266
|
file = [file] unless Array === file
|
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
|
|
273
286
|
Log.debug "TokenTrieNER merging TSV"
|
274
287
|
new.with_unnamed do
|
275
288
|
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
276
|
-
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
289
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
|
277
290
|
end
|
278
291
|
end
|
279
292
|
when Hash === new
|
@@ -284,18 +297,18 @@ class TokenTrieNER < NER
|
|
284
297
|
new = TSV.open(new, :flat)
|
285
298
|
new.with_unnamed do
|
286
299
|
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
287
|
-
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
300
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
|
288
301
|
end
|
289
302
|
end
|
290
303
|
end
|
291
304
|
end
|
292
305
|
|
293
306
|
def match(text)
|
294
|
-
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
|
307
|
+
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
|
295
308
|
|
296
309
|
tokens.extend EnumeratedArray
|
297
310
|
tokens.pos = 0
|
298
|
-
|
311
|
+
|
299
312
|
matches = []
|
300
313
|
while tokens.left?
|
301
314
|
new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'rbbt/nlp/nlp'
|
2
|
-
require 'rbbt/
|
2
|
+
require 'rbbt/segment'
|
3
3
|
module NLP
|
4
4
|
Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
|
5
5
|
|
@@ -239,6 +239,7 @@ module NLP
|
|
239
239
|
end
|
240
240
|
|
241
241
|
def self.geniass_sentence_splitter(text)
|
242
|
+
Rbbt.software.opt.Geniass.produce
|
242
243
|
offsets = []
|
243
244
|
|
244
245
|
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
@@ -294,7 +295,7 @@ module NLP
|
|
294
295
|
offsets.collect do |s,e|
|
295
296
|
sentence = text[s..e]
|
296
297
|
next if sentence.nil?
|
297
|
-
|
298
|
+
sentence.gsub!(NEW_LINE_MASK, "\n")
|
298
299
|
Segment.setup sentence, s
|
299
300
|
sentence
|
300
301
|
end
|
data/lib/rbbt/nlp/nlp.rb
CHANGED
@@ -2,8 +2,8 @@ require 'rbbt'
|
|
2
2
|
require 'rbbt/util/tmpfile'
|
3
3
|
require 'rbbt/persist'
|
4
4
|
require 'rbbt/resource'
|
5
|
-
require 'rbbt/
|
6
|
-
require 'rbbt/
|
5
|
+
require 'rbbt/segment'
|
6
|
+
require 'rbbt/segment/segmented'
|
7
7
|
require 'rbbt/nlp/genia/sentence_splitter'
|
8
8
|
require 'digest/md5'
|
9
9
|
|
@@ -101,7 +101,7 @@ module NLP
|
|
101
101
|
input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
|
102
102
|
sentence_tokens = TmpFile.with_file(input) do |fin|
|
103
103
|
out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
|
104
|
-
CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
|
104
|
+
CMD.cmd("cd #{Rbbt.software.opt.Gdep.produce.find}; ./gdep #{ fin }").read
|
105
105
|
end
|
106
106
|
|
107
107
|
out.split(/^$/).collect do |sentence|
|
@@ -120,10 +120,10 @@ module NLP
|
|
120
120
|
|
121
121
|
|
122
122
|
def self.gdep_parse_sentences_extension(sentences)
|
123
|
-
require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
|
123
|
+
require Rbbt.software.opt.Gdep.produce.ruby["Gdep.so"].find
|
124
124
|
gdep = Gdep.new
|
125
125
|
if not gdep.gdep_is_loaded
|
126
|
-
Misc.in_dir Rbbt.software.opt.Gdep.find do
|
126
|
+
Misc.in_dir Rbbt.software.opt.Gdep.produce.find do
|
127
127
|
gdep.load_gdep
|
128
128
|
end
|
129
129
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment'
|
4
4
|
require 'rbbt/resource'
|
5
5
|
|
6
6
|
module OpenNLP
|
@@ -33,48 +33,49 @@ module OpenNLP
|
|
33
33
|
def self.sentence_splitter(text)
|
34
34
|
return [] if text.nil? or text.empty?
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
sentences = nil
|
42
|
-
TmpFile.with_file do |tmpfile|
|
43
|
-
start_time = Time.now
|
44
|
-
|
45
|
-
begin
|
46
|
-
pid = Process.fork do
|
47
|
-
sent = sentence_split_detector.sentDetect(text)
|
48
|
-
Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
|
49
|
-
end
|
36
|
+
Segment.ascii(text, "?") do
|
37
|
+
last = 0
|
38
|
+
begin
|
39
|
+
sentence_split_detector = self.sentence_split_detector
|
50
40
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
raise "Taking to long (> #{MAX} seconds)"
|
55
|
-
end
|
56
|
-
sleep 0.1
|
57
|
-
end
|
41
|
+
sentences = nil
|
42
|
+
TmpFile.with_file do |tmpfile|
|
43
|
+
start_time = Time.now
|
58
44
|
|
59
45
|
begin
|
60
|
-
Process.
|
46
|
+
pid = Process.fork do
|
47
|
+
sent = sentence_split_detector.sentDetect(text)
|
48
|
+
Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
|
49
|
+
end
|
50
|
+
|
51
|
+
while not Process.waitpid(pid)
|
52
|
+
if Time.now - start_time > MAX
|
53
|
+
Process.kill(9, pid)
|
54
|
+
raise "Taking to long (> #{MAX} seconds)"
|
55
|
+
end
|
56
|
+
sleep 0.1
|
57
|
+
end
|
58
|
+
|
59
|
+
begin
|
60
|
+
Process.waitpid(pid)
|
61
|
+
end
|
62
|
+
rescue Errno::ECHILD
|
61
63
|
end
|
62
|
-
|
64
|
+
|
65
|
+
sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
|
63
66
|
end
|
64
67
|
|
65
|
-
sentences
|
68
|
+
sentences.collect{|sentence|
|
69
|
+
sentence = Misc.to_utf8(sentence)
|
70
|
+
start = text.index(sentence, last)
|
71
|
+
Segment.setup sentence, start
|
72
|
+
last = start + sentence.length - 1
|
73
|
+
sentence
|
74
|
+
}
|
75
|
+
rescue Exception
|
76
|
+
raise $!
|
77
|
+
raise "Sentence splitter raised exception: #{$!.message}"
|
66
78
|
end
|
67
|
-
|
68
|
-
sentences.collect{|sentence|
|
69
|
-
sentence = Misc.to_utf8(sentence)
|
70
|
-
start = text.index(sentence, last)
|
71
|
-
Segment.setup sentence, start
|
72
|
-
last = start + sentence.length - 1
|
73
|
-
sentence
|
74
|
-
}
|
75
|
-
rescue Exception
|
76
|
-
raise $!
|
77
|
-
raise "Sentence splitter raised exception: #{$!.message}"
|
78
79
|
end
|
79
80
|
end
|
80
81
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/segment/annotation'
|
4
|
+
require 'rbbt/util/python'
|
5
|
+
|
6
|
+
module SpaCy
|
7
|
+
|
8
|
+
PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
|
9
|
+
|
10
|
+
def self.tokens(text, lang = 'en')
|
11
|
+
|
12
|
+
tokens = []
|
13
|
+
RbbtPython.run 'spacy' do
|
14
|
+
nlp = spacy.load(lang)
|
15
|
+
doc = nlp.call(text)
|
16
|
+
doc.__len__.times do |i|
|
17
|
+
tokens << doc.__getitem__(i)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
tokens
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.segments(text, lang = 'en')
|
24
|
+
docid = text.docid if Document === text
|
25
|
+
corpus = text.corpus if Document === text
|
26
|
+
tokens = self.tokens(text, lang).collect do |token|
|
27
|
+
info = {}
|
28
|
+
PROPERTIES.each do |p|
|
29
|
+
info[p] = token.instance_eval(p.to_s)
|
30
|
+
end
|
31
|
+
info[:type] = "SpaCy"
|
32
|
+
info[:offset] = token.idx
|
33
|
+
info[:dep] = token.dep_ + "->" + token.head.idx.to_s
|
34
|
+
info[:docid] = docid if docid
|
35
|
+
info[:corpus] = corpus if corpus
|
36
|
+
SpaCyToken.setup(token.text, info)
|
37
|
+
end
|
38
|
+
SpaCyToken.setup(tokens, :corpus => corpus)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module SpaCyToken
|
43
|
+
extend Entity
|
44
|
+
include SegmentAnnotation
|
45
|
+
|
46
|
+
self.annotation *SpaCy::PROPERTIES
|
47
|
+
self.annotation :dep
|
48
|
+
end
|
49
|
+
|
50
|
+
if __FILE__ == $0
|
51
|
+
ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
|
52
|
+
end
|
data/lib/rbbt/segment.rb
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity'
|
3
|
+
require 'rbbt/document'
|
4
|
+
|
5
|
+
module SegID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
def _parts
|
10
|
+
@parts ||= self.split(":")
|
11
|
+
end
|
12
|
+
|
13
|
+
def range
|
14
|
+
@range ||= Range.new(*_parts[4].split("..").map(&:to_i))
|
15
|
+
end
|
16
|
+
|
17
|
+
def docid
|
18
|
+
@docid ||= DocID.setup(_parts[0..3] * ":")
|
19
|
+
end
|
20
|
+
|
21
|
+
def offset
|
22
|
+
range.begin
|
23
|
+
end
|
24
|
+
|
25
|
+
def segment_length
|
26
|
+
range.end - range.begin + 1
|
27
|
+
end
|
28
|
+
|
29
|
+
property :segment => :single do
|
30
|
+
docid = self.docid
|
31
|
+
document = DocID.setup(docid, :corpus => corpus).document
|
32
|
+
|
33
|
+
text = document[range]
|
34
|
+
|
35
|
+
Segment.setup(text, :docid => docid, :offset => offset)
|
36
|
+
end
|
37
|
+
|
38
|
+
property :segid do
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
module Segment
|
45
|
+
extend Entity
|
46
|
+
self.annotation :offset, :docid
|
47
|
+
|
48
|
+
def segment_length
|
49
|
+
length
|
50
|
+
end
|
51
|
+
|
52
|
+
def eend
|
53
|
+
offset.to_i + length - 1
|
54
|
+
end
|
55
|
+
|
56
|
+
def range
|
57
|
+
(offset.to_i..eend)
|
58
|
+
end
|
59
|
+
|
60
|
+
property :segid do |corpus=nil|
|
61
|
+
SegID.setup([docid, range] * ":", :corpus => corpus)
|
62
|
+
end
|
63
|
+
|
64
|
+
alias id segid
|
65
|
+
|
66
|
+
property :segment do
|
67
|
+
self
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.sort(segments, inline = true)
|
71
|
+
if inline
|
72
|
+
segments.sort do |a,b|
|
73
|
+
case
|
74
|
+
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
75
|
+
0
|
76
|
+
when (a.nil? or a.offset.nil?)
|
77
|
+
-1
|
78
|
+
when (b.nil? or b.offset.nil?)
|
79
|
+
+1
|
80
|
+
when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
|
81
|
+
a.offset.to_i <=> b.offset.to_i
|
82
|
+
else
|
83
|
+
a.segment_length <=> b.segment_length
|
84
|
+
end
|
85
|
+
end
|
86
|
+
else
|
87
|
+
segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.overlaps(sorted_segments)
|
92
|
+
last = nil
|
93
|
+
overlaped = []
|
94
|
+
|
95
|
+
sorted_segments.reverse.each do |segment|
|
96
|
+
overlaped << segment if (not last.nil?) and segment.range.end > last
|
97
|
+
last = segment.range.begin
|
98
|
+
end
|
99
|
+
|
100
|
+
overlaped
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.clean_sort(segments)
|
104
|
+
sorted = sort(segments).reject{|s| s.offset.nil?}
|
105
|
+
overlaps = overlaps(sorted)
|
106
|
+
overlaps.each do |s|
|
107
|
+
sorted.delete s
|
108
|
+
end
|
109
|
+
|
110
|
+
sorted
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.split(text, segments, skip_segments = false)
|
114
|
+
sorted_segments = clean_sort segments
|
115
|
+
|
116
|
+
chunks = []
|
117
|
+
segment_end = 0
|
118
|
+
text_offset = 0
|
119
|
+
sorted_segments.each do |segment|
|
120
|
+
return chunks if text.nil? or text.empty?
|
121
|
+
next if segment.offset.nil?
|
122
|
+
offset = segment.offset - text_offset
|
123
|
+
|
124
|
+
# Consider segment offset. Save pre, or skip if overlap
|
125
|
+
case
|
126
|
+
when offset < 0 # Overlap, skip
|
127
|
+
next
|
128
|
+
when offset > 0 # Save pre
|
129
|
+
chunk = text[0..offset - 1]
|
130
|
+
Segment.setup(chunk, text_offset)
|
131
|
+
chunks << chunk
|
132
|
+
end
|
133
|
+
|
134
|
+
segment_end = offset + segment.segment_length - 1
|
135
|
+
|
136
|
+
if not skip_segments
|
137
|
+
chunk = text[offset..segment_end]
|
138
|
+
Segment.setup(chunk, text_offset + offset)
|
139
|
+
chunks << chunk
|
140
|
+
end
|
141
|
+
|
142
|
+
text_offset += segment_end + 1
|
143
|
+
text = text[segment_end + 1..-1]
|
144
|
+
|
145
|
+
end
|
146
|
+
|
147
|
+
if not text.nil? and not text.empty?
|
148
|
+
chunk = text.dup
|
149
|
+
Segment.setup(chunk, text_offset)
|
150
|
+
chunks << chunk
|
151
|
+
end
|
152
|
+
|
153
|
+
chunks
|
154
|
+
end
|
155
|
+
|
156
|
+
def self.align(text, parts)
|
157
|
+
pre_offset = 0
|
158
|
+
docid = text.respond_to?(:docid) ? text.docid : nil
|
159
|
+
parts.each do |part|
|
160
|
+
offset = text.index part
|
161
|
+
next if offset.nil?
|
162
|
+
Segment.setup(part, pre_offset + offset, docid)
|
163
|
+
pre_offset += offset + part.segment_length - 1
|
164
|
+
text = text[(offset + part.segment_length - 1)..-1]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.index(*args)
|
169
|
+
Segment::RangeIndex.index(*args)
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
require 'rbbt/segment/range_index'
|
175
|
+
require 'rbbt/segment/overlaps'
|
176
|
+
require 'rbbt/segment/transformed'
|
177
|
+
require 'rbbt/segment/segmented'
|
178
|
+
require 'rbbt/segment/encoding'
|
179
|
+
|