rbbt-text 1.2.0 → 1.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +55 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +63 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +26 -3
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
data/lib/rbbt/ner/oscar3.rb
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/text/segment'
|
5
4
|
require 'rbbt/ner/NER'
|
6
5
|
require 'rbbt/util/log'
|
7
6
|
|
@@ -53,7 +52,7 @@ class OSCAR3 < NER
|
|
53
52
|
next unless type.nil? or type.include? mention_type
|
54
53
|
score = memm ? entities.get(key).to_string.to_f : nil
|
55
54
|
|
56
|
-
NamedEntity.setup mention, rstart.to_i + offset, mention_type,
|
55
|
+
NamedEntity.setup mention, :offset => rstart.to_i + offset, :entity_type => mention_type, :score => score
|
57
56
|
|
58
57
|
mentions << mention unless mentions.collect{|m| m.to_s}.include? mention.to_s
|
59
58
|
end
|
data/lib/rbbt/ner/oscar4.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
3
|
require 'libxml'
|
4
|
-
require 'rbbt/
|
4
|
+
require 'rbbt/segment'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
require 'rbbt/util/log'
|
7
7
|
|
@@ -25,7 +25,7 @@ class OSCAR4 < NER
|
|
25
25
|
@@tagger ||= @@OSCAR.new()
|
26
26
|
end
|
27
27
|
|
28
|
-
def self.match(text, type = nil)
|
28
|
+
def self.match(text, protect = false, type = nil)
|
29
29
|
self.init
|
30
30
|
|
31
31
|
return [] if text.nil? or text.strip.empty?
|
@@ -46,7 +46,7 @@ class OSCAR4 < NER
|
|
46
46
|
|
47
47
|
next unless entity.getType.toString == type unless type.nil?
|
48
48
|
|
49
|
-
NamedEntity.setup mention, entity.getStart, entity.getType, inchi, entity.getConfidence
|
49
|
+
NamedEntity.setup mention, :offset => entity.getStart, :entity_type => entity.getType, :code => inchi, :score => entity.getConfidence
|
50
50
|
|
51
51
|
result << mention
|
52
52
|
end
|
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/segment/named_entity'
|
2
|
+
require 'rbbt/segment/segmented'
|
3
|
+
require 'rbbt/segment/transformed'
|
4
|
+
#require 'rbbt/segment/relationship'
|
5
5
|
require 'rbbt/ner/regexpNER'
|
6
6
|
require 'rbbt/ner/token_trieNER'
|
7
7
|
require 'rbbt/nlp/nlp'
|
@@ -14,7 +14,7 @@ class PatternRelExt
|
|
14
14
|
regexpNER = RegExpNER.new type => patterns.collect{|p| /#{p}/}
|
15
15
|
segments = sentence.segments
|
16
16
|
segments = segments.values.flatten if Hash === segments
|
17
|
-
Transformed.with_transform(sentence, segments, Proc.new{|s| s.
|
17
|
+
Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
|
18
18
|
regexpNER.entities(sentence)
|
19
19
|
end
|
20
20
|
end
|
data/lib/rbbt/ner/regexpNER.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'rbbt/text/segment'
|
2
1
|
require 'rbbt/ner/NER'
|
3
2
|
require 'rbbt/util/simpleDSL'
|
4
3
|
|
@@ -23,7 +22,7 @@ class RegExpNER < NER
|
|
23
22
|
end
|
24
23
|
|
25
24
|
if match and not match.empty?
|
26
|
-
NamedEntity.setup(match, start + pre.length, type)
|
25
|
+
NamedEntity.setup(match, :offset => start + pre.length, :entity_type => type)
|
27
26
|
matches << match
|
28
27
|
end
|
29
28
|
|
@@ -1,46 +1,58 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/tsv'
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/text/segment/token'
|
3
|
+
require 'rbbt/segment'
|
5
4
|
require 'rbbt/ner/NER'
|
5
|
+
require 'rbbt/segment/token'
|
6
6
|
|
7
7
|
class TokenTrieNER < NER
|
8
|
-
def self.clean(token)
|
8
|
+
def self.clean(token, stem = false)
|
9
9
|
if token.length > 3
|
10
|
-
token
|
10
|
+
upcase = token !~ /[a-z]/
|
11
|
+
token = token.downcase.sub(/-/,'')
|
12
|
+
|
13
|
+
if stem && ! upcase
|
14
|
+
require 'stemmer'
|
15
|
+
if stem == :double
|
16
|
+
token = token.stem.stem
|
17
|
+
else
|
18
|
+
token = token.stem
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
token
|
11
23
|
else
|
12
24
|
token
|
13
25
|
end
|
14
26
|
end
|
15
27
|
|
16
|
-
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
28
|
+
def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
|
17
29
|
if no_clean
|
18
30
|
if extend_to_token
|
19
|
-
Token.setup(
|
31
|
+
Token.setup(token, :offset => start, :original => token)
|
20
32
|
else
|
21
33
|
token
|
22
34
|
end
|
23
35
|
else
|
24
36
|
if extend_to_token
|
25
|
-
Token.setup(clean(token), start, token)
|
37
|
+
Token.setup(clean(token, stem), :offset => start, :original => token)
|
26
38
|
else
|
27
|
-
clean(token)
|
39
|
+
clean(token, stem)
|
28
40
|
end
|
29
41
|
end
|
30
42
|
end
|
31
43
|
|
32
|
-
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
|
33
|
-
split_at = /\s|(\(|\)|[-."'
|
44
|
+
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
|
45
|
+
split_at = /\s|(\(|\)|[-."':,;])/ if split_at.nil?
|
34
46
|
|
35
47
|
tokens = []
|
36
48
|
while matchdata = text.match(split_at)
|
37
|
-
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
|
38
|
-
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
49
|
+
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
|
50
|
+
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
39
51
|
start += matchdata.end(0)
|
40
52
|
text = matchdata.post_match
|
41
53
|
end
|
42
54
|
|
43
|
-
tokens << prepare_token(text, start, extend_to_token) unless text.empty?
|
55
|
+
tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
|
44
56
|
|
45
57
|
tokens
|
46
58
|
end
|
@@ -130,14 +142,14 @@ class TokenTrieNER < NER
|
|
130
142
|
index1
|
131
143
|
end
|
132
144
|
|
133
|
-
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
|
145
|
+
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
|
134
146
|
|
135
147
|
chunk_size = hash.size / 100
|
136
148
|
items_in_chunk = 0
|
137
149
|
tmp_index = {}
|
138
150
|
hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
|
139
151
|
names = Array === names ? names : [names]
|
140
|
-
names.flatten! if Array === names.first and not
|
152
|
+
names.flatten! if Array === names.first and not Segment === names.first.first
|
141
153
|
|
142
154
|
if names.empty?
|
143
155
|
names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
|
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
|
|
146
158
|
names.each do |name|
|
147
159
|
next if name.empty? or (String === name and name.length < 2)
|
148
160
|
|
149
|
-
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
|
161
|
+
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
|
150
162
|
tokens.extend EnumeratedArray
|
151
163
|
|
152
164
|
token_index = index_for_tokens(tokens, code, type, slack)
|
@@ -237,10 +249,10 @@ class TokenTrieNER < NER
|
|
237
249
|
match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
|
238
250
|
}
|
239
251
|
|
240
|
-
NamedEntity.setup(match, match_tokens.first.offset, type, codes)
|
252
|
+
NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
|
241
253
|
end
|
242
254
|
|
243
|
-
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|
255
|
+
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
|
244
256
|
def initialize(type = nil, file = nil, options = {})
|
245
257
|
options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
|
246
258
|
:persist => false
|
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
|
|
248
260
|
@longest_match = options.delete :longest_match
|
249
261
|
@split_at = options.delete :split_at
|
250
262
|
@no_clean = options.delete :no_clean
|
263
|
+
@stem = options.delete :stem
|
251
264
|
|
252
265
|
file = [] if file.nil?
|
253
266
|
file = [file] unless Array === file
|
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
|
|
273
286
|
Log.debug "TokenTrieNER merging TSV"
|
274
287
|
new.with_unnamed do
|
275
288
|
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
276
|
-
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
289
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
|
277
290
|
end
|
278
291
|
end
|
279
292
|
when Hash === new
|
@@ -284,18 +297,18 @@ class TokenTrieNER < NER
|
|
284
297
|
new = TSV.open(new, :flat)
|
285
298
|
new.with_unnamed do
|
286
299
|
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
287
|
-
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
300
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
|
288
301
|
end
|
289
302
|
end
|
290
303
|
end
|
291
304
|
end
|
292
305
|
|
293
306
|
def match(text)
|
294
|
-
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
|
307
|
+
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
|
295
308
|
|
296
309
|
tokens.extend EnumeratedArray
|
297
310
|
tokens.pos = 0
|
298
|
-
|
311
|
+
|
299
312
|
matches = []
|
300
313
|
while tokens.left?
|
301
314
|
new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
require 'rbbt/nlp/nlp'
|
2
|
-
require 'rbbt/
|
2
|
+
require 'rbbt/segment'
|
3
3
|
module NLP
|
4
4
|
Rbbt.claim Rbbt.software.opt.Geniass, :install, Rbbt.share.install.software.Geniass.find
|
5
5
|
|
@@ -239,6 +239,7 @@ module NLP
|
|
239
239
|
end
|
240
240
|
|
241
241
|
def self.geniass_sentence_splitter(text)
|
242
|
+
Rbbt.software.opt.Geniass.produce
|
242
243
|
offsets = []
|
243
244
|
|
244
245
|
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
@@ -294,7 +295,7 @@ module NLP
|
|
294
295
|
offsets.collect do |s,e|
|
295
296
|
sentence = text[s..e]
|
296
297
|
next if sentence.nil?
|
297
|
-
|
298
|
+
sentence.gsub!(NEW_LINE_MASK, "\n")
|
298
299
|
Segment.setup sentence, s
|
299
300
|
sentence
|
300
301
|
end
|
data/lib/rbbt/nlp/nlp.rb
CHANGED
@@ -2,8 +2,8 @@ require 'rbbt'
|
|
2
2
|
require 'rbbt/util/tmpfile'
|
3
3
|
require 'rbbt/persist'
|
4
4
|
require 'rbbt/resource'
|
5
|
-
require 'rbbt/
|
6
|
-
require 'rbbt/
|
5
|
+
require 'rbbt/segment'
|
6
|
+
require 'rbbt/segment/segmented'
|
7
7
|
require 'rbbt/nlp/genia/sentence_splitter'
|
8
8
|
require 'digest/md5'
|
9
9
|
|
@@ -101,7 +101,7 @@ module NLP
|
|
101
101
|
input = sentences.collect{|sentence| sentence.gsub(/\n/, NEW_LINE_MASK)} * "\n"
|
102
102
|
sentence_tokens = TmpFile.with_file(input) do |fin|
|
103
103
|
out = local_persist(Digest::MD5.hexdigest(input), :Chunks, :string) do
|
104
|
-
CMD.cmd("cd #{Rbbt.software.opt.Gdep.find}; ./gdep #{ fin }").read
|
104
|
+
CMD.cmd("cd #{Rbbt.software.opt.Gdep.produce.find}; ./gdep #{ fin }").read
|
105
105
|
end
|
106
106
|
|
107
107
|
out.split(/^$/).collect do |sentence|
|
@@ -120,10 +120,10 @@ module NLP
|
|
120
120
|
|
121
121
|
|
122
122
|
def self.gdep_parse_sentences_extension(sentences)
|
123
|
-
require Rbbt.software.opt.Gdep.ruby["Gdep.so"].find
|
123
|
+
require Rbbt.software.opt.Gdep.produce.ruby["Gdep.so"].find
|
124
124
|
gdep = Gdep.new
|
125
125
|
if not gdep.gdep_is_loaded
|
126
|
-
Misc.in_dir Rbbt.software.opt.Gdep.find do
|
126
|
+
Misc.in_dir Rbbt.software.opt.Gdep.produce.find do
|
127
127
|
gdep.load_gdep
|
128
128
|
end
|
129
129
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rjb'
|
3
|
-
require 'rbbt/
|
3
|
+
require 'rbbt/segment'
|
4
4
|
require 'rbbt/resource'
|
5
5
|
|
6
6
|
module OpenNLP
|
@@ -33,48 +33,49 @@ module OpenNLP
|
|
33
33
|
def self.sentence_splitter(text)
|
34
34
|
return [] if text.nil? or text.empty?
|
35
35
|
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
sentences = nil
|
42
|
-
TmpFile.with_file do |tmpfile|
|
43
|
-
start_time = Time.now
|
44
|
-
|
45
|
-
begin
|
46
|
-
pid = Process.fork do
|
47
|
-
sent = sentence_split_detector.sentDetect(text)
|
48
|
-
Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
|
49
|
-
end
|
36
|
+
Segment.ascii(text, "?") do
|
37
|
+
last = 0
|
38
|
+
begin
|
39
|
+
sentence_split_detector = self.sentence_split_detector
|
50
40
|
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
raise "Taking to long (> #{MAX} seconds)"
|
55
|
-
end
|
56
|
-
sleep 0.1
|
57
|
-
end
|
41
|
+
sentences = nil
|
42
|
+
TmpFile.with_file do |tmpfile|
|
43
|
+
start_time = Time.now
|
58
44
|
|
59
45
|
begin
|
60
|
-
Process.
|
46
|
+
pid = Process.fork do
|
47
|
+
sent = sentence_split_detector.sentDetect(text)
|
48
|
+
Open.write(tmpfile, sent * "#OpenNLP:SENTENCE#")
|
49
|
+
end
|
50
|
+
|
51
|
+
while not Process.waitpid(pid)
|
52
|
+
if Time.now - start_time > MAX
|
53
|
+
Process.kill(9, pid)
|
54
|
+
raise "Taking to long (> #{MAX} seconds)"
|
55
|
+
end
|
56
|
+
sleep 0.1
|
57
|
+
end
|
58
|
+
|
59
|
+
begin
|
60
|
+
Process.waitpid(pid)
|
61
|
+
end
|
62
|
+
rescue Errno::ECHILD
|
61
63
|
end
|
62
|
-
|
64
|
+
|
65
|
+
sentences = Open.read(tmpfile).split("#OpenNLP:SENTENCE#")
|
63
66
|
end
|
64
67
|
|
65
|
-
sentences
|
68
|
+
sentences.collect{|sentence|
|
69
|
+
sentence = Misc.to_utf8(sentence)
|
70
|
+
start = text.index(sentence, last)
|
71
|
+
Segment.setup sentence, start
|
72
|
+
last = start + sentence.length - 1
|
73
|
+
sentence
|
74
|
+
}
|
75
|
+
rescue Exception
|
76
|
+
raise $!
|
77
|
+
raise "Sentence splitter raised exception: #{$!.message}"
|
66
78
|
end
|
67
|
-
|
68
|
-
sentences.collect{|sentence|
|
69
|
-
sentence = Misc.to_utf8(sentence)
|
70
|
-
start = text.index(sentence, last)
|
71
|
-
Segment.setup sentence, start
|
72
|
-
last = start + sentence.length - 1
|
73
|
-
sentence
|
74
|
-
}
|
75
|
-
rescue Exception
|
76
|
-
raise $!
|
77
|
-
raise "Sentence splitter raised exception: #{$!.message}"
|
78
79
|
end
|
79
80
|
end
|
80
81
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/segment/annotation'
|
4
|
+
require 'rbbt/util/python'
|
5
|
+
|
6
|
+
module SpaCy
|
7
|
+
|
8
|
+
PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
|
9
|
+
|
10
|
+
def self.tokens(text, lang = 'en')
|
11
|
+
|
12
|
+
tokens = []
|
13
|
+
RbbtPython.run 'spacy' do
|
14
|
+
nlp = spacy.load(lang)
|
15
|
+
doc = nlp.call(text)
|
16
|
+
doc.__len__.times do |i|
|
17
|
+
tokens << doc.__getitem__(i)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
tokens
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.segments(text, lang = 'en')
|
24
|
+
docid = text.docid if Document === text
|
25
|
+
corpus = text.corpus if Document === text
|
26
|
+
tokens = self.tokens(text, lang).collect do |token|
|
27
|
+
info = {}
|
28
|
+
PROPERTIES.each do |p|
|
29
|
+
info[p] = token.instance_eval(p.to_s)
|
30
|
+
end
|
31
|
+
info[:type] = "SpaCy"
|
32
|
+
info[:offset] = token.idx
|
33
|
+
info[:dep] = token.dep_ + "->" + token.head.idx.to_s
|
34
|
+
info[:docid] = docid if docid
|
35
|
+
info[:corpus] = corpus if corpus
|
36
|
+
SpaCyToken.setup(token.text, info)
|
37
|
+
end
|
38
|
+
SpaCyToken.setup(tokens, :corpus => corpus)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module SpaCyToken
|
43
|
+
extend Entity
|
44
|
+
include SegmentAnnotation
|
45
|
+
|
46
|
+
self.annotation *SpaCy::PROPERTIES
|
47
|
+
self.annotation :dep
|
48
|
+
end
|
49
|
+
|
50
|
+
if __FILE__ == $0
|
51
|
+
ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
|
52
|
+
end
|
data/lib/rbbt/segment.rb
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/entity'
|
3
|
+
require 'rbbt/document'
|
4
|
+
|
5
|
+
module SegID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
def _parts
|
10
|
+
@parts ||= self.split(":")
|
11
|
+
end
|
12
|
+
|
13
|
+
def range
|
14
|
+
@range ||= Range.new(*_parts[4].split("..").map(&:to_i))
|
15
|
+
end
|
16
|
+
|
17
|
+
def docid
|
18
|
+
@docid ||= DocID.setup(_parts[0..3] * ":")
|
19
|
+
end
|
20
|
+
|
21
|
+
def offset
|
22
|
+
range.begin
|
23
|
+
end
|
24
|
+
|
25
|
+
def segment_length
|
26
|
+
range.end - range.begin + 1
|
27
|
+
end
|
28
|
+
|
29
|
+
property :segment => :single do
|
30
|
+
docid = self.docid
|
31
|
+
document = DocID.setup(docid, :corpus => corpus).document
|
32
|
+
|
33
|
+
text = document[range]
|
34
|
+
|
35
|
+
Segment.setup(text, :docid => docid, :offset => offset)
|
36
|
+
end
|
37
|
+
|
38
|
+
property :segid do
|
39
|
+
self
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
|
44
|
+
module Segment
|
45
|
+
extend Entity
|
46
|
+
self.annotation :offset, :docid
|
47
|
+
|
48
|
+
def segment_length
|
49
|
+
length
|
50
|
+
end
|
51
|
+
|
52
|
+
def eend
|
53
|
+
offset.to_i + length - 1
|
54
|
+
end
|
55
|
+
|
56
|
+
def range
|
57
|
+
(offset.to_i..eend)
|
58
|
+
end
|
59
|
+
|
60
|
+
property :segid do |corpus=nil|
|
61
|
+
SegID.setup([docid, range] * ":", :corpus => corpus)
|
62
|
+
end
|
63
|
+
|
64
|
+
alias id segid
|
65
|
+
|
66
|
+
property :segment do
|
67
|
+
self
|
68
|
+
end
|
69
|
+
|
70
|
+
def self.sort(segments, inline = true)
|
71
|
+
if inline
|
72
|
+
segments.sort do |a,b|
|
73
|
+
case
|
74
|
+
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
75
|
+
0
|
76
|
+
when (a.nil? or a.offset.nil?)
|
77
|
+
-1
|
78
|
+
when (b.nil? or b.offset.nil?)
|
79
|
+
+1
|
80
|
+
when (not a.range.include? b.offset.to_i and not b.range.include? a.offset.to_i)
|
81
|
+
a.offset.to_i <=> b.offset.to_i
|
82
|
+
else
|
83
|
+
a.segment_length <=> b.segment_length
|
84
|
+
end
|
85
|
+
end
|
86
|
+
else
|
87
|
+
segments.sort_by do |segment| segment.offset.to_i || 0 end.reverse
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.overlaps(sorted_segments)
|
92
|
+
last = nil
|
93
|
+
overlaped = []
|
94
|
+
|
95
|
+
sorted_segments.reverse.each do |segment|
|
96
|
+
overlaped << segment if (not last.nil?) and segment.range.end > last
|
97
|
+
last = segment.range.begin
|
98
|
+
end
|
99
|
+
|
100
|
+
overlaped
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.clean_sort(segments)
|
104
|
+
sorted = sort(segments).reject{|s| s.offset.nil?}
|
105
|
+
overlaps = overlaps(sorted)
|
106
|
+
overlaps.each do |s|
|
107
|
+
sorted.delete s
|
108
|
+
end
|
109
|
+
|
110
|
+
sorted
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.split(text, segments, skip_segments = false)
|
114
|
+
sorted_segments = clean_sort segments
|
115
|
+
|
116
|
+
chunks = []
|
117
|
+
segment_end = 0
|
118
|
+
text_offset = 0
|
119
|
+
sorted_segments.each do |segment|
|
120
|
+
return chunks if text.nil? or text.empty?
|
121
|
+
next if segment.offset.nil?
|
122
|
+
offset = segment.offset - text_offset
|
123
|
+
|
124
|
+
# Consider segment offset. Save pre, or skip if overlap
|
125
|
+
case
|
126
|
+
when offset < 0 # Overlap, skip
|
127
|
+
next
|
128
|
+
when offset > 0 # Save pre
|
129
|
+
chunk = text[0..offset - 1]
|
130
|
+
Segment.setup(chunk, text_offset)
|
131
|
+
chunks << chunk
|
132
|
+
end
|
133
|
+
|
134
|
+
segment_end = offset + segment.segment_length - 1
|
135
|
+
|
136
|
+
if not skip_segments
|
137
|
+
chunk = text[offset..segment_end]
|
138
|
+
Segment.setup(chunk, text_offset + offset)
|
139
|
+
chunks << chunk
|
140
|
+
end
|
141
|
+
|
142
|
+
text_offset += segment_end + 1
|
143
|
+
text = text[segment_end + 1..-1]
|
144
|
+
|
145
|
+
end
|
146
|
+
|
147
|
+
if not text.nil? and not text.empty?
|
148
|
+
chunk = text.dup
|
149
|
+
Segment.setup(chunk, text_offset)
|
150
|
+
chunks << chunk
|
151
|
+
end
|
152
|
+
|
153
|
+
chunks
|
154
|
+
end
|
155
|
+
|
156
|
+
def self.align(text, parts)
|
157
|
+
pre_offset = 0
|
158
|
+
docid = text.respond_to?(:docid) ? text.docid : nil
|
159
|
+
parts.each do |part|
|
160
|
+
offset = text.index part
|
161
|
+
next if offset.nil?
|
162
|
+
Segment.setup(part, pre_offset + offset, docid)
|
163
|
+
pre_offset += offset + part.segment_length - 1
|
164
|
+
text = text[(offset + part.segment_length - 1)..-1]
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.index(*args)
|
169
|
+
Segment::RangeIndex.index(*args)
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
require 'rbbt/segment/range_index'
|
175
|
+
require 'rbbt/segment/overlaps'
|
176
|
+
require 'rbbt/segment/transformed'
|
177
|
+
require 'rbbt/segment/segmented'
|
178
|
+
require 'rbbt/segment/encoding'
|
179
|
+
|