rbbt-text 0.2.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
@@ -1,105 +1,214 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/util/tsv'
|
3
3
|
require 'rbbt/ner/annotations'
|
4
|
+
require 'rbbt/ner/annotations/token'
|
4
5
|
require 'rbbt/ner/NER'
|
5
6
|
|
6
7
|
class TokenTrieNER < NER
|
7
8
|
def self.clean(token)
|
8
9
|
if token.length > 3
|
9
|
-
token.downcase
|
10
|
+
token.downcase.sub(/-/,'')
|
10
11
|
else
|
11
12
|
token
|
12
13
|
end
|
13
14
|
end
|
14
15
|
|
15
|
-
def self.prepare_token(token, start)
|
16
|
-
|
16
|
+
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
17
|
+
if no_clean
|
18
|
+
if extend_to_token
|
19
|
+
Token.annotate(clean(token), start, token)
|
20
|
+
else
|
21
|
+
clean(token)
|
22
|
+
end
|
23
|
+
else
|
24
|
+
if extend_to_token
|
25
|
+
Token.annotate(clean(token), start, token)
|
26
|
+
else
|
27
|
+
token
|
28
|
+
end
|
29
|
+
end
|
17
30
|
end
|
18
31
|
|
19
|
-
def self.tokenize(text, split_at =
|
32
|
+
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
|
33
|
+
split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
|
20
34
|
|
21
35
|
tokens = []
|
22
36
|
while matchdata = text.match(split_at)
|
23
|
-
tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
24
|
-
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
37
|
+
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
|
38
|
+
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
25
39
|
start += matchdata.end(0)
|
26
40
|
text = matchdata.post_match
|
27
41
|
end
|
28
|
-
|
42
|
+
|
43
|
+
tokens << prepare_token(text, start, extend_to_token) unless text.empty?
|
29
44
|
|
30
45
|
tokens
|
31
46
|
end
|
32
47
|
|
33
48
|
#{{{ Process dictionary
|
34
49
|
|
50
|
+
module EnumeratedArray
|
51
|
+
attr_accessor :pos
|
52
|
+
def self.extended(array)
|
53
|
+
array.pos = 0
|
54
|
+
end
|
55
|
+
|
56
|
+
def last?
|
57
|
+
@pos == length - 1
|
58
|
+
end
|
59
|
+
|
60
|
+
def advance
|
61
|
+
@pos += 1
|
62
|
+
end
|
63
|
+
|
64
|
+
def back
|
65
|
+
@pos -= 1
|
66
|
+
end
|
67
|
+
|
68
|
+
def next
|
69
|
+
e = self[@pos]
|
70
|
+
advance
|
71
|
+
e
|
72
|
+
end
|
73
|
+
|
74
|
+
def peek
|
75
|
+
self[@pos]
|
76
|
+
end
|
77
|
+
|
78
|
+
def left?
|
79
|
+
@pos < length
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
|
35
85
|
class Code
|
36
|
-
attr_accessor :
|
37
|
-
def initialize(
|
38
|
-
@
|
86
|
+
attr_accessor :code, :type
|
87
|
+
def initialize(code, type = nil)
|
88
|
+
@code = code
|
39
89
|
@type = type
|
40
90
|
end
|
41
91
|
|
42
92
|
def to_s
|
43
|
-
[type,
|
93
|
+
[type, code] * ":"
|
44
94
|
end
|
45
95
|
end
|
46
96
|
|
47
|
-
def self.index_for_tokens(tokens, code, type = nil)
|
48
|
-
if tokens.
|
49
|
-
{:END => [Code.new
|
97
|
+
def self.index_for_tokens(tokens, code, type = nil, slack = nil)
|
98
|
+
if not tokens.left?
|
99
|
+
{:END => [Code.new(code, type)]}
|
50
100
|
else
|
51
|
-
|
101
|
+
head = tokens.next
|
102
|
+
if (slack.nil? or not slack.call(head))
|
103
|
+
res = {head => index_for_tokens(tokens, code, type, slack)}
|
104
|
+
else
|
105
|
+
res = {head => index_for_tokens(tokens, code, type, slack)}.merge(index_for_tokens(tokens, code, type, slack))
|
106
|
+
end
|
107
|
+
tokens.back
|
108
|
+
res
|
52
109
|
end
|
53
110
|
end
|
54
|
-
|
111
|
+
|
55
112
|
def self.merge(index1, index2)
|
113
|
+
index1.write if index1.respond_to? :write
|
56
114
|
index2.each do |key, new_index2|
|
57
115
|
case
|
58
116
|
when key == :END
|
59
|
-
index1[:END]
|
60
|
-
|
61
|
-
|
117
|
+
end1 = index1[:END] || []
|
118
|
+
end1 += new_index2.reject{|new| end1.collect{|e| e.to_s }.include? new.to_s }
|
119
|
+
end1.uniq!
|
120
|
+
index1[:END] = end1
|
62
121
|
when index1.include?(key)
|
63
|
-
merge(index1[key], new_index2)
|
122
|
+
index1[key] = merge(index1[key], new_index2)
|
64
123
|
else
|
65
124
|
index1[key] = new_index2
|
66
125
|
end
|
67
126
|
end
|
127
|
+
index1.read if index1.respond_to? :read
|
128
|
+
|
129
|
+
index1
|
68
130
|
end
|
69
131
|
|
70
|
-
def self.process(hash, type = nil)
|
71
|
-
index = {}
|
72
|
-
hash.each do |code, names|
|
73
|
-
names.flatten.each do |name|
|
74
|
-
next if name.empty? or name.length < 2
|
75
|
-
tokens = tokenize name
|
132
|
+
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
|
76
133
|
|
77
|
-
|
134
|
+
chunk_size = hash.size / 100
|
135
|
+
items_in_chunk = 0
|
136
|
+
tmp_index = {}
|
137
|
+
hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
|
138
|
+
names = Array === names ? names : [names]
|
139
|
+
names.flatten! if Array === names.first and not Token === names.first.first
|
140
|
+
names.each do |name|
|
141
|
+
next if name.empty? or (String === name and name.length < 2)
|
142
|
+
|
143
|
+
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
|
144
|
+
tokens.extend EnumeratedArray
|
145
|
+
|
146
|
+
tmp_index = merge(tmp_index, index_for_tokens(tokens, code, type, slack)) unless tokens.empty?
|
147
|
+
items_in_chunk += 1
|
148
|
+
|
149
|
+
if items_in_chunk > chunk_size
|
150
|
+
index = merge(index, tmp_index)
|
151
|
+
tmp_index = {}
|
152
|
+
items_in_chunk = 0
|
153
|
+
end
|
78
154
|
end
|
79
155
|
end
|
156
|
+
index = merge(index, tmp_index)
|
157
|
+
|
80
158
|
index
|
81
159
|
end
|
82
160
|
|
83
161
|
#{{{ Matching
|
84
162
|
|
85
|
-
def self.
|
86
|
-
|
163
|
+
def self.follow(index, head)
|
164
|
+
res = nil
|
165
|
+
|
166
|
+
if index.include? head
|
167
|
+
return index[head]
|
168
|
+
end
|
169
|
+
|
170
|
+
return nil unless (not TCHash === index ) and index.include? :PROCS
|
87
171
|
|
88
|
-
|
89
|
-
|
172
|
+
index[:PROCS].each do |key,value|
|
173
|
+
return value if key.call(head)
|
174
|
+
end
|
90
175
|
|
91
|
-
|
176
|
+
nil
|
177
|
+
end
|
178
|
+
|
179
|
+
def self.find_fail(index, tokens, head, longest_match, slack, first)
|
180
|
+
if Proc === slack and not first and not head.nil? and tokens.left? and slack.call(head)
|
181
|
+
matches = find(index, tokens, longest_match, slack, false) # Recursion
|
182
|
+
if not matches.nil?
|
183
|
+
matches.last.unshift head
|
184
|
+
return matches
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
tokens.back
|
189
|
+
return nil
|
190
|
+
end
|
191
|
+
|
192
|
+
def self.find(index, tokens, longest_match = true, slack = nil, first = true)
|
193
|
+
head = tokens.next
|
194
|
+
|
195
|
+
next_index = follow(index, head)
|
196
|
+
|
197
|
+
|
198
|
+
return find_fail(index, tokens, head, longest_match, slack, first) if next_index.nil?
|
199
|
+
|
200
|
+
if not tokens.left?
|
92
201
|
if next_index.include? :END
|
93
202
|
return [next_index[:END], [head]]
|
94
203
|
else
|
95
|
-
tokens
|
96
|
-
return nil
|
204
|
+
return find_fail(index, tokens, head, longest_match, slack, first)
|
97
205
|
end
|
98
206
|
else
|
99
207
|
|
100
208
|
return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
|
101
209
|
|
102
|
-
matches = find(next_index, tokens)
|
210
|
+
matches = find(next_index, tokens, longest_match, slack, false) # Recursion
|
211
|
+
|
103
212
|
if not matches.nil?
|
104
213
|
matches.last.unshift head
|
105
214
|
return matches
|
@@ -107,8 +216,7 @@ class TokenTrieNER < NER
|
|
107
216
|
|
108
217
|
return [next_index[:END], [head]] if next_index.include?(:END)
|
109
218
|
|
110
|
-
tokens
|
111
|
-
return nil
|
219
|
+
return find_fail(index, tokens, head, longest_match, slack, first)
|
112
220
|
end
|
113
221
|
end
|
114
222
|
|
@@ -117,20 +225,35 @@ class TokenTrieNER < NER
|
|
117
225
|
match_offset = match_tokens.first.offset
|
118
226
|
match_tokens.each{|t|
|
119
227
|
match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
|
120
|
-
match << t.original
|
228
|
+
match << (t.respond_to?(:original) ? t.original : t)
|
121
229
|
}
|
122
230
|
|
123
231
|
NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
|
124
232
|
end
|
125
233
|
|
126
|
-
attr_accessor :index, :longest_match, :type
|
127
|
-
def initialize(
|
128
|
-
options = Misc.add_defaults options, :
|
234
|
+
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|
235
|
+
def initialize(type = nil, file = nil, options = {})
|
236
|
+
options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
|
237
|
+
:persistence => false
|
238
|
+
@slack = slack
|
129
239
|
@longest_match = options.delete :longest_match
|
240
|
+
@split_at = options.delete :split_at
|
241
|
+
@no_clean = options.delete :no_clean
|
130
242
|
|
243
|
+
file = [] if file.nil?
|
131
244
|
file = [file] unless Array === file
|
132
|
-
@index =
|
133
|
-
|
245
|
+
@index = Persistence.persist(file, :TokenTRIE, :tsv, options) do |file, options, filename, persistecen_file|
|
246
|
+
if persistecen_file.nil?
|
247
|
+
@index = {}
|
248
|
+
else
|
249
|
+
FileUtils.mkdir_p File.dirname(persistecen_file) unless File.exists? File.dirname(persistecen_file)
|
250
|
+
@index = TCHash.get persistecen_file, true, :marshal
|
251
|
+
end
|
252
|
+
file.each do |f|
|
253
|
+
merge(f, type)
|
254
|
+
end
|
255
|
+
@index
|
256
|
+
end
|
134
257
|
end
|
135
258
|
|
136
259
|
def merge(new, type = nil)
|
@@ -140,24 +263,36 @@ class TokenTrieNER < NER
|
|
140
263
|
when Hash === new
|
141
264
|
TokenTrieNER.merge(@index, new)
|
142
265
|
when TSV === new
|
143
|
-
|
266
|
+
old_unnamed = new.unnamed
|
267
|
+
old_monitor = new.monitor
|
268
|
+
new.unnamed = true
|
269
|
+
new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
|
270
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
271
|
+
new.unnamed = old_unnamed
|
272
|
+
new.monitor = old_monitor
|
144
273
|
when String === new
|
145
|
-
|
274
|
+
new = TSV.new(new, :flat)
|
275
|
+
new.unnamed = true
|
276
|
+
new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
|
277
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
146
278
|
end
|
147
279
|
end
|
148
280
|
|
149
281
|
def match(text)
|
150
|
-
tokens = TokenTrieNER.tokenize
|
282
|
+
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
|
283
|
+
|
284
|
+
tokens.extend EnumeratedArray
|
285
|
+
tokens.pos = 0
|
151
286
|
|
152
287
|
matches = []
|
153
|
-
while tokens.
|
154
|
-
new_matches = TokenTrieNER.find(@index, tokens, longest_match)
|
288
|
+
while tokens.left?
|
289
|
+
new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
|
155
290
|
|
156
291
|
if new_matches
|
157
292
|
codes, match_tokens = new_matches
|
158
|
-
matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.
|
293
|
+
matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.code})
|
159
294
|
else
|
160
|
-
tokens.
|
295
|
+
tokens.advance
|
161
296
|
end
|
162
297
|
end
|
163
298
|
|
@@ -165,4 +300,3 @@ class TokenTrieNER < NER
|
|
165
300
|
end
|
166
301
|
|
167
302
|
end
|
168
|
-
|
@@ -0,0 +1,214 @@
|
|
1
|
+
module NLP
|
2
|
+
def self.returnFeatures(prevWord, delimiter, nextWord)
|
3
|
+
if nextWord.match(/__ss__/)
|
4
|
+
nw = nextWord.sub(/__ss__/, "")
|
5
|
+
else
|
6
|
+
nw = nextWord
|
7
|
+
end
|
8
|
+
|
9
|
+
str = ""
|
10
|
+
# prev. word, next word
|
11
|
+
str += "pw_" + prevWord.downcase
|
12
|
+
str += "\tnw_" + nw.downcase
|
13
|
+
|
14
|
+
# delimiter
|
15
|
+
str += "\td_" + delimiter
|
16
|
+
|
17
|
+
# capitalized first char in next word
|
18
|
+
# capital in next word excluding first char.
|
19
|
+
if nw[0].chr == nw[0].chr.capitalize
|
20
|
+
str += "\tnfc_y"
|
21
|
+
nwExcluginFirst = nw[1 ... -1]
|
22
|
+
if nwExcluginFirst == nil
|
23
|
+
str += "\tnwcef_n"
|
24
|
+
elsif nwExcluginFirst.downcase == nwExcluginFirst
|
25
|
+
str += "\tnwcef_n"
|
26
|
+
else
|
27
|
+
str += "\tnwcef_y"
|
28
|
+
end
|
29
|
+
else
|
30
|
+
if nw.downcase == nw
|
31
|
+
str += "\tnwcef_n"
|
32
|
+
else
|
33
|
+
str += "\tnwcef_y"
|
34
|
+
end
|
35
|
+
str += "\tnfc_n"
|
36
|
+
end
|
37
|
+
|
38
|
+
# prev. word capital
|
39
|
+
if prevWord.downcase == prevWord
|
40
|
+
str += "\tpwc_n"
|
41
|
+
else
|
42
|
+
str += "\tpwc_y"
|
43
|
+
end
|
44
|
+
|
45
|
+
# number in prev. word, in next word
|
46
|
+
if prevWord.match(/[0-9]/)
|
47
|
+
str += "\tpwn_y"
|
48
|
+
else
|
49
|
+
str += "\tpwn_n"
|
50
|
+
end
|
51
|
+
if nw.match(/[0-9]/)
|
52
|
+
str += "\tnwn_y"
|
53
|
+
else
|
54
|
+
str += "\tnwn_n"
|
55
|
+
end
|
56
|
+
|
57
|
+
# prev., next word excluding braket, camma, etc.
|
58
|
+
prevWordEx = prevWord.gsub(/[()'",\[\]]/, "")
|
59
|
+
nwEx = nw.gsub(/[()'",\[\]]/, "")
|
60
|
+
str += "\tpwex_" + prevWordEx.downcase
|
61
|
+
str += "\tnwex_" + nwEx.downcase
|
62
|
+
|
63
|
+
# bracket or quatation in prev. word
|
64
|
+
if prevWord.match(/()'"/)
|
65
|
+
str += "\tpwcbq_y"
|
66
|
+
else
|
67
|
+
str += "\tpwcbq_n"
|
68
|
+
end
|
69
|
+
# camma in prev., next word
|
70
|
+
if prevWord.match(/,/)
|
71
|
+
str += "\tpwcc_y"
|
72
|
+
else
|
73
|
+
str += "\tpwcc_n"
|
74
|
+
end
|
75
|
+
if nw.match(/,/)
|
76
|
+
else
|
77
|
+
str += "\tnwcc_n"
|
78
|
+
end
|
79
|
+
|
80
|
+
# prev. word + delimiter
|
81
|
+
str += "\tpw_" + prevWord + "_d_" + delimiter
|
82
|
+
# prev. word ex. + delimiter + next word ex.
|
83
|
+
str += "\tpwex_" + prevWordEx + "_d_" + delimiter + "_nwex_" + nwEx
|
84
|
+
#str +=
|
85
|
+
#str +=
|
86
|
+
#str +=
|
87
|
+
str += "\n"
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.event_extraction(text)
|
91
|
+
events = ""
|
92
|
+
marks = ""
|
93
|
+
|
94
|
+
eventCount = 0
|
95
|
+
|
96
|
+
pat = / [^ ]+[.!\?\)\]\"]( +)[^ ]+ /
|
97
|
+
for line in text.split(/\n/) do
|
98
|
+
while line.match(pat) do
|
99
|
+
line.sub!(/ ([^ ]+)([.!\?\)\]\"])( +)([^ ]+) /){
|
100
|
+
a, b, d, c = $1, $2, $3, $4
|
101
|
+
events << eventCount.to_s << "\t"
|
102
|
+
events << returnFeatures(a, b, c)
|
103
|
+
(" " + a + b + "__" + eventCount.to_s + "____" + d + "__" + c + " ")
|
104
|
+
}
|
105
|
+
eventCount += 1
|
106
|
+
end
|
107
|
+
marks << line
|
108
|
+
end
|
109
|
+
|
110
|
+
[events, marks]
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.process_labels(marked_text, labels)
|
114
|
+
out = ""
|
115
|
+
|
116
|
+
count = 0
|
117
|
+
text_lines = marked_text.split(/\n/)
|
118
|
+
line = text_lines.shift
|
119
|
+
for label in labels
|
120
|
+
pat = "__" + count.to_s + "__"
|
121
|
+
until(line.match(pat)) do
|
122
|
+
out << line
|
123
|
+
line = text_lines.shift
|
124
|
+
end
|
125
|
+
splitted = label.chomp.to_i
|
126
|
+
|
127
|
+
line.sub!(pat){
|
128
|
+
if splitted == 1
|
129
|
+
"__\n__"
|
130
|
+
else
|
131
|
+
"____"
|
132
|
+
end
|
133
|
+
}
|
134
|
+
line.sub!(/__\n____ +__/, "\n")
|
135
|
+
line.sub!(/______( +)__/){
|
136
|
+
$1
|
137
|
+
}
|
138
|
+
count += 1
|
139
|
+
end
|
140
|
+
|
141
|
+
out << line
|
142
|
+
|
143
|
+
out << text_lines * ""
|
144
|
+
|
145
|
+
out
|
146
|
+
end
|
147
|
+
|
148
|
+
def self.geniass_sentence_splitter_extension(text)
|
149
|
+
require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
|
150
|
+
geniass = Geniass.new
|
151
|
+
if not geniass.geniass_is_loaded
|
152
|
+
Misc.in_dir Rbbt.software.opt.Geniass.find do
|
153
|
+
geniass.load_geniass
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
158
|
+
events, marks = event_extraction(cleaned)
|
159
|
+
|
160
|
+
labels = events.split(/\n/).collect{|line|
|
161
|
+
geniass.label(line)
|
162
|
+
}
|
163
|
+
|
164
|
+
out = process_labels(marks, labels)
|
165
|
+
|
166
|
+
offsets = []
|
167
|
+
|
168
|
+
inTxtStrict = StringIO.new text
|
169
|
+
inTxtNew = StringIO.new out.gsub("\n", '|').gsub(NEW_LINE_MASK, "\n")
|
170
|
+
|
171
|
+
marker = "|"[0]
|
172
|
+
position = 0
|
173
|
+
sentenceCount = 1
|
174
|
+
target = ''
|
175
|
+
targetNew = ''
|
176
|
+
start = 0
|
177
|
+
finish = 0
|
178
|
+
|
179
|
+
while(!inTxtNew.eof?) do
|
180
|
+
targetNew = inTxtNew.getc
|
181
|
+
target = inTxtStrict.getc
|
182
|
+
position += 1
|
183
|
+
if targetNew == marker
|
184
|
+
sentenceCount += 1
|
185
|
+
finish = position - 1
|
186
|
+
offsets << [start, finish] if finish - start > 10
|
187
|
+
if targetNew == target
|
188
|
+
start = position
|
189
|
+
else
|
190
|
+
targetNew = inTxtNew.getc
|
191
|
+
while targetNew != target do
|
192
|
+
target = inTxtStrict.getc
|
193
|
+
position += 1
|
194
|
+
end
|
195
|
+
start = position - 1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
finish = position - 1
|
201
|
+
offsets << [start, finish] if finish > start
|
202
|
+
|
203
|
+
inTxtStrict.close
|
204
|
+
inTxtNew.close
|
205
|
+
|
206
|
+
offsets.collect do |s,e|
|
207
|
+
sentence = text[s..e]
|
208
|
+
next if sentence.nil?
|
209
|
+
Segment.annotate sentence, s
|
210
|
+
sentence
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
end
|