rbbt-text 0.2.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/get_ppis.rb +52 -0
- data/lib/rbbt/bow/dictionary.rb +9 -9
- data/lib/rbbt/bow/misc.rb +86 -2
- data/lib/rbbt/corpus/corpus.rb +55 -0
- data/lib/rbbt/corpus/document.rb +289 -0
- data/lib/rbbt/corpus/document_repo.rb +115 -0
- data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
- data/lib/rbbt/ner/NER.rb +7 -5
- data/lib/rbbt/ner/abner.rb +13 -2
- data/lib/rbbt/ner/annotations.rb +182 -51
- data/lib/rbbt/ner/annotations/annotated.rb +15 -0
- data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
- data/lib/rbbt/ner/annotations/relations.rb +25 -0
- data/lib/rbbt/ner/annotations/token.rb +28 -0
- data/lib/rbbt/ner/annotations/transformed.rb +170 -0
- data/lib/rbbt/ner/banner.rb +8 -5
- data/lib/rbbt/ner/chemical_tagger.rb +34 -0
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
- data/lib/rbbt/ner/oscar3.rb +1 -1
- data/lib/rbbt/ner/oscar4.rb +41 -0
- data/lib/rbbt/ner/patterns.rb +132 -0
- data/lib/rbbt/ner/rnorm.rb +141 -0
- data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
- data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
- data/lib/rbbt/ner/token_trieNER.rb +185 -51
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
- data/lib/rbbt/nlp/nlp.rb +235 -0
- data/share/install/software/ABNER +0 -4
- data/share/install/software/ChemicalTagger +81 -0
- data/share/install/software/Gdep +115 -0
- data/share/install/software/Geniass +118 -0
- data/share/install/software/OSCAR4 +16 -0
- data/share/install/software/StanfordParser +15 -0
- data/share/patterns/drug_induce_disease +22 -0
- data/share/rnorm/cue_default +10 -0
- data/share/rnorm/tokens_default +86 -0
- data/share/{stopwords → wordlists/stopwords} +0 -0
- data/test/rbbt/bow/test_bow.rb +1 -1
- data/test/rbbt/bow/test_dictionary.rb +1 -1
- data/test/rbbt/bow/test_misc.rb +1 -1
- data/test/rbbt/corpus/test_corpus.rb +99 -0
- data/test/rbbt/corpus/test_document.rb +222 -0
- data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
- data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
- data/test/rbbt/ner/test_abner.rb +1 -1
- data/test/rbbt/ner/test_annotations.rb +64 -2
- data/test/rbbt/ner/test_banner.rb +1 -1
- data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
- data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
- data/test/rbbt/ner/test_patterns.rb +66 -0
- data/test/rbbt/ner/test_regexpNER.rb +1 -1
- data/test/rbbt/ner/test_rnorm.rb +47 -0
- data/test/rbbt/ner/test_token_trieNER.rb +60 -35
- data/test/rbbt/nlp/test_nlp.rb +88 -0
- data/test/test_helper.rb +20 -0
- metadata +93 -20
@@ -1,105 +1,214 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/util/tsv'
|
3
3
|
require 'rbbt/ner/annotations'
|
4
|
+
require 'rbbt/ner/annotations/token'
|
4
5
|
require 'rbbt/ner/NER'
|
5
6
|
|
6
7
|
class TokenTrieNER < NER
|
7
8
|
def self.clean(token)
|
8
9
|
if token.length > 3
|
9
|
-
token.downcase
|
10
|
+
token.downcase.sub(/-/,'')
|
10
11
|
else
|
11
12
|
token
|
12
13
|
end
|
13
14
|
end
|
14
15
|
|
15
|
-
def self.prepare_token(token, start)
|
16
|
-
|
16
|
+
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
17
|
+
if no_clean
|
18
|
+
if extend_to_token
|
19
|
+
Token.annotate(clean(token), start, token)
|
20
|
+
else
|
21
|
+
clean(token)
|
22
|
+
end
|
23
|
+
else
|
24
|
+
if extend_to_token
|
25
|
+
Token.annotate(clean(token), start, token)
|
26
|
+
else
|
27
|
+
token
|
28
|
+
end
|
29
|
+
end
|
17
30
|
end
|
18
31
|
|
19
|
-
def self.tokenize(text, split_at =
|
32
|
+
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
|
33
|
+
split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
|
20
34
|
|
21
35
|
tokens = []
|
22
36
|
while matchdata = text.match(split_at)
|
23
|
-
tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
24
|
-
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
37
|
+
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
|
38
|
+
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
25
39
|
start += matchdata.end(0)
|
26
40
|
text = matchdata.post_match
|
27
41
|
end
|
28
|
-
|
42
|
+
|
43
|
+
tokens << prepare_token(text, start, extend_to_token) unless text.empty?
|
29
44
|
|
30
45
|
tokens
|
31
46
|
end
|
32
47
|
|
33
48
|
#{{{ Process dictionary
|
34
49
|
|
50
|
+
module EnumeratedArray
|
51
|
+
attr_accessor :pos
|
52
|
+
def self.extended(array)
|
53
|
+
array.pos = 0
|
54
|
+
end
|
55
|
+
|
56
|
+
def last?
|
57
|
+
@pos == length - 1
|
58
|
+
end
|
59
|
+
|
60
|
+
def advance
|
61
|
+
@pos += 1
|
62
|
+
end
|
63
|
+
|
64
|
+
def back
|
65
|
+
@pos -= 1
|
66
|
+
end
|
67
|
+
|
68
|
+
def next
|
69
|
+
e = self[@pos]
|
70
|
+
advance
|
71
|
+
e
|
72
|
+
end
|
73
|
+
|
74
|
+
def peek
|
75
|
+
self[@pos]
|
76
|
+
end
|
77
|
+
|
78
|
+
def left?
|
79
|
+
@pos < length
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
|
35
85
|
class Code
|
36
|
-
attr_accessor :
|
37
|
-
def initialize(
|
38
|
-
@
|
86
|
+
attr_accessor :code, :type
|
87
|
+
def initialize(code, type = nil)
|
88
|
+
@code = code
|
39
89
|
@type = type
|
40
90
|
end
|
41
91
|
|
42
92
|
def to_s
|
43
|
-
[type,
|
93
|
+
[type, code] * ":"
|
44
94
|
end
|
45
95
|
end
|
46
96
|
|
47
|
-
def self.index_for_tokens(tokens, code, type = nil)
|
48
|
-
if tokens.
|
49
|
-
{:END => [Code.new
|
97
|
+
def self.index_for_tokens(tokens, code, type = nil, slack = nil)
|
98
|
+
if not tokens.left?
|
99
|
+
{:END => [Code.new(code, type)]}
|
50
100
|
else
|
51
|
-
|
101
|
+
head = tokens.next
|
102
|
+
if (slack.nil? or not slack.call(head))
|
103
|
+
res = {head => index_for_tokens(tokens, code, type, slack)}
|
104
|
+
else
|
105
|
+
res = {head => index_for_tokens(tokens, code, type, slack)}.merge(index_for_tokens(tokens, code, type, slack))
|
106
|
+
end
|
107
|
+
tokens.back
|
108
|
+
res
|
52
109
|
end
|
53
110
|
end
|
54
|
-
|
111
|
+
|
55
112
|
def self.merge(index1, index2)
|
113
|
+
index1.write if index1.respond_to? :write
|
56
114
|
index2.each do |key, new_index2|
|
57
115
|
case
|
58
116
|
when key == :END
|
59
|
-
index1[:END]
|
60
|
-
|
61
|
-
|
117
|
+
end1 = index1[:END] || []
|
118
|
+
end1 += new_index2.reject{|new| end1.collect{|e| e.to_s }.include? new.to_s }
|
119
|
+
end1.uniq!
|
120
|
+
index1[:END] = end1
|
62
121
|
when index1.include?(key)
|
63
|
-
merge(index1[key], new_index2)
|
122
|
+
index1[key] = merge(index1[key], new_index2)
|
64
123
|
else
|
65
124
|
index1[key] = new_index2
|
66
125
|
end
|
67
126
|
end
|
127
|
+
index1.read if index1.respond_to? :read
|
128
|
+
|
129
|
+
index1
|
68
130
|
end
|
69
131
|
|
70
|
-
def self.process(hash, type = nil)
|
71
|
-
index = {}
|
72
|
-
hash.each do |code, names|
|
73
|
-
names.flatten.each do |name|
|
74
|
-
next if name.empty? or name.length < 2
|
75
|
-
tokens = tokenize name
|
132
|
+
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
|
76
133
|
|
77
|
-
|
134
|
+
chunk_size = hash.size / 100
|
135
|
+
items_in_chunk = 0
|
136
|
+
tmp_index = {}
|
137
|
+
hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
|
138
|
+
names = Array === names ? names : [names]
|
139
|
+
names.flatten! if Array === names.first and not Token === names.first.first
|
140
|
+
names.each do |name|
|
141
|
+
next if name.empty? or (String === name and name.length < 2)
|
142
|
+
|
143
|
+
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
|
144
|
+
tokens.extend EnumeratedArray
|
145
|
+
|
146
|
+
tmp_index = merge(tmp_index, index_for_tokens(tokens, code, type, slack)) unless tokens.empty?
|
147
|
+
items_in_chunk += 1
|
148
|
+
|
149
|
+
if items_in_chunk > chunk_size
|
150
|
+
index = merge(index, tmp_index)
|
151
|
+
tmp_index = {}
|
152
|
+
items_in_chunk = 0
|
153
|
+
end
|
78
154
|
end
|
79
155
|
end
|
156
|
+
index = merge(index, tmp_index)
|
157
|
+
|
80
158
|
index
|
81
159
|
end
|
82
160
|
|
83
161
|
#{{{ Matching
|
84
162
|
|
85
|
-
def self.
|
86
|
-
|
163
|
+
def self.follow(index, head)
|
164
|
+
res = nil
|
165
|
+
|
166
|
+
if index.include? head
|
167
|
+
return index[head]
|
168
|
+
end
|
169
|
+
|
170
|
+
return nil unless (not TCHash === index ) and index.include? :PROCS
|
87
171
|
|
88
|
-
|
89
|
-
|
172
|
+
index[:PROCS].each do |key,value|
|
173
|
+
return value if key.call(head)
|
174
|
+
end
|
90
175
|
|
91
|
-
|
176
|
+
nil
|
177
|
+
end
|
178
|
+
|
179
|
+
def self.find_fail(index, tokens, head, longest_match, slack, first)
|
180
|
+
if Proc === slack and not first and not head.nil? and tokens.left? and slack.call(head)
|
181
|
+
matches = find(index, tokens, longest_match, slack, false) # Recursion
|
182
|
+
if not matches.nil?
|
183
|
+
matches.last.unshift head
|
184
|
+
return matches
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
tokens.back
|
189
|
+
return nil
|
190
|
+
end
|
191
|
+
|
192
|
+
def self.find(index, tokens, longest_match = true, slack = nil, first = true)
|
193
|
+
head = tokens.next
|
194
|
+
|
195
|
+
next_index = follow(index, head)
|
196
|
+
|
197
|
+
|
198
|
+
return find_fail(index, tokens, head, longest_match, slack, first) if next_index.nil?
|
199
|
+
|
200
|
+
if not tokens.left?
|
92
201
|
if next_index.include? :END
|
93
202
|
return [next_index[:END], [head]]
|
94
203
|
else
|
95
|
-
tokens
|
96
|
-
return nil
|
204
|
+
return find_fail(index, tokens, head, longest_match, slack, first)
|
97
205
|
end
|
98
206
|
else
|
99
207
|
|
100
208
|
return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
|
101
209
|
|
102
|
-
matches = find(next_index, tokens)
|
210
|
+
matches = find(next_index, tokens, longest_match, slack, false) # Recursion
|
211
|
+
|
103
212
|
if not matches.nil?
|
104
213
|
matches.last.unshift head
|
105
214
|
return matches
|
@@ -107,8 +216,7 @@ class TokenTrieNER < NER
|
|
107
216
|
|
108
217
|
return [next_index[:END], [head]] if next_index.include?(:END)
|
109
218
|
|
110
|
-
tokens
|
111
|
-
return nil
|
219
|
+
return find_fail(index, tokens, head, longest_match, slack, first)
|
112
220
|
end
|
113
221
|
end
|
114
222
|
|
@@ -117,20 +225,35 @@ class TokenTrieNER < NER
|
|
117
225
|
match_offset = match_tokens.first.offset
|
118
226
|
match_tokens.each{|t|
|
119
227
|
match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
|
120
|
-
match << t.original
|
228
|
+
match << (t.respond_to?(:original) ? t.original : t)
|
121
229
|
}
|
122
230
|
|
123
231
|
NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
|
124
232
|
end
|
125
233
|
|
126
|
-
attr_accessor :index, :longest_match, :type
|
127
|
-
def initialize(
|
128
|
-
options = Misc.add_defaults options, :
|
234
|
+
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|
235
|
+
def initialize(type = nil, file = nil, options = {})
|
236
|
+
options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
|
237
|
+
:persistence => false
|
238
|
+
@slack = slack
|
129
239
|
@longest_match = options.delete :longest_match
|
240
|
+
@split_at = options.delete :split_at
|
241
|
+
@no_clean = options.delete :no_clean
|
130
242
|
|
243
|
+
file = [] if file.nil?
|
131
244
|
file = [file] unless Array === file
|
132
|
-
@index =
|
133
|
-
|
245
|
+
@index = Persistence.persist(file, :TokenTRIE, :tsv, options) do |file, options, filename, persistecen_file|
|
246
|
+
if persistecen_file.nil?
|
247
|
+
@index = {}
|
248
|
+
else
|
249
|
+
FileUtils.mkdir_p File.dirname(persistecen_file) unless File.exists? File.dirname(persistecen_file)
|
250
|
+
@index = TCHash.get persistecen_file, true, :marshal
|
251
|
+
end
|
252
|
+
file.each do |f|
|
253
|
+
merge(f, type)
|
254
|
+
end
|
255
|
+
@index
|
256
|
+
end
|
134
257
|
end
|
135
258
|
|
136
259
|
def merge(new, type = nil)
|
@@ -140,24 +263,36 @@ class TokenTrieNER < NER
|
|
140
263
|
when Hash === new
|
141
264
|
TokenTrieNER.merge(@index, new)
|
142
265
|
when TSV === new
|
143
|
-
|
266
|
+
old_unnamed = new.unnamed
|
267
|
+
old_monitor = new.monitor
|
268
|
+
new.unnamed = true
|
269
|
+
new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
|
270
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
271
|
+
new.unnamed = old_unnamed
|
272
|
+
new.monitor = old_monitor
|
144
273
|
when String === new
|
145
|
-
|
274
|
+
new = TSV.new(new, :flat)
|
275
|
+
new.unnamed = true
|
276
|
+
new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
|
277
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
146
278
|
end
|
147
279
|
end
|
148
280
|
|
149
281
|
def match(text)
|
150
|
-
tokens = TokenTrieNER.tokenize
|
282
|
+
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
|
283
|
+
|
284
|
+
tokens.extend EnumeratedArray
|
285
|
+
tokens.pos = 0
|
151
286
|
|
152
287
|
matches = []
|
153
|
-
while tokens.
|
154
|
-
new_matches = TokenTrieNER.find(@index, tokens, longest_match)
|
288
|
+
while tokens.left?
|
289
|
+
new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
|
155
290
|
|
156
291
|
if new_matches
|
157
292
|
codes, match_tokens = new_matches
|
158
|
-
matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.
|
293
|
+
matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.code})
|
159
294
|
else
|
160
|
-
tokens.
|
295
|
+
tokens.advance
|
161
296
|
end
|
162
297
|
end
|
163
298
|
|
@@ -165,4 +300,3 @@ class TokenTrieNER < NER
|
|
165
300
|
end
|
166
301
|
|
167
302
|
end
|
168
|
-
|
@@ -0,0 +1,214 @@
|
|
1
|
+
module NLP
|
2
|
+
def self.returnFeatures(prevWord, delimiter, nextWord)
|
3
|
+
if nextWord.match(/__ss__/)
|
4
|
+
nw = nextWord.sub(/__ss__/, "")
|
5
|
+
else
|
6
|
+
nw = nextWord
|
7
|
+
end
|
8
|
+
|
9
|
+
str = ""
|
10
|
+
# prev. word, next word
|
11
|
+
str += "pw_" + prevWord.downcase
|
12
|
+
str += "\tnw_" + nw.downcase
|
13
|
+
|
14
|
+
# delimiter
|
15
|
+
str += "\td_" + delimiter
|
16
|
+
|
17
|
+
# capitalized first char in next word
|
18
|
+
# capital in next word excluding first char.
|
19
|
+
if nw[0].chr == nw[0].chr.capitalize
|
20
|
+
str += "\tnfc_y"
|
21
|
+
nwExcluginFirst = nw[1 ... -1]
|
22
|
+
if nwExcluginFirst == nil
|
23
|
+
str += "\tnwcef_n"
|
24
|
+
elsif nwExcluginFirst.downcase == nwExcluginFirst
|
25
|
+
str += "\tnwcef_n"
|
26
|
+
else
|
27
|
+
str += "\tnwcef_y"
|
28
|
+
end
|
29
|
+
else
|
30
|
+
if nw.downcase == nw
|
31
|
+
str += "\tnwcef_n"
|
32
|
+
else
|
33
|
+
str += "\tnwcef_y"
|
34
|
+
end
|
35
|
+
str += "\tnfc_n"
|
36
|
+
end
|
37
|
+
|
38
|
+
# prev. word capital
|
39
|
+
if prevWord.downcase == prevWord
|
40
|
+
str += "\tpwc_n"
|
41
|
+
else
|
42
|
+
str += "\tpwc_y"
|
43
|
+
end
|
44
|
+
|
45
|
+
# number in prev. word, in next word
|
46
|
+
if prevWord.match(/[0-9]/)
|
47
|
+
str += "\tpwn_y"
|
48
|
+
else
|
49
|
+
str += "\tpwn_n"
|
50
|
+
end
|
51
|
+
if nw.match(/[0-9]/)
|
52
|
+
str += "\tnwn_y"
|
53
|
+
else
|
54
|
+
str += "\tnwn_n"
|
55
|
+
end
|
56
|
+
|
57
|
+
# prev., next word excluding braket, camma, etc.
|
58
|
+
prevWordEx = prevWord.gsub(/[()'",\[\]]/, "")
|
59
|
+
nwEx = nw.gsub(/[()'",\[\]]/, "")
|
60
|
+
str += "\tpwex_" + prevWordEx.downcase
|
61
|
+
str += "\tnwex_" + nwEx.downcase
|
62
|
+
|
63
|
+
# bracket or quatation in prev. word
|
64
|
+
if prevWord.match(/()'"/)
|
65
|
+
str += "\tpwcbq_y"
|
66
|
+
else
|
67
|
+
str += "\tpwcbq_n"
|
68
|
+
end
|
69
|
+
# camma in prev., next word
|
70
|
+
if prevWord.match(/,/)
|
71
|
+
str += "\tpwcc_y"
|
72
|
+
else
|
73
|
+
str += "\tpwcc_n"
|
74
|
+
end
|
75
|
+
if nw.match(/,/)
|
76
|
+
else
|
77
|
+
str += "\tnwcc_n"
|
78
|
+
end
|
79
|
+
|
80
|
+
# prev. word + delimiter
|
81
|
+
str += "\tpw_" + prevWord + "_d_" + delimiter
|
82
|
+
# prev. word ex. + delimiter + next word ex.
|
83
|
+
str += "\tpwex_" + prevWordEx + "_d_" + delimiter + "_nwex_" + nwEx
|
84
|
+
#str +=
|
85
|
+
#str +=
|
86
|
+
#str +=
|
87
|
+
str += "\n"
|
88
|
+
end
|
89
|
+
|
90
|
+
def self.event_extraction(text)
|
91
|
+
events = ""
|
92
|
+
marks = ""
|
93
|
+
|
94
|
+
eventCount = 0
|
95
|
+
|
96
|
+
pat = / [^ ]+[.!\?\)\]\"]( +)[^ ]+ /
|
97
|
+
for line in text.split(/\n/) do
|
98
|
+
while line.match(pat) do
|
99
|
+
line.sub!(/ ([^ ]+)([.!\?\)\]\"])( +)([^ ]+) /){
|
100
|
+
a, b, d, c = $1, $2, $3, $4
|
101
|
+
events << eventCount.to_s << "\t"
|
102
|
+
events << returnFeatures(a, b, c)
|
103
|
+
(" " + a + b + "__" + eventCount.to_s + "____" + d + "__" + c + " ")
|
104
|
+
}
|
105
|
+
eventCount += 1
|
106
|
+
end
|
107
|
+
marks << line
|
108
|
+
end
|
109
|
+
|
110
|
+
[events, marks]
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.process_labels(marked_text, labels)
|
114
|
+
out = ""
|
115
|
+
|
116
|
+
count = 0
|
117
|
+
text_lines = marked_text.split(/\n/)
|
118
|
+
line = text_lines.shift
|
119
|
+
for label in labels
|
120
|
+
pat = "__" + count.to_s + "__"
|
121
|
+
until(line.match(pat)) do
|
122
|
+
out << line
|
123
|
+
line = text_lines.shift
|
124
|
+
end
|
125
|
+
splitted = label.chomp.to_i
|
126
|
+
|
127
|
+
line.sub!(pat){
|
128
|
+
if splitted == 1
|
129
|
+
"__\n__"
|
130
|
+
else
|
131
|
+
"____"
|
132
|
+
end
|
133
|
+
}
|
134
|
+
line.sub!(/__\n____ +__/, "\n")
|
135
|
+
line.sub!(/______( +)__/){
|
136
|
+
$1
|
137
|
+
}
|
138
|
+
count += 1
|
139
|
+
end
|
140
|
+
|
141
|
+
out << line
|
142
|
+
|
143
|
+
out << text_lines * ""
|
144
|
+
|
145
|
+
out
|
146
|
+
end
|
147
|
+
|
148
|
+
def self.geniass_sentence_splitter_extension(text)
|
149
|
+
require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
|
150
|
+
geniass = Geniass.new
|
151
|
+
if not geniass.geniass_is_loaded
|
152
|
+
Misc.in_dir Rbbt.software.opt.Geniass.find do
|
153
|
+
geniass.load_geniass
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
158
|
+
events, marks = event_extraction(cleaned)
|
159
|
+
|
160
|
+
labels = events.split(/\n/).collect{|line|
|
161
|
+
geniass.label(line)
|
162
|
+
}
|
163
|
+
|
164
|
+
out = process_labels(marks, labels)
|
165
|
+
|
166
|
+
offsets = []
|
167
|
+
|
168
|
+
inTxtStrict = StringIO.new text
|
169
|
+
inTxtNew = StringIO.new out.gsub("\n", '|').gsub(NEW_LINE_MASK, "\n")
|
170
|
+
|
171
|
+
marker = "|"[0]
|
172
|
+
position = 0
|
173
|
+
sentenceCount = 1
|
174
|
+
target = ''
|
175
|
+
targetNew = ''
|
176
|
+
start = 0
|
177
|
+
finish = 0
|
178
|
+
|
179
|
+
while(!inTxtNew.eof?) do
|
180
|
+
targetNew = inTxtNew.getc
|
181
|
+
target = inTxtStrict.getc
|
182
|
+
position += 1
|
183
|
+
if targetNew == marker
|
184
|
+
sentenceCount += 1
|
185
|
+
finish = position - 1
|
186
|
+
offsets << [start, finish] if finish - start > 10
|
187
|
+
if targetNew == target
|
188
|
+
start = position
|
189
|
+
else
|
190
|
+
targetNew = inTxtNew.getc
|
191
|
+
while targetNew != target do
|
192
|
+
target = inTxtStrict.getc
|
193
|
+
position += 1
|
194
|
+
end
|
195
|
+
start = position - 1
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
finish = position - 1
|
201
|
+
offsets << [start, finish] if finish > start
|
202
|
+
|
203
|
+
inTxtStrict.close
|
204
|
+
inTxtNew.close
|
205
|
+
|
206
|
+
offsets.collect do |s,e|
|
207
|
+
sentence = text[s..e]
|
208
|
+
next if sentence.nil?
|
209
|
+
Segment.annotate sentence, s
|
210
|
+
sentence
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
end
|