rbbt-text 0.2.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -1,105 +1,214 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/util/tsv'
3
3
  require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/annotations/token'
4
5
  require 'rbbt/ner/NER'
5
6
 
6
7
  class TokenTrieNER < NER
7
8
  def self.clean(token)
8
9
  if token.length > 3
9
- token.downcase
10
+ token.downcase.sub(/-/,'')
10
11
  else
11
12
  token
12
13
  end
13
14
  end
14
15
 
15
- def self.prepare_token(token, start)
16
- Token.annotate(clean(token), start, token)
16
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
17
+ if no_clean
18
+ if extend_to_token
19
+ Token.annotate(clean(token), start, token)
20
+ else
21
+ clean(token)
22
+ end
23
+ else
24
+ if extend_to_token
25
+ Token.annotate(clean(token), start, token)
26
+ else
27
+ token
28
+ end
29
+ end
17
30
  end
18
31
 
19
- def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
32
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
33
+ split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
20
34
 
21
35
  tokens = []
22
36
  while matchdata = text.match(split_at)
23
- tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
24
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
37
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
25
39
  start += matchdata.end(0)
26
40
  text = matchdata.post_match
27
41
  end
28
- tokens << prepare_token(text, start) unless text.empty?
42
+
43
+ tokens << prepare_token(text, start, extend_to_token) unless text.empty?
29
44
 
30
45
  tokens
31
46
  end
32
47
 
33
48
  #{{{ Process dictionary
34
49
 
50
+ module EnumeratedArray
51
+ attr_accessor :pos
52
+ def self.extended(array)
53
+ array.pos = 0
54
+ end
55
+
56
+ def last?
57
+ @pos == length - 1
58
+ end
59
+
60
+ def advance
61
+ @pos += 1
62
+ end
63
+
64
+ def back
65
+ @pos -= 1
66
+ end
67
+
68
+ def next
69
+ e = self[@pos]
70
+ advance
71
+ e
72
+ end
73
+
74
+ def peek
75
+ self[@pos]
76
+ end
77
+
78
+ def left?
79
+ @pos < length
80
+ end
81
+
82
+ end
83
+
84
+
35
85
  class Code
36
- attr_accessor :value, :type
37
- def initialize(value, type = nil)
38
- @value = value
86
+ attr_accessor :code, :type
87
+ def initialize(code, type = nil)
88
+ @code = code
39
89
  @type = type
40
90
  end
41
91
 
42
92
  def to_s
43
- [type, value] * ":"
93
+ [type, code] * ":"
44
94
  end
45
95
  end
46
96
 
47
- def self.index_for_tokens(tokens, code, type = nil)
48
- if tokens.empty?
49
- {:END => [Code.new code, type]}
97
+ def self.index_for_tokens(tokens, code, type = nil, slack = nil)
98
+ if not tokens.left?
99
+ {:END => [Code.new(code, type)]}
50
100
  else
51
- {tokens.shift => index_for_tokens(tokens, code, type)}
101
+ head = tokens.next
102
+ if (slack.nil? or not slack.call(head))
103
+ res = {head => index_for_tokens(tokens, code, type, slack)}
104
+ else
105
+ res = {head => index_for_tokens(tokens, code, type, slack)}.merge(index_for_tokens(tokens, code, type, slack))
106
+ end
107
+ tokens.back
108
+ res
52
109
  end
53
110
  end
54
-
111
+
55
112
  def self.merge(index1, index2)
113
+ index1.write if index1.respond_to? :write
56
114
  index2.each do |key, new_index2|
57
115
  case
58
116
  when key == :END
59
- index1[:END] ||= []
60
- index1[:END] += new_index2.reject{|new| index1[:END].collect{|e| e.to_s }.include? new.to_s }
61
- index1[:END].uniq!
117
+ end1 = index1[:END] || []
118
+ end1 += new_index2.reject{|new| end1.collect{|e| e.to_s }.include? new.to_s }
119
+ end1.uniq!
120
+ index1[:END] = end1
62
121
  when index1.include?(key)
63
- merge(index1[key], new_index2)
122
+ index1[key] = merge(index1[key], new_index2)
64
123
  else
65
124
  index1[key] = new_index2
66
125
  end
67
126
  end
127
+ index1.read if index1.respond_to? :read
128
+
129
+ index1
68
130
  end
69
131
 
70
- def self.process(hash, type = nil)
71
- index = {}
72
- hash.each do |code, names|
73
- names.flatten.each do |name|
74
- next if name.empty? or name.length < 2
75
- tokens = tokenize name
132
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
76
133
 
77
- merge(index, index_for_tokens(tokens, code, type)) unless tokens.empty?
134
+ chunk_size = hash.size / 100
135
+ items_in_chunk = 0
136
+ tmp_index = {}
137
+ hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
138
+ names = Array === names ? names : [names]
139
+ names.flatten! if Array === names.first and not Token === names.first.first
140
+ names.each do |name|
141
+ next if name.empty? or (String === name and name.length < 2)
142
+
143
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
144
+ tokens.extend EnumeratedArray
145
+
146
+ tmp_index = merge(tmp_index, index_for_tokens(tokens, code, type, slack)) unless tokens.empty?
147
+ items_in_chunk += 1
148
+
149
+ if items_in_chunk > chunk_size
150
+ index = merge(index, tmp_index)
151
+ tmp_index = {}
152
+ items_in_chunk = 0
153
+ end
78
154
  end
79
155
  end
156
+ index = merge(index, tmp_index)
157
+
80
158
  index
81
159
  end
82
160
 
83
161
  #{{{ Matching
84
162
 
85
- def self.find(index, tokens, longest_match = true)
86
- return nil unless index.include? tokens.first
163
+ def self.follow(index, head)
164
+ res = nil
165
+
166
+ if index.include? head
167
+ return index[head]
168
+ end
169
+
170
+ return nil unless (not TCHash === index ) and index.include? :PROCS
87
171
 
88
- head = tokens.shift
89
- next_index = index[head]
172
+ index[:PROCS].each do |key,value|
173
+ return value if key.call(head)
174
+ end
90
175
 
91
- if tokens.empty?
176
+ nil
177
+ end
178
+
179
+ def self.find_fail(index, tokens, head, longest_match, slack, first)
180
+ if Proc === slack and not first and not head.nil? and tokens.left? and slack.call(head)
181
+ matches = find(index, tokens, longest_match, slack, false) # Recursion
182
+ if not matches.nil?
183
+ matches.last.unshift head
184
+ return matches
185
+ end
186
+ end
187
+
188
+ tokens.back
189
+ return nil
190
+ end
191
+
192
+ def self.find(index, tokens, longest_match = true, slack = nil, first = true)
193
+ head = tokens.next
194
+
195
+ next_index = follow(index, head)
196
+
197
+
198
+ return find_fail(index, tokens, head, longest_match, slack, first) if next_index.nil?
199
+
200
+ if not tokens.left?
92
201
  if next_index.include? :END
93
202
  return [next_index[:END], [head]]
94
203
  else
95
- tokens.unshift head
96
- return nil
204
+ return find_fail(index, tokens, head, longest_match, slack, first)
97
205
  end
98
206
  else
99
207
 
100
208
  return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
101
209
 
102
- matches = find(next_index, tokens)
210
+ matches = find(next_index, tokens, longest_match, slack, false) # Recursion
211
+
103
212
  if not matches.nil?
104
213
  matches.last.unshift head
105
214
  return matches
@@ -107,8 +216,7 @@ class TokenTrieNER < NER
107
216
 
108
217
  return [next_index[:END], [head]] if next_index.include?(:END)
109
218
 
110
- tokens.unshift head
111
- return nil
219
+ return find_fail(index, tokens, head, longest_match, slack, first)
112
220
  end
113
221
  end
114
222
 
@@ -117,20 +225,35 @@ class TokenTrieNER < NER
117
225
  match_offset = match_tokens.first.offset
118
226
  match_tokens.each{|t|
119
227
  match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
120
- match << t.original
228
+ match << (t.respond_to?(:original) ? t.original : t)
121
229
  }
122
230
 
123
231
  NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
124
232
  end
125
233
 
126
- attr_accessor :index, :longest_match, :type
127
- def initialize(file, type = nil, options = {})
128
- options = Misc.add_defaults options, :flatten => true, :longest_match => true
234
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
235
+ def initialize(type = nil, file = nil, options = {})
236
+ options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
237
+ :persistence => false
238
+ @slack = slack
129
239
  @longest_match = options.delete :longest_match
240
+ @split_at = options.delete :split_at
241
+ @no_clean = options.delete :no_clean
130
242
 
243
+ file = [] if file.nil?
131
244
  file = [file] unless Array === file
132
- @index = {}
133
- file.each do |f| TokenTrieNER.merge(@index, TokenTrieNER.process(TSV.new(f, options), type)) end
245
+ @index = Persistence.persist(file, :TokenTRIE, :tsv, options) do |file, options, filename, persistecen_file|
246
+ if persistecen_file.nil?
247
+ @index = {}
248
+ else
249
+ FileUtils.mkdir_p File.dirname(persistecen_file) unless File.exists? File.dirname(persistecen_file)
250
+ @index = TCHash.get persistecen_file, true, :marshal
251
+ end
252
+ file.each do |f|
253
+ merge(f, type)
254
+ end
255
+ @index
256
+ end
134
257
  end
135
258
 
136
259
  def merge(new, type = nil)
@@ -140,24 +263,36 @@ class TokenTrieNER < NER
140
263
  when Hash === new
141
264
  TokenTrieNER.merge(@index, new)
142
265
  when TSV === new
143
- TokenTrieNER.merge(@index, TokenTrieNER.process(new,type))
266
+ old_unnamed = new.unnamed
267
+ old_monitor = new.monitor
268
+ new.unnamed = true
269
+ new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
270
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
271
+ new.unnamed = old_unnamed
272
+ new.monitor = old_monitor
144
273
  when String === new
145
- TokenTrieNER.merge(@index, TokenTrieNER.process(TSV.new(new, :flatten => true), type))
274
+ new = TSV.new(new, :flat)
275
+ new.unnamed = true
276
+ new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
277
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
146
278
  end
147
279
  end
148
280
 
149
281
  def match(text)
150
- tokens = TokenTrieNER.tokenize text
282
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
283
+
284
+ tokens.extend EnumeratedArray
285
+ tokens.pos = 0
151
286
 
152
287
  matches = []
153
- while tokens.any?
154
- new_matches = TokenTrieNER.find(@index, tokens, longest_match)
288
+ while tokens.left?
289
+ new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
155
290
 
156
291
  if new_matches
157
292
  codes, match_tokens = new_matches
158
- matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.value})
293
+ matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.code})
159
294
  else
160
- tokens.shift
295
+ tokens.advance
161
296
  end
162
297
  end
163
298
 
@@ -165,4 +300,3 @@ class TokenTrieNER < NER
165
300
  end
166
301
 
167
302
  end
168
-
@@ -0,0 +1,214 @@
1
+ module NLP
2
+ def self.returnFeatures(prevWord, delimiter, nextWord)
3
+ if nextWord.match(/__ss__/)
4
+ nw = nextWord.sub(/__ss__/, "")
5
+ else
6
+ nw = nextWord
7
+ end
8
+
9
+ str = ""
10
+ # prev. word, next word
11
+ str += "pw_" + prevWord.downcase
12
+ str += "\tnw_" + nw.downcase
13
+
14
+ # delimiter
15
+ str += "\td_" + delimiter
16
+
17
+ # capitalized first char in next word
18
+ # capital in next word excluding first char.
19
+ if nw[0].chr == nw[0].chr.capitalize
20
+ str += "\tnfc_y"
21
+ nwExcluginFirst = nw[1 ... -1]
22
+ if nwExcluginFirst == nil
23
+ str += "\tnwcef_n"
24
+ elsif nwExcluginFirst.downcase == nwExcluginFirst
25
+ str += "\tnwcef_n"
26
+ else
27
+ str += "\tnwcef_y"
28
+ end
29
+ else
30
+ if nw.downcase == nw
31
+ str += "\tnwcef_n"
32
+ else
33
+ str += "\tnwcef_y"
34
+ end
35
+ str += "\tnfc_n"
36
+ end
37
+
38
+ # prev. word capital
39
+ if prevWord.downcase == prevWord
40
+ str += "\tpwc_n"
41
+ else
42
+ str += "\tpwc_y"
43
+ end
44
+
45
+ # number in prev. word, in next word
46
+ if prevWord.match(/[0-9]/)
47
+ str += "\tpwn_y"
48
+ else
49
+ str += "\tpwn_n"
50
+ end
51
+ if nw.match(/[0-9]/)
52
+ str += "\tnwn_y"
53
+ else
54
+ str += "\tnwn_n"
55
+ end
56
+
57
+ # prev., next word excluding braket, camma, etc.
58
+ prevWordEx = prevWord.gsub(/[()'",\[\]]/, "")
59
+ nwEx = nw.gsub(/[()'",\[\]]/, "")
60
+ str += "\tpwex_" + prevWordEx.downcase
61
+ str += "\tnwex_" + nwEx.downcase
62
+
63
+ # bracket or quatation in prev. word
64
+ if prevWord.match(/()'"/)
65
+ str += "\tpwcbq_y"
66
+ else
67
+ str += "\tpwcbq_n"
68
+ end
69
+ # camma in prev., next word
70
+ if prevWord.match(/,/)
71
+ str += "\tpwcc_y"
72
+ else
73
+ str += "\tpwcc_n"
74
+ end
75
+ if nw.match(/,/)
76
+ else
77
+ str += "\tnwcc_n"
78
+ end
79
+
80
+ # prev. word + delimiter
81
+ str += "\tpw_" + prevWord + "_d_" + delimiter
82
+ # prev. word ex. + delimiter + next word ex.
83
+ str += "\tpwex_" + prevWordEx + "_d_" + delimiter + "_nwex_" + nwEx
84
+ #str +=
85
+ #str +=
86
+ #str +=
87
+ str += "\n"
88
+ end
89
+
90
+ def self.event_extraction(text)
91
+ events = ""
92
+ marks = ""
93
+
94
+ eventCount = 0
95
+
96
+ pat = / [^ ]+[.!\?\)\]\"]( +)[^ ]+ /
97
+ for line in text.split(/\n/) do
98
+ while line.match(pat) do
99
+ line.sub!(/ ([^ ]+)([.!\?\)\]\"])( +)([^ ]+) /){
100
+ a, b, d, c = $1, $2, $3, $4
101
+ events << eventCount.to_s << "\t"
102
+ events << returnFeatures(a, b, c)
103
+ (" " + a + b + "__" + eventCount.to_s + "____" + d + "__" + c + " ")
104
+ }
105
+ eventCount += 1
106
+ end
107
+ marks << line
108
+ end
109
+
110
+ [events, marks]
111
+ end
112
+
113
+ def self.process_labels(marked_text, labels)
114
+ out = ""
115
+
116
+ count = 0
117
+ text_lines = marked_text.split(/\n/)
118
+ line = text_lines.shift
119
+ for label in labels
120
+ pat = "__" + count.to_s + "__"
121
+ until(line.match(pat)) do
122
+ out << line
123
+ line = text_lines.shift
124
+ end
125
+ splitted = label.chomp.to_i
126
+
127
+ line.sub!(pat){
128
+ if splitted == 1
129
+ "__\n__"
130
+ else
131
+ "____"
132
+ end
133
+ }
134
+ line.sub!(/__\n____ +__/, "\n")
135
+ line.sub!(/______( +)__/){
136
+ $1
137
+ }
138
+ count += 1
139
+ end
140
+
141
+ out << line
142
+
143
+ out << text_lines * ""
144
+
145
+ out
146
+ end
147
+
148
+ def self.geniass_sentence_splitter_extension(text)
149
+ require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
150
+ geniass = Geniass.new
151
+ if not geniass.geniass_is_loaded
152
+ Misc.in_dir Rbbt.software.opt.Geniass.find do
153
+ geniass.load_geniass
154
+ end
155
+ end
156
+
157
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
158
+ events, marks = event_extraction(cleaned)
159
+
160
+ labels = events.split(/\n/).collect{|line|
161
+ geniass.label(line)
162
+ }
163
+
164
+ out = process_labels(marks, labels)
165
+
166
+ offsets = []
167
+
168
+ inTxtStrict = StringIO.new text
169
+ inTxtNew = StringIO.new out.gsub("\n", '|').gsub(NEW_LINE_MASK, "\n")
170
+
171
+ marker = "|"[0]
172
+ position = 0
173
+ sentenceCount = 1
174
+ target = ''
175
+ targetNew = ''
176
+ start = 0
177
+ finish = 0
178
+
179
+ while(!inTxtNew.eof?) do
180
+ targetNew = inTxtNew.getc
181
+ target = inTxtStrict.getc
182
+ position += 1
183
+ if targetNew == marker
184
+ sentenceCount += 1
185
+ finish = position - 1
186
+ offsets << [start, finish] if finish - start > 10
187
+ if targetNew == target
188
+ start = position
189
+ else
190
+ targetNew = inTxtNew.getc
191
+ while targetNew != target do
192
+ target = inTxtStrict.getc
193
+ position += 1
194
+ end
195
+ start = position - 1
196
+ end
197
+ end
198
+ end
199
+
200
+ finish = position - 1
201
+ offsets << [start, finish] if finish > start
202
+
203
+ inTxtStrict.close
204
+ inTxtNew.close
205
+
206
+ offsets.collect do |s,e|
207
+ sentence = text[s..e]
208
+ next if sentence.nil?
209
+ Segment.annotate sentence, s
210
+ sentence
211
+ end
212
+
213
+ end
214
+ end