rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -1,105 +1,214 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/util/tsv'
3
3
  require 'rbbt/ner/annotations'
4
+ require 'rbbt/ner/annotations/token'
4
5
  require 'rbbt/ner/NER'
5
6
 
6
7
  class TokenTrieNER < NER
7
8
  def self.clean(token)
8
9
  if token.length > 3
9
- token.downcase
10
+ token.downcase.sub(/-/,'')
10
11
  else
11
12
  token
12
13
  end
13
14
  end
14
15
 
15
- def self.prepare_token(token, start)
16
- Token.annotate(clean(token), start, token)
16
+ def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
17
+ if no_clean
18
+ if extend_to_token
19
+ Token.annotate(clean(token), start, token)
20
+ else
21
+ clean(token)
22
+ end
23
+ else
24
+ if extend_to_token
25
+ Token.annotate(clean(token), start, token)
26
+ else
27
+ token
28
+ end
29
+ end
17
30
  end
18
31
 
19
- def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
32
+ def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
33
+ split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
20
34
 
21
35
  tokens = []
22
36
  while matchdata = text.match(split_at)
23
- tokens << prepare_token(matchdata.pre_match, start) unless matchdata.pre_match.empty?
24
- tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
37
+ tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
38
+ tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
25
39
  start += matchdata.end(0)
26
40
  text = matchdata.post_match
27
41
  end
28
- tokens << prepare_token(text, start) unless text.empty?
42
+
43
+ tokens << prepare_token(text, start, extend_to_token) unless text.empty?
29
44
 
30
45
  tokens
31
46
  end
32
47
 
33
48
  #{{{ Process dictionary
34
49
 
50
+ module EnumeratedArray
51
+ attr_accessor :pos
52
+ def self.extended(array)
53
+ array.pos = 0
54
+ end
55
+
56
+ def last?
57
+ @pos == length - 1
58
+ end
59
+
60
+ def advance
61
+ @pos += 1
62
+ end
63
+
64
+ def back
65
+ @pos -= 1
66
+ end
67
+
68
+ def next
69
+ e = self[@pos]
70
+ advance
71
+ e
72
+ end
73
+
74
+ def peek
75
+ self[@pos]
76
+ end
77
+
78
+ def left?
79
+ @pos < length
80
+ end
81
+
82
+ end
83
+
84
+
35
85
  class Code
36
- attr_accessor :value, :type
37
- def initialize(value, type = nil)
38
- @value = value
86
+ attr_accessor :code, :type
87
+ def initialize(code, type = nil)
88
+ @code = code
39
89
  @type = type
40
90
  end
41
91
 
42
92
  def to_s
43
- [type, value] * ":"
93
+ [type, code] * ":"
44
94
  end
45
95
  end
46
96
 
47
- def self.index_for_tokens(tokens, code, type = nil)
48
- if tokens.empty?
49
- {:END => [Code.new code, type]}
97
+ def self.index_for_tokens(tokens, code, type = nil, slack = nil)
98
+ if not tokens.left?
99
+ {:END => [Code.new(code, type)]}
50
100
  else
51
- {tokens.shift => index_for_tokens(tokens, code, type)}
101
+ head = tokens.next
102
+ if (slack.nil? or not slack.call(head))
103
+ res = {head => index_for_tokens(tokens, code, type, slack)}
104
+ else
105
+ res = {head => index_for_tokens(tokens, code, type, slack)}.merge(index_for_tokens(tokens, code, type, slack))
106
+ end
107
+ tokens.back
108
+ res
52
109
  end
53
110
  end
54
-
111
+
55
112
  def self.merge(index1, index2)
113
+ index1.write if index1.respond_to? :write
56
114
  index2.each do |key, new_index2|
57
115
  case
58
116
  when key == :END
59
- index1[:END] ||= []
60
- index1[:END] += new_index2.reject{|new| index1[:END].collect{|e| e.to_s }.include? new.to_s }
61
- index1[:END].uniq!
117
+ end1 = index1[:END] || []
118
+ end1 += new_index2.reject{|new| end1.collect{|e| e.to_s }.include? new.to_s }
119
+ end1.uniq!
120
+ index1[:END] = end1
62
121
  when index1.include?(key)
63
- merge(index1[key], new_index2)
122
+ index1[key] = merge(index1[key], new_index2)
64
123
  else
65
124
  index1[key] = new_index2
66
125
  end
67
126
  end
127
+ index1.read if index1.respond_to? :read
128
+
129
+ index1
68
130
  end
69
131
 
70
- def self.process(hash, type = nil)
71
- index = {}
72
- hash.each do |code, names|
73
- names.flatten.each do |name|
74
- next if name.empty? or name.length < 2
75
- tokens = tokenize name
132
+ def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
76
133
 
77
- merge(index, index_for_tokens(tokens, code, type)) unless tokens.empty?
134
+ chunk_size = hash.size / 100
135
+ items_in_chunk = 0
136
+ tmp_index = {}
137
+ hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
138
+ names = Array === names ? names : [names]
139
+ names.flatten! if Array === names.first and not Token === names.first.first
140
+ names.each do |name|
141
+ next if name.empty? or (String === name and name.length < 2)
142
+
143
+ tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
144
+ tokens.extend EnumeratedArray
145
+
146
+ tmp_index = merge(tmp_index, index_for_tokens(tokens, code, type, slack)) unless tokens.empty?
147
+ items_in_chunk += 1
148
+
149
+ if items_in_chunk > chunk_size
150
+ index = merge(index, tmp_index)
151
+ tmp_index = {}
152
+ items_in_chunk = 0
153
+ end
78
154
  end
79
155
  end
156
+ index = merge(index, tmp_index)
157
+
80
158
  index
81
159
  end
82
160
 
83
161
  #{{{ Matching
84
162
 
85
- def self.find(index, tokens, longest_match = true)
86
- return nil unless index.include? tokens.first
163
+ def self.follow(index, head)
164
+ res = nil
165
+
166
+ if index.include? head
167
+ return index[head]
168
+ end
169
+
170
+ return nil unless (not TCHash === index ) and index.include? :PROCS
87
171
 
88
- head = tokens.shift
89
- next_index = index[head]
172
+ index[:PROCS].each do |key,value|
173
+ return value if key.call(head)
174
+ end
90
175
 
91
- if tokens.empty?
176
+ nil
177
+ end
178
+
179
+ def self.find_fail(index, tokens, head, longest_match, slack, first)
180
+ if Proc === slack and not first and not head.nil? and tokens.left? and slack.call(head)
181
+ matches = find(index, tokens, longest_match, slack, false) # Recursion
182
+ if not matches.nil?
183
+ matches.last.unshift head
184
+ return matches
185
+ end
186
+ end
187
+
188
+ tokens.back
189
+ return nil
190
+ end
191
+
192
+ def self.find(index, tokens, longest_match = true, slack = nil, first = true)
193
+ head = tokens.next
194
+
195
+ next_index = follow(index, head)
196
+
197
+
198
+ return find_fail(index, tokens, head, longest_match, slack, first) if next_index.nil?
199
+
200
+ if not tokens.left?
92
201
  if next_index.include? :END
93
202
  return [next_index[:END], [head]]
94
203
  else
95
- tokens.unshift head
96
- return nil
204
+ return find_fail(index, tokens, head, longest_match, slack, first)
97
205
  end
98
206
  else
99
207
 
100
208
  return [next_index[:END], [head]] if next_index.include?(:END) and not longest_match
101
209
 
102
- matches = find(next_index, tokens)
210
+ matches = find(next_index, tokens, longest_match, slack, false) # Recursion
211
+
103
212
  if not matches.nil?
104
213
  matches.last.unshift head
105
214
  return matches
@@ -107,8 +216,7 @@ class TokenTrieNER < NER
107
216
 
108
217
  return [next_index[:END], [head]] if next_index.include?(:END)
109
218
 
110
- tokens.unshift head
111
- return nil
219
+ return find_fail(index, tokens, head, longest_match, slack, first)
112
220
  end
113
221
  end
114
222
 
@@ -117,20 +225,35 @@ class TokenTrieNER < NER
117
225
  match_offset = match_tokens.first.offset
118
226
  match_tokens.each{|t|
119
227
  match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
120
- match << t.original
228
+ match << (t.respond_to?(:original) ? t.original : t)
121
229
  }
122
230
 
123
231
  NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
124
232
  end
125
233
 
126
- attr_accessor :index, :longest_match, :type
127
- def initialize(file, type = nil, options = {})
128
- options = Misc.add_defaults options, :flatten => true, :longest_match => true
234
+ attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
235
+ def initialize(type = nil, file = nil, options = {})
236
+ options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
237
+ :persistence => false
238
+ @slack = slack
129
239
  @longest_match = options.delete :longest_match
240
+ @split_at = options.delete :split_at
241
+ @no_clean = options.delete :no_clean
130
242
 
243
+ file = [] if file.nil?
131
244
  file = [file] unless Array === file
132
- @index = {}
133
- file.each do |f| TokenTrieNER.merge(@index, TokenTrieNER.process(TSV.new(f, options), type)) end
245
+ @index = Persistence.persist(file, :TokenTRIE, :tsv, options) do |file, options, filename, persistecen_file|
246
+ if persistecen_file.nil?
247
+ @index = {}
248
+ else
249
+ FileUtils.mkdir_p File.dirname(persistecen_file) unless File.exists? File.dirname(persistecen_file)
250
+ @index = TCHash.get persistecen_file, true, :marshal
251
+ end
252
+ file.each do |f|
253
+ merge(f, type)
254
+ end
255
+ @index
256
+ end
134
257
  end
135
258
 
136
259
  def merge(new, type = nil)
@@ -140,24 +263,36 @@ class TokenTrieNER < NER
140
263
  when Hash === new
141
264
  TokenTrieNER.merge(@index, new)
142
265
  when TSV === new
143
- TokenTrieNER.merge(@index, TokenTrieNER.process(new,type))
266
+ old_unnamed = new.unnamed
267
+ old_monitor = new.monitor
268
+ new.unnamed = true
269
+ new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
270
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
271
+ new.unnamed = old_unnamed
272
+ new.monitor = old_monitor
144
273
  when String === new
145
- TokenTrieNER.merge(@index, TokenTrieNER.process(TSV.new(new, :flatten => true), type))
274
+ new = TSV.new(new, :flat)
275
+ new.unnamed = true
276
+ new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
277
+ TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
146
278
  end
147
279
  end
148
280
 
149
281
  def match(text)
150
- tokens = TokenTrieNER.tokenize text
282
+ tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
283
+
284
+ tokens.extend EnumeratedArray
285
+ tokens.pos = 0
151
286
 
152
287
  matches = []
153
- while tokens.any?
154
- new_matches = TokenTrieNER.find(@index, tokens, longest_match)
288
+ while tokens.left?
289
+ new_matches = TokenTrieNER.find(@index, tokens, longest_match, slack)
155
290
 
156
291
  if new_matches
157
292
  codes, match_tokens = new_matches
158
- matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.value})
293
+ matches << TokenTrieNER.make_match(match_tokens, codes.collect{|c| c.type}, codes.collect{|c| c.code})
159
294
  else
160
- tokens.shift
295
+ tokens.advance
161
296
  end
162
297
  end
163
298
 
@@ -165,4 +300,3 @@ class TokenTrieNER < NER
165
300
  end
166
301
 
167
302
  end
168
-
@@ -0,0 +1,214 @@
1
+ module NLP
2
+ def self.returnFeatures(prevWord, delimiter, nextWord)
3
+ if nextWord.match(/__ss__/)
4
+ nw = nextWord.sub(/__ss__/, "")
5
+ else
6
+ nw = nextWord
7
+ end
8
+
9
+ str = ""
10
+ # prev. word, next word
11
+ str += "pw_" + prevWord.downcase
12
+ str += "\tnw_" + nw.downcase
13
+
14
+ # delimiter
15
+ str += "\td_" + delimiter
16
+
17
+ # capitalized first char in next word
18
+ # capital in next word excluding first char.
19
+ if nw[0].chr == nw[0].chr.capitalize
20
+ str += "\tnfc_y"
21
+ nwExcluginFirst = nw[1 ... -1]
22
+ if nwExcluginFirst == nil
23
+ str += "\tnwcef_n"
24
+ elsif nwExcluginFirst.downcase == nwExcluginFirst
25
+ str += "\tnwcef_n"
26
+ else
27
+ str += "\tnwcef_y"
28
+ end
29
+ else
30
+ if nw.downcase == nw
31
+ str += "\tnwcef_n"
32
+ else
33
+ str += "\tnwcef_y"
34
+ end
35
+ str += "\tnfc_n"
36
+ end
37
+
38
+ # prev. word capital
39
+ if prevWord.downcase == prevWord
40
+ str += "\tpwc_n"
41
+ else
42
+ str += "\tpwc_y"
43
+ end
44
+
45
+ # number in prev. word, in next word
46
+ if prevWord.match(/[0-9]/)
47
+ str += "\tpwn_y"
48
+ else
49
+ str += "\tpwn_n"
50
+ end
51
+ if nw.match(/[0-9]/)
52
+ str += "\tnwn_y"
53
+ else
54
+ str += "\tnwn_n"
55
+ end
56
+
57
+ # prev., next word excluding braket, camma, etc.
58
+ prevWordEx = prevWord.gsub(/[()'",\[\]]/, "")
59
+ nwEx = nw.gsub(/[()'",\[\]]/, "")
60
+ str += "\tpwex_" + prevWordEx.downcase
61
+ str += "\tnwex_" + nwEx.downcase
62
+
63
+ # bracket or quatation in prev. word
64
+ if prevWord.match(/()'"/)
65
+ str += "\tpwcbq_y"
66
+ else
67
+ str += "\tpwcbq_n"
68
+ end
69
+ # camma in prev., next word
70
+ if prevWord.match(/,/)
71
+ str += "\tpwcc_y"
72
+ else
73
+ str += "\tpwcc_n"
74
+ end
75
+ if nw.match(/,/)
76
+ else
77
+ str += "\tnwcc_n"
78
+ end
79
+
80
+ # prev. word + delimiter
81
+ str += "\tpw_" + prevWord + "_d_" + delimiter
82
+ # prev. word ex. + delimiter + next word ex.
83
+ str += "\tpwex_" + prevWordEx + "_d_" + delimiter + "_nwex_" + nwEx
84
+ #str +=
85
+ #str +=
86
+ #str +=
87
+ str += "\n"
88
+ end
89
+
90
+ def self.event_extraction(text)
91
+ events = ""
92
+ marks = ""
93
+
94
+ eventCount = 0
95
+
96
+ pat = / [^ ]+[.!\?\)\]\"]( +)[^ ]+ /
97
+ for line in text.split(/\n/) do
98
+ while line.match(pat) do
99
+ line.sub!(/ ([^ ]+)([.!\?\)\]\"])( +)([^ ]+) /){
100
+ a, b, d, c = $1, $2, $3, $4
101
+ events << eventCount.to_s << "\t"
102
+ events << returnFeatures(a, b, c)
103
+ (" " + a + b + "__" + eventCount.to_s + "____" + d + "__" + c + " ")
104
+ }
105
+ eventCount += 1
106
+ end
107
+ marks << line
108
+ end
109
+
110
+ [events, marks]
111
+ end
112
+
113
+ def self.process_labels(marked_text, labels)
114
+ out = ""
115
+
116
+ count = 0
117
+ text_lines = marked_text.split(/\n/)
118
+ line = text_lines.shift
119
+ for label in labels
120
+ pat = "__" + count.to_s + "__"
121
+ until(line.match(pat)) do
122
+ out << line
123
+ line = text_lines.shift
124
+ end
125
+ splitted = label.chomp.to_i
126
+
127
+ line.sub!(pat){
128
+ if splitted == 1
129
+ "__\n__"
130
+ else
131
+ "____"
132
+ end
133
+ }
134
+ line.sub!(/__\n____ +__/, "\n")
135
+ line.sub!(/______( +)__/){
136
+ $1
137
+ }
138
+ count += 1
139
+ end
140
+
141
+ out << line
142
+
143
+ out << text_lines * ""
144
+
145
+ out
146
+ end
147
+
148
+ def self.geniass_sentence_splitter_extension(text)
149
+ require Rbbt.software.opt.Geniass.ruby["Geniass.so"].find
150
+ geniass = Geniass.new
151
+ if not geniass.geniass_is_loaded
152
+ Misc.in_dir Rbbt.software.opt.Geniass.find do
153
+ geniass.load_geniass
154
+ end
155
+ end
156
+
157
+ cleaned = text.gsub("\n",NEW_LINE_MASK)
158
+ events, marks = event_extraction(cleaned)
159
+
160
+ labels = events.split(/\n/).collect{|line|
161
+ geniass.label(line)
162
+ }
163
+
164
+ out = process_labels(marks, labels)
165
+
166
+ offsets = []
167
+
168
+ inTxtStrict = StringIO.new text
169
+ inTxtNew = StringIO.new out.gsub("\n", '|').gsub(NEW_LINE_MASK, "\n")
170
+
171
+ marker = "|"[0]
172
+ position = 0
173
+ sentenceCount = 1
174
+ target = ''
175
+ targetNew = ''
176
+ start = 0
177
+ finish = 0
178
+
179
+ while(!inTxtNew.eof?) do
180
+ targetNew = inTxtNew.getc
181
+ target = inTxtStrict.getc
182
+ position += 1
183
+ if targetNew == marker
184
+ sentenceCount += 1
185
+ finish = position - 1
186
+ offsets << [start, finish] if finish - start > 10
187
+ if targetNew == target
188
+ start = position
189
+ else
190
+ targetNew = inTxtNew.getc
191
+ while targetNew != target do
192
+ target = inTxtStrict.getc
193
+ position += 1
194
+ end
195
+ start = position - 1
196
+ end
197
+ end
198
+ end
199
+
200
+ finish = position - 1
201
+ offsets << [start, finish] if finish > start
202
+
203
+ inTxtStrict.close
204
+ inTxtNew.close
205
+
206
+ offsets.collect do |s,e|
207
+ sentence = text[s..e]
208
+ next if sentence.nil?
209
+ Segment.annotate sentence, s
210
+ sentence
211
+ end
212
+
213
+ end
214
+ end