rbbt-text 0.2.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -0,0 +1,115 @@
1
+ require 'rbbt/util/misc'
2
+ require 'tokyocabinet'
3
+
4
+ class DocumentRepo < TokyoCabinet::BDB
5
+ class OpenError < StandardError;end
6
+ class KeyFormatError < StandardError;end
7
+
8
+ CONNECTIONS = {} unless defined? CONNECTIONS
9
+
10
+ def self.get(path, write = false)
11
+
12
+ if !File.exists?(path) or not CONNECTIONS.include? path
13
+ CONNECTIONS[path] = self.new(path, true)
14
+ end
15
+
16
+ d = CONNECTIONS[path]
17
+
18
+ if write and not d.write?
19
+ d.write
20
+ else
21
+ d.read if d.write?
22
+ end
23
+
24
+ d
25
+ end
26
+
27
+
28
+ alias original_open open
29
+ def open(write = false)
30
+ flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
31
+
32
+ FileUtils.mkdir_p File.dirname(@path_to_db) unless File.exists?(File.dirname(@path_to_db))
33
+ if !self.original_open(@path_to_db, flags)
34
+ ecode = self.ecode
35
+ raise OpenError, "Open error: #{self.errmsg(ecode)}. Trying to open file #{@path_to_db}"
36
+ end
37
+
38
+ @write = write
39
+
40
+ end
41
+
42
+ def write?
43
+ @write
44
+ end
45
+
46
+ def write
47
+ self.close
48
+ self.open(true)
49
+ end
50
+
51
+ def read
52
+ self.close
53
+ self.open(false)
54
+ end
55
+
56
+ def initialize(path, write = false)
57
+ super()
58
+
59
+ @path_to_db = path
60
+
61
+ if write || ! File.exists?(@path_to_db)
62
+ self.setcache(100000) or raise "Error setting cache"
63
+ self.open(true)
64
+ else
65
+ self.open(false)
66
+ end
67
+ end
68
+
69
+ def docid2fields(docid)
70
+ docid.split(":", -1).values_at 0,1,2,3
71
+ end
72
+
73
+ def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
74
+ [namespace, id, type, hash] * ":"
75
+ end
76
+
77
+ def docid(docid)
78
+ get(docid)
79
+ end
80
+
81
+ def add(text, namespace, id, type, hash)
82
+ write unless write?
83
+ docid = fields2docid(namespace, id, type, hash)
84
+ self[docid] = text unless self.include? docid
85
+ docid
86
+ end
87
+
88
+ def find(namespace=nil, id = nil, type = nil, hash = nil)
89
+ case
90
+ when namespace.nil?
91
+ self.keys
92
+ when id.nil?
93
+ range_start = [namespace] * ":" + ':'
94
+ range_end = [namespace] * ":" + ';'
95
+ self.range(range_start, true, range_end, false)
96
+ when (type and hash)
97
+ [[namespace, id, type, hash] * ":"]
98
+ when hash
99
+ [[namespace, id, "", hash] * ":"]
100
+ when type
101
+ range_start = [namespace, id, type] * ":" + ':'
102
+ range_end = [namespace, id, type] * ":" + ';'
103
+ self.range(range_start, true, range_end, false)
104
+ else
105
+ range_start = [namespace, id] * ":" + ':'
106
+ range_end = [namespace, id] * ":" + ';'
107
+ self.range(range_start, true, range_end, false)
108
+ end
109
+ end
110
+
111
+ def find_docid(docid)
112
+ find(*docid2fields(docid))
113
+ end
114
+
115
+ end
@@ -0,0 +1,26 @@
1
+ require 'rbbt/sources/pubmed'
2
+
3
+ class Corpus
4
+
5
+ NAMESPACES = {} unless defined? NAMESPACES
6
+ NAMESPACES[:pubmed] = :add_pmid
7
+
8
+ def add_pmid(pmid, type = nil)
9
+ pmids = Array === pmid ? pmid : [pmid]
10
+ type = nil if String === type and type.empty?
11
+
12
+ PubMed.get_article(pmids).collect do |pmid, article|
13
+ if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
14
+ add_document(article.text, :pubmed, pmid, :abstract)
15
+ else
16
+ raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
17
+ add_document(article.full_text, :pubmed, pmid, :fulltext)
18
+ end
19
+ end
20
+ end
21
+
22
+ def add_pubmed_query(query, max, type = nil)
23
+ pmids = PubMed.query(query, max)
24
+ add_pmid(pmids, type)
25
+ end
26
+ end
@@ -1,18 +1,20 @@
1
1
  require 'rbbt/ner/annotations'
2
+ require 'rbbt/ner/annotations/named_entity'
3
+ require 'rbbt/ner/annotations/annotated'
2
4
 
3
5
  class NER
4
- def entities(text, overlap = true, *args)
6
+ def entities(text, protect = false, *args)
5
7
  case
6
8
  when Array === text
7
9
  text.collect do |element|
8
- matches = entities(element, overlap, *args)
10
+ matches = entities(element, protect, *args)
9
11
  matches.each{|match|
10
- match.offset += element.offset if match.offset
12
+ match.offset += element.offset if match.offset and element.offset
11
13
  }
12
14
  matches
13
15
  end.flatten
14
- when (Annotated === text and not overlap)
15
- entities(text.split, overlap, *args)
16
+ when (Annotated === text and protect)
17
+ entities(text.split_segments(true), protect, *args)
16
18
  else
17
19
  match(text, *args)
18
20
  end
@@ -7,7 +7,7 @@ require 'rbbt/ner/NER'
7
7
  # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
8
8
  class Abner < NER
9
9
 
10
- Rbbt.add_software "ABNER" => ['','']
10
+ Rbbt.software.opt.ABNER.define_as_install Rbbt.share.install.software.ABNER.find
11
11
 
12
12
  @@JFile = Rjb::import('java.io.File')
13
13
  @@Tagger = Rjb::import('abner.Tagger')
@@ -27,14 +27,25 @@ class Abner < NER
27
27
  # returns all the mentions found, regardless of type, to be coherent
28
28
  # with the rest of NER packages in Rbbt.
29
29
  def match(text)
30
+ return [] if text.nil? or text.empty?
30
31
 
31
32
  res = @tagger.getEntities(text)
32
33
  types = res[1]
33
34
  strings = res[0]
34
35
 
36
+ global_offset = 0
35
37
  strings.zip(types).collect do |mention, type|
36
38
  mention = mention.to_s;
37
- NamedEntity.annotate(mention, nil, type.to_s)
39
+ offset = text.index(mention)
40
+ if offset.nil?
41
+ NamedEntity.annotate(mention, nil, type.to_s)
42
+ else
43
+ NamedEntity.annotate(mention, offset + global_offset, type.to_s)
44
+ text = text[offset + mention.length..-1]
45
+ global_offset += offset + mention.length
46
+ end
47
+
48
+ mention
38
49
  end
39
50
  end
40
51
 
@@ -1,5 +1,63 @@
1
1
  module Segment
2
- attr_accessor :offset
2
+ attr_accessor :offset, :docid
3
+
4
+ def self.included(base)
5
+ if base.instance_methods.include? "segment_types"
6
+ class << base
7
+ self.module_eval do
8
+ define_method "extended" do |object|
9
+ object.segment_types ||= []
10
+ object.segment_types << self.to_s unless object.segment_types.include? self.to_s
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
16
+
17
+ def self.annotate(string, offset = nil, docid = nil)
18
+ string.extend Segment
19
+ string.offset = offset
20
+ string.docid = docid
21
+ string
22
+ end
23
+
24
+ def id
25
+ new = info.dup
26
+ Digest::MD5.hexdigest(Misc.hash2string(new) << self << (offset || 0).to_s)
27
+ end
28
+
29
+ SKIP = %w(docid offset)
30
+ def info
31
+ equal_ascii = "="[0]
32
+ info = {}
33
+ singleton_methods.select{|method| method[-1] == equal_ascii}.
34
+ collect{|m| m[(0..-2)]}.each{|m| info[m] = self.send(m) if self.respond_to?(m) and not SKIP.include? m.to_s}
35
+ info
36
+ info.delete_if{|k,v| v.nil?}
37
+ info
38
+ end
39
+
40
+ def self.load(text, start, eend, info, docid = nil)
41
+ string = text[start.to_i..eend.to_i] if start and eend
42
+ string ||= info[:literal]
43
+ string.extend Segment
44
+
45
+ # add types
46
+ types = info.delete("segment_types")|| info.delete(:segment_types) || []
47
+ types.each do |type| string.extend Misc.string2const(type) end
48
+
49
+ # set info data
50
+ info.each do |key,value|
51
+ string.send key + '=', value if string.respond_to? key.to_sym
52
+ end
53
+
54
+ string.docid = docid
55
+ string.offset = start.to_i
56
+
57
+ string
58
+ end
59
+
60
+ # {{{ Sorting and splitting
3
61
 
4
62
  def self.sort(segments, inline = true)
5
63
  if inline
@@ -14,21 +72,43 @@ module Segment
14
72
  when (not a.range.include? b.offset and not b.range.include? a.offset)
15
73
  a.offset <=> b.offset
16
74
  else
17
- b.length <=> a.length
75
+ a.length <=> b.length
18
76
  end
19
- end.reverse
77
+ end
20
78
  else
21
- segments.sort_by do |segment| segment.offset || 0 end
79
+ segments.sort_by do |segment| segment.offset || 0 end.reverse
80
+ end
81
+ end
82
+
83
+ def self.overlaps(sorted_segments)
84
+
85
+ last = nil
86
+ overlaped = []
87
+ sorted_segments.reverse.each do |segment|
88
+ overlaped << segment if (not last.nil?) and segment.range.end > last
89
+ last = segment.range.begin
22
90
  end
91
+
92
+ overlaped
93
+ end
94
+
95
+ def self.clean_sort(segments)
96
+ sorted = sort(segments).reject{|s| s.offset.nil?}
97
+ overlaps = overlaps(sorted)
98
+ overlaps.each do |s|
99
+ sorted.delete s
100
+ end
101
+
102
+ sorted
23
103
  end
24
104
 
25
- def self.split(text, segments)
26
- sorted_segments = sort segments
105
+ def self.split(text, segments, skip_segments = false)
106
+ sorted_segments = clean_sort segments
27
107
 
28
108
  chunks = []
29
109
  segment_end = 0
30
110
  text_offset = 0
31
- sorted_segments.each do |segment|
111
+ sorted_segments.reverse.each do |segment|
32
112
  return chunks if text.nil? or text.empty?
33
113
  next if segment.offset.nil?
34
114
  offset = segment.offset - text_offset
@@ -45,12 +125,15 @@ module Segment
45
125
 
46
126
  segment_end = offset + segment.length - 1
47
127
 
48
- chunk = text[offset..segment_end]
49
- Segment.annotate(chunk, text_offset + offset)
50
- chunks << chunk
128
+ if not skip_segments
129
+ chunk = text[offset..segment_end]
130
+ Segment.annotate(chunk, text_offset + offset)
131
+ chunks << chunk
132
+ end
51
133
 
52
134
  text_offset += segment_end + 1
53
135
  text = text[segment_end + 1..-1]
136
+
54
137
  end
55
138
 
56
139
  if not text.nil? and text.any?
@@ -62,62 +145,110 @@ module Segment
62
145
  chunks
63
146
  end
64
147
 
65
- def self.annotate(string, offset = nil)
66
- string.extend Segment
67
- string.offset = offset
68
- string
148
+ # {{{ Ranges and manipulation
149
+
150
+ def pull(offset)
151
+ if self.offset.nil? or offset.nil?
152
+ self.offset = nil
153
+ else
154
+ self.offset += offset
155
+ end
156
+
157
+ self
158
+ end
159
+
160
+ def push(offset)
161
+ if self.offset.nil? or offset.nil?
162
+ self.offset = nil
163
+ else
164
+ self.offset -= offset
165
+ end
166
+
167
+ self
168
+ end
169
+
170
+ def make_relative(segments)
171
+ segments.collect{|s| s.push offset}
172
+ end
173
+
174
+ def end
175
+ return nil if offset.nil?
176
+ offset + length - 1
69
177
  end
70
178
 
71
179
  def range
72
- (offset..offset + length - 1)
180
+ raise "No offset specified" if offset.nil?
181
+ (offset..self.end)
73
182
  end
74
- end
75
183
 
76
- module Annotated
77
- attr_accessor :annotations
78
- def self.annotate(string)
79
- string.extend Annotated
80
- string.annotations = []
81
- string
184
+ def range_in(container = nil)
185
+ raise "No offset specified" if offset.nil?
186
+ case
187
+ when (Segment === container and not container.offset.nil?)
188
+ ((offset - container.offset)..(self.end - container.offset))
189
+ when Integer === container
190
+ ((offset - container)..(self.end - container))
191
+ else
192
+ range
193
+ end
82
194
  end
83
195
 
84
- def split
85
- Segment.split(self, @annotations)
196
+ def self.align(text, parts)
197
+ pre_offset = 0
198
+ parts.each do |part|
199
+ offset = text.index part
200
+ next if offset.nil?
201
+ Segment.annotate(part, pre_offset + offset)
202
+ pre_offset += offset + part.length - 1
203
+ text = text[(offset + part.length - 1)..-1]
204
+ end
86
205
  end
87
- end
88
206
 
89
- module NamedEntity
90
- include Segment
91
- attr_accessor :type, :code, :score
207
+ class Index
208
+ attr_accessor :index, :data
209
+ def initialize(index, data)
210
+ @index = index
211
+ @data = data
212
+ end
92
213
 
93
- def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
94
- string.extend NamedEntity
95
- string.offset = offset
96
- string.type = type
97
- string.code = code
98
- string.score = score
99
- string
214
+ def [](pos)
215
+ index[pos].collect{|id| data[id]}
216
+ end
100
217
  end
101
218
 
102
- def to_s
103
- <<-EOF
104
- String: #{ self }
105
- Offset: #{ offset.inspect }
106
- Type: #{type.inspect}
107
- Code: #{code.inspect}
108
- Score: #{score.inspect}
109
- EOF
219
+ def self.index(segments, persistence_file = :memory)
220
+
221
+ segments = segments.values.flatten if Hash === segments
222
+
223
+ annotation_index =
224
+ Persistence.persist("Index", :Index, :fwt, :persistence => (! (persistence_file.nil? or persistence_file == :memory)), :persistence_file => persistence_file, :range => true) do
225
+
226
+ value_size = 0
227
+ index_data = segments.collect{|segment|
228
+ next if segment.offset.nil?
229
+ range = segment.range
230
+ value_size = [segment.id.length, value_size].max
231
+ [segment.id, [range.begin, range.end]]
232
+ }.compact
233
+
234
+ fwt = FixWidthTable.get :memory, value_size, true
235
+ fwt.add_range index_data
236
+ fwt
237
+ end
238
+
239
+ data = {}
240
+ segments.each do |segment| data[segment.id] = segment end
241
+ Index.new annotation_index, data
110
242
  end
243
+
111
244
  end
112
245
 
113
- module Token
246
+ module Comment
114
247
  include Segment
115
- attr_accessor :original
116
- def self.annotate(string, offset = nil, original = nil)
117
- string.extend Token
118
- string.offset = offset
119
- string.original = original
120
- string
248
+ attr_accessor :comment
249
+ def self.annotate(text, comment = nil)
250
+ text.extend Comment
251
+ text.comment = (comment.nil? ? text : comment)
252
+ text
121
253
  end
122
254
  end
123
-