rbbt-text 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. data/bin/get_ppis.rb +52 -0
  2. data/lib/rbbt/bow/dictionary.rb +9 -9
  3. data/lib/rbbt/bow/misc.rb +86 -2
  4. data/lib/rbbt/corpus/corpus.rb +55 -0
  5. data/lib/rbbt/corpus/document.rb +289 -0
  6. data/lib/rbbt/corpus/document_repo.rb +115 -0
  7. data/lib/rbbt/corpus/sources/pubmed.rb +26 -0
  8. data/lib/rbbt/ner/NER.rb +7 -5
  9. data/lib/rbbt/ner/abner.rb +13 -2
  10. data/lib/rbbt/ner/annotations.rb +182 -51
  11. data/lib/rbbt/ner/annotations/annotated.rb +15 -0
  12. data/lib/rbbt/ner/annotations/named_entity.rb +37 -0
  13. data/lib/rbbt/ner/annotations/relations.rb +25 -0
  14. data/lib/rbbt/ner/annotations/token.rb +28 -0
  15. data/lib/rbbt/ner/annotations/transformed.rb +170 -0
  16. data/lib/rbbt/ner/banner.rb +8 -5
  17. data/lib/rbbt/ner/chemical_tagger.rb +34 -0
  18. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +136 -0
  19. data/lib/rbbt/ner/oscar3.rb +1 -1
  20. data/lib/rbbt/ner/oscar4.rb +41 -0
  21. data/lib/rbbt/ner/patterns.rb +132 -0
  22. data/lib/rbbt/ner/rnorm.rb +141 -0
  23. data/lib/rbbt/ner/rnorm/cue_index.rb +80 -0
  24. data/lib/rbbt/ner/rnorm/tokens.rb +218 -0
  25. data/lib/rbbt/ner/token_trieNER.rb +185 -51
  26. data/lib/rbbt/nlp/genia/sentence_splitter.rb +214 -0
  27. data/lib/rbbt/nlp/nlp.rb +235 -0
  28. data/share/install/software/ABNER +0 -4
  29. data/share/install/software/ChemicalTagger +81 -0
  30. data/share/install/software/Gdep +115 -0
  31. data/share/install/software/Geniass +118 -0
  32. data/share/install/software/OSCAR4 +16 -0
  33. data/share/install/software/StanfordParser +15 -0
  34. data/share/patterns/drug_induce_disease +22 -0
  35. data/share/rnorm/cue_default +10 -0
  36. data/share/rnorm/tokens_default +86 -0
  37. data/share/{stopwords → wordlists/stopwords} +0 -0
  38. data/test/rbbt/bow/test_bow.rb +1 -1
  39. data/test/rbbt/bow/test_dictionary.rb +1 -1
  40. data/test/rbbt/bow/test_misc.rb +1 -1
  41. data/test/rbbt/corpus/test_corpus.rb +99 -0
  42. data/test/rbbt/corpus/test_document.rb +222 -0
  43. data/test/rbbt/ner/annotations/test_named_entity.rb +14 -0
  44. data/test/rbbt/ner/annotations/test_transformed.rb +175 -0
  45. data/test/rbbt/ner/test_abner.rb +1 -1
  46. data/test/rbbt/ner/test_annotations.rb +64 -2
  47. data/test/rbbt/ner/test_banner.rb +1 -1
  48. data/test/rbbt/ner/test_chemical_tagger.rb +56 -0
  49. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +20 -0
  50. data/test/rbbt/ner/{test_oscar3.rb → test_oscar4.rb} +12 -13
  51. data/test/rbbt/ner/test_patterns.rb +66 -0
  52. data/test/rbbt/ner/test_regexpNER.rb +1 -1
  53. data/test/rbbt/ner/test_rnorm.rb +47 -0
  54. data/test/rbbt/ner/test_token_trieNER.rb +60 -35
  55. data/test/rbbt/nlp/test_nlp.rb +88 -0
  56. data/test/test_helper.rb +20 -0
  57. metadata +93 -20
@@ -0,0 +1,115 @@
1
+ require 'rbbt/util/misc'
2
+ require 'tokyocabinet'
3
+
4
+ class DocumentRepo < TokyoCabinet::BDB
5
+ class OpenError < StandardError;end
6
+ class KeyFormatError < StandardError;end
7
+
8
+ CONNECTIONS = {} unless defined? CONNECTIONS
9
+
10
+ def self.get(path, write = false)
11
+
12
+ if !File.exists?(path) or not CONNECTIONS.include? path
13
+ CONNECTIONS[path] = self.new(path, true)
14
+ end
15
+
16
+ d = CONNECTIONS[path]
17
+
18
+ if write and not d.write?
19
+ d.write
20
+ else
21
+ d.read if d.write?
22
+ end
23
+
24
+ d
25
+ end
26
+
27
+
28
+ alias original_open open
29
+ def open(write = false)
30
+ flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
31
+
32
+ FileUtils.mkdir_p File.dirname(@path_to_db) unless File.exists?(File.dirname(@path_to_db))
33
+ if !self.original_open(@path_to_db, flags)
34
+ ecode = self.ecode
35
+ raise OpenError, "Open error: #{self.errmsg(ecode)}. Trying to open file #{@path_to_db}"
36
+ end
37
+
38
+ @write = write
39
+
40
+ end
41
+
42
+ def write?
43
+ @write
44
+ end
45
+
46
+ def write
47
+ self.close
48
+ self.open(true)
49
+ end
50
+
51
+ def read
52
+ self.close
53
+ self.open(false)
54
+ end
55
+
56
+ def initialize(path, write = false)
57
+ super()
58
+
59
+ @path_to_db = path
60
+
61
+ if write || ! File.exists?(@path_to_db)
62
+ self.setcache(100000) or raise "Error setting cache"
63
+ self.open(true)
64
+ else
65
+ self.open(false)
66
+ end
67
+ end
68
+
69
+ def docid2fields(docid)
70
+ docid.split(":", -1).values_at 0,1,2,3
71
+ end
72
+
73
+ def fields2docid(namespace = nil, id = nil, type = nil, hash = nil)
74
+ [namespace, id, type, hash] * ":"
75
+ end
76
+
77
+ def docid(docid)
78
+ get(docid)
79
+ end
80
+
81
+ def add(text, namespace, id, type, hash)
82
+ write unless write?
83
+ docid = fields2docid(namespace, id, type, hash)
84
+ self[docid] = text unless self.include? docid
85
+ docid
86
+ end
87
+
88
+ def find(namespace=nil, id = nil, type = nil, hash = nil)
89
+ case
90
+ when namespace.nil?
91
+ self.keys
92
+ when id.nil?
93
+ range_start = [namespace] * ":" + ':'
94
+ range_end = [namespace] * ":" + ';'
95
+ self.range(range_start, true, range_end, false)
96
+ when (type and hash)
97
+ [[namespace, id, type, hash] * ":"]
98
+ when hash
99
+ [[namespace, id, "", hash] * ":"]
100
+ when type
101
+ range_start = [namespace, id, type] * ":" + ':'
102
+ range_end = [namespace, id, type] * ":" + ';'
103
+ self.range(range_start, true, range_end, false)
104
+ else
105
+ range_start = [namespace, id] * ":" + ':'
106
+ range_end = [namespace, id] * ":" + ';'
107
+ self.range(range_start, true, range_end, false)
108
+ end
109
+ end
110
+
111
+ def find_docid(docid)
112
+ find(*docid2fields(docid))
113
+ end
114
+
115
+ end
@@ -0,0 +1,26 @@
1
+ require 'rbbt/sources/pubmed'
2
+
3
+ class Corpus
4
+
5
+ NAMESPACES = {} unless defined? NAMESPACES
6
+ NAMESPACES[:pubmed] = :add_pmid
7
+
8
+ def add_pmid(pmid, type = nil)
9
+ pmids = Array === pmid ? pmid : [pmid]
10
+ type = nil if String === type and type.empty?
11
+
12
+ PubMed.get_article(pmids).collect do |pmid, article|
13
+ if (type.nil? and article.pdf_url.nil?) or (not type.nil? and type.to_sym === :abstract)
14
+ add_document(article.text, :pubmed, pmid, :abstract)
15
+ else
16
+ raise "No FullText available for #{ pmid }" if article.pdf_url.nil?
17
+ add_document(article.full_text, :pubmed, pmid, :fulltext)
18
+ end
19
+ end
20
+ end
21
+
22
+ def add_pubmed_query(query, max, type = nil)
23
+ pmids = PubMed.query(query, max)
24
+ add_pmid(pmids, type)
25
+ end
26
+ end
@@ -1,18 +1,20 @@
1
1
  require 'rbbt/ner/annotations'
2
+ require 'rbbt/ner/annotations/named_entity'
3
+ require 'rbbt/ner/annotations/annotated'
2
4
 
3
5
  class NER
4
- def entities(text, overlap = true, *args)
6
+ def entities(text, protect = false, *args)
5
7
  case
6
8
  when Array === text
7
9
  text.collect do |element|
8
- matches = entities(element, overlap, *args)
10
+ matches = entities(element, protect, *args)
9
11
  matches.each{|match|
10
- match.offset += element.offset if match.offset
12
+ match.offset += element.offset if match.offset and element.offset
11
13
  }
12
14
  matches
13
15
  end.flatten
14
- when (Annotated === text and not overlap)
15
- entities(text.split, overlap, *args)
16
+ when (Annotated === text and protect)
17
+ entities(text.split_segments(true), protect, *args)
16
18
  else
17
19
  match(text, *args)
18
20
  end
@@ -7,7 +7,7 @@ require 'rbbt/ner/NER'
7
7
  # in Java Abner[http://www.cs.wisc.edu/~bsettles/abner/].
8
8
  class Abner < NER
9
9
 
10
- Rbbt.add_software "ABNER" => ['','']
10
+ Rbbt.software.opt.ABNER.define_as_install Rbbt.share.install.software.ABNER.find
11
11
 
12
12
  @@JFile = Rjb::import('java.io.File')
13
13
  @@Tagger = Rjb::import('abner.Tagger')
@@ -27,14 +27,25 @@ class Abner < NER
27
27
  # returns all the mentions found, regardless of type, to be coherent
28
28
  # with the rest of NER packages in Rbbt.
29
29
  def match(text)
30
+ return [] if text.nil? or text.empty?
30
31
 
31
32
  res = @tagger.getEntities(text)
32
33
  types = res[1]
33
34
  strings = res[0]
34
35
 
36
+ global_offset = 0
35
37
  strings.zip(types).collect do |mention, type|
36
38
  mention = mention.to_s;
37
- NamedEntity.annotate(mention, nil, type.to_s)
39
+ offset = text.index(mention)
40
+ if offset.nil?
41
+ NamedEntity.annotate(mention, nil, type.to_s)
42
+ else
43
+ NamedEntity.annotate(mention, offset + global_offset, type.to_s)
44
+ text = text[offset + mention.length..-1]
45
+ global_offset += offset + mention.length
46
+ end
47
+
48
+ mention
38
49
  end
39
50
  end
40
51
 
@@ -1,5 +1,63 @@
1
1
  module Segment
2
- attr_accessor :offset
2
+ attr_accessor :offset, :docid
3
+
4
+ def self.included(base)
5
+ if base.instance_methods.include? "segment_types"
6
+ class << base
7
+ self.module_eval do
8
+ define_method "extended" do |object|
9
+ object.segment_types ||= []
10
+ object.segment_types << self.to_s unless object.segment_types.include? self.to_s
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
16
+
17
+ def self.annotate(string, offset = nil, docid = nil)
18
+ string.extend Segment
19
+ string.offset = offset
20
+ string.docid = docid
21
+ string
22
+ end
23
+
24
+ def id
25
+ new = info.dup
26
+ Digest::MD5.hexdigest(Misc.hash2string(new) << self << (offset || 0).to_s)
27
+ end
28
+
29
+ SKIP = %w(docid offset)
30
+ def info
31
+ equal_ascii = "="[0]
32
+ info = {}
33
+ singleton_methods.select{|method| method[-1] == equal_ascii}.
34
+ collect{|m| m[(0..-2)]}.each{|m| info[m] = self.send(m) if self.respond_to?(m) and not SKIP.include? m.to_s}
35
+ info
36
+ info.delete_if{|k,v| v.nil?}
37
+ info
38
+ end
39
+
40
+ def self.load(text, start, eend, info, docid = nil)
41
+ string = text[start.to_i..eend.to_i] if start and eend
42
+ string ||= info[:literal]
43
+ string.extend Segment
44
+
45
+ # add types
46
+ types = info.delete("segment_types")|| info.delete(:segment_types) || []
47
+ types.each do |type| string.extend Misc.string2const(type) end
48
+
49
+ # set info data
50
+ info.each do |key,value|
51
+ string.send key + '=', value if string.respond_to? key.to_sym
52
+ end
53
+
54
+ string.docid = docid
55
+ string.offset = start.to_i
56
+
57
+ string
58
+ end
59
+
60
+ # {{{ Sorting and splitting
3
61
 
4
62
  def self.sort(segments, inline = true)
5
63
  if inline
@@ -14,21 +72,43 @@ module Segment
14
72
  when (not a.range.include? b.offset and not b.range.include? a.offset)
15
73
  a.offset <=> b.offset
16
74
  else
17
- b.length <=> a.length
75
+ a.length <=> b.length
18
76
  end
19
- end.reverse
77
+ end
20
78
  else
21
- segments.sort_by do |segment| segment.offset || 0 end
79
+ segments.sort_by do |segment| segment.offset || 0 end.reverse
80
+ end
81
+ end
82
+
83
+ def self.overlaps(sorted_segments)
84
+
85
+ last = nil
86
+ overlaped = []
87
+ sorted_segments.reverse.each do |segment|
88
+ overlaped << segment if (not last.nil?) and segment.range.end > last
89
+ last = segment.range.begin
22
90
  end
91
+
92
+ overlaped
93
+ end
94
+
95
+ def self.clean_sort(segments)
96
+ sorted = sort(segments).reject{|s| s.offset.nil?}
97
+ overlaps = overlaps(sorted)
98
+ overlaps.each do |s|
99
+ sorted.delete s
100
+ end
101
+
102
+ sorted
23
103
  end
24
104
 
25
- def self.split(text, segments)
26
- sorted_segments = sort segments
105
+ def self.split(text, segments, skip_segments = false)
106
+ sorted_segments = clean_sort segments
27
107
 
28
108
  chunks = []
29
109
  segment_end = 0
30
110
  text_offset = 0
31
- sorted_segments.each do |segment|
111
+ sorted_segments.reverse.each do |segment|
32
112
  return chunks if text.nil? or text.empty?
33
113
  next if segment.offset.nil?
34
114
  offset = segment.offset - text_offset
@@ -45,12 +125,15 @@ module Segment
45
125
 
46
126
  segment_end = offset + segment.length - 1
47
127
 
48
- chunk = text[offset..segment_end]
49
- Segment.annotate(chunk, text_offset + offset)
50
- chunks << chunk
128
+ if not skip_segments
129
+ chunk = text[offset..segment_end]
130
+ Segment.annotate(chunk, text_offset + offset)
131
+ chunks << chunk
132
+ end
51
133
 
52
134
  text_offset += segment_end + 1
53
135
  text = text[segment_end + 1..-1]
136
+
54
137
  end
55
138
 
56
139
  if not text.nil? and text.any?
@@ -62,62 +145,110 @@ module Segment
62
145
  chunks
63
146
  end
64
147
 
65
- def self.annotate(string, offset = nil)
66
- string.extend Segment
67
- string.offset = offset
68
- string
148
+ # {{{ Ranges and manipulation
149
+
150
+ def pull(offset)
151
+ if self.offset.nil? or offset.nil?
152
+ self.offset = nil
153
+ else
154
+ self.offset += offset
155
+ end
156
+
157
+ self
158
+ end
159
+
160
+ def push(offset)
161
+ if self.offset.nil? or offset.nil?
162
+ self.offset = nil
163
+ else
164
+ self.offset -= offset
165
+ end
166
+
167
+ self
168
+ end
169
+
170
+ def make_relative(segments)
171
+ segments.collect{|s| s.push offset}
172
+ end
173
+
174
+ def end
175
+ return nil if offset.nil?
176
+ offset + length - 1
69
177
  end
70
178
 
71
179
  def range
72
- (offset..offset + length - 1)
180
+ raise "No offset specified" if offset.nil?
181
+ (offset..self.end)
73
182
  end
74
- end
75
183
 
76
- module Annotated
77
- attr_accessor :annotations
78
- def self.annotate(string)
79
- string.extend Annotated
80
- string.annotations = []
81
- string
184
+ def range_in(container = nil)
185
+ raise "No offset specified" if offset.nil?
186
+ case
187
+ when (Segment === container and not container.offset.nil?)
188
+ ((offset - container.offset)..(self.end - container.offset))
189
+ when Integer === container
190
+ ((offset - container)..(self.end - container))
191
+ else
192
+ range
193
+ end
82
194
  end
83
195
 
84
- def split
85
- Segment.split(self, @annotations)
196
+ def self.align(text, parts)
197
+ pre_offset = 0
198
+ parts.each do |part|
199
+ offset = text.index part
200
+ next if offset.nil?
201
+ Segment.annotate(part, pre_offset + offset)
202
+ pre_offset += offset + part.length - 1
203
+ text = text[(offset + part.length - 1)..-1]
204
+ end
86
205
  end
87
- end
88
206
 
89
- module NamedEntity
90
- include Segment
91
- attr_accessor :type, :code, :score
207
+ class Index
208
+ attr_accessor :index, :data
209
+ def initialize(index, data)
210
+ @index = index
211
+ @data = data
212
+ end
92
213
 
93
- def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
94
- string.extend NamedEntity
95
- string.offset = offset
96
- string.type = type
97
- string.code = code
98
- string.score = score
99
- string
214
+ def [](pos)
215
+ index[pos].collect{|id| data[id]}
216
+ end
100
217
  end
101
218
 
102
- def to_s
103
- <<-EOF
104
- String: #{ self }
105
- Offset: #{ offset.inspect }
106
- Type: #{type.inspect}
107
- Code: #{code.inspect}
108
- Score: #{score.inspect}
109
- EOF
219
+ def self.index(segments, persistence_file = :memory)
220
+
221
+ segments = segments.values.flatten if Hash === segments
222
+
223
+ annotation_index =
224
+ Persistence.persist("Index", :Index, :fwt, :persistence => (! (persistence_file.nil? or persistence_file == :memory)), :persistence_file => persistence_file, :range => true) do
225
+
226
+ value_size = 0
227
+ index_data = segments.collect{|segment|
228
+ next if segment.offset.nil?
229
+ range = segment.range
230
+ value_size = [segment.id.length, value_size].max
231
+ [segment.id, [range.begin, range.end]]
232
+ }.compact
233
+
234
+ fwt = FixWidthTable.get :memory, value_size, true
235
+ fwt.add_range index_data
236
+ fwt
237
+ end
238
+
239
+ data = {}
240
+ segments.each do |segment| data[segment.id] = segment end
241
+ Index.new annotation_index, data
110
242
  end
243
+
111
244
  end
112
245
 
113
- module Token
246
+ module Comment
114
247
  include Segment
115
- attr_accessor :original
116
- def self.annotate(string, offset = nil, original = nil)
117
- string.extend Token
118
- string.offset = offset
119
- string.original = original
120
- string
248
+ attr_accessor :comment
249
+ def self.annotate(text, comment = nil)
250
+ text.extend Comment
251
+ text.comment = (comment.nil? ? text : comment)
252
+ text
121
253
  end
122
254
  end
123
-