rbbt-text 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/lib/rbbt/corpus/corpus.rb +15 -6
  2. data/lib/rbbt/corpus/document.rb +100 -127
  3. data/lib/rbbt/corpus/document_repo.rb +72 -51
  4. data/lib/rbbt/ner/NER.rb +4 -4
  5. data/lib/rbbt/ner/abner.rb +5 -4
  6. data/lib/rbbt/ner/banner.rb +3 -3
  7. data/lib/rbbt/ner/chemical_tagger.rb +3 -3
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
  9. data/lib/rbbt/ner/oscar3.rb +3 -3
  10. data/lib/rbbt/ner/oscar4.rb +3 -3
  11. data/lib/rbbt/ner/patterns.rb +15 -13
  12. data/lib/rbbt/ner/regexpNER.rb +3 -2
  13. data/lib/rbbt/ner/rnorm.rb +2 -2
  14. data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
  15. data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
  16. data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
  17. data/lib/rbbt/ner/segment/relationship.rb +20 -0
  18. data/lib/rbbt/ner/segment/segmented.rb +13 -0
  19. data/lib/rbbt/ner/segment/token.rb +24 -0
  20. data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
  21. data/lib/rbbt/ner/token_trieNER.rb +30 -22
  22. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  23. data/lib/rbbt/nlp/nlp.rb +23 -37
  24. data/test/rbbt/corpus/test_document.rb +39 -37
  25. data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
  26. data/test/rbbt/ner/segment/test_segmented.rb +23 -0
  27. data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
  28. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
  29. data/test/rbbt/ner/test_patterns.rb +11 -12
  30. data/test/rbbt/ner/test_regexpNER.rb +5 -4
  31. data/test/rbbt/ner/test_segment.rb +101 -0
  32. data/test/rbbt/ner/test_token_trieNER.rb +8 -9
  33. data/test/test_helper.rb +6 -6
  34. metadata +40 -22
  35. data/lib/rbbt/ner/annotations/annotated.rb +0 -15
  36. data/lib/rbbt/ner/annotations/relations.rb +0 -25
  37. data/lib/rbbt/ner/annotations/token.rb +0 -28
  38. data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
  39. data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -7,17 +7,26 @@ class Corpus
7
7
  @corpora_path = case
8
8
  when corpora_path.nil?
9
9
  Rbbt.corpora
10
- when (not Resource::Path === corpora_path)
11
- Resource::Path.path(corpora_path)
10
+ when (not Path === corpora_path)
11
+ Path.setup(corpora_path)
12
12
  else
13
13
  corpora_path
14
14
  end
15
15
 
16
- @document_repo = DocumentRepo.get @corpora_path.document_repo, false
16
+ @corpora_path = @corpora_path.find
17
17
  @persistence_dir = File.join(@corpora_path, "annotations")
18
- @global_annotations = TSV.new(TCHash.get(File.join(@persistence_dir, "global_annotations"), :list), :list, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
19
- @global_annotations.unnamed = true
20
- end
18
+
19
+ Misc.lock(@persistence_dir) do
20
+ @global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
21
+ @global_annotations.unnamed = true
22
+ @global_annotations.close
23
+ end
24
+
25
+ Misc.lock(@corpora_path.document_repo) do
26
+ @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
27
+ end
28
+
29
+ end
21
30
 
22
31
  def persistence_for(docid)
23
32
  File.join(persistence_dir, docid)
@@ -1,19 +1,21 @@
1
- require 'rbbt/ner/annotations'
2
- require 'rbbt/util/tsv'
3
- require 'rbbt/util/resource'
1
+ require 'rbbt/ner/segment'
2
+ require 'rbbt/ner/segment/segmented'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/resource/path'
5
+ require 'rbbt/persist/tsv'
4
6
  require 'rbbt/util/misc'
5
7
  require 'json'
6
8
 
7
9
  class Document
8
10
 
9
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :annotations, :segment_indeces, :persistence_dir, :global_persistence
10
- def initialize(persistence_dir = nil, docid = nil, text = nil, global_persistence = nil)
11
- @annotations = {}
11
+ attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indeces, :persist_dir, :global_persistence
12
+ def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil)
13
+ @segments = {}
12
14
  @segment_indeces = {}
13
15
 
14
- if not persistence_dir.nil?
15
- @persistence_dir = persistence_dir
16
- @persistence_dir = Resource::Path.path(@persistence_dir) if not Resource::Path == @persistence_dir
16
+ if not persist_dir.nil?
17
+ @persist_dir = persist_dir
18
+ @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
17
19
  end
18
20
 
19
21
  @global_persistence = global_persistence
@@ -34,45 +36,6 @@ class Document
34
36
  update_docid
35
37
  end
36
38
 
37
- def self.save_segment(segment, fields = nil)
38
- if fields.nil?
39
- eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
40
- [segment.offset, eend, segment.info.to_json]
41
- else
42
- eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
43
- info = segment.info
44
- info["literal"] = segment.to_s.gsub(/\s/,' ')
45
- info.extend IndiferentHash
46
- [segment.offset, eend].concat info.values_at(*fields.collect{|f| f.downcase}).collect{|v| Array === v ? v * "|" : v}
47
- end
48
- end
49
-
50
- def self.load_segment(text, annotation, fields = nil)
51
- if fields.nil?
52
- start, eend, info = annotation.values_at 0,1,2
53
- info = JSON.parse(info)
54
- else
55
- start, eend = annotation.values_at 0,1
56
- info = Misc.process_to_hash(fields) do |fields| annotation.values_at(*fields.collect{|f| f.downcase}).collect{|v| v.index("|").nil? ? v : v.split("|")} end
57
- end
58
-
59
- Segment.load(text, start, eend, info, @docid)
60
- end
61
-
62
- def self.tsv(segments, fields = nil)
63
- tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
64
- if fields.nil?
65
- tsv.fields += ["Info"]
66
- else
67
- tsv.fields += fields
68
- end
69
-
70
- segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
71
-
72
- tsv
73
- end
74
-
75
-
76
39
  #{{{ PERSISTENCE
77
40
 
78
41
  TSV_REPOS = {}
@@ -82,30 +45,28 @@ class Document
82
45
  if not fields.nil?
83
46
  fields = [fields] if not Array === fields
84
47
  fields = fields.collect{|f| f.to_s}
85
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
48
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
86
49
  end
87
50
 
88
51
  self.class_eval <<-EOC
89
- def load_with_persistence_#{entity}
52
+ def load_with_persistence_#{entity}(raw = false)
90
53
  fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
91
54
 
92
- annotations = Persistence.persist("#{ entity }", :Entity, :tsv_string,
93
- :persistence_file => File.join(@persistence_dir, "#{ entity }")) do
55
+ tsv_file = File.join(@persist_dir.find, "#{ entity }")
94
56
 
95
- tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
96
- if fields.nil?
97
- tsv.fields += ["Info"]
98
- else
99
- tsv.fields += fields
100
- end
57
+ return nil if raw == :check and File.exists? tsv_file
101
58
 
59
+ annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
102
60
  segments = produce_#{entity}
103
- segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
104
-
105
- tsv
61
+ tsv = Segment.tsv(segments, fields)
106
62
  end
107
63
 
108
- annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
64
+ return annotations if raw
65
+
66
+ annotations.unnamed = true
67
+ annotations.collect{|id, annotation|
68
+ Segment.load_tsv_values(text, annotation, annotations.fields)
69
+ }
109
70
  end
110
71
  EOC
111
72
  end
@@ -125,47 +86,43 @@ class Document
125
86
  end
126
87
 
127
88
  self.class_eval <<-EOC
128
- def load_with_persistence_#{entity}
89
+ def load_with_persistence_#{entity}(raw = false)
129
90
  repo = TSV_REPOS["#{ entity }"]
130
91
  if repo.nil?
131
- raise "No persistence file or persistencr dir for persist_in_tsv" if persistence_dir.nil?
132
- repo = TCHash.get(persistence_dir.annotations_by_type.find, TCHash::TSVSerializer)
92
+ raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
93
+ repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
133
94
  end
134
95
 
135
-
136
96
  fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
137
-
138
97
  if not repo.include? "#{ entity }"
139
- tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
140
- if fields.nil?
141
- tsv.fields += ["Info"]
142
- else
143
- tsv.fields += fields
144
- end
145
-
146
- produce_#{entity}.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
98
+ segments = produce_#{entity}
147
99
  repo.write
148
- repo["#{entity}"] = tsv
100
+ repo["#{entity}"] = Segment.tsv(segments, fields)
149
101
  repo.read
102
+ else
103
+ if raw == :check
104
+ repo.close
105
+ return nil
106
+ end
150
107
  end
151
108
 
109
+
152
110
  annotations = repo["#{entity}"]
153
111
 
154
112
  repo.close
155
113
 
156
- annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
114
+
115
+ return annotations if raw
116
+
117
+ annotations.unnamed = true
118
+ annotations.collect{|id, annotation|
119
+ Segment.load_tsv_values(text, annotation, annotations.fields)
120
+ }
157
121
  end
158
- EOC
122
+ EOC
159
123
  end
160
124
 
161
125
  def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
162
- if not tsv.nil? and not tsv.respond_to?(:keys)
163
- entity_field = doc_field if doc_field
164
- doc_field = fields if fields
165
- fields = tsv if tsv
166
- tsv = nil
167
- end
168
-
169
126
  doc_field ||= "Document ID"
170
127
  entity_field ||= "Entity Type"
171
128
 
@@ -174,34 +131,34 @@ class Document
174
131
  if not fields.nil?
175
132
  fields = [fields] if not Array === fields
176
133
  fields = fields.collect{|f| f.to_s}
177
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
134
+ else
135
+ fields = nil
178
136
  end
179
137
 
138
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
139
+
180
140
  self.class_eval <<-EOC
181
- def load_with_persistence_#{entity}
141
+ def load_with_persistence_#{entity}(raw = false)
182
142
  fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
183
143
 
184
- data = TSV_REPOS["#{ entity }"]
144
+ data = TSV_REPOS["#{ entity }"] || @global_persistence
145
+
146
+ data.read true
147
+
148
+ fields = data.fields if fields.nil? and data.respond_to? :fields
185
149
 
186
- if data.nil?
187
- data = global_persistence
188
- end
189
150
 
190
151
  data.filter
191
152
  data.add_filter("field:#{ doc_field }", @docid)
192
153
  data.add_filter("field:#{ entity_field }", "#{ entity }")
154
+ keys = data.keys
155
+ data.pop_filter
156
+ data.pop_filter
193
157
 
194
- if data.keys.empty?
195
- tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
196
- if fields.nil?
197
- tsv.fields += ["Info"]
198
- else
199
- tsv.fields += fields
200
- end
201
-
158
+ if keys.empty?
202
159
  segments = produce_#{entity}
203
- segments << Segment.annotate("No #{entity} found in document #{ @docid }", -1) if segments.empty?
204
- segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
160
+ segments << Segment.setup("No #{entity} found in document #{ @docid }", -1) if segments.empty?
161
+ tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
205
162
 
206
163
  tsv.add_field "#{ doc_field }" do
207
164
  @docid
@@ -211,16 +168,31 @@ class Document
211
168
  "#{ entity }"
212
169
  end
213
170
 
214
- data.write
215
- data.merge!(tsv)
171
+ data.add_filter("field:#{ doc_field }", @docid)
172
+ data.add_filter("field:#{ entity_field }", "#{ entity }")
173
+ data.write true
174
+ keys = tsv.collect do |key, value|
175
+ data[key] = value
176
+ key
177
+ end
178
+ data.pop_filter
179
+ data.pop_filter
216
180
  data.read
181
+ else
182
+ if raw == :check
183
+ data.close
184
+ return nil
185
+ end
217
186
  end
218
187
 
219
- segments = []
220
- data.each{|id, annotation| segments << Document.load_segment(text, annotation, fields) unless annotation[1].to_i == -1}
188
+ return data.values if raw
221
189
 
222
- data.pop_filter
223
- data.pop_filter
190
+ start_pos = data.identify_field "Start"
191
+ segments = data.values_at(*keys).collect{|annotation|
192
+ pos = annotation[start_pos]
193
+ Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
194
+ }.compact
195
+ data.close
224
196
 
225
197
  segments
226
198
  end
@@ -232,21 +204,21 @@ class Document
232
204
  send :define_method, "produce_#{entity}", &block
233
205
 
234
206
  self.class_eval <<-EOC
235
- def load_#{entity}
236
- return if annotations.include? "#{ entity }"
237
- if self.respond_to?("load_with_persistence_#{entity}") and not @persistence_dir.nil?
238
- annotations["#{entity}"] = load_with_persistence_#{entity}
207
+ def load_#{entity}(raw = false)
208
+ return if segments.include? "#{ entity }"
209
+ if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
210
+ segments["#{entity}"] = load_with_persistence_#{entity}(raw)
239
211
  else
240
- annotations["#{ entity }"] = produce_#{entity}
212
+ segments["#{ entity }"] = produce_#{entity}
241
213
  end
242
214
  end
243
215
 
244
- def #{entity}
216
+ def #{entity}(raw = false)
245
217
  begin
246
- entities = annotations["#{ entity }"]
218
+ entities = segments["#{ entity }"]
247
219
  if entities.nil?
248
- load_#{entity}
249
- entities = annotations["#{ entity }"]
220
+ load_#{entity}(raw)
221
+ entities = segments["#{ entity }"]
250
222
  end
251
223
  end
252
224
 
@@ -254,34 +226,35 @@ class Document
254
226
  end
255
227
 
256
228
  def #{entity}_at(pos, persist = false)
257
- segment_index("#{ entity }", persist ? File.join(@persistence_dir, 'ranges') : nil)[pos]
229
+ segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
258
230
  end
259
231
 
260
232
  EOC
261
233
  end
262
234
 
263
- def segment_index(name, persistence_dir = nil)
264
- @segment_indeces[name] ||= Segment.index(self.send(name), persistence_dir.nil? ? :memory : File.join(persistence_dir, name + '.range'))
235
+ def segment_index(name, persist_dir = nil)
236
+ @segment_indeces[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
265
237
  end
266
238
 
267
239
  def load_into(segment, *annotations)
268
240
  options = annotations.pop if Hash === annotations.last
269
241
  options ||= {}
270
- if options[:persist] and not @persistence_dir.nil?
271
- persistence_dir = File.join(@persistence_dir, 'ranges')
242
+
243
+ if options[:persist] and not @persist_dir.nil?
244
+ persist_dir = File.join(@persist_dir, 'ranges')
272
245
  else
273
- persistence_dir = nil
246
+ persist_dir = nil
274
247
  end
275
248
 
276
- segment.extend Annotated
277
- segment.annotations ||= {}
249
+ Segmented.setup(segment, {})
278
250
  annotations.collect do |name|
279
251
  name = name.to_s
280
- annotations = segment_index(name, persistence_dir)[segment.range]
281
- segment.annotations[name] = annotations
252
+ index = segment_index(name, persist_dir)
253
+ annotations = index[segment.range]
254
+ segment.segments[name] = annotations
282
255
  class << segment
283
256
  self
284
- end.class_eval "def #{ name }; @annotations['#{ name }']; end"
257
+ end.class_eval "def #{ name }; @segments['#{ name }']; end"
285
258
  end
286
259
 
287
260
  segment
@@ -1,69 +1,89 @@
1
1
  require 'rbbt/util/misc'
2
2
  require 'tokyocabinet'
3
3
 
4
- class DocumentRepo < TokyoCabinet::BDB
4
+ module DocumentRepo
5
5
  class OpenError < StandardError;end
6
6
  class KeyFormatError < StandardError;end
7
7
 
8
- CONNECTIONS = {} unless defined? CONNECTIONS
8
+ TC_CONNECTIONS = {}
9
+ def self.open_tokyocabinet(path, write)
10
+ write = true if not File.exists?(path)
11
+ flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
9
12
 
10
- def self.get(path, write = false)
13
+ FileUtils.mkdir_p File.dirname(path) unless File.exists?(File.dirname(path))
11
14
 
12
- if !File.exists?(path) or not CONNECTIONS.include? path
13
- CONNECTIONS[path] = self.new(path, true)
14
- end
15
+ database = TC_CONNECTIONS[path] ||= TokyoCabinet::BDB.new
16
+ database.close
15
17
 
16
- d = CONNECTIONS[path]
17
-
18
- if write and not d.write?
19
- d.write
20
- else
21
- d.read if d.write?
18
+ if !database.open(path, flags)
19
+ ecode = database.ecode
20
+ raise "Open error: #{database.errmsg(ecode)}. Trying to open file #{path}"
22
21
  end
23
22
 
24
- d
25
- end
26
-
23
+ class << database
24
+ attr_accessor :writable, :persistence_path
25
+
26
+ def read
27
+ return if not @writable
28
+ self.close
29
+ if !self.open(@persistence_path, TokyoCabinet::BDB::OREADER)
30
+ ecode = self.ecode
31
+ raise "Open error: #{self.errmsg(ecode)}. Trying to open file #{@persistence_path}"
32
+ end
33
+ @writable = false
34
+ self
35
+ end
36
+
37
+ def write
38
+ return if @writable
39
+ self.close
40
+ if !self.open(@persistence_path, TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT)
41
+ ecode = self.ecode
42
+ raise "Open error: #{self.errmsg(ecode)}. Trying to open file #{@persistence_path}"
43
+ end
44
+ @writable = true
45
+ self
46
+ end
47
+
48
+ def write?
49
+ @writable
50
+ end
51
+
52
+ def collect
53
+ res = []
54
+ each do |key, value|
55
+ res << if block_given?
56
+ yield key, value
57
+ else
58
+ [key, value]
59
+ end
60
+ end
61
+ res
62
+ end
63
+
64
+ def delete(key)
65
+ out(key)
66
+ end
67
+
68
+ def values_at(*keys)
69
+ keys.collect do |key|
70
+ self[key]
71
+ end
72
+ end
73
+
74
+ def merge!(hash)
75
+ hash.each do |key,values|
76
+ self[key] = values
77
+ end
78
+ end
27
79
 
28
- alias original_open open
29
- def open(write = false)
30
- flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
31
-
32
- FileUtils.mkdir_p File.dirname(@path_to_db) unless File.exists?(File.dirname(@path_to_db))
33
- if !self.original_open(@path_to_db, flags)
34
- ecode = self.ecode
35
- raise OpenError, "Open error: #{self.errmsg(ecode)}. Trying to open file #{@path_to_db}"
36
80
  end
37
81
 
38
- @write = write
39
-
40
- end
41
-
42
- def write?
43
- @write
44
- end
82
+ database.persistence_path ||= path
45
83
 
46
- def write
47
- self.close
48
- self.open(true)
49
- end
50
-
51
- def read
52
- self.close
53
- self.open(false)
54
- end
84
+ database.extend DocumentRepo
55
85
 
56
- def initialize(path, write = false)
57
- super()
58
-
59
- @path_to_db = path
60
-
61
- if write || ! File.exists?(@path_to_db)
62
- self.setcache(100000) or raise "Error setting cache"
63
- self.open(true)
64
- else
65
- self.open(false)
66
- end
86
+ database
67
87
  end
68
88
 
69
89
  def docid2fields(docid)
@@ -79,9 +99,10 @@ class DocumentRepo < TokyoCabinet::BDB
79
99
  end
80
100
 
81
101
  def add(text, namespace, id, type, hash)
82
- write unless write?
102
+ write
83
103
  docid = fields2docid(namespace, id, type, hash)
84
104
  self[docid] = text unless self.include? docid
105
+ read
85
106
  docid
86
107
  end
87
108