rbbt-text 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/lib/rbbt/corpus/corpus.rb +15 -6
  2. data/lib/rbbt/corpus/document.rb +100 -127
  3. data/lib/rbbt/corpus/document_repo.rb +72 -51
  4. data/lib/rbbt/ner/NER.rb +4 -4
  5. data/lib/rbbt/ner/abner.rb +5 -4
  6. data/lib/rbbt/ner/banner.rb +3 -3
  7. data/lib/rbbt/ner/chemical_tagger.rb +3 -3
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
  9. data/lib/rbbt/ner/oscar3.rb +3 -3
  10. data/lib/rbbt/ner/oscar4.rb +3 -3
  11. data/lib/rbbt/ner/patterns.rb +15 -13
  12. data/lib/rbbt/ner/regexpNER.rb +3 -2
  13. data/lib/rbbt/ner/rnorm.rb +2 -2
  14. data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
  15. data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
  16. data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
  17. data/lib/rbbt/ner/segment/relationship.rb +20 -0
  18. data/lib/rbbt/ner/segment/segmented.rb +13 -0
  19. data/lib/rbbt/ner/segment/token.rb +24 -0
  20. data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
  21. data/lib/rbbt/ner/token_trieNER.rb +30 -22
  22. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  23. data/lib/rbbt/nlp/nlp.rb +23 -37
  24. data/test/rbbt/corpus/test_document.rb +39 -37
  25. data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
  26. data/test/rbbt/ner/segment/test_segmented.rb +23 -0
  27. data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
  28. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
  29. data/test/rbbt/ner/test_patterns.rb +11 -12
  30. data/test/rbbt/ner/test_regexpNER.rb +5 -4
  31. data/test/rbbt/ner/test_segment.rb +101 -0
  32. data/test/rbbt/ner/test_token_trieNER.rb +8 -9
  33. data/test/test_helper.rb +6 -6
  34. metadata +40 -22
  35. data/lib/rbbt/ner/annotations/annotated.rb +0 -15
  36. data/lib/rbbt/ner/annotations/relations.rb +0 -25
  37. data/lib/rbbt/ner/annotations/token.rb +0 -28
  38. data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
  39. data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -7,17 +7,26 @@ class Corpus
7
7
  @corpora_path = case
8
8
  when corpora_path.nil?
9
9
  Rbbt.corpora
10
- when (not Resource::Path === corpora_path)
11
- Resource::Path.path(corpora_path)
10
+ when (not Path === corpora_path)
11
+ Path.setup(corpora_path)
12
12
  else
13
13
  corpora_path
14
14
  end
15
15
 
16
- @document_repo = DocumentRepo.get @corpora_path.document_repo, false
16
+ @corpora_path = @corpora_path.find
17
17
  @persistence_dir = File.join(@corpora_path, "annotations")
18
- @global_annotations = TSV.new(TCHash.get(File.join(@persistence_dir, "global_annotations"), :list), :list, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
19
- @global_annotations.unnamed = true
20
- end
18
+
19
+ Misc.lock(@persistence_dir) do
20
+ @global_annotations = TSV.setup(Persist.open_tokyocabinet(File.join(@persistence_dir, "global_annotations"), false, :list), :key => "ID", :fields => ["Start", "End", "JSON", "Document ID", "Entity Type"])
21
+ @global_annotations.unnamed = true
22
+ @global_annotations.close
23
+ end
24
+
25
+ Misc.lock(@corpora_path.document_repo) do
26
+ @document_repo = DocumentRepo.open_tokyocabinet @corpora_path.document_repo, false
27
+ end
28
+
29
+ end
21
30
 
22
31
  def persistence_for(docid)
23
32
  File.join(persistence_dir, docid)
@@ -1,19 +1,21 @@
1
- require 'rbbt/ner/annotations'
2
- require 'rbbt/util/tsv'
3
- require 'rbbt/util/resource'
1
+ require 'rbbt/ner/segment'
2
+ require 'rbbt/ner/segment/segmented'
3
+ require 'rbbt/tsv'
4
+ require 'rbbt/resource/path'
5
+ require 'rbbt/persist/tsv'
4
6
  require 'rbbt/util/misc'
5
7
  require 'json'
6
8
 
7
9
  class Document
8
10
 
9
- attr_accessor :text, :docid, :namespace, :id, :type, :hash, :annotations, :segment_indeces, :persistence_dir, :global_persistence
10
- def initialize(persistence_dir = nil, docid = nil, text = nil, global_persistence = nil)
11
- @annotations = {}
11
+ attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indeces, :persist_dir, :global_persistence
12
+ def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil)
13
+ @segments = {}
12
14
  @segment_indeces = {}
13
15
 
14
- if not persistence_dir.nil?
15
- @persistence_dir = persistence_dir
16
- @persistence_dir = Resource::Path.path(@persistence_dir) if not Resource::Path == @persistence_dir
16
+ if not persist_dir.nil?
17
+ @persist_dir = persist_dir
18
+ @persist_dir = Path.setup(@persist_dir) if not Path == @persist_dir
17
19
  end
18
20
 
19
21
  @global_persistence = global_persistence
@@ -34,45 +36,6 @@ class Document
34
36
  update_docid
35
37
  end
36
38
 
37
- def self.save_segment(segment, fields = nil)
38
- if fields.nil?
39
- eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
40
- [segment.offset, eend, segment.info.to_json]
41
- else
42
- eend = case segment.offset; when nil; nil; when -1; -1; else segment.end; end
43
- info = segment.info
44
- info["literal"] = segment.to_s.gsub(/\s/,' ')
45
- info.extend IndiferentHash
46
- [segment.offset, eend].concat info.values_at(*fields.collect{|f| f.downcase}).collect{|v| Array === v ? v * "|" : v}
47
- end
48
- end
49
-
50
- def self.load_segment(text, annotation, fields = nil)
51
- if fields.nil?
52
- start, eend, info = annotation.values_at 0,1,2
53
- info = JSON.parse(info)
54
- else
55
- start, eend = annotation.values_at 0,1
56
- info = Misc.process_to_hash(fields) do |fields| annotation.values_at(*fields.collect{|f| f.downcase}).collect{|v| v.index("|").nil? ? v : v.split("|")} end
57
- end
58
-
59
- Segment.load(text, start, eend, info, @docid)
60
- end
61
-
62
- def self.tsv(segments, fields = nil)
63
- tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
64
- if fields.nil?
65
- tsv.fields += ["Info"]
66
- else
67
- tsv.fields += fields
68
- end
69
-
70
- segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
71
-
72
- tsv
73
- end
74
-
75
-
76
39
  #{{{ PERSISTENCE
77
40
 
78
41
  TSV_REPOS = {}
@@ -82,30 +45,28 @@ class Document
82
45
  if not fields.nil?
83
46
  fields = [fields] if not Array === fields
84
47
  fields = fields.collect{|f| f.to_s}
85
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
48
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
86
49
  end
87
50
 
88
51
  self.class_eval <<-EOC
89
- def load_with_persistence_#{entity}
52
+ def load_with_persistence_#{entity}(raw = false)
90
53
  fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
91
54
 
92
- annotations = Persistence.persist("#{ entity }", :Entity, :tsv_string,
93
- :persistence_file => File.join(@persistence_dir, "#{ entity }")) do
55
+ tsv_file = File.join(@persist_dir.find, "#{ entity }")
94
56
 
95
- tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
96
- if fields.nil?
97
- tsv.fields += ["Info"]
98
- else
99
- tsv.fields += fields
100
- end
57
+ return nil if raw == :check and File.exists? tsv_file
101
58
 
59
+ annotations = Persist.persist("Entity[#{ entity }]", :tsv, :file => tsv_file) do
102
60
  segments = produce_#{entity}
103
- segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
104
-
105
- tsv
61
+ tsv = Segment.tsv(segments, fields)
106
62
  end
107
63
 
108
- annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
64
+ return annotations if raw
65
+
66
+ annotations.unnamed = true
67
+ annotations.collect{|id, annotation|
68
+ Segment.load_tsv_values(text, annotation, annotations.fields)
69
+ }
109
70
  end
110
71
  EOC
111
72
  end
@@ -125,47 +86,43 @@ class Document
125
86
  end
126
87
 
127
88
  self.class_eval <<-EOC
128
- def load_with_persistence_#{entity}
89
+ def load_with_persistence_#{entity}(raw = false)
129
90
  repo = TSV_REPOS["#{ entity }"]
130
91
  if repo.nil?
131
- raise "No persistence file or persistencr dir for persist_in_tsv" if persistence_dir.nil?
132
- repo = TCHash.get(persistence_dir.annotations_by_type.find, TCHash::TSVSerializer)
92
+ raise "No persistence file or persistence dir for persist_in_tsv" if persist_dir.nil?
93
+ repo = Persist.open_tokyocabinet(persist_dir.annotations_by_type.find, true, :marshal_tsv)
133
94
  end
134
95
 
135
-
136
96
  fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
137
-
138
97
  if not repo.include? "#{ entity }"
139
- tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
140
- if fields.nil?
141
- tsv.fields += ["Info"]
142
- else
143
- tsv.fields += fields
144
- end
145
-
146
- produce_#{entity}.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
98
+ segments = produce_#{entity}
147
99
  repo.write
148
- repo["#{entity}"] = tsv
100
+ repo["#{entity}"] = Segment.tsv(segments, fields)
149
101
  repo.read
102
+ else
103
+ if raw == :check
104
+ repo.close
105
+ return nil
106
+ end
150
107
  end
151
108
 
109
+
152
110
  annotations = repo["#{entity}"]
153
111
 
154
112
  repo.close
155
113
 
156
- annotations.collect{|id, annotation| Document.load_segment(text, annotation, fields)}
114
+
115
+ return annotations if raw
116
+
117
+ annotations.unnamed = true
118
+ annotations.collect{|id, annotation|
119
+ Segment.load_tsv_values(text, annotation, annotations.fields)
120
+ }
157
121
  end
158
- EOC
122
+ EOC
159
123
  end
160
124
 
161
125
  def self.persist_in_global_tsv(entity, tsv = nil, fields = nil, doc_field = nil, entity_field = nil)
162
- if not tsv.nil? and not tsv.respond_to?(:keys)
163
- entity_field = doc_field if doc_field
164
- doc_field = fields if fields
165
- fields = tsv if tsv
166
- tsv = nil
167
- end
168
-
169
126
  doc_field ||= "Document ID"
170
127
  entity_field ||= "Entity Type"
171
128
 
@@ -174,34 +131,34 @@ class Document
174
131
  if not fields.nil?
175
132
  fields = [fields] if not Array === fields
176
133
  fields = fields.collect{|f| f.to_s}
177
- FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
134
+ else
135
+ fields = nil
178
136
  end
179
137
 
138
+ FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
139
+
180
140
  self.class_eval <<-EOC
181
- def load_with_persistence_#{entity}
141
+ def load_with_persistence_#{entity}(raw = false)
182
142
  fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
183
143
 
184
- data = TSV_REPOS["#{ entity }"]
144
+ data = TSV_REPOS["#{ entity }"] || @global_persistence
145
+
146
+ data.read true
147
+
148
+ fields = data.fields if fields.nil? and data.respond_to? :fields
185
149
 
186
- if data.nil?
187
- data = global_persistence
188
- end
189
150
 
190
151
  data.filter
191
152
  data.add_filter("field:#{ doc_field }", @docid)
192
153
  data.add_filter("field:#{ entity_field }", "#{ entity }")
154
+ keys = data.keys
155
+ data.pop_filter
156
+ data.pop_filter
193
157
 
194
- if data.keys.empty?
195
- tsv = TSV.new({}, :list, :key => "ID", :fields => %w(Start End))
196
- if fields.nil?
197
- tsv.fields += ["Info"]
198
- else
199
- tsv.fields += fields
200
- end
201
-
158
+ if keys.empty?
202
159
  segments = produce_#{entity}
203
- segments << Segment.annotate("No #{entity} found in document #{ @docid }", -1) if segments.empty?
204
- segments.each{|segment| tsv[segment.id] = Document.save_segment(segment, fields) unless segment.offset.nil?}
160
+ segments << Segment.setup("No #{entity} found in document #{ @docid }", -1) if segments.empty?
161
+ tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
205
162
 
206
163
  tsv.add_field "#{ doc_field }" do
207
164
  @docid
@@ -211,16 +168,31 @@ class Document
211
168
  "#{ entity }"
212
169
  end
213
170
 
214
- data.write
215
- data.merge!(tsv)
171
+ data.add_filter("field:#{ doc_field }", @docid)
172
+ data.add_filter("field:#{ entity_field }", "#{ entity }")
173
+ data.write true
174
+ keys = tsv.collect do |key, value|
175
+ data[key] = value
176
+ key
177
+ end
178
+ data.pop_filter
179
+ data.pop_filter
216
180
  data.read
181
+ else
182
+ if raw == :check
183
+ data.close
184
+ return nil
185
+ end
217
186
  end
218
187
 
219
- segments = []
220
- data.each{|id, annotation| segments << Document.load_segment(text, annotation, fields) unless annotation[1].to_i == -1}
188
+ return data.values if raw
221
189
 
222
- data.pop_filter
223
- data.pop_filter
190
+ start_pos = data.identify_field "Start"
191
+ segments = data.values_at(*keys).collect{|annotation|
192
+ pos = annotation[start_pos]
193
+ Segment.load_tsv_values(text, annotation, data.fields) unless [-1, "-1", [-1], ["-1"]].include? pos
194
+ }.compact
195
+ data.close
224
196
 
225
197
  segments
226
198
  end
@@ -232,21 +204,21 @@ class Document
232
204
  send :define_method, "produce_#{entity}", &block
233
205
 
234
206
  self.class_eval <<-EOC
235
- def load_#{entity}
236
- return if annotations.include? "#{ entity }"
237
- if self.respond_to?("load_with_persistence_#{entity}") and not @persistence_dir.nil?
238
- annotations["#{entity}"] = load_with_persistence_#{entity}
207
+ def load_#{entity}(raw = false)
208
+ return if segments.include? "#{ entity }"
209
+ if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
210
+ segments["#{entity}"] = load_with_persistence_#{entity}(raw)
239
211
  else
240
- annotations["#{ entity }"] = produce_#{entity}
212
+ segments["#{ entity }"] = produce_#{entity}
241
213
  end
242
214
  end
243
215
 
244
- def #{entity}
216
+ def #{entity}(raw = false)
245
217
  begin
246
- entities = annotations["#{ entity }"]
218
+ entities = segments["#{ entity }"]
247
219
  if entities.nil?
248
- load_#{entity}
249
- entities = annotations["#{ entity }"]
220
+ load_#{entity}(raw)
221
+ entities = segments["#{ entity }"]
250
222
  end
251
223
  end
252
224
 
@@ -254,34 +226,35 @@ class Document
254
226
  end
255
227
 
256
228
  def #{entity}_at(pos, persist = false)
257
- segment_index("#{ entity }", persist ? File.join(@persistence_dir, 'ranges') : nil)[pos]
229
+ segment_index("#{ entity }", persist ? File.join(@persist_dir, 'ranges') : nil)[pos]
258
230
  end
259
231
 
260
232
  EOC
261
233
  end
262
234
 
263
- def segment_index(name, persistence_dir = nil)
264
- @segment_indeces[name] ||= Segment.index(self.send(name), persistence_dir.nil? ? :memory : File.join(persistence_dir, name + '.range'))
235
+ def segment_index(name, persist_dir = nil)
236
+ @segment_indeces[name] ||= Segment.index(self.send(name), persist_dir.nil? ? :memory : File.join(persist_dir, name + '.range'))
265
237
  end
266
238
 
267
239
  def load_into(segment, *annotations)
268
240
  options = annotations.pop if Hash === annotations.last
269
241
  options ||= {}
270
- if options[:persist] and not @persistence_dir.nil?
271
- persistence_dir = File.join(@persistence_dir, 'ranges')
242
+
243
+ if options[:persist] and not @persist_dir.nil?
244
+ persist_dir = File.join(@persist_dir, 'ranges')
272
245
  else
273
- persistence_dir = nil
246
+ persist_dir = nil
274
247
  end
275
248
 
276
- segment.extend Annotated
277
- segment.annotations ||= {}
249
+ Segmented.setup(segment, {})
278
250
  annotations.collect do |name|
279
251
  name = name.to_s
280
- annotations = segment_index(name, persistence_dir)[segment.range]
281
- segment.annotations[name] = annotations
252
+ index = segment_index(name, persist_dir)
253
+ annotations = index[segment.range]
254
+ segment.segments[name] = annotations
282
255
  class << segment
283
256
  self
284
- end.class_eval "def #{ name }; @annotations['#{ name }']; end"
257
+ end.class_eval "def #{ name }; @segments['#{ name }']; end"
285
258
  end
286
259
 
287
260
  segment
@@ -1,69 +1,89 @@
1
1
  require 'rbbt/util/misc'
2
2
  require 'tokyocabinet'
3
3
 
4
- class DocumentRepo < TokyoCabinet::BDB
4
+ module DocumentRepo
5
5
  class OpenError < StandardError;end
6
6
  class KeyFormatError < StandardError;end
7
7
 
8
- CONNECTIONS = {} unless defined? CONNECTIONS
8
+ TC_CONNECTIONS = {}
9
+ def self.open_tokyocabinet(path, write)
10
+ write = true if not File.exists?(path)
11
+ flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
9
12
 
10
- def self.get(path, write = false)
13
+ FileUtils.mkdir_p File.dirname(path) unless File.exists?(File.dirname(path))
11
14
 
12
- if !File.exists?(path) or not CONNECTIONS.include? path
13
- CONNECTIONS[path] = self.new(path, true)
14
- end
15
+ database = TC_CONNECTIONS[path] ||= TokyoCabinet::BDB.new
16
+ database.close
15
17
 
16
- d = CONNECTIONS[path]
17
-
18
- if write and not d.write?
19
- d.write
20
- else
21
- d.read if d.write?
18
+ if !database.open(path, flags)
19
+ ecode = database.ecode
20
+ raise "Open error: #{database.errmsg(ecode)}. Trying to open file #{path}"
22
21
  end
23
22
 
24
- d
25
- end
26
-
23
+ class << database
24
+ attr_accessor :writable, :persistence_path
25
+
26
+ def read
27
+ return if not @writable
28
+ self.close
29
+ if !self.open(@persistence_path, TokyoCabinet::BDB::OREADER)
30
+ ecode = self.ecode
31
+ raise "Open error: #{self.errmsg(ecode)}. Trying to open file #{@persistence_path}"
32
+ end
33
+ @writable = false
34
+ self
35
+ end
36
+
37
+ def write
38
+ return if @writable
39
+ self.close
40
+ if !self.open(@persistence_path, TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT)
41
+ ecode = self.ecode
42
+ raise "Open error: #{self.errmsg(ecode)}. Trying to open file #{@persistence_path}"
43
+ end
44
+ @writable = true
45
+ self
46
+ end
47
+
48
+ def write?
49
+ @writable
50
+ end
51
+
52
+ def collect
53
+ res = []
54
+ each do |key, value|
55
+ res << if block_given?
56
+ yield key, value
57
+ else
58
+ [key, value]
59
+ end
60
+ end
61
+ res
62
+ end
63
+
64
+ def delete(key)
65
+ out(key)
66
+ end
67
+
68
+ def values_at(*keys)
69
+ keys.collect do |key|
70
+ self[key]
71
+ end
72
+ end
73
+
74
+ def merge!(hash)
75
+ hash.each do |key,values|
76
+ self[key] = values
77
+ end
78
+ end
27
79
 
28
- alias original_open open
29
- def open(write = false)
30
- flags = (write ? TokyoCabinet::BDB::OWRITER | TokyoCabinet::BDB::OCREAT : TokyoCabinet::BDB::OREADER)
31
-
32
- FileUtils.mkdir_p File.dirname(@path_to_db) unless File.exists?(File.dirname(@path_to_db))
33
- if !self.original_open(@path_to_db, flags)
34
- ecode = self.ecode
35
- raise OpenError, "Open error: #{self.errmsg(ecode)}. Trying to open file #{@path_to_db}"
36
80
  end
37
81
 
38
- @write = write
39
-
40
- end
41
-
42
- def write?
43
- @write
44
- end
82
+ database.persistence_path ||= path
45
83
 
46
- def write
47
- self.close
48
- self.open(true)
49
- end
50
-
51
- def read
52
- self.close
53
- self.open(false)
54
- end
84
+ database.extend DocumentRepo
55
85
 
56
- def initialize(path, write = false)
57
- super()
58
-
59
- @path_to_db = path
60
-
61
- if write || ! File.exists?(@path_to_db)
62
- self.setcache(100000) or raise "Error setting cache"
63
- self.open(true)
64
- else
65
- self.open(false)
66
- end
86
+ database
67
87
  end
68
88
 
69
89
  def docid2fields(docid)
@@ -79,9 +99,10 @@ class DocumentRepo < TokyoCabinet::BDB
79
99
  end
80
100
 
81
101
  def add(text, namespace, id, type, hash)
82
- write unless write?
102
+ write
83
103
  docid = fields2docid(namespace, id, type, hash)
84
104
  self[docid] = text unless self.include? docid
105
+ read
85
106
  docid
86
107
  end
87
108