rbbt-text 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. data/lib/rbbt/corpus/corpus.rb +15 -6
  2. data/lib/rbbt/corpus/document.rb +100 -127
  3. data/lib/rbbt/corpus/document_repo.rb +72 -51
  4. data/lib/rbbt/ner/NER.rb +4 -4
  5. data/lib/rbbt/ner/abner.rb +5 -4
  6. data/lib/rbbt/ner/banner.rb +3 -3
  7. data/lib/rbbt/ner/chemical_tagger.rb +3 -3
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
  9. data/lib/rbbt/ner/oscar3.rb +3 -3
  10. data/lib/rbbt/ner/oscar4.rb +3 -3
  11. data/lib/rbbt/ner/patterns.rb +15 -13
  12. data/lib/rbbt/ner/regexpNER.rb +3 -2
  13. data/lib/rbbt/ner/rnorm.rb +2 -2
  14. data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
  15. data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
  16. data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
  17. data/lib/rbbt/ner/segment/relationship.rb +20 -0
  18. data/lib/rbbt/ner/segment/segmented.rb +13 -0
  19. data/lib/rbbt/ner/segment/token.rb +24 -0
  20. data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
  21. data/lib/rbbt/ner/token_trieNER.rb +30 -22
  22. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  23. data/lib/rbbt/nlp/nlp.rb +23 -37
  24. data/test/rbbt/corpus/test_document.rb +39 -37
  25. data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
  26. data/test/rbbt/ner/segment/test_segmented.rb +23 -0
  27. data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
  28. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
  29. data/test/rbbt/ner/test_patterns.rb +11 -12
  30. data/test/rbbt/ner/test_regexpNER.rb +5 -4
  31. data/test/rbbt/ner/test_segment.rb +101 -0
  32. data/test/rbbt/ner/test_token_trieNER.rb +8 -9
  33. data/test/test_helper.rb +6 -6
  34. metadata +40 -22
  35. data/lib/rbbt/ner/annotations/annotated.rb +0 -15
  36. data/lib/rbbt/ner/annotations/relations.rb +0 -25
  37. data/lib/rbbt/ner/annotations/token.rb +0 -28
  38. data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
  39. data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -1,17 +1,9 @@
1
- require 'rbbt/ner/annotations'
1
+ require 'rbbt/ner/segment'
2
2
 
3
3
  module NamedEntity
4
- attr_accessor :type, :code, :score, :segment_types
4
+ extend Annotation
5
5
  include Segment
6
-
7
- def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
8
- string.extend NamedEntity
9
- string.offset = offset unless offset.nil?
10
- string.type = type unless type.nil?
11
- string.code = code unless code.nil?
12
- string.score = score unless score.nil?
13
- string
14
- end
6
+ self.annotation :type, :code, :score
15
7
 
16
8
  def report
17
9
  <<-EOF
@@ -0,0 +1,20 @@
1
+ require 'rbbt/ner/segment'
2
+
3
+ module Relationship
4
+ extend Annotation
5
+ include Segment
6
+ self.annotation :terms
7
+
8
+ def html
9
+ text = <<-EOF
10
+ <span class='Relationship'\
11
+ >#{ self }</span>
12
+ EOF
13
+ text.chomp
14
+ end
15
+
16
+ def html_with_entities(*types)
17
+ annotations.values_at(*types).each do |segments|
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,13 @@
1
+ require 'rbbt/annotations'
2
+ require 'rbbt/ner/segment'
3
+
4
+ module Segmented
5
+ extend Annotation
6
+ self.annotation :segments
7
+
8
+ def split_segments(skip_segments = false)
9
+ Segment.split(self, @segments, skip_segments)
10
+ end
11
+ end
12
+
13
+
@@ -0,0 +1,24 @@
1
+ require 'rbbt/annotations'
2
+ require 'rbbt/ner/segment'
3
+
4
+ module Token
5
+ extend Annotation
6
+ include Segment
7
+ self.annotation :original
8
+
9
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
10
+
11
+ tokens = []
12
+ while matchdata = text.match(split_at)
13
+ tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
14
+ tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
15
+ start += matchdata.end(0)
16
+ text = matchdata.post_match
17
+ end
18
+
19
+ tokens << Token.setup(text, start) unless text.empty?
20
+
21
+ tokens
22
+ end
23
+ end
24
+
@@ -1,7 +1,16 @@
1
- require 'rbbt/ner/annotations'
1
+ require 'rbbt/ner/segment'
2
2
  module Transformed
3
3
  attr_accessor :transformation_offset_differences, :transformation_original
4
4
 
5
+ def self.transform(text, segments, replacement = nil, &block)
6
+ require 'rbbt/util/misc'
7
+
8
+ text.extend Transformed
9
+ text.replace(segments, replacement, &block)
10
+
11
+ text
12
+ end
13
+
5
14
  def self.with_transform(text, segments, replacement)
6
15
  require 'rbbt/util/misc'
7
16
 
@@ -14,16 +23,7 @@ module Transformed
14
23
 
15
24
  text.restore(segments, true)
16
25
  end
17
-
18
- def self.transform(text, segments, replacement = nil, &block)
19
- require 'rbbt/util/misc'
20
26
 
21
- text.extend Transformed
22
- text.replace(segments, replacement, &block)
23
-
24
- text
25
- end
26
-
27
27
  def transform_pos(pos)
28
28
  return pos if transformation_offset_differences.nil?
29
29
  # tranformation_offset_differences are assumed to be sorted in reverse
@@ -1,7 +1,7 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/tsv'
3
- require 'rbbt/ner/annotations'
4
- require 'rbbt/ner/annotations/token'
1
+ require 'rbbt'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/ner/segment'
4
+ require 'rbbt/ner/segment/token'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  class TokenTrieNER < NER
@@ -16,15 +16,15 @@ class TokenTrieNER < NER
16
16
  def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
17
17
  if no_clean
18
18
  if extend_to_token
19
- Token.annotate(clean(token), start, token)
19
+ Token.setup(clean(token), start, token)
20
20
  else
21
- clean(token)
21
+ token
22
22
  end
23
23
  else
24
24
  if extend_to_token
25
- Token.annotate(clean(token), start, token)
25
+ Token.setup(clean(token), start, token)
26
26
  else
27
- token
27
+ clean(token)
28
28
  end
29
29
  end
30
30
  end
@@ -137,6 +137,11 @@ class TokenTrieNER < NER
137
137
  hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
138
138
  names = Array === names ? names : [names]
139
139
  names.flatten! if Array === names.first and not Token === names.first.first
140
+
141
+ if names.empty?
142
+ names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
143
+ end
144
+
140
145
  names.each do |name|
141
146
  next if name.empty? or (String === name and name.length < 2)
142
147
 
@@ -167,7 +172,7 @@ class TokenTrieNER < NER
167
172
  return index[head]
168
173
  end
169
174
 
170
- return nil unless (not TCHash === index ) and index.include? :PROCS
175
+ return nil unless (not TokyoCabinet::HDB === index ) and index.include? :PROCS
171
176
 
172
177
  index[:PROCS].each do |key,value|
173
178
  return value if key.call(head)
@@ -225,16 +230,16 @@ class TokenTrieNER < NER
225
230
  match_offset = match_tokens.first.offset
226
231
  match_tokens.each{|t|
227
232
  match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
228
- match << (t.respond_to?(:original) ? t.original : t)
233
+ match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
229
234
  }
230
235
 
231
- NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
236
+ NamedEntity.setup(match, match_tokens.first.offset, type, codes)
232
237
  end
233
238
 
234
239
  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
235
240
  def initialize(type = nil, file = nil, options = {})
236
241
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
237
- :persistence => false
242
+ :persist => false
238
243
  @slack = slack
239
244
  @longest_match = options.delete :longest_match
240
245
  @split_at = options.delete :split_at
@@ -242,16 +247,15 @@ class TokenTrieNER < NER
242
247
 
243
248
  file = [] if file.nil?
244
249
  file = [file] unless Array === file
245
- @index = Persistence.persist(file, :TokenTRIE, :tsv, options) do |file, options, filename, persistecen_file|
246
- if persistecen_file.nil?
247
- @index = {}
248
- else
249
- FileUtils.mkdir_p File.dirname(persistecen_file) unless File.exists? File.dirname(persistecen_file)
250
- @index = TCHash.get persistecen_file, true, :marshal
251
- end
250
+ persist_options = Misc.pull_keys options, :persist
251
+ @index = Persist.persist_tsv(file, options, persist_options) do |data|
252
+ data.serializer = :marshal if data.respond_to? :serializer and data.serializer == :type
253
+
254
+ @index = data
252
255
  file.each do |f|
253
256
  merge(f, type)
254
257
  end
258
+
255
259
  @index
256
260
  end
257
261
  end
@@ -259,10 +263,10 @@ class TokenTrieNER < NER
259
263
  def merge(new, type = nil)
260
264
  case
261
265
  when TokenTrieNER === new
266
+ Log.debug "TokenTrieNER merging other TokenTrieNER"
262
267
  TokenTrieNER.merge(@index, new.index)
263
- when Hash === new
264
- TokenTrieNER.merge(@index, new)
265
268
  when TSV === new
269
+ Log.debug "TokenTrieNER merging TSV"
266
270
  old_unnamed = new.unnamed
267
271
  old_monitor = new.monitor
268
272
  new.unnamed = true
@@ -270,8 +274,12 @@ class TokenTrieNER < NER
270
274
  TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
271
275
  new.unnamed = old_unnamed
272
276
  new.monitor = old_monitor
277
+ when Hash === new
278
+ Log.debug "TokenTrieNER merging Hash"
279
+ TokenTrieNER.merge(@index, new)
273
280
  when String === new
274
- new = TSV.new(new, :flat)
281
+ Log.debug "TokenTrieNER merging file: #{ new }"
282
+ new = TSV.open(new, :flat)
275
283
  new.unnamed = true
276
284
  new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
277
285
  TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
@@ -1,3 +1,4 @@
1
+ require 'rbbt/ner/segment'
1
2
  module NLP
2
3
  def self.returnFeatures(prevWord, delimiter, nextWord)
3
4
  if nextWord.match(/__ss__/)
@@ -206,7 +207,7 @@ module NLP
206
207
  offsets.collect do |s,e|
207
208
  sentence = text[s..e]
208
209
  next if sentence.nil?
209
- Segment.annotate sentence, s
210
+ Segment.setup sentence, s
210
211
  sentence
211
212
  end
212
213
 
data/lib/rbbt/nlp/nlp.rb CHANGED
@@ -1,9 +1,9 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
- require 'rbbt/util/persistence'
4
- require 'rbbt/util/resource'
5
- require 'rbbt/ner/annotations'
6
- require 'rbbt/ner/annotations/annotated'
3
+ require 'rbbt/persist'
4
+ require 'rbbt/resource'
5
+ require 'rbbt/ner/segment'
6
+ require 'rbbt/ner/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -11,7 +11,7 @@ require 'digest/md5'
11
11
  module NLP
12
12
 
13
13
  extend LocalPersist
14
- self.local_persistence_dir = '/tmp/crap'
14
+ self.local_persist_dir = '/tmp/crap'
15
15
 
16
16
  #Rbbt.software.opt.StanfordParser.define_as_install Rbbt.share.install.software.StanfordParser.find
17
17
  #Rbbt.software.opt.StanfordParser.produce
@@ -81,44 +81,21 @@ module NLP
81
81
  sentence = text[s..e]
82
82
  next if sentence.nil?
83
83
  #sentence.gsub!(NEW_LINE_MASK, "\n")
84
- Segment.annotate sentence, s
84
+ Segment.setup sentence, s
85
85
  sentence
86
86
  end
87
87
  end
88
88
 
89
89
  module GdepToken
90
- attr_accessor :num, :token, :lemma, :chunk, :pos, :bio, :link, :dep
90
+ extend Annotation
91
91
  include Segment
92
-
93
- def self.annotate(token, offset = nil, num = nil, lemma = nil, chunk = nil, pos = nil, bio = nil, link = nil, dep = nil)
94
- token.extend GdepToken
95
-
96
- token.offset = offset
97
- token.num = num
98
- token.lemma = lemma
99
- token.chunk = chunk
100
- token.pos = pos
101
- token.bio = bio
102
- token.link = link
103
- token.dep = dep
104
-
105
- token
106
- end
92
+ self.annotation :num, :lemma, :chunk, :pos, :bio, :link, :dep
107
93
  end
108
94
 
109
95
  module GdepChunk
110
- attr_accessor :type, :parts, :segment_types
96
+ extend Annotation
111
97
  include Segment
112
-
113
- def self.annotate(string, offset = nil, type = nil, parts = nil)
114
- string.extend GdepChunk
115
-
116
- string.offset = offset
117
- string.type = type
118
- string.parts = parts
119
-
120
- string
121
- end
98
+ self.annotation :type, :parts
122
99
  end
123
100
 
124
101
  def self.merge_vp_chunks(chunk_list)
@@ -148,7 +125,7 @@ module NLP
148
125
  chunk_start = "B"[0]
149
126
  chunk_inside = "I"[0]
150
127
 
151
- last = GdepToken.annotate("LW")
128
+ last = GdepToken.setup("LW")
152
129
 
153
130
  chunk_segments = []
154
131
  segment_list.each do |segment|
@@ -159,7 +136,7 @@ module NLP
159
136
  cstart = chunk_segments.first.offset
160
137
  cend = chunk_segments.last.end
161
138
  chunk = sentence[cstart..cend]
162
- GdepChunk.annotate(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
139
+ GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
163
140
  chunks << chunk
164
141
  end
165
142
 
@@ -172,6 +149,15 @@ module NLP
172
149
  last = segment
173
150
  end
174
151
 
152
+ if chunk_segments.any?
153
+ cstart = chunk_segments.first.offset
154
+ cend = chunk_segments.last.end
155
+ chunk = sentence[cstart..cend]
156
+ GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
157
+ chunks << chunk
158
+ end
159
+
160
+
175
161
  chunks
176
162
  end
177
163
 
@@ -188,7 +174,7 @@ module NLP
188
174
  tokens = sentence.split(/\n/).collect do |line|
189
175
  next if line.empty?
190
176
  num, token, lemma, chunk, pos, bio, link, dep = line.split(/\t/)
191
- GdepToken.annotate(token, nil, num, lemma, chunk, pos, bio, link, dep)
177
+ GdepToken.setup(token, nil, num, lemma, chunk, pos, bio, link, dep)
192
178
  end.compact
193
179
  end
194
180
  end
@@ -214,7 +200,7 @@ module NLP
214
200
  Gdep.new.tag(sentence).split(/\n/).collect do |line|
215
201
  next if line.empty?
216
202
  token, lemma, pos, chunk = line.split(/\t/)
217
- GdepToken.annotate(token, nil, nil, lemma, chunk, pos)
203
+ GdepToken.setup(token, nil, nil, lemma, chunk, pos)
218
204
  token
219
205
  end.compact
220
206
  }
@@ -2,11 +2,6 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
2
2
  require 'rbbt/corpus/document'
3
3
  require 'test/unit'
4
4
 
5
- $persistence = TSV.new({})
6
- $tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
7
- $global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
8
- $tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
9
-
10
5
  class Document
11
6
  define :sentences do
12
7
  require 'rbbt/nlp/nlp'
@@ -14,22 +9,22 @@ class Document
14
9
  end
15
10
 
16
11
  define :tokens do
17
- require 'rbbt/ner/annotations/token'
12
+ require 'rbbt/ner/segment/token'
18
13
  Token.tokenize(text)
19
14
  end
20
15
 
21
16
  define :long_words do
22
- require 'rbbt/ner/annotations/token'
17
+ require 'rbbt/ner/segment/token'
23
18
  Token.tokenize(text).select{|tok| tok.length > 5}
24
19
  end
25
20
 
26
21
  define :short_words do
27
- require 'rbbt/ner/annotations/token'
22
+ require 'rbbt/ner/segment/token'
28
23
  Token.tokenize(text).select{|tok| tok.length < 5}
29
24
  end
30
25
 
31
26
  define :even_words do
32
- require 'rbbt/ner/annotations/token'
27
+ require 'rbbt/ner/segment/token'
33
28
  Token.tokenize(text).select{|tok| tok.length % 2 == 0}
34
29
  end
35
30
 
@@ -40,17 +35,30 @@ class Document
40
35
  define :tokens_again do
41
36
  raise "This should be here already"
42
37
  end
43
-
44
- persist :sentences
45
- persist_in_tsv :tokens
46
- persist_in_tsv :long_words, $tchash_persistence, :Literal
47
- persist_in_global_tsv :short_words, $global_persistence
48
- persist_in_global_tsv :even_words, $tchash_global_persistence
49
- persist_in_global_tsv :missing, $tchash_global_persistence
50
38
  end
51
39
 
52
40
  class TestDocument < Test::Unit::TestCase
53
41
 
42
+ def setup
43
+ global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
44
+ $persistence = TSV.setup({})
45
+ $tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
46
+ $global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
47
+ $tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
48
+ $tchash_global_persistence.read
49
+ $tchash_global_persistence.write
50
+
51
+ Document.class_eval do
52
+
53
+ persist :sentences
54
+ persist_in_tsv :tokens, :literal
55
+ persist_in_tsv :long_words, $tchash_persistence, :literal
56
+ persist_in_global_tsv :short_words, $global_persistence
57
+ persist_in_global_tsv :even_words, $tchash_global_persistence
58
+ persist_in_global_tsv :missing, $tchash_global_persistence
59
+ end
60
+ end
61
+
54
62
  def test_annotations
55
63
 
56
64
  text =<<-EOF
@@ -127,7 +135,7 @@ another sentence.
127
135
  doc.text = text
128
136
 
129
137
  sentence = doc.sentences.last
130
- Misc.benchmark(10) do
138
+ Misc.benchmark(1) do
131
139
  doc = Document.new(dir)
132
140
  doc.text = text
133
141
 
@@ -166,6 +174,15 @@ another sentence.
166
174
  assert_equal "another", sentence.tokens[2]
167
175
  assert_equal sentence.offset + 0, sentence.tokens[0].offset
168
176
 
177
+ assert_equal 2, sentence.long_words.length
178
+ doc = Document.new(dir)
179
+ doc.text = text * 10
180
+ doc.sentences
181
+ assert_equal sentence, doc.sentences.last
182
+
183
+ sentence = doc.sentences.last
184
+ doc.load_into sentence, :tokens, :long_words
185
+
169
186
  assert_equal 2, sentence.long_words.length
170
187
  assert_equal %w(another sentence), sentence.long_words
171
188
  assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
@@ -183,15 +200,16 @@ another sentence.
183
200
  FileUtils.mkdir_p dir
184
201
 
185
202
 
186
- doc = Document.new(dir)
203
+ global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
204
+ doc = Document.new(dir, nil, nil, global_persistence)
187
205
  doc.text = text * 10
188
- doc.docid = "FOOF"
189
- doc.short_words
206
+ doc.docid = "TEST"
207
+
190
208
  doc.sentences
191
209
 
192
210
  doc = Document.new(dir)
193
211
  doc.text = text * 10
194
- doc.docid = "FOOF"
212
+ doc.docid = "TEST"
195
213
 
196
214
  sentence = doc.sentences.last
197
215
 
@@ -201,22 +219,6 @@ another sentence.
201
219
  assert_equal 3, sentence.even_words.length
202
220
  end
203
221
  end
204
-
205
- def test_dump
206
- text =<<-EOF
207
- This is a
208
- sentence. This is
209
- another sentence.
210
- EOF
211
-
212
- TmpFile.with_file do |dir|
213
- FileUtils.mkdir_p dir
214
-
215
- doc = Document.new(dir)
216
- doc.text = text * 10
217
- tsv = Document.tsv(doc.sentences, ["Literal"])
218
- end
219
- end
220
222
  end
221
223
 
222
224