rbbt-text 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (39) hide show
  1. data/lib/rbbt/corpus/corpus.rb +15 -6
  2. data/lib/rbbt/corpus/document.rb +100 -127
  3. data/lib/rbbt/corpus/document_repo.rb +72 -51
  4. data/lib/rbbt/ner/NER.rb +4 -4
  5. data/lib/rbbt/ner/abner.rb +5 -4
  6. data/lib/rbbt/ner/banner.rb +3 -3
  7. data/lib/rbbt/ner/chemical_tagger.rb +3 -3
  8. data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
  9. data/lib/rbbt/ner/oscar3.rb +3 -3
  10. data/lib/rbbt/ner/oscar4.rb +3 -3
  11. data/lib/rbbt/ner/patterns.rb +15 -13
  12. data/lib/rbbt/ner/regexpNER.rb +3 -2
  13. data/lib/rbbt/ner/rnorm.rb +2 -2
  14. data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
  15. data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
  16. data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
  17. data/lib/rbbt/ner/segment/relationship.rb +20 -0
  18. data/lib/rbbt/ner/segment/segmented.rb +13 -0
  19. data/lib/rbbt/ner/segment/token.rb +24 -0
  20. data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
  21. data/lib/rbbt/ner/token_trieNER.rb +30 -22
  22. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  23. data/lib/rbbt/nlp/nlp.rb +23 -37
  24. data/test/rbbt/corpus/test_document.rb +39 -37
  25. data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
  26. data/test/rbbt/ner/segment/test_segmented.rb +23 -0
  27. data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
  28. data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
  29. data/test/rbbt/ner/test_patterns.rb +11 -12
  30. data/test/rbbt/ner/test_regexpNER.rb +5 -4
  31. data/test/rbbt/ner/test_segment.rb +101 -0
  32. data/test/rbbt/ner/test_token_trieNER.rb +8 -9
  33. data/test/test_helper.rb +6 -6
  34. metadata +40 -22
  35. data/lib/rbbt/ner/annotations/annotated.rb +0 -15
  36. data/lib/rbbt/ner/annotations/relations.rb +0 -25
  37. data/lib/rbbt/ner/annotations/token.rb +0 -28
  38. data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
  39. data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -1,17 +1,9 @@
1
- require 'rbbt/ner/annotations'
1
+ require 'rbbt/ner/segment'
2
2
 
3
3
  module NamedEntity
4
- attr_accessor :type, :code, :score, :segment_types
4
+ extend Annotation
5
5
  include Segment
6
-
7
- def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
8
- string.extend NamedEntity
9
- string.offset = offset unless offset.nil?
10
- string.type = type unless type.nil?
11
- string.code = code unless code.nil?
12
- string.score = score unless score.nil?
13
- string
14
- end
6
+ self.annotation :type, :code, :score
15
7
 
16
8
  def report
17
9
  <<-EOF
@@ -0,0 +1,20 @@
1
+ require 'rbbt/ner/segment'
2
+
3
+ module Relationship
4
+ extend Annotation
5
+ include Segment
6
+ self.annotation :terms
7
+
8
+ def html
9
+ text = <<-EOF
10
+ <span class='Relationship'\
11
+ >#{ self }</span>
12
+ EOF
13
+ text.chomp
14
+ end
15
+
16
+ def html_with_entities(*types)
17
+ annotations.values_at(*types).each do |segments|
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,13 @@
1
+ require 'rbbt/annotations'
2
+ require 'rbbt/ner/segment'
3
+
4
+ module Segmented
5
+ extend Annotation
6
+ self.annotation :segments
7
+
8
+ def split_segments(skip_segments = false)
9
+ Segment.split(self, @segments, skip_segments)
10
+ end
11
+ end
12
+
13
+
@@ -0,0 +1,24 @@
1
+ require 'rbbt/annotations'
2
+ require 'rbbt/ner/segment'
3
+
4
+ module Token
5
+ extend Annotation
6
+ include Segment
7
+ self.annotation :original
8
+
9
+ def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
10
+
11
+ tokens = []
12
+ while matchdata = text.match(split_at)
13
+ tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
14
+ tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
15
+ start += matchdata.end(0)
16
+ text = matchdata.post_match
17
+ end
18
+
19
+ tokens << Token.setup(text, start) unless text.empty?
20
+
21
+ tokens
22
+ end
23
+ end
24
+
@@ -1,7 +1,16 @@
1
- require 'rbbt/ner/annotations'
1
+ require 'rbbt/ner/segment'
2
2
  module Transformed
3
3
  attr_accessor :transformation_offset_differences, :transformation_original
4
4
 
5
+ def self.transform(text, segments, replacement = nil, &block)
6
+ require 'rbbt/util/misc'
7
+
8
+ text.extend Transformed
9
+ text.replace(segments, replacement, &block)
10
+
11
+ text
12
+ end
13
+
5
14
  def self.with_transform(text, segments, replacement)
6
15
  require 'rbbt/util/misc'
7
16
 
@@ -14,16 +23,7 @@ module Transformed
14
23
 
15
24
  text.restore(segments, true)
16
25
  end
17
-
18
- def self.transform(text, segments, replacement = nil, &block)
19
- require 'rbbt/util/misc'
20
26
 
21
- text.extend Transformed
22
- text.replace(segments, replacement, &block)
23
-
24
- text
25
- end
26
-
27
27
  def transform_pos(pos)
28
28
  return pos if transformation_offset_differences.nil?
29
29
  # tranformation_offset_differences are assumed to be sorted in reverse
@@ -1,7 +1,7 @@
1
- require 'rbbt-util'
2
- require 'rbbt/util/tsv'
3
- require 'rbbt/ner/annotations'
4
- require 'rbbt/ner/annotations/token'
1
+ require 'rbbt'
2
+ require 'rbbt/tsv'
3
+ require 'rbbt/ner/segment'
4
+ require 'rbbt/ner/segment/token'
5
5
  require 'rbbt/ner/NER'
6
6
 
7
7
  class TokenTrieNER < NER
@@ -16,15 +16,15 @@ class TokenTrieNER < NER
16
16
  def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
17
17
  if no_clean
18
18
  if extend_to_token
19
- Token.annotate(clean(token), start, token)
19
+ Token.setup(clean(token), start, token)
20
20
  else
21
- clean(token)
21
+ token
22
22
  end
23
23
  else
24
24
  if extend_to_token
25
- Token.annotate(clean(token), start, token)
25
+ Token.setup(clean(token), start, token)
26
26
  else
27
- token
27
+ clean(token)
28
28
  end
29
29
  end
30
30
  end
@@ -137,6 +137,11 @@ class TokenTrieNER < NER
137
137
  hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
138
138
  names = Array === names ? names : [names]
139
139
  names.flatten! if Array === names.first and not Token === names.first.first
140
+
141
+ if names.empty?
142
+ names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
143
+ end
144
+
140
145
  names.each do |name|
141
146
  next if name.empty? or (String === name and name.length < 2)
142
147
 
@@ -167,7 +172,7 @@ class TokenTrieNER < NER
167
172
  return index[head]
168
173
  end
169
174
 
170
- return nil unless (not TCHash === index ) and index.include? :PROCS
175
+ return nil unless (not TokyoCabinet::HDB === index ) and index.include? :PROCS
171
176
 
172
177
  index[:PROCS].each do |key,value|
173
178
  return value if key.call(head)
@@ -225,16 +230,16 @@ class TokenTrieNER < NER
225
230
  match_offset = match_tokens.first.offset
226
231
  match_tokens.each{|t|
227
232
  match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
228
- match << (t.respond_to?(:original) ? t.original : t)
233
+ match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
229
234
  }
230
235
 
231
- NamedEntity.annotate(match, match_tokens.first.offset, type, codes)
236
+ NamedEntity.setup(match, match_tokens.first.offset, type, codes)
232
237
  end
233
238
 
234
239
  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
235
240
  def initialize(type = nil, file = nil, options = {})
236
241
  options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
237
- :persistence => false
242
+ :persist => false
238
243
  @slack = slack
239
244
  @longest_match = options.delete :longest_match
240
245
  @split_at = options.delete :split_at
@@ -242,16 +247,15 @@ class TokenTrieNER < NER
242
247
 
243
248
  file = [] if file.nil?
244
249
  file = [file] unless Array === file
245
- @index = Persistence.persist(file, :TokenTRIE, :tsv, options) do |file, options, filename, persistecen_file|
246
- if persistecen_file.nil?
247
- @index = {}
248
- else
249
- FileUtils.mkdir_p File.dirname(persistecen_file) unless File.exists? File.dirname(persistecen_file)
250
- @index = TCHash.get persistecen_file, true, :marshal
251
- end
250
+ persist_options = Misc.pull_keys options, :persist
251
+ @index = Persist.persist_tsv(file, options, persist_options) do |data|
252
+ data.serializer = :marshal if data.respond_to? :serializer and data.serializer == :type
253
+
254
+ @index = data
252
255
  file.each do |f|
253
256
  merge(f, type)
254
257
  end
258
+
255
259
  @index
256
260
  end
257
261
  end
@@ -259,10 +263,10 @@ class TokenTrieNER < NER
259
263
  def merge(new, type = nil)
260
264
  case
261
265
  when TokenTrieNER === new
266
+ Log.debug "TokenTrieNER merging other TokenTrieNER"
262
267
  TokenTrieNER.merge(@index, new.index)
263
- when Hash === new
264
- TokenTrieNER.merge(@index, new)
265
268
  when TSV === new
269
+ Log.debug "TokenTrieNER merging TSV"
266
270
  old_unnamed = new.unnamed
267
271
  old_monitor = new.monitor
268
272
  new.unnamed = true
@@ -270,8 +274,12 @@ class TokenTrieNER < NER
270
274
  TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
271
275
  new.unnamed = old_unnamed
272
276
  new.monitor = old_monitor
277
+ when Hash === new
278
+ Log.debug "TokenTrieNER merging Hash"
279
+ TokenTrieNER.merge(@index, new)
273
280
  when String === new
274
- new = TSV.new(new, :flat)
281
+ Log.debug "TokenTrieNER merging file: #{ new }"
282
+ new = TSV.open(new, :flat)
275
283
  new.unnamed = true
276
284
  new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
277
285
  TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
@@ -1,3 +1,4 @@
1
+ require 'rbbt/ner/segment'
1
2
  module NLP
2
3
  def self.returnFeatures(prevWord, delimiter, nextWord)
3
4
  if nextWord.match(/__ss__/)
@@ -206,7 +207,7 @@ module NLP
206
207
  offsets.collect do |s,e|
207
208
  sentence = text[s..e]
208
209
  next if sentence.nil?
209
- Segment.annotate sentence, s
210
+ Segment.setup sentence, s
210
211
  sentence
211
212
  end
212
213
 
data/lib/rbbt/nlp/nlp.rb CHANGED
@@ -1,9 +1,9 @@
1
1
  require 'rbbt'
2
2
  require 'rbbt/util/tmpfile'
3
- require 'rbbt/util/persistence'
4
- require 'rbbt/util/resource'
5
- require 'rbbt/ner/annotations'
6
- require 'rbbt/ner/annotations/annotated'
3
+ require 'rbbt/persist'
4
+ require 'rbbt/resource'
5
+ require 'rbbt/ner/segment'
6
+ require 'rbbt/ner/segment/segmented'
7
7
  require 'rbbt/nlp/genia/sentence_splitter'
8
8
  require 'digest/md5'
9
9
 
@@ -11,7 +11,7 @@ require 'digest/md5'
11
11
  module NLP
12
12
 
13
13
  extend LocalPersist
14
- self.local_persistence_dir = '/tmp/crap'
14
+ self.local_persist_dir = '/tmp/crap'
15
15
 
16
16
  #Rbbt.software.opt.StanfordParser.define_as_install Rbbt.share.install.software.StanfordParser.find
17
17
  #Rbbt.software.opt.StanfordParser.produce
@@ -81,44 +81,21 @@ module NLP
81
81
  sentence = text[s..e]
82
82
  next if sentence.nil?
83
83
  #sentence.gsub!(NEW_LINE_MASK, "\n")
84
- Segment.annotate sentence, s
84
+ Segment.setup sentence, s
85
85
  sentence
86
86
  end
87
87
  end
88
88
 
89
89
  module GdepToken
90
- attr_accessor :num, :token, :lemma, :chunk, :pos, :bio, :link, :dep
90
+ extend Annotation
91
91
  include Segment
92
-
93
- def self.annotate(token, offset = nil, num = nil, lemma = nil, chunk = nil, pos = nil, bio = nil, link = nil, dep = nil)
94
- token.extend GdepToken
95
-
96
- token.offset = offset
97
- token.num = num
98
- token.lemma = lemma
99
- token.chunk = chunk
100
- token.pos = pos
101
- token.bio = bio
102
- token.link = link
103
- token.dep = dep
104
-
105
- token
106
- end
92
+ self.annotation :num, :lemma, :chunk, :pos, :bio, :link, :dep
107
93
  end
108
94
 
109
95
  module GdepChunk
110
- attr_accessor :type, :parts, :segment_types
96
+ extend Annotation
111
97
  include Segment
112
-
113
- def self.annotate(string, offset = nil, type = nil, parts = nil)
114
- string.extend GdepChunk
115
-
116
- string.offset = offset
117
- string.type = type
118
- string.parts = parts
119
-
120
- string
121
- end
98
+ self.annotation :type, :parts
122
99
  end
123
100
 
124
101
  def self.merge_vp_chunks(chunk_list)
@@ -148,7 +125,7 @@ module NLP
148
125
  chunk_start = "B"[0]
149
126
  chunk_inside = "I"[0]
150
127
 
151
- last = GdepToken.annotate("LW")
128
+ last = GdepToken.setup("LW")
152
129
 
153
130
  chunk_segments = []
154
131
  segment_list.each do |segment|
@@ -159,7 +136,7 @@ module NLP
159
136
  cstart = chunk_segments.first.offset
160
137
  cend = chunk_segments.last.end
161
138
  chunk = sentence[cstart..cend]
162
- GdepChunk.annotate(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
139
+ GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
163
140
  chunks << chunk
164
141
  end
165
142
 
@@ -172,6 +149,15 @@ module NLP
172
149
  last = segment
173
150
  end
174
151
 
152
+ if chunk_segments.any?
153
+ cstart = chunk_segments.first.offset
154
+ cend = chunk_segments.last.end
155
+ chunk = sentence[cstart..cend]
156
+ GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
157
+ chunks << chunk
158
+ end
159
+
160
+
175
161
  chunks
176
162
  end
177
163
 
@@ -188,7 +174,7 @@ module NLP
188
174
  tokens = sentence.split(/\n/).collect do |line|
189
175
  next if line.empty?
190
176
  num, token, lemma, chunk, pos, bio, link, dep = line.split(/\t/)
191
- GdepToken.annotate(token, nil, num, lemma, chunk, pos, bio, link, dep)
177
+ GdepToken.setup(token, nil, num, lemma, chunk, pos, bio, link, dep)
192
178
  end.compact
193
179
  end
194
180
  end
@@ -214,7 +200,7 @@ module NLP
214
200
  Gdep.new.tag(sentence).split(/\n/).collect do |line|
215
201
  next if line.empty?
216
202
  token, lemma, pos, chunk = line.split(/\t/)
217
- GdepToken.annotate(token, nil, nil, lemma, chunk, pos)
203
+ GdepToken.setup(token, nil, nil, lemma, chunk, pos)
218
204
  token
219
205
  end.compact
220
206
  }
@@ -2,11 +2,6 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
2
2
  require 'rbbt/corpus/document'
3
3
  require 'test/unit'
4
4
 
5
- $persistence = TSV.new({})
6
- $tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
7
- $global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
8
- $tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
9
-
10
5
  class Document
11
6
  define :sentences do
12
7
  require 'rbbt/nlp/nlp'
@@ -14,22 +9,22 @@ class Document
14
9
  end
15
10
 
16
11
  define :tokens do
17
- require 'rbbt/ner/annotations/token'
12
+ require 'rbbt/ner/segment/token'
18
13
  Token.tokenize(text)
19
14
  end
20
15
 
21
16
  define :long_words do
22
- require 'rbbt/ner/annotations/token'
17
+ require 'rbbt/ner/segment/token'
23
18
  Token.tokenize(text).select{|tok| tok.length > 5}
24
19
  end
25
20
 
26
21
  define :short_words do
27
- require 'rbbt/ner/annotations/token'
22
+ require 'rbbt/ner/segment/token'
28
23
  Token.tokenize(text).select{|tok| tok.length < 5}
29
24
  end
30
25
 
31
26
  define :even_words do
32
- require 'rbbt/ner/annotations/token'
27
+ require 'rbbt/ner/segment/token'
33
28
  Token.tokenize(text).select{|tok| tok.length % 2 == 0}
34
29
  end
35
30
 
@@ -40,17 +35,30 @@ class Document
40
35
  define :tokens_again do
41
36
  raise "This should be here already"
42
37
  end
43
-
44
- persist :sentences
45
- persist_in_tsv :tokens
46
- persist_in_tsv :long_words, $tchash_persistence, :Literal
47
- persist_in_global_tsv :short_words, $global_persistence
48
- persist_in_global_tsv :even_words, $tchash_global_persistence
49
- persist_in_global_tsv :missing, $tchash_global_persistence
50
38
  end
51
39
 
52
40
  class TestDocument < Test::Unit::TestCase
53
41
 
42
+ def setup
43
+ global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
44
+ $persistence = TSV.setup({})
45
+ $tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
46
+ $global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
47
+ $tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
48
+ $tchash_global_persistence.read
49
+ $tchash_global_persistence.write
50
+
51
+ Document.class_eval do
52
+
53
+ persist :sentences
54
+ persist_in_tsv :tokens, :literal
55
+ persist_in_tsv :long_words, $tchash_persistence, :literal
56
+ persist_in_global_tsv :short_words, $global_persistence
57
+ persist_in_global_tsv :even_words, $tchash_global_persistence
58
+ persist_in_global_tsv :missing, $tchash_global_persistence
59
+ end
60
+ end
61
+
54
62
  def test_annotations
55
63
 
56
64
  text =<<-EOF
@@ -127,7 +135,7 @@ another sentence.
127
135
  doc.text = text
128
136
 
129
137
  sentence = doc.sentences.last
130
- Misc.benchmark(10) do
138
+ Misc.benchmark(1) do
131
139
  doc = Document.new(dir)
132
140
  doc.text = text
133
141
 
@@ -166,6 +174,15 @@ another sentence.
166
174
  assert_equal "another", sentence.tokens[2]
167
175
  assert_equal sentence.offset + 0, sentence.tokens[0].offset
168
176
 
177
+ assert_equal 2, sentence.long_words.length
178
+ doc = Document.new(dir)
179
+ doc.text = text * 10
180
+ doc.sentences
181
+ assert_equal sentence, doc.sentences.last
182
+
183
+ sentence = doc.sentences.last
184
+ doc.load_into sentence, :tokens, :long_words
185
+
169
186
  assert_equal 2, sentence.long_words.length
170
187
  assert_equal %w(another sentence), sentence.long_words
171
188
  assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
@@ -183,15 +200,16 @@ another sentence.
183
200
  FileUtils.mkdir_p dir
184
201
 
185
202
 
186
- doc = Document.new(dir)
203
+ global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
204
+ doc = Document.new(dir, nil, nil, global_persistence)
187
205
  doc.text = text * 10
188
- doc.docid = "FOOF"
189
- doc.short_words
206
+ doc.docid = "TEST"
207
+
190
208
  doc.sentences
191
209
 
192
210
  doc = Document.new(dir)
193
211
  doc.text = text * 10
194
- doc.docid = "FOOF"
212
+ doc.docid = "TEST"
195
213
 
196
214
  sentence = doc.sentences.last
197
215
 
@@ -201,22 +219,6 @@ another sentence.
201
219
  assert_equal 3, sentence.even_words.length
202
220
  end
203
221
  end
204
-
205
- def test_dump
206
- text =<<-EOF
207
- This is a
208
- sentence. This is
209
- another sentence.
210
- EOF
211
-
212
- TmpFile.with_file do |dir|
213
- FileUtils.mkdir_p dir
214
-
215
- doc = Document.new(dir)
216
- doc.text = text * 10
217
- tsv = Document.tsv(doc.sentences, ["Literal"])
218
- end
219
- end
220
222
  end
221
223
 
222
224