rbbt-text 1.3.0 → 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +20 -5
  5. data/lib/rbbt/document/annotation.rb +7 -4
  6. data/lib/rbbt/document/corpus.rb +30 -3
  7. data/lib/rbbt/document/corpus/pubmed.rb +2 -1
  8. data/lib/rbbt/ner/abner.rb +3 -2
  9. data/lib/rbbt/ner/banner.rb +3 -1
  10. data/lib/rbbt/ner/brat.rb +1 -1
  11. data/lib/rbbt/ner/g_norm_plus.rb +7 -1
  12. data/lib/rbbt/ner/linnaeus.rb +2 -1
  13. data/lib/rbbt/ner/patterns.rb +0 -1
  14. data/lib/rbbt/ner/rner.rb +229 -0
  15. data/lib/rbbt/ner/token_trieNER.rb +32 -18
  16. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  17. data/lib/rbbt/nlp/spaCy.rb +195 -0
  18. data/lib/rbbt/relationship.rb +24 -0
  19. data/lib/rbbt/segment.rb +9 -4
  20. data/lib/rbbt/segment/annotation.rb +3 -3
  21. data/lib/rbbt/segment/named_entity.rb +7 -0
  22. data/lib/rbbt/segment/range_index.rb +1 -1
  23. data/lib/rbbt/segment/relationship.rb +7 -0
  24. data/lib/rbbt/segment/transformed.rb +5 -1
  25. data/share/install/software/OpenNLP +1 -1
  26. data/share/rner/config.rb +51 -0
  27. data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
  28. data/test/rbbt/document/test_annotation.rb +15 -6
  29. data/test/rbbt/document/test_corpus.rb +15 -1
  30. data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
  31. data/test/rbbt/ner/test_rner.rb +132 -0
  32. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
  33. data/test/rbbt/segment/test_annotation.rb +3 -4
  34. data/test/rbbt/segment/test_encoding.rb +1 -1
  35. data/test/rbbt/segment/test_named_entity.rb +7 -5
  36. data/test/rbbt/segment/test_range_index.rb +1 -2
  37. data/test/rbbt/segment/test_transformed.rb +33 -4
  38. data/test/rbbt/test_segment.rb +5 -10
  39. data/test/test_spaCy.rb +144 -0
  40. metadata +12 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c2a24d8e7faf30d53e41a00a27f6145e8e9f18f0c10af57cdddaea0ee18c35d6
4
- data.tar.gz: 3475006965110391e35151cd1b5368028dacf467aa276f8eb68fce3320be1122
3
+ metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
4
+ data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
5
5
  SHA512:
6
- metadata.gz: da40a039a4792eb5e7fa00270870279221c74dcbf51df1b5278b247496fefbfa888a87b7ab19f05676644c51a01177eb49e229cb0156fe7f0190dd4933d41e24
7
- data.tar.gz: a32fca5f21a987dcbb6b5541015cc33879330e6f1ef7c4a28e75debe5bdd1dc8bf7b98bfc91d828e605f29868aa972b55cd59bb4f86e66d2fb0cfea31fac2ae0
6
+ metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
7
+ data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e
data/lib/rbbt/bow/bow.rb CHANGED
@@ -69,6 +69,11 @@ module BagOfWords
69
69
  count = bigrams ? count(bigrams(text)) : count(words(text))
70
70
  count.values_at(*terms)
71
71
  end
72
+
73
+ def self.weighted_features(text, weights)
74
+ features = features(text, weights.keys)
75
+ features.zip(weights.values).collect{|f,w| f * w }
76
+ end
72
77
  end
73
78
 
74
79
  class String
@@ -82,5 +87,3 @@ class String
82
87
  BagOfWords.bigrams(self)
83
88
  end
84
89
  end
85
-
86
-
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
74
74
  end
75
75
 
76
76
  def best(options = {})
77
- high, low, limit = {
78
- :low => 0,
79
- :high => 1,
80
- }.merge(options).
81
- values_at(:high, :low, :limit)
82
-
83
- num_docs = @num_docs.to_f
84
- best = df.select{|term, value|
85
- value >= low && value <= high
86
- }.collect{|p|
87
- term = p.first
88
- df_value = p.last
89
- [term,
90
- @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
- ]
92
- }
93
-
94
- if limit
95
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
96
- else
97
- Hash[*best.flatten]
98
- end
77
+ key = Misc.obj2digest(options)
78
+ @best ||= {}
79
+ @best[key] ||= begin
80
+ high, low, limit = {
81
+ :low => 0,
82
+ :high => 1,
83
+ }.merge(options).
84
+ values_at(:high, :low, :limit)
85
+
86
+ num_docs = @num_docs.to_f
87
+ best = df.select{|term, value|
88
+ value >= low && value <= high
89
+ }.collect{|p|
90
+ term = p.first
91
+ df_value = p.last
92
+ [term,
93
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
94
+ ]
95
+ }
96
+
97
+ if limit
98
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
99
+ else
100
+ Hash[*best.flatten]
101
+ end
102
+ end
99
103
  end
100
104
 
101
105
  def weights(options = {})
@@ -173,7 +177,7 @@ class Dictionary::KL
173
177
  best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
174
178
  }
175
179
  if limit
176
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
180
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
177
181
  else
178
182
  best
179
183
  end
data/lib/rbbt/document.rb CHANGED
@@ -1,6 +1,5 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/entity'
3
- require 'rbbt/document/annotation'
4
3
 
5
4
  module DocID
6
5
  extend Entity
@@ -19,10 +18,20 @@ module DocID
19
18
  DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
20
19
  end
21
20
 
22
- def document
23
- text = self.corpus[self]
24
- namespace, id, type = self.split(":")
25
- Document.setup(text, namespace, id, type, :corpus => corpus)
21
+ property :document => :both do
22
+ if Array === self
23
+ namespace, id, type = nil, nil, nil
24
+ docs = self.collect do |docid|
25
+ text = self.corpus[docid]
26
+ namespace, id, type = docid.split(":")
27
+ text
28
+ end
29
+ Document.setup(docs, :corpus => corpus)
30
+ else
31
+ text = self.corpus[self]
32
+ namespace, id, type = self.split(":")
33
+ Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
34
+ end
26
35
  end
27
36
  end
28
37
 
@@ -44,3 +53,9 @@ module Document
44
53
  alias id docid
45
54
  end
46
55
 
56
+ #class String
57
+ # def docid
58
+ # digest = Misc.digest(self)
59
+ # ["STRING", digest, nil, nil] * ":"
60
+ # end
61
+ #end
@@ -1,3 +1,4 @@
1
+ require 'rbbt/segment'
1
2
  require 'rbbt/segment/annotation'
2
3
 
3
4
  module Document
@@ -12,7 +13,7 @@ module Document
12
13
  end
13
14
 
14
15
  docid = self.docid
15
- segments.each{|s| s.docid = docid if s.docid.nil? }
16
+ segments.each{|s| s.docid = docid }
16
17
 
17
18
  segments
18
19
  end
@@ -22,18 +23,20 @@ module Document
22
23
  send :property, type => :multiple do |list|
23
24
  doc_segments = self.instance_exec list, &block
24
25
 
25
- doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
26
+ doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
26
27
 
27
28
  doc_segments.each_with_index do |segments,i|
29
+ next if segments.nil?
28
30
  document = list[i]
29
- Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
31
+ Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
30
32
 
31
33
  segments.each do |segment|
32
34
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
33
35
  end
34
36
 
35
37
  docid = document.docid
36
- segments.each{|s| s.docid = docid if s.docid.nil? }
38
+
39
+ segments.each{|s| s.docid = docid }
37
40
 
38
41
  segments
39
42
  end
@@ -3,17 +3,43 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus.extend Document::Corpus
6
+ corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
7
+ corpus.extend Document::Corpus unless Document::Corpus === corpus
8
+ corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
9
+ corpus
7
10
  end
8
11
 
9
12
  def add_document(document)
10
- self[document.docid] = document
13
+ docid = document.docid
14
+ return self[docid] if self.include?(docid)
15
+ self.write_and_close do
16
+ self[docid] = document
17
+ end
18
+ end
19
+
20
+ def docids(*prefix)
21
+ prefix = prefix * ":"
22
+ prefix += ":" unless prefix == :all || prefix[-1] == ":"
23
+ docids = self.read_and_close do
24
+ prefix == :all ? self.keys : self.prefix(prefix)
25
+ end
26
+ DocID.setup(docids, :corpus => self)
27
+ end
28
+
29
+ def documents(*prefix)
30
+ self.docids(*prefix).document
11
31
  end
12
32
 
13
33
  def [](*args)
14
34
  docid, *rest = args
15
- res = super(*args)
35
+
36
+ res = self.read_and_close do
37
+ super(*args)
38
+ end
39
+
40
+ res.force_encoding(Encoding.default_external) if res
16
41
  return res if args.length > 1
42
+
17
43
  namespace, id, type = docid.split(":")
18
44
 
19
45
  if res.nil?
@@ -22,6 +48,7 @@ module Document::Corpus
22
48
  end
23
49
  end
24
50
 
51
+ res.force_encoding(Encoding.default_external) if res
25
52
  Document.setup(res, namespace, id, type, self) unless res.nil?
26
53
 
27
54
  res
@@ -6,7 +6,6 @@ module Document::Corpus
6
6
  type = nil if String === type and type.empty?
7
7
 
8
8
  res = PubMed.get_article(pmids).collect do |pmid, article|
9
- Log.debug "Loading pmid #{pmid}"
10
9
  document = if type.nil? || type.to_sym == :abstract
11
10
  Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
12
11
  elsif type.to_sym == :title
@@ -15,7 +14,9 @@ module Document::Corpus
15
14
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
16
15
  Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
17
16
  end
17
+ Log.debug "Loading pmid #{pmid}"
18
18
  add_document(document)
19
+ document
19
20
  end
20
21
 
21
22
  Document.setup(res)
@@ -39,14 +39,15 @@ class Abner < NER
39
39
  types = res[1]
40
40
  strings = res[0]
41
41
 
42
+ docid = Misc.digest(text)
42
43
  global_offset = 0
43
44
  strings.zip(types).collect do |mention, type|
44
45
  mention = mention.to_s;
45
46
  offset = text.index(mention)
46
47
  if offset.nil?
47
- NamedEntity.setup(mention, nil, type.to_s)
48
+ NamedEntity.setup(mention, :docid => docid, :entity_type => type)
48
49
  else
49
- NamedEntity.setup(mention, offset + global_offset, type.to_s)
50
+ NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
50
51
  text = text[offset + mention.length..-1]
51
52
  global_offset += offset + mention.length
52
53
  end
@@ -55,6 +55,7 @@ class Banner < NER
55
55
  # text.
56
56
  def match(text)
57
57
  return [] if text.nil?
58
+ text = text.dup if text.frozen?
58
59
  text.gsub!(/\n/,' ')
59
60
  text.gsub!(/\|/,'/') # Character | gives an error
60
61
  return [] if text.strip.empty?
@@ -66,6 +67,7 @@ class Banner < NER
66
67
  @parenPP.postProcess(sentence)
67
68
  tagged = sentence.getSGML
68
69
 
70
+ docid = Misc.digest text
69
71
  res = tagged.scan(/<GENE>.*?<\/GENE>/).
70
72
  collect{|r|
71
73
  r.match(/<GENE>(.*?)<\/GENE>/)
@@ -73,7 +75,7 @@ class Banner < NER
73
75
  mention.sub!(/^\s*/,'')
74
76
  mention.sub!(/\s*$/,'')
75
77
  offset = text.index(mention)
76
- NamedEntity.setup(mention, offset, 'GENE')
78
+ NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
77
79
  mention
78
80
  }
79
81
  res
data/lib/rbbt/ner/brat.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/segment/named_entity'
2
- require 'rbbt/text/segment/relationship'
2
+ require 'rbbt/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
5
 
@@ -55,11 +55,16 @@ EOF
55
55
  Open.mkdir 'tmp'
56
56
 
57
57
  texts.each do |name,text|
58
+ text = Misc.fixutf8(text)
59
+
60
+ text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
61
+
58
62
  Open.write("input/#{name}.txt") do |f|
59
- f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
63
+ f.puts "#{name}|a|" << text
60
64
  f.puts
61
65
  end
62
66
  end
67
+
63
68
  Open.write('config', CONFIG)
64
69
  CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
65
70
 
@@ -95,6 +100,7 @@ EOF
95
100
 
96
101
  res[name] = segments
97
102
  end
103
+ res
98
104
  end
99
105
  end
100
106
 
@@ -31,7 +31,8 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
34
+ best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
35
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
35
36
  end
36
37
  end
37
38
  end
@@ -15,7 +15,6 @@ class PatternRelExt
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
17
  Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
- ppp sentence
19
18
  regexpNER.entities(sentence)
20
19
  end
21
20
  end
@@ -0,0 +1,229 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/misc'
4
+ require 'rbbt/util/simpleDSL'
5
+
6
+ class NERFeatures
7
+ include SimpleDSL
8
+
9
+ def self.tokens(text)
10
+ text.scan(/
11
+ \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
12
+ \w-\w*|
13
+ \w+-[A-Z](?!\w)|
14
+ \w+|
15
+ [.,()\/\[\]{}'"+-]
16
+ /x)
17
+ end
18
+
19
+ def self.reverse(text)
20
+ tokens(text).reverse.join(" ")
21
+ end
22
+
23
+ def define(name, *args, &block)
24
+ action = args[0] || block || /#{name.to_s}s?/i
25
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
26
+
27
+ @types[name.to_s] = action
28
+ @order.push name.to_s
29
+
30
+ name.to_s
31
+ end
32
+
33
+ attr_accessor :reverse
34
+ def initialize(file = nil, reverse = false, &block)
35
+ @types = {}
36
+ @order = []
37
+ @context = []
38
+ @reverse = reverse
39
+
40
+ file ||= Rbbt.share.ner['config.rb'].find if !file && !block
41
+
42
+ parse(:define, file, &block)
43
+ end
44
+
45
+ def config
46
+ @config[:define]
47
+ end
48
+
49
+ def window(positions)
50
+ @window = positions
51
+ end
52
+
53
+ def context(name, &block)
54
+ if name.is_a? Array
55
+ @context += name
56
+ else
57
+ @context.push name
58
+
59
+ # The block might be wrongly assigned to this function
60
+ # instead of the actual definition, fix that.
61
+ if block
62
+ @types[name] = block
63
+ end
64
+ end
65
+ end
66
+
67
+ def direction(dir)
68
+ if dir.to_sym == :reverse
69
+ @reverse = true
70
+ end
71
+ end
72
+
73
+ def features(word)
74
+ values = [word]
75
+
76
+ @order.each{|features|
77
+ action = @types[features]
78
+ if action.is_a?(Proc)
79
+ values.push(action.call(word))
80
+ else
81
+ m = action.match(word)
82
+ if m
83
+ if m[1]
84
+ values.push(m[1])
85
+ else
86
+ values.push(m != nil)
87
+ end
88
+ else
89
+ values.push(false)
90
+ end
91
+ end
92
+ }
93
+ values
94
+ end
95
+
96
+ def template(window=nil)
97
+ window ||= @window || [1,-1]
98
+ template = ""
99
+
100
+ i = 1
101
+ @order.each{|feat|
102
+ template += "U#{ feat }: %x[0,#{ i }]\n"
103
+
104
+ if @context.include?(feat)
105
+ window.each{|p|
106
+ template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
107
+ }
108
+ end
109
+ i += 1
110
+ }
111
+
112
+ template += "B\n"
113
+
114
+ template
115
+ end
116
+
117
+
118
+ def text_features(text, positive = nil)
119
+ text = self.class.reverse(text) if @reverse
120
+ initial = true
121
+ self.class.tokens(text).collect{|token|
122
+ features = features(token)
123
+ if !positive.nil?
124
+ features << (positive ? (initial ? 1 : 2) : 0)
125
+ initial = false
126
+ end
127
+ features
128
+ }
129
+ end
130
+
131
+ def tagged_features(text, mentions)
132
+ mentions ||= []
133
+ mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
134
+ re = mentions.collect{|mention|
135
+ Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
136
+ }.join("|")
137
+
138
+ positive = false
139
+ features = []
140
+ chunks = text.split(/(#{re})/)
141
+ chunks.each{|t|
142
+ chunk_features = text_features(t, positive)
143
+ positive = !positive
144
+ if @reverse
145
+ features = chunk_features + features
146
+ else
147
+ features = features + chunk_features
148
+ end
149
+ }
150
+ features
151
+ end
152
+
153
+ def train(features, model)
154
+ tmp_template = TmpFile.tmp_file("template-")
155
+ Open.write(tmp_template,template)
156
+
157
+ cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
158
+ system cmd
159
+ Open.write(model + '.config',config)
160
+ FileUtils.rm tmp_template
161
+ end
162
+
163
+ end
164
+
165
+ class NER
166
+
167
+ def initialize(model = nil)
168
+ begin
169
+ require 'CRFPP'
170
+ rescue Exception
171
+ require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
172
+ end
173
+
174
+ model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
175
+
176
+ @parser = NERFeatures.new(model + '.config')
177
+ @reverse = @parser.reverse
178
+ @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
179
+ end
180
+
181
+ def extract(text)
182
+ features = @parser.text_features(text)
183
+
184
+ @tagger.clear
185
+ features.each{|feats|
186
+ @tagger.add(feats.join(" "))
187
+ }
188
+
189
+ @tagger.parse
190
+
191
+ found = []
192
+ mention = []
193
+
194
+ @tagger.size.times{|i|
195
+ label = @tagger.y(i)
196
+ word = @tagger.x(i,0)
197
+
198
+ if word == ')'
199
+ mention.push(')') if mention.join =~ /\(/
200
+ next
201
+ end
202
+
203
+ case label
204
+ when 1
205
+ if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
206
+ found.push(mention)
207
+ mention = []
208
+ end
209
+ mention.push(word)
210
+ when 2
211
+ mention.push(word)
212
+ when 0
213
+ found.push(mention) if mention.any?
214
+ mention = []
215
+ end
216
+ }
217
+
218
+ found << mention if mention.any?
219
+
220
+ found.collect{|list|
221
+ list = list.reverse if @reverse
222
+ list.join(" ")
223
+ }
224
+ end
225
+
226
+ end
227
+
228
+
229
+