rbbt-text 1.3.0 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rbbt/bow/bow.rb +5 -2
  3. data/lib/rbbt/bow/dictionary.rb +27 -23
  4. data/lib/rbbt/document.rb +20 -5
  5. data/lib/rbbt/document/annotation.rb +7 -4
  6. data/lib/rbbt/document/corpus.rb +30 -3
  7. data/lib/rbbt/document/corpus/pubmed.rb +2 -1
  8. data/lib/rbbt/ner/abner.rb +3 -2
  9. data/lib/rbbt/ner/banner.rb +3 -1
  10. data/lib/rbbt/ner/brat.rb +1 -1
  11. data/lib/rbbt/ner/g_norm_plus.rb +7 -1
  12. data/lib/rbbt/ner/linnaeus.rb +2 -1
  13. data/lib/rbbt/ner/patterns.rb +0 -1
  14. data/lib/rbbt/ner/rner.rb +229 -0
  15. data/lib/rbbt/ner/token_trieNER.rb +32 -18
  16. data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
  17. data/lib/rbbt/nlp/spaCy.rb +195 -0
  18. data/lib/rbbt/relationship.rb +24 -0
  19. data/lib/rbbt/segment.rb +9 -4
  20. data/lib/rbbt/segment/annotation.rb +3 -3
  21. data/lib/rbbt/segment/named_entity.rb +7 -0
  22. data/lib/rbbt/segment/range_index.rb +1 -1
  23. data/lib/rbbt/segment/relationship.rb +7 -0
  24. data/lib/rbbt/segment/transformed.rb +5 -1
  25. data/share/install/software/OpenNLP +1 -1
  26. data/share/rner/config.rb +51 -0
  27. data/test/rbbt/document/corpus/test_pubmed.rb +2 -1
  28. data/test/rbbt/document/test_annotation.rb +15 -6
  29. data/test/rbbt/document/test_corpus.rb +15 -1
  30. data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
  31. data/test/rbbt/ner/test_rner.rb +132 -0
  32. data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
  33. data/test/rbbt/segment/test_annotation.rb +3 -4
  34. data/test/rbbt/segment/test_encoding.rb +1 -1
  35. data/test/rbbt/segment/test_named_entity.rb +7 -5
  36. data/test/rbbt/segment/test_range_index.rb +1 -2
  37. data/test/rbbt/segment/test_transformed.rb +33 -4
  38. data/test/rbbt/test_segment.rb +5 -10
  39. data/test/test_spaCy.rb +144 -0
  40. metadata +12 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c2a24d8e7faf30d53e41a00a27f6145e8e9f18f0c10af57cdddaea0ee18c35d6
4
- data.tar.gz: 3475006965110391e35151cd1b5368028dacf467aa276f8eb68fce3320be1122
3
+ metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
4
+ data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
5
5
  SHA512:
6
- metadata.gz: da40a039a4792eb5e7fa00270870279221c74dcbf51df1b5278b247496fefbfa888a87b7ab19f05676644c51a01177eb49e229cb0156fe7f0190dd4933d41e24
7
- data.tar.gz: a32fca5f21a987dcbb6b5541015cc33879330e6f1ef7c4a28e75debe5bdd1dc8bf7b98bfc91d828e605f29868aa972b55cd59bb4f86e66d2fb0cfea31fac2ae0
6
+ metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
7
+ data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e
data/lib/rbbt/bow/bow.rb CHANGED
@@ -69,6 +69,11 @@ module BagOfWords
69
69
  count = bigrams ? count(bigrams(text)) : count(words(text))
70
70
  count.values_at(*terms)
71
71
  end
72
+
73
+ def self.weighted_features(text, weights)
74
+ features = features(text, weights.keys)
75
+ features.zip(weights.values).collect{|f,w| f * w }
76
+ end
72
77
  end
73
78
 
74
79
  class String
@@ -82,5 +87,3 @@ class String
82
87
  BagOfWords.bigrams(self)
83
88
  end
84
89
  end
85
-
86
-
@@ -74,28 +74,32 @@ class Dictionary::TF_IDF
74
74
  end
75
75
 
76
76
  def best(options = {})
77
- high, low, limit = {
78
- :low => 0,
79
- :high => 1,
80
- }.merge(options).
81
- values_at(:high, :low, :limit)
82
-
83
- num_docs = @num_docs.to_f
84
- best = df.select{|term, value|
85
- value >= low && value <= high
86
- }.collect{|p|
87
- term = p.first
88
- df_value = p.last
89
- [term,
90
- @terms[term].to_f / num_docs * Math::log(1.0/df_value)
91
- ]
92
- }
93
-
94
- if limit
95
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
96
- else
97
- Hash[*best.flatten]
98
- end
77
+ key = Misc.obj2digest(options)
78
+ @best ||= {}
79
+ @best[key] ||= begin
80
+ high, low, limit = {
81
+ :low => 0,
82
+ :high => 1,
83
+ }.merge(options).
84
+ values_at(:high, :low, :limit)
85
+
86
+ num_docs = @num_docs.to_f
87
+ best = df.select{|term, value|
88
+ value >= low && value <= high
89
+ }.collect{|p|
90
+ term = p.first
91
+ df_value = p.last
92
+ [term,
93
+ @terms[term].to_f / num_docs * Math::log(1.0/df_value)
94
+ ]
95
+ }
96
+
97
+ if limit
98
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
99
+ else
100
+ Hash[*best.flatten]
101
+ end
102
+ end
99
103
  end
100
104
 
101
105
  def weights(options = {})
@@ -173,7 +177,7 @@ class Dictionary::KL
173
177
  best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
174
178
  }
175
179
  if limit
176
- Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
180
+ Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
177
181
  else
178
182
  best
179
183
  end
data/lib/rbbt/document.rb CHANGED
@@ -1,6 +1,5 @@
1
1
  require 'rbbt-util'
2
2
  require 'rbbt/entity'
3
- require 'rbbt/document/annotation'
4
3
 
5
4
  module DocID
6
5
  extend Entity
@@ -19,10 +18,20 @@ module DocID
19
18
  DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
20
19
  end
21
20
 
22
- def document
23
- text = self.corpus[self]
24
- namespace, id, type = self.split(":")
25
- Document.setup(text, namespace, id, type, :corpus => corpus)
21
+ property :document => :both do
22
+ if Array === self
23
+ namespace, id, type = nil, nil, nil
24
+ docs = self.collect do |docid|
25
+ text = self.corpus[docid]
26
+ namespace, id, type = docid.split(":")
27
+ text
28
+ end
29
+ Document.setup(docs, :corpus => corpus)
30
+ else
31
+ text = self.corpus[self]
32
+ namespace, id, type = self.split(":")
33
+ Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
34
+ end
26
35
  end
27
36
  end
28
37
 
@@ -44,3 +53,9 @@ module Document
44
53
  alias id docid
45
54
  end
46
55
 
56
+ #class String
57
+ # def docid
58
+ # digest = Misc.digest(self)
59
+ # ["STRING", digest, nil, nil] * ":"
60
+ # end
61
+ #end
@@ -1,3 +1,4 @@
1
+ require 'rbbt/segment'
1
2
  require 'rbbt/segment/annotation'
2
3
 
3
4
  module Document
@@ -12,7 +13,7 @@ module Document
12
13
  end
13
14
 
14
15
  docid = self.docid
15
- segments.each{|s| s.docid = docid if s.docid.nil? }
16
+ segments.each{|s| s.docid = docid }
16
17
 
17
18
  segments
18
19
  end
@@ -22,18 +23,20 @@ module Document
22
23
  send :property, type => :multiple do |list|
23
24
  doc_segments = self.instance_exec list, &block
24
25
 
25
- doc_segments = doc_segments.chunked_values_at(self) if Hash === doc_segments
26
+ doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
26
27
 
27
28
  doc_segments.each_with_index do |segments,i|
29
+ next if segments.nil?
28
30
  document = list[i]
29
- Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
31
+ Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
30
32
 
31
33
  segments.each do |segment|
32
34
  SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
33
35
  end
34
36
 
35
37
  docid = document.docid
36
- segments.each{|s| s.docid = docid if s.docid.nil? }
38
+
39
+ segments.each{|s| s.docid = docid }
37
40
 
38
41
  segments
39
42
  end
@@ -3,17 +3,43 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
- corpus.extend Document::Corpus
6
+ corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
7
+ corpus.extend Document::Corpus unless Document::Corpus === corpus
8
+ corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
9
+ corpus
7
10
  end
8
11
 
9
12
  def add_document(document)
10
- self[document.docid] = document
13
+ docid = document.docid
14
+ return self[docid] if self.include?(docid)
15
+ self.write_and_close do
16
+ self[docid] = document
17
+ end
18
+ end
19
+
20
+ def docids(*prefix)
21
+ prefix = prefix * ":"
22
+ prefix += ":" unless prefix == :all || prefix[-1] == ":"
23
+ docids = self.read_and_close do
24
+ prefix == :all ? self.keys : self.prefix(prefix)
25
+ end
26
+ DocID.setup(docids, :corpus => self)
27
+ end
28
+
29
+ def documents(*prefix)
30
+ self.docids(*prefix).document
11
31
  end
12
32
 
13
33
  def [](*args)
14
34
  docid, *rest = args
15
- res = super(*args)
35
+
36
+ res = self.read_and_close do
37
+ super(*args)
38
+ end
39
+
40
+ res.force_encoding(Encoding.default_external) if res
16
41
  return res if args.length > 1
42
+
17
43
  namespace, id, type = docid.split(":")
18
44
 
19
45
  if res.nil?
@@ -22,6 +48,7 @@ module Document::Corpus
22
48
  end
23
49
  end
24
50
 
51
+ res.force_encoding(Encoding.default_external) if res
25
52
  Document.setup(res, namespace, id, type, self) unless res.nil?
26
53
 
27
54
  res
@@ -6,7 +6,6 @@ module Document::Corpus
6
6
  type = nil if String === type and type.empty?
7
7
 
8
8
  res = PubMed.get_article(pmids).collect do |pmid, article|
9
- Log.debug "Loading pmid #{pmid}"
10
9
  document = if type.nil? || type.to_sym == :abstract
11
10
  Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
12
11
  elsif type.to_sym == :title
@@ -15,7 +14,9 @@ module Document::Corpus
15
14
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
16
15
  Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
17
16
  end
17
+ Log.debug "Loading pmid #{pmid}"
18
18
  add_document(document)
19
+ document
19
20
  end
20
21
 
21
22
  Document.setup(res)
@@ -39,14 +39,15 @@ class Abner < NER
39
39
  types = res[1]
40
40
  strings = res[0]
41
41
 
42
+ docid = Misc.digest(text)
42
43
  global_offset = 0
43
44
  strings.zip(types).collect do |mention, type|
44
45
  mention = mention.to_s;
45
46
  offset = text.index(mention)
46
47
  if offset.nil?
47
- NamedEntity.setup(mention, nil, type.to_s)
48
+ NamedEntity.setup(mention, :docid => docid, :entity_type => type)
48
49
  else
49
- NamedEntity.setup(mention, offset + global_offset, type.to_s)
50
+ NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
50
51
  text = text[offset + mention.length..-1]
51
52
  global_offset += offset + mention.length
52
53
  end
@@ -55,6 +55,7 @@ class Banner < NER
55
55
  # text.
56
56
  def match(text)
57
57
  return [] if text.nil?
58
+ text = text.dup if text.frozen?
58
59
  text.gsub!(/\n/,' ')
59
60
  text.gsub!(/\|/,'/') # Character | gives an error
60
61
  return [] if text.strip.empty?
@@ -66,6 +67,7 @@ class Banner < NER
66
67
  @parenPP.postProcess(sentence)
67
68
  tagged = sentence.getSGML
68
69
 
70
+ docid = Misc.digest text
69
71
  res = tagged.scan(/<GENE>.*?<\/GENE>/).
70
72
  collect{|r|
71
73
  r.match(/<GENE>(.*?)<\/GENE>/)
@@ -73,7 +75,7 @@ class Banner < NER
73
75
  mention.sub!(/^\s*/,'')
74
76
  mention.sub!(/\s*$/,'')
75
77
  offset = text.index(mention)
76
- NamedEntity.setup(mention, offset, 'GENE')
78
+ NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
77
79
  mention
78
80
  }
79
81
  res
data/lib/rbbt/ner/brat.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/segment/named_entity'
2
- require 'rbbt/text/segment/relationship'
2
+ require 'rbbt/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
5
 
@@ -55,11 +55,16 @@ EOF
55
55
  Open.mkdir 'tmp'
56
56
 
57
57
  texts.each do |name,text|
58
+ text = Misc.fixutf8(text)
59
+
60
+ text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
61
+
58
62
  Open.write("input/#{name}.txt") do |f|
59
- f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
63
+ f.puts "#{name}|a|" << text
60
64
  f.puts
61
65
  end
62
66
  end
67
+
63
68
  Open.write('config', CONFIG)
64
69
  CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
65
70
 
@@ -95,6 +100,7 @@ EOF
95
100
 
96
101
  res[name] = segments
97
102
  end
103
+ res
98
104
  end
99
105
  end
100
106
 
@@ -31,7 +31,8 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
34
+ best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
35
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
35
36
  end
36
37
  end
37
38
  end
@@ -15,7 +15,6 @@ class PatternRelExt
15
15
  segments = sentence.segments
16
16
  segments = segments.values.flatten if Hash === segments
17
17
  Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
18
- ppp sentence
19
18
  regexpNER.entities(sentence)
20
19
  end
21
20
  end
@@ -0,0 +1,229 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/misc'
4
+ require 'rbbt/util/simpleDSL'
5
+
6
+ class NERFeatures
7
+ include SimpleDSL
8
+
9
+ def self.tokens(text)
10
+ text.scan(/
11
+ \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
12
+ \w-\w*|
13
+ \w+-[A-Z](?!\w)|
14
+ \w+|
15
+ [.,()\/\[\]{}'"+-]
16
+ /x)
17
+ end
18
+
19
+ def self.reverse(text)
20
+ tokens(text).reverse.join(" ")
21
+ end
22
+
23
+ def define(name, *args, &block)
24
+ action = args[0] || block || /#{name.to_s}s?/i
25
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
26
+
27
+ @types[name.to_s] = action
28
+ @order.push name.to_s
29
+
30
+ name.to_s
31
+ end
32
+
33
+ attr_accessor :reverse
34
+ def initialize(file = nil, reverse = false, &block)
35
+ @types = {}
36
+ @order = []
37
+ @context = []
38
+ @reverse = reverse
39
+
40
+ file ||= Rbbt.share.ner['config.rb'].find if !file && !block
41
+
42
+ parse(:define, file, &block)
43
+ end
44
+
45
+ def config
46
+ @config[:define]
47
+ end
48
+
49
+ def window(positions)
50
+ @window = positions
51
+ end
52
+
53
+ def context(name, &block)
54
+ if name.is_a? Array
55
+ @context += name
56
+ else
57
+ @context.push name
58
+
59
+ # The block might be wrongly assigned to this function
60
+ # instead of the actual definition, fix that.
61
+ if block
62
+ @types[name] = block
63
+ end
64
+ end
65
+ end
66
+
67
+ def direction(dir)
68
+ if dir.to_sym == :reverse
69
+ @reverse = true
70
+ end
71
+ end
72
+
73
+ def features(word)
74
+ values = [word]
75
+
76
+ @order.each{|features|
77
+ action = @types[features]
78
+ if action.is_a?(Proc)
79
+ values.push(action.call(word))
80
+ else
81
+ m = action.match(word)
82
+ if m
83
+ if m[1]
84
+ values.push(m[1])
85
+ else
86
+ values.push(m != nil)
87
+ end
88
+ else
89
+ values.push(false)
90
+ end
91
+ end
92
+ }
93
+ values
94
+ end
95
+
96
+ def template(window=nil)
97
+ window ||= @window || [1,-1]
98
+ template = ""
99
+
100
+ i = 1
101
+ @order.each{|feat|
102
+ template += "U#{ feat }: %x[0,#{ i }]\n"
103
+
104
+ if @context.include?(feat)
105
+ window.each{|p|
106
+ template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
107
+ }
108
+ end
109
+ i += 1
110
+ }
111
+
112
+ template += "B\n"
113
+
114
+ template
115
+ end
116
+
117
+
118
+ def text_features(text, positive = nil)
119
+ text = self.class.reverse(text) if @reverse
120
+ initial = true
121
+ self.class.tokens(text).collect{|token|
122
+ features = features(token)
123
+ if !positive.nil?
124
+ features << (positive ? (initial ? 1 : 2) : 0)
125
+ initial = false
126
+ end
127
+ features
128
+ }
129
+ end
130
+
131
+ def tagged_features(text, mentions)
132
+ mentions ||= []
133
+ mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
134
+ re = mentions.collect{|mention|
135
+ Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
136
+ }.join("|")
137
+
138
+ positive = false
139
+ features = []
140
+ chunks = text.split(/(#{re})/)
141
+ chunks.each{|t|
142
+ chunk_features = text_features(t, positive)
143
+ positive = !positive
144
+ if @reverse
145
+ features = chunk_features + features
146
+ else
147
+ features = features + chunk_features
148
+ end
149
+ }
150
+ features
151
+ end
152
+
153
+ def train(features, model)
154
+ tmp_template = TmpFile.tmp_file("template-")
155
+ Open.write(tmp_template,template)
156
+
157
+ cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
158
+ system cmd
159
+ Open.write(model + '.config',config)
160
+ FileUtils.rm tmp_template
161
+ end
162
+
163
+ end
164
+
165
+ class NER
166
+
167
+ def initialize(model = nil)
168
+ begin
169
+ require 'CRFPP'
170
+ rescue Exception
171
+ require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
172
+ end
173
+
174
+ model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
175
+
176
+ @parser = NERFeatures.new(model + '.config')
177
+ @reverse = @parser.reverse
178
+ @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
179
+ end
180
+
181
+ def extract(text)
182
+ features = @parser.text_features(text)
183
+
184
+ @tagger.clear
185
+ features.each{|feats|
186
+ @tagger.add(feats.join(" "))
187
+ }
188
+
189
+ @tagger.parse
190
+
191
+ found = []
192
+ mention = []
193
+
194
+ @tagger.size.times{|i|
195
+ label = @tagger.y(i)
196
+ word = @tagger.x(i,0)
197
+
198
+ if word == ')'
199
+ mention.push(')') if mention.join =~ /\(/
200
+ next
201
+ end
202
+
203
+ case label
204
+ when 1
205
+ if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
206
+ found.push(mention)
207
+ mention = []
208
+ end
209
+ mention.push(word)
210
+ when 2
211
+ mention.push(word)
212
+ when 0
213
+ found.push(mention) if mention.any?
214
+ mention = []
215
+ end
216
+ }
217
+
218
+ found << mention if mention.any?
219
+
220
+ found.collect{|list|
221
+ list = list.reverse if @reverse
222
+ list.join(" ")
223
+ }
224
+ end
225
+
226
+ end
227
+
228
+
229
+