rbbt-text 1.3.4 → 1.3.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 496288d7d3ff1215ded1fd210192d5887a6a071eea5f322295a669a5d648d77b
4
- data.tar.gz: 47996496009cbcdaab38a9dc9bf6efbbe7fc0145f315b0a48bfab0f543742f94
3
+ metadata.gz: 8dfc374254fcbe88c8be6bfffd9a3cfabf6e23c953c11ecd2f61cf41027ff3d6
4
+ data.tar.gz: 3d3211f41cfecea05862505d1508a4b7b76eecb3c90b3b0000194eb08033715e
5
5
  SHA512:
6
- metadata.gz: 36e7415ad06207066844a30001c8541865f066d1e83a4a2ddc5182c54b704cd3d442cbccce219bd2114717a83656d07558c42725eca75597fea239b6e13244ab
7
- data.tar.gz: 988eff4d242d0425910b96fac4188df079c8c53c3abea2825cc97d5af5118841680705fa33461a5b4cfa7b8d6b32a486465e44b75f20fad324e4623c6c8083d8
6
+ metadata.gz: 7ed870e46bae2c113d0885697bfbade6064732a89477833c640eaf4ee8bdb2c0fbf52f69f456af5eb30a82e56a7f0aeb37e71127f884430c3d315202a07fa3cb
7
+ data.tar.gz: e31853e816321a5ead788036b5f67eecaca179c75168c0bb2804be1f18ae844031ab808a4e3c9d67e1f9a52f94ca478949798b8101e164eba32481c0182a1f58
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2010-2022 Miguel Vázquez García
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -13,7 +13,7 @@ module Document
13
13
  end
14
14
 
15
15
  docid = self.docid
16
- segments.each{|s| s.docid = docid if s.docid.nil? }
16
+ segments.each{|s| s.docid = docid }
17
17
 
18
18
  segments
19
19
  end
@@ -36,7 +36,7 @@ module Document
36
36
 
37
37
  docid = document.docid
38
38
 
39
- segments.each{|s| s.docid = docid if s.docid.nil? }
39
+ segments.each{|s| s.docid = docid }
40
40
 
41
41
  segments
42
42
  end
@@ -1,21 +1,30 @@
1
1
  require 'rbbt/sources/pubmed'
2
2
 
3
3
  module Document::Corpus
4
- def add_pmid(pmid, type = nil)
4
+ PUBMED_NAMESPACE="PMID"
5
+ def add_pmid(pmid, type = nil, update = false)
6
+ type = :abstract if type.nil?
7
+ if update == false
8
+ id = [PUBMED_NAMESPACE, pmid, type].collect{|e| e.to_s}*":"
9
+ documents = self.documents(id)
10
+ return documents if documents.any?
11
+ end
12
+
5
13
  pmids = Array === pmid ? pmid : [pmid]
6
14
  type = nil if String === type and type.empty?
7
15
 
8
16
  res = PubMed.get_article(pmids).collect do |pmid, article|
9
- document = if type.nil? || type.to_sym == :abstract
10
- Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
17
+ document = if type.to_sym == :abstract
18
+ Document.setup(article.abstract || "", PUBMED_NAMESPACE, pmid, :abstract, self, :corpus => self)
11
19
  elsif type.to_sym == :title
12
- Document.setup(article.title, :PMID, pmid, :title, self)
20
+ Document.setup(article.title, PUBMED_NAMESPACE, pmid, :title, self)
13
21
  else
14
22
  raise "No FullText available for #{ pmid }" if article.full_text.nil?
15
- Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
23
+ Document.setup(article.full_text, PUBMED_NAMESPACE, pmid, :fulltext, self, :corpus => self)
16
24
  end
17
25
  Log.debug "Loading pmid #{pmid}"
18
26
  add_document(document)
27
+ document
19
28
  end
20
29
 
21
30
  Document.setup(res)
@@ -3,8 +3,10 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
+ corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
6
7
  corpus.extend Document::Corpus unless Document::Corpus === corpus
7
8
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
9
+ corpus.close
8
10
  corpus
9
11
  end
10
12
 
@@ -16,22 +18,23 @@ module Document::Corpus
16
18
  end
17
19
  end
18
20
 
19
- def docids(prefix)
20
- prefix += ":" unless prefix == :all || prefix[-1] == ":"
21
+ def docids(*prefix)
22
+ prefix = prefix * ":"
23
+ prefix += ":" unless prefix == :all || prefix == "all" || prefix[-1] == ":"
21
24
  docids = self.read_and_close do
22
- prefix == :all ? self.keys : self.prefix(prefix)
25
+ prefix == "all" ? self.keys : self.prefix(prefix)
23
26
  end
24
27
  DocID.setup(docids, :corpus => self)
25
28
  end
26
29
 
27
- def documents(prefix)
28
- self.docids(prefix).document
30
+ def documents(*prefix)
31
+ self.docids(*prefix).document
29
32
  end
30
33
 
31
34
  def [](*args)
32
35
  docid, *rest = args
33
36
 
34
- res = self.read_and_close do
37
+ res = self.with_read do
35
38
  super(*args)
36
39
  end
37
40
 
@@ -41,7 +44,7 @@ module Document::Corpus
41
44
  namespace, id, type = docid.split(":")
42
45
 
43
46
  if res.nil?
44
- if Document::Corpus.claims.include?(namespace.to_s)
47
+ if Document::Corpus.claims && Document::Corpus.claims.include?(namespace.to_s)
45
48
  res = self.instance_exec(id, type, &Document::Corpus.claims[namespace.to_s])
46
49
  end
47
50
  end
data/lib/rbbt/document.rb CHANGED
@@ -22,9 +22,7 @@ module DocID
22
22
  if Array === self
23
23
  namespace, id, type = nil, nil, nil
24
24
  docs = self.collect do |docid|
25
- text = self.corpus[docid]
26
- namespace, id, type = docid.split(":")
27
- text
25
+ self.corpus[docid]
28
26
  end
29
27
  Document.setup(docs, :corpus => corpus)
30
28
  else
@@ -53,3 +51,9 @@ module Document
53
51
  alias id docid
54
52
  end
55
53
 
54
+ #class String
55
+ # def docid
56
+ # digest = Misc.digest(self)
57
+ # ["STRING", digest, nil, nil] * ":"
58
+ # end
59
+ #end
@@ -39,14 +39,15 @@ class Abner < NER
39
39
  types = res[1]
40
40
  strings = res[0]
41
41
 
42
+ docid = Misc.digest(text)
42
43
  global_offset = 0
43
44
  strings.zip(types).collect do |mention, type|
44
45
  mention = mention.to_s;
45
46
  offset = text.index(mention)
46
47
  if offset.nil?
47
- NamedEntity.setup(mention, nil, type.to_s)
48
+ NamedEntity.setup(mention, :docid => docid, :entity_type => type)
48
49
  else
49
- NamedEntity.setup(mention, offset + global_offset, type.to_s)
50
+ NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
50
51
  text = text[offset + mention.length..-1]
51
52
  global_offset += offset + mention.length
52
53
  end
@@ -55,6 +55,7 @@ class Banner < NER
55
55
  # text.
56
56
  def match(text)
57
57
  return [] if text.nil?
58
+ text = text.dup if text.frozen?
58
59
  text.gsub!(/\n/,' ')
59
60
  text.gsub!(/\|/,'/') # Character | gives an error
60
61
  return [] if text.strip.empty?
@@ -66,6 +67,7 @@ class Banner < NER
66
67
  @parenPP.postProcess(sentence)
67
68
  tagged = sentence.getSGML
68
69
 
70
+ docid = Misc.digest text
69
71
  res = tagged.scan(/<GENE>.*?<\/GENE>/).
70
72
  collect{|r|
71
73
  r.match(/<GENE>(.*?)<\/GENE>/)
@@ -73,7 +75,7 @@ class Banner < NER
73
75
  mention.sub!(/^\s*/,'')
74
76
  mention.sub!(/\s*$/,'')
75
77
  offset = text.index(mention)
76
- NamedEntity.setup(mention, offset, 'GENE')
78
+ NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
77
79
  mention
78
80
  }
79
81
  res
data/lib/rbbt/ner/brat.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/segment/named_entity'
2
- require 'rbbt/text/segment/relationship'
2
+ require 'rbbt/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
5
 
@@ -31,7 +31,8 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
34
+ best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
35
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
35
36
  end
36
37
  end
37
38
  end
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'libxml'
4
3
  require 'rbbt/ner/NER'
5
4
  require 'rbbt/util/log'
6
5
 
@@ -1,6 +1,5 @@
1
1
  require 'rbbt'
2
2
  require 'rjb'
3
- require 'libxml'
4
3
  require 'rbbt/segment'
5
4
  require 'rbbt/ner/NER'
6
5
  require 'rbbt/util/log'
@@ -0,0 +1,229 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/misc'
4
+ require 'rbbt/util/simpleDSL'
5
+
6
+ class NERFeatures
7
+ include SimpleDSL
8
+
9
+ def self.tokens(text)
10
+ text.scan(/
11
+ \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
12
+ \w-\w*|
13
+ \w+-[A-Z](?!\w)|
14
+ \w+|
15
+ [.,()\/\[\]{}'"+-]
16
+ /x)
17
+ end
18
+
19
+ def self.reverse(text)
20
+ tokens(text).reverse.join(" ")
21
+ end
22
+
23
+ def define(name, *args, &block)
24
+ action = args[0] || block || /#{name.to_s}s?/i
25
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
26
+
27
+ @types[name.to_s] = action
28
+ @order.push name.to_s
29
+
30
+ name.to_s
31
+ end
32
+
33
+ attr_accessor :reverse
34
+ def initialize(file = nil, reverse = false, &block)
35
+ @types = {}
36
+ @order = []
37
+ @context = []
38
+ @reverse = reverse
39
+
40
+ file ||= Rbbt.share.ner['config.rb'].find if !file && !block
41
+
42
+ parse(:define, file, &block)
43
+ end
44
+
45
+ def config
46
+ @config[:define]
47
+ end
48
+
49
+ def window(positions)
50
+ @window = positions
51
+ end
52
+
53
+ def context(name, &block)
54
+ if name.is_a? Array
55
+ @context += name
56
+ else
57
+ @context.push name
58
+
59
+ # The block might be wrongly assigned to this function
60
+ # instead of the actual definition, fix that.
61
+ if block
62
+ @types[name] = block
63
+ end
64
+ end
65
+ end
66
+
67
+ def direction(dir)
68
+ if dir.to_sym == :reverse
69
+ @reverse = true
70
+ end
71
+ end
72
+
73
+ def features(word)
74
+ values = [word]
75
+
76
+ @order.each{|features|
77
+ action = @types[features]
78
+ if action.is_a?(Proc)
79
+ values.push(action.call(word))
80
+ else
81
+ m = action.match(word)
82
+ if m
83
+ if m[1]
84
+ values.push(m[1])
85
+ else
86
+ values.push(m != nil)
87
+ end
88
+ else
89
+ values.push(false)
90
+ end
91
+ end
92
+ }
93
+ values
94
+ end
95
+
96
+ def template(window=nil)
97
+ window ||= @window || [1,-1]
98
+ template = ""
99
+
100
+ i = 1
101
+ @order.each{|feat|
102
+ template += "U#{ feat }: %x[0,#{ i }]\n"
103
+
104
+ if @context.include?(feat)
105
+ window.each{|p|
106
+ template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
107
+ }
108
+ end
109
+ i += 1
110
+ }
111
+
112
+ template += "B\n"
113
+
114
+ template
115
+ end
116
+
117
+
118
+ def text_features(text, positive = nil)
119
+ text = self.class.reverse(text) if @reverse
120
+ initial = true
121
+ self.class.tokens(text).collect{|token|
122
+ features = features(token)
123
+ if !positive.nil?
124
+ features << (positive ? (initial ? 1 : 2) : 0)
125
+ initial = false
126
+ end
127
+ features
128
+ }
129
+ end
130
+
131
+ def tagged_features(text, mentions)
132
+ mentions ||= []
133
+ mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
134
+ re = mentions.collect{|mention|
135
+ Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
136
+ }.join("|")
137
+
138
+ positive = false
139
+ features = []
140
+ chunks = text.split(/(#{re})/)
141
+ chunks.each{|t|
142
+ chunk_features = text_features(t, positive)
143
+ positive = !positive
144
+ if @reverse
145
+ features = chunk_features + features
146
+ else
147
+ features = features + chunk_features
148
+ end
149
+ }
150
+ features
151
+ end
152
+
153
+ def train(features, model)
154
+ tmp_template = TmpFile.tmp_file("template-")
155
+ Open.write(tmp_template,template)
156
+
157
+ cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
158
+ system cmd
159
+ Open.write(model + '.config',config)
160
+ FileUtils.rm tmp_template
161
+ end
162
+
163
+ end
164
+
165
+ class NER
166
+
167
+ def initialize(model = nil)
168
+ begin
169
+ require 'CRFPP'
170
+ rescue Exception
171
+ require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
172
+ end
173
+
174
+ model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
175
+
176
+ @parser = NERFeatures.new(model + '.config')
177
+ @reverse = @parser.reverse
178
+ @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
179
+ end
180
+
181
+ def extract(text)
182
+ features = @parser.text_features(text)
183
+
184
+ @tagger.clear
185
+ features.each{|feats|
186
+ @tagger.add(feats.join(" "))
187
+ }
188
+
189
+ @tagger.parse
190
+
191
+ found = []
192
+ mention = []
193
+
194
+ @tagger.size.times{|i|
195
+ label = @tagger.y(i)
196
+ word = @tagger.x(i,0)
197
+
198
+ if word == ')'
199
+ mention.push(')') if mention.join =~ /\(/
200
+ next
201
+ end
202
+
203
+ case label
204
+ when 1
205
+ if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
206
+ found.push(mention)
207
+ mention = []
208
+ end
209
+ mention.push(word)
210
+ when 2
211
+ mention.push(word)
212
+ when 0
213
+ found.push(mention) if mention.any?
214
+ mention = []
215
+ end
216
+ }
217
+
218
+ found << mention if mention.any?
219
+
220
+ found.collect{|list|
221
+ list = list.reverse if @reverse
222
+ list.join(" ")
223
+ }
224
+ end
225
+
226
+ end
227
+
228
+
229
+
@@ -172,6 +172,7 @@ class Tokenizer
172
172
 
173
173
  #{{{ Token Types
174
174
  GREEK_RE = "(?:" + $greek.keys.select{|w| w.length > 3}.collect{|w| w.downcase}.join("|") + ")"
175
+ GREEK_LETTER_RE = "(?:" + $inverse_greek.keys.select{|w| w.length == 1}.collect{|w| w.upcase}.join("|") + ")"
175
176
  def tokenize(word)
176
177
  return word.
177
178
  gsub(/([^IVX])I$/,'\1|I|'). # Separate last roman number
@@ -180,6 +181,7 @@ class Tokenizer
180
181
  gsub(/([A-Z]{2,})([a-z])/,'\1-\2').
181
182
  gsub(/^(#{GREEK_RE})/,'\1-').
182
183
  gsub(/(#{GREEK_RE})$/,'-\1').
184
+ gsub(/(#{GREEK_LETTER_RE})$/,'-\1').
183
185
  split( /[^\w.]+/). # Split by separator char
184
186
  select{|t| !t.empty? }
185
187
  end
@@ -204,7 +206,7 @@ class Tokenizer
204
206
  end
205
207
 
206
208
  #{{{ Comparisons
207
-
209
+
208
210
  def evaluate_tokens(list1, list2)
209
211
  @operations.inject(0){|acc, o|
210
212
  acc + o.eval(list1, list2)
@@ -18,6 +18,10 @@ class Normalizer
18
18
  values.select{|p| p[1] == best}
19
19
  end
20
20
 
21
+ def token_evaluate(mention, name)
22
+ @tokens.evaluate(mention, name)
23
+ end
24
+
21
25
  # Compares the tokens and gives each candidate a score based on the
22
26
  # commonalities and differences amongst the tokens.
23
27
  def token_score(code, mention)
@@ -31,7 +35,7 @@ class Normalizer
31
35
  when mention.downcase.gsub(/\s/,'') == name.downcase.gsub(/\s/,'')
32
36
  80
33
37
  else
34
- @tokens.evaluate(mention, name)
38
+ token_evaluate(mention, name)
35
39
  end
36
40
  [value, name]
37
41
  }.sort_by{|value, name| value }.last
@@ -249,7 +249,8 @@ class TokenTrieNER < NER
249
249
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
250
250
  }
251
251
 
252
- NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
252
+ type = type.first
253
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
253
254
  end
254
255
 
255
256
  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
@@ -7,7 +7,7 @@ module OpenNLP
7
7
  Rbbt.claim Rbbt.software.opt.OpenNLP, :install, Rbbt.share.install.software.OpenNLP.find
8
8
 
9
9
 
10
- Rbbt.claim Rbbt.software.opt.OpenNLP.models["da-sent.bin"], :url, "http://opennlp.sourceforge.net/models-1.5/de-sent.bin"
10
+ Rbbt.claim Rbbt.software.opt.OpenNLP.models["da-sent.bin"], :url, "https://www.apache.org/dyn/closer.cgi/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin"
11
11
 
12
12
  MAX = 5
13
13