rbbt-text 1.3.4 → 1.3.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 496288d7d3ff1215ded1fd210192d5887a6a071eea5f322295a669a5d648d77b
4
- data.tar.gz: 47996496009cbcdaab38a9dc9bf6efbbe7fc0145f315b0a48bfab0f543742f94
3
+ metadata.gz: '0846f900d745dd27df8006eecbc9d294f9f38a23dd76001de2a5dc0313db7e22'
4
+ data.tar.gz: 675985882a6c8b9813f620d7ef0a555efa5c148c7c2fe36e0030f84f3fd88cf0
5
5
  SHA512:
6
- metadata.gz: 36e7415ad06207066844a30001c8541865f066d1e83a4a2ddc5182c54b704cd3d442cbccce219bd2114717a83656d07558c42725eca75597fea239b6e13244ab
7
- data.tar.gz: 988eff4d242d0425910b96fac4188df079c8c53c3abea2825cc97d5af5118841680705fa33461a5b4cfa7b8d6b32a486465e44b75f20fad324e4623c6c8083d8
6
+ metadata.gz: dfd9c333b94181496134b825c63d6e93a0390f81d426526f79c00cf12556021b60004b29b57ca9b0b274141937027f7bc780552a60de007e5f790b19910354c0
7
+ data.tar.gz: 205beeb8829c8358fd29c0a18351522e566106e24220af3d7bec3676694d37d682b92243e4fd4cd495b542f9945a28cf8585e587342672d31779d0b21b53ae4e
data/lib/rbbt/document.rb CHANGED
@@ -53,3 +53,9 @@ module Document
53
53
  alias id docid
54
54
  end
55
55
 
56
+ #class String
57
+ # def docid
58
+ # digest = Misc.digest(self)
59
+ # ["STRING", digest, nil, nil] * ":"
60
+ # end
61
+ #end
@@ -13,7 +13,7 @@ module Document
13
13
  end
14
14
 
15
15
  docid = self.docid
16
- segments.each{|s| s.docid = docid if s.docid.nil? }
16
+ segments.each{|s| s.docid = docid }
17
17
 
18
18
  segments
19
19
  end
@@ -36,7 +36,7 @@ module Document
36
36
 
37
37
  docid = document.docid
38
38
 
39
- segments.each{|s| s.docid = docid if s.docid.nil? }
39
+ segments.each{|s| s.docid = docid }
40
40
 
41
41
  segments
42
42
  end
@@ -3,6 +3,7 @@ require 'rbbt-util'
3
3
  module Document::Corpus
4
4
 
5
5
  def self.setup(corpus)
6
+ corpus = Persist.open_tokyocabinet(corpus, true, :single, "BDB") if String === corpus
6
7
  corpus.extend Document::Corpus unless Document::Corpus === corpus
7
8
  corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
8
9
  corpus
@@ -16,7 +17,8 @@ module Document::Corpus
16
17
  end
17
18
  end
18
19
 
19
- def docids(prefix)
20
+ def docids(*prefix)
21
+ prefix = prefix * ":"
20
22
  prefix += ":" unless prefix == :all || prefix[-1] == ":"
21
23
  docids = self.read_and_close do
22
24
  prefix == :all ? self.keys : self.prefix(prefix)
@@ -24,8 +26,8 @@ module Document::Corpus
24
26
  DocID.setup(docids, :corpus => self)
25
27
  end
26
28
 
27
- def documents(prefix)
28
- self.docids(prefix).document
29
+ def documents(*prefix)
30
+ self.docids(*prefix).document
29
31
  end
30
32
 
31
33
  def [](*args)
@@ -16,6 +16,7 @@ module Document::Corpus
16
16
  end
17
17
  Log.debug "Loading pmid #{pmid}"
18
18
  add_document(document)
19
+ document
19
20
  end
20
21
 
21
22
  Document.setup(res)
@@ -39,14 +39,15 @@ class Abner < NER
39
39
  types = res[1]
40
40
  strings = res[0]
41
41
 
42
+ docid = Misc.digest(text)
42
43
  global_offset = 0
43
44
  strings.zip(types).collect do |mention, type|
44
45
  mention = mention.to_s;
45
46
  offset = text.index(mention)
46
47
  if offset.nil?
47
- NamedEntity.setup(mention, nil, type.to_s)
48
+ NamedEntity.setup(mention, :docid => docid, :entity_type => type)
48
49
  else
49
- NamedEntity.setup(mention, offset + global_offset, type.to_s)
50
+ NamedEntity.setup(mention, :offset => offset + global_offset, :docid => docid, :entity_type => type.to_s)
50
51
  text = text[offset + mention.length..-1]
51
52
  global_offset += offset + mention.length
52
53
  end
@@ -55,6 +55,7 @@ class Banner < NER
55
55
  # text.
56
56
  def match(text)
57
57
  return [] if text.nil?
58
+ text = text.dup if text.frozen?
58
59
  text.gsub!(/\n/,' ')
59
60
  text.gsub!(/\|/,'/') # Character | gives an error
60
61
  return [] if text.strip.empty?
@@ -66,6 +67,7 @@ class Banner < NER
66
67
  @parenPP.postProcess(sentence)
67
68
  tagged = sentence.getSGML
68
69
 
70
+ docid = Misc.digest text
69
71
  res = tagged.scan(/<GENE>.*?<\/GENE>/).
70
72
  collect{|r|
71
73
  r.match(/<GENE>(.*?)<\/GENE>/)
@@ -73,7 +75,7 @@ class Banner < NER
73
75
  mention.sub!(/^\s*/,'')
74
76
  mention.sub!(/\s*$/,'')
75
77
  offset = text.index(mention)
76
- NamedEntity.setup(mention, offset, 'GENE')
78
+ NamedEntity.setup(mention, :offset => offset, :docid => docid, :entity_type => 'GENE')
77
79
  mention
78
80
  }
79
81
  res
data/lib/rbbt/ner/brat.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  require 'rbbt/segment/named_entity'
2
- require 'rbbt/text/segment/relationship'
2
+ require 'rbbt/relationship'
3
3
  module Brat
4
4
  Rbbt.claim Rbbt.software.opt.Brat, :install, "https://github.com/nlplab/brat.git"
5
5
 
@@ -31,7 +31,8 @@ module Linnaeus
31
31
  init unless defined? @@Matcher
32
32
 
33
33
  @@Matcher.match(text).toArray().collect do |mention|
34
- NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => mention.ids(), :score => mention.probabilities())
34
+ best_id, best_prob = mention.ids().zip(mention.probabilities()).sort_by{|i,p| p.to_f }.last
35
+ NamedEntity.setup(mention.text(), :offset => mention.start(), :entity_type => "Organism", :code => best_id, :score => best_prob)
35
36
  end
36
37
  end
37
38
  end
@@ -0,0 +1,229 @@
1
+ require 'rbbt'
2
+ require 'rbbt/util/open'
3
+ require 'rbbt/util/misc'
4
+ require 'rbbt/util/simpleDSL'
5
+
6
+ class NERFeatures
7
+ include SimpleDSL
8
+
9
+ def self.tokens(text)
10
+ text.scan(/
11
+ \w*-?(?:\d*\d[.,]\d\d*|\d+)\w*|
12
+ \w-\w*|
13
+ \w+-[A-Z](?!\w)|
14
+ \w+|
15
+ [.,()\/\[\]{}'"+-]
16
+ /x)
17
+ end
18
+
19
+ def self.reverse(text)
20
+ tokens(text).reverse.join(" ")
21
+ end
22
+
23
+ def define(name, *args, &block)
24
+ action = args[0] || block || /#{name.to_s}s?/i
25
+ raise "Wrong format" unless (action.is_a?(Proc) || action.is_a?(Regexp))
26
+
27
+ @types[name.to_s] = action
28
+ @order.push name.to_s
29
+
30
+ name.to_s
31
+ end
32
+
33
+ attr_accessor :reverse
34
+ def initialize(file = nil, reverse = false, &block)
35
+ @types = {}
36
+ @order = []
37
+ @context = []
38
+ @reverse = reverse
39
+
40
+ file ||= Rbbt.share.ner['config.rb'].find if !file && !block
41
+
42
+ parse(:define, file, &block)
43
+ end
44
+
45
+ def config
46
+ @config[:define]
47
+ end
48
+
49
+ def window(positions)
50
+ @window = positions
51
+ end
52
+
53
+ def context(name, &block)
54
+ if name.is_a? Array
55
+ @context += name
56
+ else
57
+ @context.push name
58
+
59
+ # The block might be wrongly assigned to this function
60
+ # instead of the actual definition, fix that.
61
+ if block
62
+ @types[name] = block
63
+ end
64
+ end
65
+ end
66
+
67
+ def direction(dir)
68
+ if dir.to_sym == :reverse
69
+ @reverse = true
70
+ end
71
+ end
72
+
73
+ def features(word)
74
+ values = [word]
75
+
76
+ @order.each{|features|
77
+ action = @types[features]
78
+ if action.is_a?(Proc)
79
+ values.push(action.call(word))
80
+ else
81
+ m = action.match(word)
82
+ if m
83
+ if m[1]
84
+ values.push(m[1])
85
+ else
86
+ values.push(m != nil)
87
+ end
88
+ else
89
+ values.push(false)
90
+ end
91
+ end
92
+ }
93
+ values
94
+ end
95
+
96
+ def template(window=nil)
97
+ window ||= @window || [1,-1]
98
+ template = ""
99
+
100
+ i = 1
101
+ @order.each{|feat|
102
+ template += "U#{ feat }: %x[0,#{ i }]\n"
103
+
104
+ if @context.include?(feat)
105
+ window.each{|p|
106
+ template += "U#{ feat }##{ p}: %x[#{ p },#{ i }]\n"
107
+ }
108
+ end
109
+ i += 1
110
+ }
111
+
112
+ template += "B\n"
113
+
114
+ template
115
+ end
116
+
117
+
118
+ def text_features(text, positive = nil)
119
+ text = self.class.reverse(text) if @reverse
120
+ initial = true
121
+ self.class.tokens(text).collect{|token|
122
+ features = features(token)
123
+ if !positive.nil?
124
+ features << (positive ? (initial ? 1 : 2) : 0)
125
+ initial = false
126
+ end
127
+ features
128
+ }
129
+ end
130
+
131
+ def tagged_features(text, mentions)
132
+ mentions ||= []
133
+ mentions = ['IMPOSSIBLE_MATCH'] if mentions.empty?
134
+ re = mentions.collect{|mention|
135
+ Regexp.quote(mention.gsub(/\s+/,' ')).sub(/\\s/,'\s+')
136
+ }.join("|")
137
+
138
+ positive = false
139
+ features = []
140
+ chunks = text.split(/(#{re})/)
141
+ chunks.each{|t|
142
+ chunk_features = text_features(t, positive)
143
+ positive = !positive
144
+ if @reverse
145
+ features = chunk_features + features
146
+ else
147
+ features = features + chunk_features
148
+ end
149
+ }
150
+ features
151
+ end
152
+
153
+ def train(features, model)
154
+ tmp_template = TmpFile.tmp_file("template-")
155
+ Open.write(tmp_template,template)
156
+
157
+ cmd = "#{File.join(Rbbt.datadir, 'third_party/crf++/bin/crf_learn')} '#{tmp_template}' '#{features}' '#{model}'"
158
+ system cmd
159
+ Open.write(model + '.config',config)
160
+ FileUtils.rm tmp_template
161
+ end
162
+
163
+ end
164
+
165
+ class NER
166
+
167
+ def initialize(model = nil)
168
+ begin
169
+ require 'CRFPP'
170
+ rescue Exception
171
+ require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP')
172
+ end
173
+
174
+ model ||= File.join(Rbbt.datadir, + 'ner/model/BC2')
175
+
176
+ @parser = NERFeatures.new(model + '.config')
177
+ @reverse = @parser.reverse
178
+ @tagger = CRFPP::Tagger.new("-m #{ model } -v 3 -n2")
179
+ end
180
+
181
+ def extract(text)
182
+ features = @parser.text_features(text)
183
+
184
+ @tagger.clear
185
+ features.each{|feats|
186
+ @tagger.add(feats.join(" "))
187
+ }
188
+
189
+ @tagger.parse
190
+
191
+ found = []
192
+ mention = []
193
+
194
+ @tagger.size.times{|i|
195
+ label = @tagger.y(i)
196
+ word = @tagger.x(i,0)
197
+
198
+ if word == ')'
199
+ mention.push(')') if mention.join =~ /\(/
200
+ next
201
+ end
202
+
203
+ case label
204
+ when 1
205
+ if mention.any? && ( mention.join(" ").is_special? || mention.select{|m| m.is_special?}.any?)
206
+ found.push(mention)
207
+ mention = []
208
+ end
209
+ mention.push(word)
210
+ when 2
211
+ mention.push(word)
212
+ when 0
213
+ found.push(mention) if mention.any?
214
+ mention = []
215
+ end
216
+ }
217
+
218
+ found << mention if mention.any?
219
+
220
+ found.collect{|list|
221
+ list = list.reverse if @reverse
222
+ list.join(" ")
223
+ }
224
+ end
225
+
226
+ end
227
+
228
+
229
+
@@ -249,7 +249,8 @@ class TokenTrieNER < NER
249
249
  match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
250
250
  }
251
251
 
252
- NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
252
+ type = type.first
253
+ NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes, :type => type)
253
254
  end
254
255
 
255
256
  attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
@@ -2,30 +2,55 @@ require 'rbbt/segment'
2
2
  require 'rbbt/document'
3
3
  require 'rbbt/segment/annotation'
4
4
  require 'rbbt/util/python'
5
+ require 'rbbt/network/paths'
5
6
 
6
7
  module SpaCy
7
8
 
8
- PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
9
+ TOKEN_PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
10
+ CHUNK_PROPERTIES = %w(lemma_)
9
11
 
10
- def self.tokens(text, lang = 'en')
12
+ def self.nlp(lang = 'en_core_web_md')
13
+ @@nlp ||= {}
14
+ @@nlp[lang] ||= RbbtPython.run :spacy do
15
+ spacy.load(lang)
16
+ end
17
+ end
18
+
19
+ def self.tokens(text, lang = 'en_core_web_sm')
11
20
 
12
21
  tokens = []
13
- RbbtPython.run 'spacy' do
14
- nlp = spacy.load(lang)
15
- doc = nlp.call(text)
16
- doc.__len__.times do |i|
17
- tokens << doc.__getitem__(i)
18
- end
22
+
23
+ nlp = nlp(lang)
24
+ doc = nlp.call(text)
25
+
26
+ doc.__len__.times do |i|
27
+ tokens << doc.__getitem__(i)
28
+ end
29
+
30
+ tokens
31
+ end
32
+
33
+ def self.chunks(text, lang = 'en_core_web_sm')
34
+
35
+ tokens = []
36
+ nlp = nlp(lang)
37
+
38
+ doc = nlp.call(text)
39
+ chunks = doc.noun_chunks.__iter__
40
+
41
+ RbbtPython.iterate chunks do |item|
42
+ tokens << item
19
43
  end
44
+
20
45
  tokens
21
46
  end
22
47
 
23
- def self.segments(text, lang = 'en')
24
- docid = text.docid if Document === text
48
+ def self.segments(text, lang = 'en_core_web_sm')
49
+ docid = text.docid if Document === text
25
50
  corpus = text.corpus if Document === text
26
51
  tokens = self.tokens(text, lang).collect do |token|
27
52
  info = {}
28
- PROPERTIES.each do |p|
53
+ TOKEN_PROPERTIES.each do |p|
29
54
  info[p] = token.instance_eval(p.to_s)
30
55
  end
31
56
  info[:type] = "SpaCy"
@@ -35,7 +60,120 @@ module SpaCy
35
60
  info[:corpus] = corpus if corpus
36
61
  SpaCyToken.setup(token.text, info)
37
62
  end
38
- SpaCyToken.setup(tokens, :corpus => corpus)
63
+
64
+ tokens
65
+ end
66
+
67
+ def self.chunk_segments(text, lang = 'en_core_web_sm')
68
+ docid = text.docid if Document === text
69
+ corpus = text.corpus if Document === text
70
+ chunks = self.chunks(text, lang).collect do |chunk|
71
+ info = {}
72
+ CHUNK_PROPERTIES.each do |p|
73
+ info[p] = chunk.instance_eval(p.to_s)
74
+ end
75
+ start = eend = nil
76
+ deps = []
77
+ RbbtPython.iterate chunk.__iter__ do |token|
78
+ start = token.idx if start.nil?
79
+ eend = start + chunk.text.length if eend.nil?
80
+ deps << token.idx.to_s + ":" + token.dep_ + "->" + token.head.idx.to_s if token.head.idx < start || token.head.idx > eend
81
+ end
82
+ info[:type] = "SpaCy"
83
+ info[:offset] = chunk.__iter__.__next__.idx
84
+ info[:dep] = deps * ";"
85
+ info[:docid] = docid if docid
86
+ info[:corpus] = corpus if corpus
87
+ SpaCySpan.setup(chunk.text, info)
88
+ end
89
+
90
+ chunks
91
+ end
92
+
93
+ def self.dep_graph(text, reverse = false, lang = 'en_core_web_md')
94
+ tokens = self.segments(text, lang)
95
+ index = Segment.index(tokens)
96
+ associations = {}
97
+ tokens.each do |token|
98
+ type, target_pos = token.dep.split("->")
99
+ target_tokens = index[target_pos.to_i]
100
+ associations[token.segid] = target_tokens
101
+ end
102
+
103
+ if reverse
104
+ old = associations.dup
105
+ old.each do |s,ts|
106
+ ts.each do |t|
107
+ associations[t] ||= []
108
+ associations[t] += [s] unless associations[t].include?(s)
109
+ end
110
+ end
111
+ end
112
+
113
+ associations
114
+ end
115
+
116
+ def self.chunk_dep_graph(text, reverse = false, lang = 'en_core_web_md')
117
+ associations = dep_graph(text, false, lang)
118
+
119
+ chunks = self.chunk_segments(text, lang)
120
+ tokens = self.segments(text, lang)
121
+ index = Segment.index(tokens + chunks)
122
+
123
+ chunks.each do |chunk|
124
+ target_token_ids = chunk.dep.split(";").collect do|dep|
125
+ type, target_pos = dep.split("->")
126
+ index[target_pos.to_i]
127
+ end.flatten
128
+
129
+ target_tokens = target_token_ids.collect do |target_token_id|
130
+ range = Range.new(*target_token_id.split(":").last.split("..").map(&:to_i))
131
+ range.collect do |pos|
132
+ index[pos]
133
+ end.uniq
134
+ end.flatten
135
+ associations[chunk.segid] = target_tokens
136
+ end
137
+
138
+ if reverse
139
+ old = associations.dup
140
+ old.each do |s,ts|
141
+ ts.each do |t|
142
+ associations[t] ||= []
143
+ associations[t] += [s] unless associations[t].include?(s)
144
+ end
145
+ end
146
+ end
147
+
148
+ associations
149
+ end
150
+
151
+ def self.paths(text, source, target, reverse = true, lang = 'en_core_web_md')
152
+ graph = SpaCy.chunk_dep_graph(text, reverse, lang)
153
+
154
+ chunk_index = Segment.index(SpaCy.chunk_segments(text, lang))
155
+
156
+ source_id = chunk_index[source.offset].first || source.segid
157
+ target_id = chunk_index[target.offset].first || target.segid
158
+
159
+ path = Paths.dijkstra(graph, source_id, [target_id])
160
+
161
+ return nil if path.nil?
162
+
163
+ path.reverse
164
+ end
165
+
166
+ def self.config(base, target = nil)
167
+ TmpFile.with_file(base) do |baseconfig|
168
+ if target
169
+ CMD.cmd(:spacy, "init fill-config #{baseconfig} #{target}")
170
+ else
171
+ TmpFile.with_file do |tmptarget|
172
+ CMD.cmd(:spacy, "init fill-config #{baseconfig} #{tmptarget}")
173
+ Open.read(targetconfig)
174
+ end
175
+ end
176
+ end
39
177
  end
40
178
  end
41
179
 
@@ -43,10 +181,15 @@ module SpaCyToken
43
181
  extend Entity
44
182
  include SegmentAnnotation
45
183
 
46
- self.annotation *SpaCy::PROPERTIES
184
+ self.annotation *SpaCy::TOKEN_PROPERTIES
47
185
  self.annotation :dep
48
186
  end
49
187
 
50
- if __FILE__ == $0
51
- ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
188
+ module SpaCySpan
189
+ extend Entity
190
+ include SegmentAnnotation
191
+
192
+ self.annotation *SpaCy::CHUNK_PROPERTIES
193
+ self.annotation :dep
52
194
  end
195
+
@@ -0,0 +1,24 @@
1
+ require 'rbbt/segment'
2
+
3
+ module Relationship
4
+ extend Annotation
5
+ self.annotation :segment
6
+ self.annotation :terms
7
+ self.annotation :type
8
+
9
+ def text
10
+ if segment
11
+ segment
12
+ else
13
+ type + ": " + terms * ", "
14
+ end
15
+ end
16
+
17
+ def html
18
+ text = <<-EOF
19
+ <span class='Relationship'\
20
+ >#{ self.text }</span>
21
+ EOF
22
+ text.chomp
23
+ end
24
+ end
data/lib/rbbt/segment.rb CHANGED
@@ -49,10 +49,13 @@ module Segment
49
49
  length
50
50
  end
51
51
 
52
+
52
53
  def eend
53
54
  offset.to_i + length - 1
54
55
  end
55
56
 
57
+ alias end eend
58
+
56
59
  def range
57
60
  (offset.to_i..eend)
58
61
  end
@@ -8,6 +8,10 @@ module NamedEntity
8
8
 
9
9
  self.annotation :entity_type, :code, :score
10
10
 
11
+ def entity_type
12
+ annotation_values[:entity_type] || annotation_values[:type]
13
+ end
14
+
11
15
  def report
12
16
  <<-EOF
13
17
  String: #{ self }
@@ -6,7 +6,7 @@ module Segment::RangeIndex
6
6
  SegID.setup(res, :corpus => corpus)
7
7
  end
8
8
 
9
- def self.index(segments, corpus, persist_file = :memory)
9
+ def self.index(segments, corpus = nil, persist_file = :memory)
10
10
  segments = segments.values.flatten if Hash === segments
11
11
 
12
12
  annotation_index =
@@ -0,0 +1,51 @@
1
+ isLetters /^[A-Z]+$/i
2
+ isUpper /^[A-Z]+$/
3
+ isLower /^[a-z]+$/
4
+ isDigits /^[0-9]+$/i
5
+ isRoman /^[IVX]+$/
6
+ isGreek /^(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)$/i
7
+ isPunctuation /^[,.;]$/
8
+ isDelim /^[\/()\[\]{}\-]$/
9
+ isNonWord /^[^\w]+$/
10
+ isConjunction /^and|or|&|,$/
11
+
12
+ hasLetters /[A-Z]/i
13
+ hasUpper /.[A-Z]/
14
+ hasLower /[a-z]/
15
+ hasDigits /[0-9]/i
16
+ hasGreek /(?:alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)/i
17
+ hasPunctuation /[,.;]/
18
+ hasDelim /[\/()\[\]{}\-]/
19
+ hasNonWord /[^\w]/
20
+ caspMix /[a-z].[A-Z]/
21
+ keywords /(?:protein|gene|domain|ase)s?$/
22
+ hasSuffix /[a-z][A-Z0-9]$/
23
+
24
+ numLetters do |w| w.scan(/[A-Z]/i).length end
25
+ numDigits do |w| w.scan(/[0-9]/).length end
26
+ #
27
+ prefix_3 /^(...)/
28
+ prefix_4 /^(....)/
29
+ suffix_3 /(...)$/
30
+ suffix_4 /(....)$/
31
+
32
+
33
+ token1 do |w|
34
+ w.sub(/[A-Z]/,'A').
35
+ sub(/[a-z]/,'a').
36
+ sub(/[0-9]/,'0').
37
+ sub(/[^0-9a-z]/i,'x')
38
+ end
39
+ token2 do |w|
40
+ w.sub(/[A-Z]+/,'A').
41
+ sub(/[a-z]+/,'a').
42
+ sub(/[0-9]+/,'0').
43
+ sub(/[^0-9a-z]+/i,'x')
44
+ end
45
+ token3 do |w| w.downcase end
46
+ special do |w| w.is_special? end
47
+
48
+ context %w(special token2 isPunctuation isDelim)
49
+ window %w(1 2 3 -1 -2 -3)
50
+ #direction :reverse
51
+
@@ -7,7 +7,8 @@ class TestCorpusPubmed < Test::Unit::TestCase
7
7
  def test_add_pmid
8
8
  corpus = Document::Corpus.setup({})
9
9
 
10
- document = corpus.add_pmid("32299157", :abstract).first
10
+ document = corpus.add_pmid("33359141", :abstract).first
11
+ iii document.docid
11
12
  title = document.to(:title)
12
13
  assert title.include?("COVID-19")
13
14
  end
@@ -4,6 +4,7 @@ require 'rbbt/document/corpus'
4
4
  require 'rbbt/segment'
5
5
  require 'rbbt/document/annotation'
6
6
  require 'rbbt/segment/named_entity'
7
+ require 'rbbt/ner/abner'
7
8
 
8
9
  class TestAnnotation < Test::Unit::TestCase
9
10
  class CalledOnce < Exception; end
@@ -28,6 +29,12 @@ class TestAnnotation < Test::Unit::TestCase
28
29
  self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
29
30
  end
30
31
 
32
+ Document.define :abner do
33
+ $called_once = true
34
+ Abner.new.match(self)
35
+ end
36
+
37
+
31
38
  Document.persist :ner
32
39
  end
33
40
 
@@ -133,7 +140,9 @@ class TestAnnotation < Test::Unit::TestCase
133
140
  text.ner
134
141
 
135
142
  assert ! $called_once
136
-
143
+
144
+ assert_equal text.abner.first.docid, text.docid
145
+
137
146
  assert text.ner.first.segid.include?("TEST:")
138
147
  end
139
148
  end
@@ -29,5 +29,19 @@ class TestDocumentCorpus < Test::Unit::TestCase
29
29
  assert corpus.docids("TEST:").include?(text.docid)
30
30
  end
31
31
  end
32
+
33
+ def test_load
34
+ text = "This is a document"
35
+ Document.setup(text, "TEST", "test_doc1", nil)
36
+
37
+ TmpFile.with_file do |path|
38
+ corpus = Persist.open_tokyocabinet(path, true, :single, "BDB")
39
+ corpus.extend Document::Corpus
40
+
41
+ corpus.add_document(text)
42
+
43
+ assert corpus.docids("TEST:").include?(text.docid)
44
+ end
45
+ end
32
46
  end
33
47
 
@@ -0,0 +1,132 @@
1
+ require File.dirname(__FILE__) + '/../../test_helper'
2
+ require 'rbbt'
3
+ require 'rbbt/ner/rner'
4
+ require 'test/unit'
5
+
6
+ class TestRNer < Test::Unit::TestCase
7
+
8
+ def setup
9
+ @parser = NERFeatures.new() do
10
+ isLetters /^[A-Z]+$/i
11
+ context prefix_3 /^(...)/
12
+ downcase do |w| w.downcase end
13
+
14
+ context %w(downcase)
15
+ end
16
+ end
17
+
18
+ def test_config
19
+ config = <<-EOC
20
+ isLetters /^[A-Z]+$/i
21
+ context prefix_3 /^(...)/
22
+ downcase do |w| w.downcase end
23
+
24
+ context %w(downcase)
25
+ EOC
26
+
27
+ assert_equal config.strip, @parser.config.strip
28
+ end
29
+
30
+ def test_reverse
31
+ assert_equal("protein P53", NERFeatures.reverse("P53 protein"))
32
+ assert_equal(
33
+ ". LH of assay - radioimmuno serum the with compared was LH urinary for ) GONAVIS - HI ( test hemagglutination direct new A",
34
+ NERFeatures.reverse(
35
+ "A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH."
36
+ ))
37
+ end
38
+
39
+ def test_features
40
+ assert_equal @parser.features("abCdE"), ["abCdE",true,'abC','abcde']
41
+ end
42
+
43
+ def test_template
44
+ template =<<-EOT
45
+ UisLetters: %x[0,1]
46
+ Uprefix_3: %x[0,2]
47
+ Uprefix_3#1: %x[1,2]
48
+ Uprefix_3#-1: %x[-1,2]
49
+ Udowncase: %x[0,3]
50
+ Udowncase#1: %x[1,3]
51
+ Udowncase#-1: %x[-1,3]
52
+ B
53
+ EOT
54
+
55
+ assert(@parser.template == template)
56
+ end
57
+
58
+ def test_tokens
59
+ assert( NERFeatures.tokens("A new direct hemagglutination test (HI-GONAVIS) for urinary LH was compared with the serum\n radioimmuno-assay of LH.")==
60
+ ["A", "new", "direct", "hemagglutination", "test", "(", "HI", "-", "GONAVIS", ")", "for", "urinary", "LH", "was", "compared", "with", "the", "serum", "radioimmuno", "-", "assay", "of", "LH", "."])
61
+
62
+
63
+ end
64
+ def test_text_features
65
+
66
+ assert(@parser.text_features("abCdE 1234") == [["abCdE",true, "abC", "abcde"], ["1234",false, "123", "1234"]])
67
+ assert(@parser.text_features("abCdE 1234",true) == [["abCdE",true, "abC", "abcde",1], ["1234",false, "123", "1234",2]])
68
+ assert(@parser.text_features("abCdE 1234",false) == [["abCdE",true, "abC", "abcde",0], ["1234",false, "123", "1234",0]])
69
+
70
+ end
71
+
72
+ def test_tagged_features
73
+ assert_equal(
74
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
75
+ ["of",true, false, "of", 0],
76
+ ["GENE1",false, "GEN", "gene1", 1],
77
+ [".", false, false, ".", 0]],
78
+ @parser.tagged_features("phosphorilation of GENE1.",['GENE1']))
79
+
80
+ assert_equal(
81
+ [["GENE1",false, "GEN", "gene1", 1],
82
+ ["phosphorilation",true, "pho", "phosphorilation", 0]],
83
+ @parser.tagged_features("GENE1 phosphorilation",['GENE1']))
84
+
85
+
86
+ assert_equal(
87
+ [["phosphorilation",true, "pho", "phosphorilation", 0],
88
+ ["of",true, false, "of", 0],
89
+ ["GENE",true, "GEN", "gene", 1],
90
+ ["1",false, false, "1", 2],
91
+ [".", false, false, ".", 0]],
92
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
93
+ end
94
+
95
+ def test_tagged_features_reverse
96
+ @parser.reverse = true
97
+ assert_equal(
98
+ [
99
+ ["GENE1",false, "GEN", "gene1", 1],
100
+ ["of",true, false, "of", 0],
101
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
102
+ ],
103
+ @parser.tagged_features("phosphorilation of GENE1",['GENE1']))
104
+
105
+ assert_equal(
106
+ [
107
+ [".", false, false, ".", 0],
108
+ ["1",false, false, "1", 1],
109
+ ["GENE",true, "GEN", "gene", 2],
110
+ ["of",true, false, "of", 0],
111
+ ["phosphorilation",true, "pho", "phosphorilation", 0]
112
+ ],
113
+ @parser.tagged_features("phosphorilation of GENE 1.",['GENE 1']))
114
+ end
115
+
116
+ def test_default_config
117
+ require 'rbbt/bow/misc'
118
+ text =<<-EOF
119
+ This text explains how MDM2 interacts with TP53.
120
+ EOF
121
+ @parser = NERFeatures.new Rbbt.share.rner["config.rb"].find
122
+ features = @parser.tagged_features text, %w(TP53 MDM2)
123
+ assert features.first.first == "This"
124
+ end
125
+
126
+
127
+
128
+ def __test_CRFPP_install
129
+ assert(require File.join(Rbbt.datadir, 'third_party/crf++/ruby/CRFPP'))
130
+ end
131
+
132
+ end
@@ -24,7 +24,8 @@ class TestClass < Test::Unit::TestCase
24
24
 
25
25
  def test_tsv
26
26
  a = "test"
27
- NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
27
+ NamedEntity.setup a, 10, "DocID", "TYPE", "CODE", "SCORE"
28
+ ppp Annotated.tsv([a,a])
28
29
  assert Annotated.tsv([a]).fields.include? "code"
29
30
  assert Annotated.tsv([a], nil).fields.include? "code"
30
31
  assert Annotated.tsv([a], :all).fields.include? "code"
@@ -144,7 +144,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
144
144
  gene2.entity_type = "Protein"
145
145
 
146
146
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
147
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
147
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
148
148
  end
149
149
  end
150
150
 
@@ -165,7 +165,7 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
165
165
  gene2.entity_type = "Protein"
166
166
 
167
167
  Transformed.with_transform(a, [gene1,gene2], Proc.new{|e| e.html}) do
168
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein'>CDK5R1</span> protein", a
168
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the <span class='Entity' attr-entity-type='Protein' title='Protein'>CDK5R1</span> protein", a
169
169
  end
170
170
  end
171
171
 
@@ -185,9 +185,9 @@ More recently, PPAR activators were shown to inhibit the activation of inflammat
185
185
  assert_equal [gene1], Segment.overlaps(Segment.sort([gene1,gene2]))
186
186
 
187
187
  Transformed.with_transform(a, [gene1], Proc.new{|e| e.html}) do
188
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene'>TP53</span> gene and the CDK5R1 protein", a
188
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene and the CDK5R1 protein", a
189
189
  Transformed.with_transform(a, [gene2], Proc.new{|e| e.html}) do
190
- assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene'><span class='Entity' attr-entity-type='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
190
+ assert_equal "This sentence mentions the <span class='Entity' attr-entity-type='Expanded Gene' title='Expanded Gene'><span class='Entity' attr-entity-type='Gene' title='Gene'>TP53</span> gene</span> and the CDK5R1 protein", a
191
191
  end
192
192
  end
193
193
  end
data/test/test_spaCy.rb CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/nlp/spaCy'
3
3
  require 'rbbt/document/corpus'
4
4
 
5
5
  class TestSpaCy < Test::Unit::TestCase
6
- def _test_tokens
6
+ def test_tokens
7
7
  text = "I tell a story"
8
8
 
9
9
  tokens = SpaCy.tokens(text)
@@ -12,6 +12,16 @@ class TestSpaCy < Test::Unit::TestCase
12
12
  assert_equal "tell", tokens[1].to_s
13
13
  end
14
14
 
15
+ def test_chunks
16
+ text = "Miguel Vazquez tell a good story"
17
+
18
+ tokens = SpaCy.chunks(text)
19
+
20
+ assert_equal 2, tokens.length
21
+ assert_equal "Miguel Vazquez", tokens[0].to_s
22
+ end
23
+
24
+
15
25
  def test_segments
16
26
  text = "I tell a story. It's a very good story."
17
27
 
@@ -28,5 +38,107 @@ class TestSpaCy < Test::Unit::TestCase
28
38
  assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
29
39
  end
30
40
  end
41
+
42
+ def test_chunk_segments
43
+ text = "I tell a story. It's a very good story."
44
+
45
+ corpus = Document::Corpus.setup({})
46
+
47
+ Document.setup(text, "TEST", "test_doc1", "simple_sentence")
48
+
49
+ corpus.add_document text
50
+ text.corpus = corpus
51
+
52
+ segments = SpaCy.chunk_segments(text)
53
+
54
+ segments.each do |segment|
55
+ assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
56
+ end
57
+ end
58
+
59
+ def test_dep_graph
60
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
61
+ graph = SpaCy.dep_graph(text, true)
62
+
63
+ tokens = SpaCy.segments(text)
64
+ index = Segment.index tokens
65
+ tf_s = tokens.select{|t| t == "TF" }.first
66
+ tg_s = tokens.select{|t| t == "TG" }.first
67
+
68
+ require 'rbbt/network/paths'
69
+
70
+ path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
71
+ path_tokens = path.collect do |segid|
72
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
73
+ text[range]
74
+ end
75
+
76
+ assert path_tokens.include? 'increase'
77
+
78
+ end
79
+
80
+ def test_chunk_dep_graph
81
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
82
+ graph = SpaCy.chunk_dep_graph(text, true)
83
+
84
+ tokens = SpaCy.chunk_segments(text)
85
+ index = Segment.index tokens
86
+ tf_s = tokens.select{|t| t.include? "TF" }.first
87
+ tg_s = tokens.select{|t| t.include? "TG" }.first
88
+
89
+
90
+ require 'rbbt/network/paths'
91
+
92
+ path = Paths.dijkstra(graph, tf_s.segid, [tg_s.segid])
93
+ path_tokens = path.collect do |segid|
94
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
95
+ text[range]
96
+ end
97
+
98
+ assert path_tokens.include? 'increase'
99
+ end
100
+
101
+ def test_paths
102
+ text = "Meanwhile, TF antisense treatment activated the human ASBT promoter 5-fold and not only abrogated interleukin-1beta-mediated repression but led to a paradoxical increase in TG promoter activity"
103
+ path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
104
+
105
+
106
+ path_tokens = path.collect do |segid|
107
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
108
+ text[range]
109
+ end
110
+
111
+ ppp text
112
+ iii path_tokens
113
+
114
+ assert path_tokens.include? 'increase'
115
+ end
116
+
117
+ def test_paths2
118
+ text = "Deletion and domain swap experiments identified small, discreet positive and negative elements in A-Myb and TF that were required for the regulation of specific genes, such as DHRS2, TG, and mim-1"
119
+ path = SpaCy.paths(text, Segment.setup("TF", :offset => text.index("TF")), Segment.setup("TG",:offset => text.index("TG")))
120
+
121
+
122
+ path_tokens = path.collect do |segid|
123
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
124
+ text[range]
125
+ end
126
+
127
+ iii path_tokens
128
+
129
+
130
+ assert path_tokens.include? 'regulation'
131
+ end
132
+
133
+ def test_paths3
134
+ text = "Therefore, we speculate that PEA3 factors may contribute to the up-regulation of COX-2 expression resulting from both APC mutation and Wnt1 expression"
135
+ path = SpaCy.paths(text, *Segment.align(text,["PEA3", "Wnt1"]))
136
+
137
+ path_tokens = path.collect do |segid|
138
+ range = Range.new(*segid.split(":").last.split("..").map(&:to_i))
139
+ text[range]
140
+ end
141
+
142
+ end
31
143
  end
32
144
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rbbt-text
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.4
4
+ version: 1.3.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Miguel Vazquez
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-23 00:00:00.000000000 Z
11
+ date: 2021-06-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rbbt-util
@@ -95,6 +95,7 @@ files:
95
95
  - lib/rbbt/ner/oscar4.rb
96
96
  - lib/rbbt/ner/patterns.rb
97
97
  - lib/rbbt/ner/regexpNER.rb
98
+ - lib/rbbt/ner/rner.rb
98
99
  - lib/rbbt/ner/rnorm.rb
99
100
  - lib/rbbt/ner/rnorm/cue_index.rb
100
101
  - lib/rbbt/ner/rnorm/tokens.rb
@@ -103,6 +104,7 @@ files:
103
104
  - lib/rbbt/nlp/nlp.rb
104
105
  - lib/rbbt/nlp/open_nlp/sentence_splitter.rb
105
106
  - lib/rbbt/nlp/spaCy.rb
107
+ - lib/rbbt/relationship.rb
106
108
  - lib/rbbt/segment.rb
107
109
  - lib/rbbt/segment/annotation.rb
108
110
  - lib/rbbt/segment/encoding.rb
@@ -126,6 +128,7 @@ files:
126
128
  - share/install/software/OpenNLP
127
129
  - share/install/software/StanfordParser
128
130
  - share/patterns/drug_induce_disease
131
+ - share/rner/config.rb
129
132
  - share/rnorm/cue_default
130
133
  - share/rnorm/tokens_default
131
134
  - share/wordlists/stopwords
@@ -148,6 +151,7 @@ files:
148
151
  - test/rbbt/ner/test_oscar4.rb
149
152
  - test/rbbt/ner/test_patterns.rb
150
153
  - test/rbbt/ner/test_regexpNER.rb
154
+ - test/rbbt/ner/test_rner.rb
151
155
  - test/rbbt/ner/test_rnorm.rb
152
156
  - test/rbbt/ner/test_token_trieNER.rb
153
157
  - test/rbbt/nlp/genia/test_sentence_splitter.rb
@@ -182,7 +186,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
182
186
  - !ruby/object:Gem::Version
183
187
  version: '0'
184
188
  requirements: []
185
- rubygems_version: 3.0.6
189
+ rubygems_version: 3.1.4
186
190
  signing_key:
187
191
  specification_version: 4
188
192
  summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
@@ -210,6 +214,7 @@ test_files:
210
214
  - test/rbbt/ner/test_banner.rb
211
215
  - test/rbbt/ner/test_token_trieNER.rb
212
216
  - test/rbbt/ner/test_finder.rb
217
+ - test/rbbt/ner/test_rner.rb
213
218
  - test/rbbt/ner/test_linnaeus.rb
214
219
  - test/rbbt/ner/test_oscar4.rb
215
220
  - test/rbbt/test_segment.rb