rbbt-text 1.3.1 → 1.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +0 -2
- data/lib/rbbt/bow/dictionary.rb +2 -2
- data/lib/rbbt/document.rb +15 -5
- data/lib/rbbt/document/annotation.rb +5 -2
- data/lib/rbbt/document/corpus.rb +26 -3
- data/lib/rbbt/document/corpus/pubmed.rb +1 -1
- data/lib/rbbt/ner/g_norm_plus.rb +7 -1
- data/lib/rbbt/ner/patterns.rb +0 -1
- data/lib/rbbt/ner/token_trieNER.rb +28 -15
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +6 -4
- data/lib/rbbt/segment/annotation.rb +3 -3
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/segment/transformed.rb +2 -2
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/test_annotation.rb +5 -5
- data/test/rbbt/document/test_corpus.rb +1 -1
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
- data/test/rbbt/segment/test_annotation.rb +3 -4
- data/test/rbbt/segment/test_encoding.rb +1 -1
- data/test/rbbt/segment/test_named_entity.rb +5 -4
- data/test/rbbt/segment/test_range_index.rb +1 -2
- data/test/rbbt/test_segment.rb +5 -10
- data/test/test_spaCy.rb +32 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05b1cf1981e955652598dd3db811cf8e6a7d64b68535e21834012abe90efe388
|
4
|
+
data.tar.gz: 67017f8a10cbfae51664999218336d638ea6be7c29b5ec305872473672977a41
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 03b02dcea1040edfa653e976d9f2f808ed25f9e0164add2fc85afa4417cf8e10ff8dfb27e1927c457f0ff6c6ee90311765ac364b7c5d7c8d9fd51cfff4ab9434
|
7
|
+
data.tar.gz: a5c44f475241da67863ac33ea446e7dbc64283ca53d6642c7c67c1c6e2e34a5d28b1ad678d2a5a44bf3316ed3c063f2d084628b8ff4d79aee8be04db3f8a6ab1
|
data/lib/rbbt/bow/bow.rb
CHANGED
data/lib/rbbt/bow/dictionary.rb
CHANGED
@@ -95,7 +95,7 @@ class Dictionary::TF_IDF
|
|
95
95
|
}
|
96
96
|
|
97
97
|
if limit
|
98
|
-
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
98
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
|
99
99
|
else
|
100
100
|
Hash[*best.flatten]
|
101
101
|
end
|
@@ -177,7 +177,7 @@ class Dictionary::KL
|
|
177
177
|
best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
|
178
178
|
}
|
179
179
|
if limit
|
180
|
-
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
180
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
|
181
181
|
else
|
182
182
|
best
|
183
183
|
end
|
data/lib/rbbt/document.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/entity'
|
3
|
-
require 'rbbt/document/annotation'
|
4
3
|
|
5
4
|
module DocID
|
6
5
|
extend Entity
|
@@ -19,10 +18,21 @@ module DocID
|
|
19
18
|
DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
|
20
19
|
end
|
21
20
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
21
|
+
property :document => :both do
|
22
|
+
if Array === self
|
23
|
+
namespace, id, type = nil, nil, nil
|
24
|
+
docs = self.collect do |docid|
|
25
|
+
text = self.corpus[docid]
|
26
|
+
namespace, id, type = docid.split(":")
|
27
|
+
#Document.setup(text, namespace, id, type, :corpus => corpus)
|
28
|
+
text
|
29
|
+
end
|
30
|
+
Document.setup(docs, :corpus => corpus)
|
31
|
+
else
|
32
|
+
text = self.corpus[self]
|
33
|
+
namespace, id, type = self.split(":")
|
34
|
+
Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
|
35
|
+
end
|
26
36
|
end
|
27
37
|
end
|
28
38
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rbbt/segment'
|
1
2
|
require 'rbbt/segment/annotation'
|
2
3
|
|
3
4
|
module Document
|
@@ -22,17 +23,19 @@ module Document
|
|
22
23
|
send :property, type => :multiple do |list|
|
23
24
|
doc_segments = self.instance_exec list, &block
|
24
25
|
|
25
|
-
doc_segments = doc_segments.chunked_values_at(
|
26
|
+
doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
|
26
27
|
|
27
28
|
doc_segments.each_with_index do |segments,i|
|
29
|
+
next if segments.nil?
|
28
30
|
document = list[i]
|
29
|
-
Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
|
31
|
+
Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
|
30
32
|
|
31
33
|
segments.each do |segment|
|
32
34
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
33
35
|
end
|
34
36
|
|
35
37
|
docid = document.docid
|
38
|
+
|
36
39
|
segments.each{|s| s.docid = docid if s.docid.nil? }
|
37
40
|
|
38
41
|
segments
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -3,17 +3,40 @@ require 'rbbt-util'
|
|
3
3
|
module Document::Corpus
|
4
4
|
|
5
5
|
def self.setup(corpus)
|
6
|
-
corpus.extend Document::Corpus
|
6
|
+
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
7
|
+
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
8
|
+
corpus
|
7
9
|
end
|
8
10
|
|
9
11
|
def add_document(document)
|
10
|
-
|
12
|
+
docid = document.docid
|
13
|
+
return document if self.include?(docid)
|
14
|
+
self.write_and_close do
|
15
|
+
self[docid] = document
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def docids(prefix)
|
20
|
+
prefix += ":" unless prefix[-1] == ":"
|
21
|
+
docids = self.read_and_close do
|
22
|
+
self.prefix(prefix)
|
23
|
+
end
|
24
|
+
DocID.setup(docids, :corpus => self)
|
25
|
+
end
|
26
|
+
|
27
|
+
def documents(prefix)
|
28
|
+
self.docids(prefix).document
|
11
29
|
end
|
12
30
|
|
13
31
|
def [](*args)
|
14
32
|
docid, *rest = args
|
15
|
-
|
33
|
+
|
34
|
+
res = self.read_and_close do
|
35
|
+
super(*args)
|
36
|
+
end
|
37
|
+
|
16
38
|
return res if args.length > 1
|
39
|
+
|
17
40
|
namespace, id, type = docid.split(":")
|
18
41
|
|
19
42
|
if res.nil?
|
@@ -6,7 +6,6 @@ module Document::Corpus
|
|
6
6
|
type = nil if String === type and type.empty?
|
7
7
|
|
8
8
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
9
|
-
Log.debug "Loading pmid #{pmid}"
|
10
9
|
document = if type.nil? || type.to_sym == :abstract
|
11
10
|
Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
|
12
11
|
elsif type.to_sym == :title
|
@@ -15,6 +14,7 @@ module Document::Corpus
|
|
15
14
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
16
15
|
Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
|
17
16
|
end
|
17
|
+
Log.debug "Loading pmid #{pmid}"
|
18
18
|
add_document(document)
|
19
19
|
end
|
20
20
|
|
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -55,11 +55,16 @@ EOF
|
|
55
55
|
Open.mkdir 'tmp'
|
56
56
|
|
57
57
|
texts.each do |name,text|
|
58
|
+
text = Misc.fixutf8(text)
|
59
|
+
|
60
|
+
text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
|
61
|
+
|
58
62
|
Open.write("input/#{name}.txt") do |f|
|
59
|
-
f.puts "#{name}|a|" << text
|
63
|
+
f.puts "#{name}|a|" << text
|
60
64
|
f.puts
|
61
65
|
end
|
62
66
|
end
|
67
|
+
|
63
68
|
Open.write('config', CONFIG)
|
64
69
|
CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
65
70
|
|
@@ -95,6 +100,7 @@ EOF
|
|
95
100
|
|
96
101
|
res[name] = segments
|
97
102
|
end
|
103
|
+
res
|
98
104
|
end
|
99
105
|
end
|
100
106
|
|
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -15,7 +15,6 @@ class PatternRelExt
|
|
15
15
|
segments = sentence.segments
|
16
16
|
segments = segments.values.flatten if Hash === segments
|
17
17
|
Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
|
18
|
-
ppp sentence
|
19
18
|
regexpNER.entities(sentence)
|
20
19
|
end
|
21
20
|
end
|
@@ -5,15 +5,27 @@ require 'rbbt/ner/NER'
|
|
5
5
|
require 'rbbt/segment/token'
|
6
6
|
|
7
7
|
class TokenTrieNER < NER
|
8
|
-
def self.clean(token)
|
8
|
+
def self.clean(token, stem = false)
|
9
9
|
if token.length > 3
|
10
|
-
token
|
10
|
+
upcase = token !~ /[a-z]/
|
11
|
+
token = token.downcase.sub(/-/,'')
|
12
|
+
|
13
|
+
if stem && ! upcase
|
14
|
+
require 'stemmer'
|
15
|
+
if stem == :double
|
16
|
+
token = token.stem.stem
|
17
|
+
else
|
18
|
+
token = token.stem
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
token
|
11
23
|
else
|
12
24
|
token
|
13
25
|
end
|
14
26
|
end
|
15
27
|
|
16
|
-
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
28
|
+
def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
|
17
29
|
if no_clean
|
18
30
|
if extend_to_token
|
19
31
|
Token.setup(token, :offset => start, :original => token)
|
@@ -22,25 +34,25 @@ class TokenTrieNER < NER
|
|
22
34
|
end
|
23
35
|
else
|
24
36
|
if extend_to_token
|
25
|
-
Token.setup(clean(token), :offset => start, :original => token)
|
37
|
+
Token.setup(clean(token, stem), :offset => start, :original => token)
|
26
38
|
else
|
27
|
-
clean(token)
|
39
|
+
clean(token, stem)
|
28
40
|
end
|
29
41
|
end
|
30
42
|
end
|
31
43
|
|
32
|
-
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
|
44
|
+
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
|
33
45
|
split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
|
34
46
|
|
35
47
|
tokens = []
|
36
48
|
while matchdata = text.match(split_at)
|
37
|
-
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
|
38
|
-
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
49
|
+
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
|
50
|
+
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
39
51
|
start += matchdata.end(0)
|
40
52
|
text = matchdata.post_match
|
41
53
|
end
|
42
54
|
|
43
|
-
tokens << prepare_token(text, start, extend_to_token) unless text.empty?
|
55
|
+
tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
|
44
56
|
|
45
57
|
tokens
|
46
58
|
end
|
@@ -130,7 +142,7 @@ class TokenTrieNER < NER
|
|
130
142
|
index1
|
131
143
|
end
|
132
144
|
|
133
|
-
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
|
145
|
+
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
|
134
146
|
|
135
147
|
chunk_size = hash.size / 100
|
136
148
|
items_in_chunk = 0
|
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
|
|
146
158
|
names.each do |name|
|
147
159
|
next if name.empty? or (String === name and name.length < 2)
|
148
160
|
|
149
|
-
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
|
161
|
+
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
|
150
162
|
tokens.extend EnumeratedArray
|
151
163
|
|
152
164
|
token_index = index_for_tokens(tokens, code, type, slack)
|
@@ -240,7 +252,7 @@ class TokenTrieNER < NER
|
|
240
252
|
NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
|
241
253
|
end
|
242
254
|
|
243
|
-
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|
255
|
+
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
|
244
256
|
def initialize(type = nil, file = nil, options = {})
|
245
257
|
options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
|
246
258
|
:persist => false
|
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
|
|
248
260
|
@longest_match = options.delete :longest_match
|
249
261
|
@split_at = options.delete :split_at
|
250
262
|
@no_clean = options.delete :no_clean
|
263
|
+
@stem = options.delete :stem
|
251
264
|
|
252
265
|
file = [] if file.nil?
|
253
266
|
file = [file] unless Array === file
|
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
|
|
273
286
|
Log.debug "TokenTrieNER merging TSV"
|
274
287
|
new.with_unnamed do
|
275
288
|
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
276
|
-
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
289
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
|
277
290
|
end
|
278
291
|
end
|
279
292
|
when Hash === new
|
@@ -284,14 +297,14 @@ class TokenTrieNER < NER
|
|
284
297
|
new = TSV.open(new, :flat)
|
285
298
|
new.with_unnamed do
|
286
299
|
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
287
|
-
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
300
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
|
288
301
|
end
|
289
302
|
end
|
290
303
|
end
|
291
304
|
end
|
292
305
|
|
293
306
|
def match(text)
|
294
|
-
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
|
307
|
+
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
|
295
308
|
|
296
309
|
tokens.extend EnumeratedArray
|
297
310
|
tokens.pos = 0
|
@@ -239,6 +239,7 @@ module NLP
|
|
239
239
|
end
|
240
240
|
|
241
241
|
def self.geniass_sentence_splitter(text)
|
242
|
+
Rbbt.software.opt.Geniass.produce
|
242
243
|
offsets = []
|
243
244
|
|
244
245
|
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
@@ -294,7 +295,7 @@ module NLP
|
|
294
295
|
offsets.collect do |s,e|
|
295
296
|
sentence = text[s..e]
|
296
297
|
next if sentence.nil?
|
297
|
-
|
298
|
+
sentence.gsub!(NEW_LINE_MASK, "\n")
|
298
299
|
Segment.setup sentence, s
|
299
300
|
sentence
|
300
301
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/segment/annotation'
|
4
|
+
require 'rbbt/util/python'
|
5
|
+
|
6
|
+
module SpaCy
|
7
|
+
|
8
|
+
PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
|
9
|
+
|
10
|
+
def self.tokens(text, lang = 'en')
|
11
|
+
|
12
|
+
tokens = []
|
13
|
+
RbbtPython.run 'spacy' do
|
14
|
+
nlp = spacy.load(lang)
|
15
|
+
doc = nlp.call(text)
|
16
|
+
doc.__len__.times do |i|
|
17
|
+
tokens << doc.__getitem__(i)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
tokens
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.segments(text, lang = 'en')
|
24
|
+
docid = text.docid if Document === text
|
25
|
+
corpus = text.corpus if Document === text
|
26
|
+
tokens = self.tokens(text, lang).collect do |token|
|
27
|
+
info = {}
|
28
|
+
PROPERTIES.each do |p|
|
29
|
+
info[p] = token.instance_eval(p.to_s)
|
30
|
+
end
|
31
|
+
info[:type] = "SpaCy"
|
32
|
+
info[:offset] = token.idx
|
33
|
+
info[:dep] = token.dep_ + "->" + token.head.idx.to_s
|
34
|
+
info[:docid] = docid if docid
|
35
|
+
info[:corpus] = corpus if corpus
|
36
|
+
SpaCyToken.setup(token.text, info)
|
37
|
+
end
|
38
|
+
SpaCyToken.setup(tokens, :corpus => corpus)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module SpaCyToken
|
43
|
+
extend Entity
|
44
|
+
include SegmentAnnotation
|
45
|
+
|
46
|
+
self.annotation *SpaCy::PROPERTIES
|
47
|
+
self.annotation :dep
|
48
|
+
end
|
49
|
+
|
50
|
+
if __FILE__ == $0
|
51
|
+
ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
|
52
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/entity'
|
3
|
+
require 'rbbt/document'
|
3
4
|
|
4
5
|
module SegID
|
5
6
|
extend Entity
|
@@ -10,11 +11,11 @@ module SegID
|
|
10
11
|
end
|
11
12
|
|
12
13
|
def range
|
13
|
-
@range ||= Range.new(*_parts.
|
14
|
+
@range ||= Range.new(*_parts[4].split("..").map(&:to_i))
|
14
15
|
end
|
15
16
|
|
16
17
|
def docid
|
17
|
-
@docid ||= _parts[0..3] * ":"
|
18
|
+
@docid ||= DocID.setup(_parts[0..3] * ":")
|
18
19
|
end
|
19
20
|
|
20
21
|
def offset
|
@@ -25,12 +26,13 @@ module SegID
|
|
25
26
|
range.end - range.begin + 1
|
26
27
|
end
|
27
28
|
|
28
|
-
property :segment do
|
29
|
+
property :segment => :single do
|
30
|
+
docid = self.docid
|
29
31
|
document = DocID.setup(docid, :corpus => corpus).document
|
30
32
|
|
31
33
|
text = document[range]
|
32
34
|
|
33
|
-
Segment.setup(text, docid)
|
35
|
+
Segment.setup(text, :docid => docid, :offset => offset)
|
34
36
|
end
|
35
37
|
|
36
38
|
property :segid do
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
-
require 'rbbt/entity'
|
3
2
|
require 'rbbt/segment'
|
3
|
+
require 'rbbt/entity'
|
4
4
|
|
5
5
|
module AnnotID
|
6
6
|
extend Entity
|
@@ -32,7 +32,7 @@ end
|
|
32
32
|
|
33
33
|
module SegmentAnnotation
|
34
34
|
extend Entity
|
35
|
-
include Segment
|
35
|
+
include Object::Segment
|
36
36
|
self.annotation :type
|
37
37
|
|
38
38
|
property :segid do
|
@@ -47,7 +47,7 @@ module SegmentAnnotation
|
|
47
47
|
end
|
48
48
|
|
49
49
|
property :annotid do |corpus=nil|
|
50
|
-
AnnotID.setup([segid, type] * ":", :corpus => corpus)
|
50
|
+
AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
|
51
51
|
end
|
52
52
|
|
53
53
|
alias id annotid
|
@@ -69,8 +69,8 @@ module Transformed
|
|
69
69
|
segments = [segments] unless Array === segments
|
70
70
|
orig_length = self.length
|
71
71
|
|
72
|
-
offset = self.respond_to?(:offset) ? self.offset : 0
|
73
|
-
segments = segments.select{|s| s.offset >= offset && s.offset <= offset + self.length - 1 }
|
72
|
+
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
|
+
segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
|
74
74
|
|
75
75
|
Segment.clean_sort(segments).each do |segment|
|
76
76
|
next if segment.offset.nil?
|
@@ -36,7 +36,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
36
36
|
Document.setup(text, "TEST", "test_doc1", nil)
|
37
37
|
|
38
38
|
corpus = {}
|
39
|
-
|
39
|
+
Document::Corpus.setup corpus
|
40
40
|
|
41
41
|
corpus.add_document(text)
|
42
42
|
|
@@ -50,7 +50,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
50
50
|
Document.setup(text2, "TEST", "test_doc2", nil)
|
51
51
|
|
52
52
|
corpus = {}
|
53
|
-
|
53
|
+
Document::Corpus.setup corpus
|
54
54
|
|
55
55
|
corpus.add_document(text1)
|
56
56
|
corpus.add_document(text2)
|
@@ -68,7 +68,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
68
68
|
Document.setup(text, "TEST", "test_doc1", nil)
|
69
69
|
|
70
70
|
corpus = {}
|
71
|
-
|
71
|
+
Document::Corpus.setup corpus
|
72
72
|
|
73
73
|
corpus.add_document(text)
|
74
74
|
|
@@ -95,7 +95,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
95
95
|
Document.setup(text, "TEST", "test_doc1", nil)
|
96
96
|
|
97
97
|
corpus = {}
|
98
|
-
|
98
|
+
Document::Corpus.setup corpus
|
99
99
|
|
100
100
|
corpus.add_document(text)
|
101
101
|
|
@@ -122,7 +122,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
122
122
|
Document.setup(text, "TEST", "test_doc1", nil)
|
123
123
|
|
124
124
|
corpus = {}
|
125
|
-
|
125
|
+
Document::Corpus.setup corpus
|
126
126
|
|
127
127
|
corpus.add_document(text)
|
128
128
|
|
@@ -5,12 +5,17 @@ Log.severity = 0
|
|
5
5
|
class TestGNormPlus < Test::Unit::TestCase
|
6
6
|
def test_match
|
7
7
|
text =<<-EOF
|
8
|
-
|
8
|
+
|
9
|
+
Introduction
|
10
|
+
|
11
|
+
We found that TP53 is regulated by MDM2 in Homo
|
12
|
+
sapiens
|
9
13
|
EOF
|
10
14
|
|
11
15
|
mentions = GNormPlus.process({:file => text})
|
16
|
+
|
12
17
|
assert_equal 1, mentions.length
|
13
|
-
assert_equal
|
18
|
+
assert_equal 3, mentions["file"].length
|
14
19
|
end
|
15
20
|
|
16
21
|
def test_entities
|
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
|
|
19
24
|
EOF
|
20
25
|
|
21
26
|
mentions = GNormPlus.entities({:file => text})
|
22
|
-
mentions["file"].include?
|
27
|
+
assert mentions["file"].include?("TP53")
|
28
|
+
mentions["file"].each do |mention|
|
29
|
+
assert_equal mention, text[mention.range].sub("\n", ' ')
|
30
|
+
end
|
23
31
|
end
|
24
32
|
end
|
25
33
|
|
@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
|
|
7
7
|
This is a sentence.
|
8
8
|
A funky character ™ in a sentence.
|
9
9
|
This is a sentence.
|
10
|
-
This is a
|
10
|
+
This is a broken
|
11
11
|
sentence. This is
|
12
|
-
another sentence.
|
12
|
+
another broken sentence.
|
13
13
|
EOF
|
14
14
|
|
15
|
-
|
15
|
+
iii NLP.geniass_sentence_splitter(text)
|
16
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
16
17
|
end
|
17
18
|
|
19
|
+
def test_sentences_2
|
20
|
+
text =<<-EOF
|
21
|
+
This is a sentence.
|
22
|
+
This is a sentence.
|
23
|
+
This is a broken
|
24
|
+
sentence. This is
|
25
|
+
another broken sentence.
|
26
|
+
EOF
|
27
|
+
|
28
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_sentences_ext
|
32
|
+
text =<<-EOF
|
33
|
+
This is a sentence.
|
34
|
+
This is a sentence.
|
35
|
+
This is a broken
|
36
|
+
sentence. This is
|
37
|
+
another broken sentence.
|
38
|
+
EOF
|
39
|
+
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
41
|
+
end
|
18
42
|
end
|
19
43
|
|
@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
|
|
12
12
|
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
13
|
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
14
|
|
15
|
-
assert_equal 'verb', annotation.annotid.split(":")
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
16
16
|
|
17
17
|
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
-
assert_equal 'verb', annotation.annotid.split(":")
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_annotid
|
22
22
|
text = "This is a document"
|
23
23
|
Document.setup(text, "TEST", "test_doc1", nil)
|
24
24
|
|
25
|
-
corpus = {}
|
26
|
-
corpus.extend Document::Corpus
|
25
|
+
corpus = Document::Corpus.setup({})
|
27
26
|
|
28
27
|
corpus.add_document(text)
|
29
28
|
|
@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
|
|
2
2
|
require 'rbbt/segment/encoding'
|
3
3
|
|
4
4
|
class TestEncoding < Test::Unit::TestCase
|
5
|
-
def
|
5
|
+
def test_bad_chars
|
6
6
|
text = "A funky character ™ in a sentence."
|
7
7
|
|
8
8
|
assert_equal ["™"], Segment.bad_chars(text)
|
@@ -22,12 +22,13 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
assert_equal "SCORE", a.score
|
23
23
|
end
|
24
24
|
|
25
|
-
def
|
25
|
+
def test_tsv
|
26
26
|
a = "test"
|
27
27
|
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
28
|
-
assert
|
29
|
-
assert
|
30
|
-
assert
|
28
|
+
assert Annotated.tsv([a]).fields.include? "code"
|
29
|
+
assert Annotated.tsv([a], nil).fields.include? "code"
|
30
|
+
assert Annotated.tsv([a], :all).fields.include? "code"
|
31
|
+
assert Annotated.tsv([a], :all).fields.include? "literal"
|
31
32
|
end
|
32
33
|
|
33
34
|
def __test_segment_brat
|
@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
|
|
9
9
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
10
|
Document.setup(text, "TEST", "test_doc1", nil)
|
11
11
|
|
12
|
-
corpus = {}
|
13
|
-
corpus.extend Document::Corpus
|
12
|
+
corpus = Document::Corpus.setup({})
|
14
13
|
|
15
14
|
corpus.add_document(text)
|
16
15
|
|
data/test/rbbt/test_segment.rb
CHANGED
@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
|
|
17
17
|
text = "This is a document"
|
18
18
|
Document.setup(text, "TEST", "test_doc1", nil)
|
19
19
|
|
20
|
-
corpus = {}
|
21
|
-
corpus.extend Document::Corpus
|
20
|
+
corpus = Document::Corpus.setup({})
|
22
21
|
|
23
22
|
corpus.add_document(text)
|
24
23
|
|
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
|
|
41
40
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
42
41
|
Document.setup(text, "TEST", "test_doc1", nil)
|
43
42
|
|
44
|
-
corpus = {}
|
45
|
-
corpus.extend Document::Corpus
|
43
|
+
corpus = Document::Corpus.setup({})
|
46
44
|
|
47
45
|
corpus.add_document(text)
|
48
46
|
|
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
|
|
65
63
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
66
64
|
Document.setup(text, "TEST", "test_doc1", nil)
|
67
65
|
|
68
|
-
corpus = {}
|
69
|
-
corpus.extend Document::Corpus
|
66
|
+
corpus = Document::Corpus.setup({})
|
70
67
|
|
71
68
|
corpus.add_document(text)
|
72
69
|
|
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
|
|
94
91
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
95
92
|
Document.setup(text, "TEST", "test_doc1", nil)
|
96
93
|
|
97
|
-
corpus = {}
|
98
|
-
corpus.extend Document::Corpus
|
94
|
+
corpus = Document::Corpus.setup({})
|
99
95
|
|
100
96
|
corpus.add_document(text)
|
101
97
|
|
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
|
|
142
138
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
143
139
|
Document.setup(text, "TEST", "test_doc1", nil)
|
144
140
|
|
145
|
-
corpus = {}
|
146
|
-
corpus.extend Document::Corpus
|
141
|
+
corpus = Document::Corpus.setup({})
|
147
142
|
|
148
143
|
corpus.add_document(text)
|
149
144
|
|
data/test/test_spaCy.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
|
2
|
+
require 'rbbt/nlp/spaCy'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
|
5
|
+
class TestSpaCy < Test::Unit::TestCase
|
6
|
+
def _test_tokens
|
7
|
+
text = "I tell a story"
|
8
|
+
|
9
|
+
tokens = SpaCy.tokens(text)
|
10
|
+
|
11
|
+
assert_equal 4, tokens.length
|
12
|
+
assert_equal "tell", tokens[1].to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_segments
|
16
|
+
text = "I tell a story. It's a very good story."
|
17
|
+
|
18
|
+
corpus = Document::Corpus.setup({})
|
19
|
+
|
20
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
21
|
+
|
22
|
+
corpus.add_document text
|
23
|
+
text.corpus = corpus
|
24
|
+
|
25
|
+
segments = SpaCy.segments(text)
|
26
|
+
|
27
|
+
segments.each do |segment|
|
28
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -102,12 +102,14 @@ files:
|
|
102
102
|
- lib/rbbt/nlp/genia/sentence_splitter.rb
|
103
103
|
- lib/rbbt/nlp/nlp.rb
|
104
104
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
105
|
+
- lib/rbbt/nlp/spaCy.rb
|
105
106
|
- lib/rbbt/segment.rb
|
106
107
|
- lib/rbbt/segment/annotation.rb
|
107
108
|
- lib/rbbt/segment/encoding.rb
|
108
109
|
- lib/rbbt/segment/named_entity.rb
|
109
110
|
- lib/rbbt/segment/overlaps.rb
|
110
111
|
- lib/rbbt/segment/range_index.rb
|
112
|
+
- lib/rbbt/segment/relationship.rb
|
111
113
|
- lib/rbbt/segment/segmented.rb
|
112
114
|
- lib/rbbt/segment/token.rb
|
113
115
|
- lib/rbbt/segment/transformed.rb
|
@@ -161,6 +163,7 @@ files:
|
|
161
163
|
- test/rbbt/test_document.rb
|
162
164
|
- test/rbbt/test_segment.rb
|
163
165
|
- test/test_helper.rb
|
166
|
+
- test/test_spaCy.rb
|
164
167
|
homepage: http://github.com/mikisvaz/rbbt-util
|
165
168
|
licenses: []
|
166
169
|
metadata: {}
|
@@ -217,4 +220,5 @@ test_files:
|
|
217
220
|
- test/rbbt/segment/test_encoding.rb
|
218
221
|
- test/rbbt/segment/test_range_index.rb
|
219
222
|
- test/rbbt/segment/test_corpus.rb
|
223
|
+
- test/test_spaCy.rb
|
220
224
|
- test/test_helper.rb
|