rbbt-text 1.3.1 → 1.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +0 -2
- data/lib/rbbt/bow/dictionary.rb +2 -2
- data/lib/rbbt/document.rb +15 -5
- data/lib/rbbt/document/annotation.rb +5 -2
- data/lib/rbbt/document/corpus.rb +26 -3
- data/lib/rbbt/document/corpus/pubmed.rb +1 -1
- data/lib/rbbt/ner/g_norm_plus.rb +7 -1
- data/lib/rbbt/ner/patterns.rb +0 -1
- data/lib/rbbt/ner/token_trieNER.rb +28 -15
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +6 -4
- data/lib/rbbt/segment/annotation.rb +3 -3
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/segment/transformed.rb +2 -2
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/test_annotation.rb +5 -5
- data/test/rbbt/document/test_corpus.rb +1 -1
- data/test/rbbt/ner/test_g_norm_plus.rb +11 -3
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +27 -3
- data/test/rbbt/segment/test_annotation.rb +3 -4
- data/test/rbbt/segment/test_encoding.rb +1 -1
- data/test/rbbt/segment/test_named_entity.rb +5 -4
- data/test/rbbt/segment/test_range_index.rb +1 -2
- data/test/rbbt/test_segment.rb +5 -10
- data/test/test_spaCy.rb +32 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 05b1cf1981e955652598dd3db811cf8e6a7d64b68535e21834012abe90efe388
|
4
|
+
data.tar.gz: 67017f8a10cbfae51664999218336d638ea6be7c29b5ec305872473672977a41
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 03b02dcea1040edfa653e976d9f2f808ed25f9e0164add2fc85afa4417cf8e10ff8dfb27e1927c457f0ff6c6ee90311765ac364b7c5d7c8d9fd51cfff4ab9434
|
7
|
+
data.tar.gz: a5c44f475241da67863ac33ea446e7dbc64283ca53d6642c7c67c1c6e2e34a5d28b1ad678d2a5a44bf3316ed3c063f2d084628b8ff4d79aee8be04db3f8a6ab1
|
data/lib/rbbt/bow/bow.rb
CHANGED
data/lib/rbbt/bow/dictionary.rb
CHANGED
@@ -95,7 +95,7 @@ class Dictionary::TF_IDF
|
|
95
95
|
}
|
96
96
|
|
97
97
|
if limit
|
98
|
-
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
98
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
|
99
99
|
else
|
100
100
|
Hash[*best.flatten]
|
101
101
|
end
|
@@ -177,7 +177,7 @@ class Dictionary::KL
|
|
177
177
|
best[term] = pos * Math::log(pos / neg) + neg * Math::log(neg / pos)
|
178
178
|
}
|
179
179
|
if limit
|
180
|
-
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit).flatten]
|
180
|
+
Hash[*best.sort{|a,b| b[1] <=> a[1]}.slice(0, limit-1).flatten]
|
181
181
|
else
|
182
182
|
best
|
183
183
|
end
|
data/lib/rbbt/document.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/entity'
|
3
|
-
require 'rbbt/document/annotation'
|
4
3
|
|
5
4
|
module DocID
|
6
5
|
extend Entity
|
@@ -19,10 +18,21 @@ module DocID
|
|
19
18
|
DocID.setup([namespace, code, "title"] * ":", :corpus => corpus)
|
20
19
|
end
|
21
20
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
21
|
+
property :document => :both do
|
22
|
+
if Array === self
|
23
|
+
namespace, id, type = nil, nil, nil
|
24
|
+
docs = self.collect do |docid|
|
25
|
+
text = self.corpus[docid]
|
26
|
+
namespace, id, type = docid.split(":")
|
27
|
+
#Document.setup(text, namespace, id, type, :corpus => corpus)
|
28
|
+
text
|
29
|
+
end
|
30
|
+
Document.setup(docs, :corpus => corpus)
|
31
|
+
else
|
32
|
+
text = self.corpus[self]
|
33
|
+
namespace, id, type = self.split(":")
|
34
|
+
Document.setup(text, :namespace => namespace, :code => id, :type => type, :corpus => corpus)
|
35
|
+
end
|
26
36
|
end
|
27
37
|
end
|
28
38
|
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rbbt/segment'
|
1
2
|
require 'rbbt/segment/annotation'
|
2
3
|
|
3
4
|
module Document
|
@@ -22,17 +23,19 @@ module Document
|
|
22
23
|
send :property, type => :multiple do |list|
|
23
24
|
doc_segments = self.instance_exec list, &block
|
24
25
|
|
25
|
-
doc_segments = doc_segments.chunked_values_at(
|
26
|
+
doc_segments = doc_segments.chunked_values_at(list) if Hash === doc_segments
|
26
27
|
|
27
28
|
doc_segments.each_with_index do |segments,i|
|
29
|
+
next if segments.nil?
|
28
30
|
document = list[i]
|
29
|
-
Segment.align(document, segments) unless segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
|
31
|
+
Segment.align(document, segments) unless segments.nil? || segments.empty? || (Segment === segments && segments.offset) || (Segment === segments.first && segments.first.offset)
|
30
32
|
|
31
33
|
segments.each do |segment|
|
32
34
|
SegmentAnnotation.setup(segment, :type => type.to_s) unless SegmentAnnotation === segment && segment.type
|
33
35
|
end
|
34
36
|
|
35
37
|
docid = document.docid
|
38
|
+
|
36
39
|
segments.each{|s| s.docid = docid if s.docid.nil? }
|
37
40
|
|
38
41
|
segments
|
data/lib/rbbt/document/corpus.rb
CHANGED
@@ -3,17 +3,40 @@ require 'rbbt-util'
|
|
3
3
|
module Document::Corpus
|
4
4
|
|
5
5
|
def self.setup(corpus)
|
6
|
-
corpus.extend Document::Corpus
|
6
|
+
corpus.extend Document::Corpus unless Document::Corpus === corpus
|
7
|
+
corpus.extend Persist::TSVAdapter unless Persist::TSVAdapter === corpus
|
8
|
+
corpus
|
7
9
|
end
|
8
10
|
|
9
11
|
def add_document(document)
|
10
|
-
|
12
|
+
docid = document.docid
|
13
|
+
return document if self.include?(docid)
|
14
|
+
self.write_and_close do
|
15
|
+
self[docid] = document
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def docids(prefix)
|
20
|
+
prefix += ":" unless prefix[-1] == ":"
|
21
|
+
docids = self.read_and_close do
|
22
|
+
self.prefix(prefix)
|
23
|
+
end
|
24
|
+
DocID.setup(docids, :corpus => self)
|
25
|
+
end
|
26
|
+
|
27
|
+
def documents(prefix)
|
28
|
+
self.docids(prefix).document
|
11
29
|
end
|
12
30
|
|
13
31
|
def [](*args)
|
14
32
|
docid, *rest = args
|
15
|
-
|
33
|
+
|
34
|
+
res = self.read_and_close do
|
35
|
+
super(*args)
|
36
|
+
end
|
37
|
+
|
16
38
|
return res if args.length > 1
|
39
|
+
|
17
40
|
namespace, id, type = docid.split(":")
|
18
41
|
|
19
42
|
if res.nil?
|
@@ -6,7 +6,6 @@ module Document::Corpus
|
|
6
6
|
type = nil if String === type and type.empty?
|
7
7
|
|
8
8
|
res = PubMed.get_article(pmids).collect do |pmid, article|
|
9
|
-
Log.debug "Loading pmid #{pmid}"
|
10
9
|
document = if type.nil? || type.to_sym == :abstract
|
11
10
|
Document.setup(article.abstract || "", "PMID", pmid, :abstract, self, :corpus => self)
|
12
11
|
elsif type.to_sym == :title
|
@@ -15,6 +14,7 @@ module Document::Corpus
|
|
15
14
|
raise "No FullText available for #{ pmid }" if article.full_text.nil?
|
16
15
|
Document.setup(article.full_text, :PMID, pmid, :fulltext, self, :corpus => self)
|
17
16
|
end
|
17
|
+
Log.debug "Loading pmid #{pmid}"
|
18
18
|
add_document(document)
|
19
19
|
end
|
20
20
|
|
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -55,11 +55,16 @@ EOF
|
|
55
55
|
Open.mkdir 'tmp'
|
56
56
|
|
57
57
|
texts.each do |name,text|
|
58
|
+
text = Misc.fixutf8(text)
|
59
|
+
|
60
|
+
text = text.gsub('|', '#').gsub("\n", " ").gsub(/\t/,' ')
|
61
|
+
|
58
62
|
Open.write("input/#{name}.txt") do |f|
|
59
|
-
f.puts "#{name}|a|" << text
|
63
|
+
f.puts "#{name}|a|" << text
|
60
64
|
f.puts
|
61
65
|
end
|
62
66
|
end
|
67
|
+
|
63
68
|
Open.write('config', CONFIG)
|
64
69
|
CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
65
70
|
|
@@ -95,6 +100,7 @@ EOF
|
|
95
100
|
|
96
101
|
res[name] = segments
|
97
102
|
end
|
103
|
+
res
|
98
104
|
end
|
99
105
|
end
|
100
106
|
|
data/lib/rbbt/ner/patterns.rb
CHANGED
@@ -15,7 +15,6 @@ class PatternRelExt
|
|
15
15
|
segments = sentence.segments
|
16
16
|
segments = segments.values.flatten if Hash === segments
|
17
17
|
Transformed.with_transform(sentence, segments, Proc.new{|s| s.entity_type.to_s.upcase}) do |sentence|
|
18
|
-
ppp sentence
|
19
18
|
regexpNER.entities(sentence)
|
20
19
|
end
|
21
20
|
end
|
@@ -5,15 +5,27 @@ require 'rbbt/ner/NER'
|
|
5
5
|
require 'rbbt/segment/token'
|
6
6
|
|
7
7
|
class TokenTrieNER < NER
|
8
|
-
def self.clean(token)
|
8
|
+
def self.clean(token, stem = false)
|
9
9
|
if token.length > 3
|
10
|
-
token
|
10
|
+
upcase = token !~ /[a-z]/
|
11
|
+
token = token.downcase.sub(/-/,'')
|
12
|
+
|
13
|
+
if stem && ! upcase
|
14
|
+
require 'stemmer'
|
15
|
+
if stem == :double
|
16
|
+
token = token.stem.stem
|
17
|
+
else
|
18
|
+
token = token.stem
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
token
|
11
23
|
else
|
12
24
|
token
|
13
25
|
end
|
14
26
|
end
|
15
27
|
|
16
|
-
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
28
|
+
def self.prepare_token(token, start, extend_to_token = true, no_clean = false, stem = false)
|
17
29
|
if no_clean
|
18
30
|
if extend_to_token
|
19
31
|
Token.setup(token, :offset => start, :original => token)
|
@@ -22,25 +34,25 @@ class TokenTrieNER < NER
|
|
22
34
|
end
|
23
35
|
else
|
24
36
|
if extend_to_token
|
25
|
-
Token.setup(clean(token), :offset => start, :original => token)
|
37
|
+
Token.setup(clean(token, stem), :offset => start, :original => token)
|
26
38
|
else
|
27
|
-
clean(token)
|
39
|
+
clean(token, stem)
|
28
40
|
end
|
29
41
|
end
|
30
42
|
end
|
31
43
|
|
32
|
-
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, start = 0)
|
44
|
+
def self.tokenize(text, extend_to_token = true, split_at = nil, no_clean = false, stem = false, start = 0)
|
33
45
|
split_at = /\s|(\(|\)|[-."':,])/ if split_at.nil?
|
34
46
|
|
35
47
|
tokens = []
|
36
48
|
while matchdata = text.match(split_at)
|
37
|
-
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean) unless matchdata.pre_match.empty?
|
38
|
-
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
49
|
+
tokens << prepare_token(matchdata.pre_match, start, extend_to_token, no_clean, stem) unless matchdata.pre_match.empty?
|
50
|
+
tokens << prepare_token(matchdata.captures.first, start + matchdata.begin(1), extend_to_token, no_clean, stem) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
39
51
|
start += matchdata.end(0)
|
40
52
|
text = matchdata.post_match
|
41
53
|
end
|
42
54
|
|
43
|
-
tokens << prepare_token(text, start, extend_to_token) unless text.empty?
|
55
|
+
tokens << prepare_token(text, start, extend_to_token, no_clean, stem) unless text.empty?
|
44
56
|
|
45
57
|
tokens
|
46
58
|
end
|
@@ -130,7 +142,7 @@ class TokenTrieNER < NER
|
|
130
142
|
index1
|
131
143
|
end
|
132
144
|
|
133
|
-
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false)
|
145
|
+
def self.process(index, hash, type = nil, slack = nil, split_at = nil, no_clean = false, stem = false)
|
134
146
|
|
135
147
|
chunk_size = hash.size / 100
|
136
148
|
items_in_chunk = 0
|
@@ -146,7 +158,7 @@ class TokenTrieNER < NER
|
|
146
158
|
names.each do |name|
|
147
159
|
next if name.empty? or (String === name and name.length < 2)
|
148
160
|
|
149
|
-
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
|
161
|
+
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean, stem)
|
150
162
|
tokens.extend EnumeratedArray
|
151
163
|
|
152
164
|
token_index = index_for_tokens(tokens, code, type, slack)
|
@@ -240,7 +252,7 @@ class TokenTrieNER < NER
|
|
240
252
|
NamedEntity.setup(match, :offset => match_tokens.first.offset, :entity_type => type, :code => codes)
|
241
253
|
end
|
242
254
|
|
243
|
-
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|
255
|
+
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean, :stem
|
244
256
|
def initialize(type = nil, file = nil, options = {})
|
245
257
|
options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
|
246
258
|
:persist => false
|
@@ -248,6 +260,7 @@ class TokenTrieNER < NER
|
|
248
260
|
@longest_match = options.delete :longest_match
|
249
261
|
@split_at = options.delete :split_at
|
250
262
|
@no_clean = options.delete :no_clean
|
263
|
+
@stem = options.delete :stem
|
251
264
|
|
252
265
|
file = [] if file.nil?
|
253
266
|
file = [file] unless Array === file
|
@@ -273,7 +286,7 @@ class TokenTrieNER < NER
|
|
273
286
|
Log.debug "TokenTrieNER merging TSV"
|
274
287
|
new.with_unnamed do
|
275
288
|
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
276
|
-
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
289
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
|
277
290
|
end
|
278
291
|
end
|
279
292
|
when Hash === new
|
@@ -284,14 +297,14 @@ class TokenTrieNER < NER
|
|
284
297
|
new = TSV.open(new, :flat)
|
285
298
|
new.with_unnamed do
|
286
299
|
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
287
|
-
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
300
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean, stem)
|
288
301
|
end
|
289
302
|
end
|
290
303
|
end
|
291
304
|
end
|
292
305
|
|
293
306
|
def match(text)
|
294
|
-
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean)
|
307
|
+
tokens = Array === text ? text : TokenTrieNER.tokenize(text, true, split_at, no_clean, stem)
|
295
308
|
|
296
309
|
tokens.extend EnumeratedArray
|
297
310
|
tokens.pos = 0
|
@@ -239,6 +239,7 @@ module NLP
|
|
239
239
|
end
|
240
240
|
|
241
241
|
def self.geniass_sentence_splitter(text)
|
242
|
+
Rbbt.software.opt.Geniass.produce
|
242
243
|
offsets = []
|
243
244
|
|
244
245
|
cleaned = text.gsub("\n",NEW_LINE_MASK)
|
@@ -294,7 +295,7 @@ module NLP
|
|
294
295
|
offsets.collect do |s,e|
|
295
296
|
sentence = text[s..e]
|
296
297
|
next if sentence.nil?
|
297
|
-
|
298
|
+
sentence.gsub!(NEW_LINE_MASK, "\n")
|
298
299
|
Segment.setup sentence, s
|
299
300
|
sentence
|
300
301
|
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/segment/annotation'
|
4
|
+
require 'rbbt/util/python'
|
5
|
+
|
6
|
+
module SpaCy
|
7
|
+
|
8
|
+
PROPERTIES = %w(lemma_ is_punct is_space shape_ pos_ tag_)
|
9
|
+
|
10
|
+
def self.tokens(text, lang = 'en')
|
11
|
+
|
12
|
+
tokens = []
|
13
|
+
RbbtPython.run 'spacy' do
|
14
|
+
nlp = spacy.load(lang)
|
15
|
+
doc = nlp.call(text)
|
16
|
+
doc.__len__.times do |i|
|
17
|
+
tokens << doc.__getitem__(i)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
tokens
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.segments(text, lang = 'en')
|
24
|
+
docid = text.docid if Document === text
|
25
|
+
corpus = text.corpus if Document === text
|
26
|
+
tokens = self.tokens(text, lang).collect do |token|
|
27
|
+
info = {}
|
28
|
+
PROPERTIES.each do |p|
|
29
|
+
info[p] = token.instance_eval(p.to_s)
|
30
|
+
end
|
31
|
+
info[:type] = "SpaCy"
|
32
|
+
info[:offset] = token.idx
|
33
|
+
info[:dep] = token.dep_ + "->" + token.head.idx.to_s
|
34
|
+
info[:docid] = docid if docid
|
35
|
+
info[:corpus] = corpus if corpus
|
36
|
+
SpaCyToken.setup(token.text, info)
|
37
|
+
end
|
38
|
+
SpaCyToken.setup(tokens, :corpus => corpus)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
module SpaCyToken
|
43
|
+
extend Entity
|
44
|
+
include SegmentAnnotation
|
45
|
+
|
46
|
+
self.annotation *SpaCy::PROPERTIES
|
47
|
+
self.annotation :dep
|
48
|
+
end
|
49
|
+
|
50
|
+
if __FILE__ == $0
|
51
|
+
ppp Annotated.tsv(SpaCy.segments("I tell a story"), :all)
|
52
|
+
end
|
data/lib/rbbt/segment.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rbbt-util'
|
2
2
|
require 'rbbt/entity'
|
3
|
+
require 'rbbt/document'
|
3
4
|
|
4
5
|
module SegID
|
5
6
|
extend Entity
|
@@ -10,11 +11,11 @@ module SegID
|
|
10
11
|
end
|
11
12
|
|
12
13
|
def range
|
13
|
-
@range ||= Range.new(*_parts.
|
14
|
+
@range ||= Range.new(*_parts[4].split("..").map(&:to_i))
|
14
15
|
end
|
15
16
|
|
16
17
|
def docid
|
17
|
-
@docid ||= _parts[0..3] * ":"
|
18
|
+
@docid ||= DocID.setup(_parts[0..3] * ":")
|
18
19
|
end
|
19
20
|
|
20
21
|
def offset
|
@@ -25,12 +26,13 @@ module SegID
|
|
25
26
|
range.end - range.begin + 1
|
26
27
|
end
|
27
28
|
|
28
|
-
property :segment do
|
29
|
+
property :segment => :single do
|
30
|
+
docid = self.docid
|
29
31
|
document = DocID.setup(docid, :corpus => corpus).document
|
30
32
|
|
31
33
|
text = document[range]
|
32
34
|
|
33
|
-
Segment.setup(text, docid)
|
35
|
+
Segment.setup(text, :docid => docid, :offset => offset)
|
34
36
|
end
|
35
37
|
|
36
38
|
property :segid do
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'rbbt-util'
|
2
|
-
require 'rbbt/entity'
|
3
2
|
require 'rbbt/segment'
|
3
|
+
require 'rbbt/entity'
|
4
4
|
|
5
5
|
module AnnotID
|
6
6
|
extend Entity
|
@@ -32,7 +32,7 @@ end
|
|
32
32
|
|
33
33
|
module SegmentAnnotation
|
34
34
|
extend Entity
|
35
|
-
include Segment
|
35
|
+
include Object::Segment
|
36
36
|
self.annotation :type
|
37
37
|
|
38
38
|
property :segid do
|
@@ -47,7 +47,7 @@ module SegmentAnnotation
|
|
47
47
|
end
|
48
48
|
|
49
49
|
property :annotid do |corpus=nil|
|
50
|
-
AnnotID.setup([segid, type] * ":", :corpus => corpus)
|
50
|
+
AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
|
51
51
|
end
|
52
52
|
|
53
53
|
alias id annotid
|
@@ -69,8 +69,8 @@ module Transformed
|
|
69
69
|
segments = [segments] unless Array === segments
|
70
70
|
orig_length = self.length
|
71
71
|
|
72
|
-
offset = self.respond_to?(:offset) ? self.offset : 0
|
73
|
-
segments = segments.select{|s| s.offset >= offset && s.offset <= offset + self.length - 1 }
|
72
|
+
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
|
+
segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
|
74
74
|
|
75
75
|
Segment.clean_sort(segments).each do |segment|
|
76
76
|
next if segment.offset.nil?
|
@@ -36,7 +36,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
36
36
|
Document.setup(text, "TEST", "test_doc1", nil)
|
37
37
|
|
38
38
|
corpus = {}
|
39
|
-
|
39
|
+
Document::Corpus.setup corpus
|
40
40
|
|
41
41
|
corpus.add_document(text)
|
42
42
|
|
@@ -50,7 +50,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
50
50
|
Document.setup(text2, "TEST", "test_doc2", nil)
|
51
51
|
|
52
52
|
corpus = {}
|
53
|
-
|
53
|
+
Document::Corpus.setup corpus
|
54
54
|
|
55
55
|
corpus.add_document(text1)
|
56
56
|
corpus.add_document(text2)
|
@@ -68,7 +68,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
68
68
|
Document.setup(text, "TEST", "test_doc1", nil)
|
69
69
|
|
70
70
|
corpus = {}
|
71
|
-
|
71
|
+
Document::Corpus.setup corpus
|
72
72
|
|
73
73
|
corpus.add_document(text)
|
74
74
|
|
@@ -95,7 +95,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
95
95
|
Document.setup(text, "TEST", "test_doc1", nil)
|
96
96
|
|
97
97
|
corpus = {}
|
98
|
-
|
98
|
+
Document::Corpus.setup corpus
|
99
99
|
|
100
100
|
corpus.add_document(text)
|
101
101
|
|
@@ -122,7 +122,7 @@ class TestAnnotation < Test::Unit::TestCase
|
|
122
122
|
Document.setup(text, "TEST", "test_doc1", nil)
|
123
123
|
|
124
124
|
corpus = {}
|
125
|
-
|
125
|
+
Document::Corpus.setup corpus
|
126
126
|
|
127
127
|
corpus.add_document(text)
|
128
128
|
|
@@ -5,12 +5,17 @@ Log.severity = 0
|
|
5
5
|
class TestGNormPlus < Test::Unit::TestCase
|
6
6
|
def test_match
|
7
7
|
text =<<-EOF
|
8
|
-
|
8
|
+
|
9
|
+
Introduction
|
10
|
+
|
11
|
+
We found that TP53 is regulated by MDM2 in Homo
|
12
|
+
sapiens
|
9
13
|
EOF
|
10
14
|
|
11
15
|
mentions = GNormPlus.process({:file => text})
|
16
|
+
|
12
17
|
assert_equal 1, mentions.length
|
13
|
-
assert_equal
|
18
|
+
assert_equal 3, mentions["file"].length
|
14
19
|
end
|
15
20
|
|
16
21
|
def test_entities
|
@@ -19,7 +24,10 @@ We found that TP53 is regulated by MDM2 in Homo sapiens
|
|
19
24
|
EOF
|
20
25
|
|
21
26
|
mentions = GNormPlus.entities({:file => text})
|
22
|
-
mentions["file"].include?
|
27
|
+
assert mentions["file"].include?("TP53")
|
28
|
+
mentions["file"].each do |mention|
|
29
|
+
assert_equal mention, text[mention.range].sub("\n", ' ')
|
30
|
+
end
|
23
31
|
end
|
24
32
|
end
|
25
33
|
|
@@ -7,13 +7,37 @@ class TestNLP < Test::Unit::TestCase
|
|
7
7
|
This is a sentence.
|
8
8
|
A funky character ™ in a sentence.
|
9
9
|
This is a sentence.
|
10
|
-
This is a
|
10
|
+
This is a broken
|
11
11
|
sentence. This is
|
12
|
-
another sentence.
|
12
|
+
another broken sentence.
|
13
13
|
EOF
|
14
14
|
|
15
|
-
|
15
|
+
iii NLP.geniass_sentence_splitter(text)
|
16
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
16
17
|
end
|
17
18
|
|
19
|
+
def test_sentences_2
|
20
|
+
text =<<-EOF
|
21
|
+
This is a sentence.
|
22
|
+
This is a sentence.
|
23
|
+
This is a broken
|
24
|
+
sentence. This is
|
25
|
+
another broken sentence.
|
26
|
+
EOF
|
27
|
+
|
28
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter(text)[2].strip
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_sentences_ext
|
32
|
+
text =<<-EOF
|
33
|
+
This is a sentence.
|
34
|
+
This is a sentence.
|
35
|
+
This is a broken
|
36
|
+
sentence. This is
|
37
|
+
another broken sentence.
|
38
|
+
EOF
|
39
|
+
|
40
|
+
assert_equal "This is a broken\nsentence.", NLP.geniass_sentence_splitter_extension(text)[2].strip
|
41
|
+
end
|
18
42
|
end
|
19
43
|
|
@@ -12,18 +12,17 @@ class TestAnnotation < Test::Unit::TestCase
|
|
12
12
|
segment = Segment.setup("is", :offset => text.index("is"), :docid => text.docid)
|
13
13
|
annotation = SegmentAnnotation.setup(segment, :type => :verb)
|
14
14
|
|
15
|
-
assert_equal 'verb', annotation.annotid.split(":")
|
15
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
16
16
|
|
17
17
|
annotation = SegmentAnnotation.setup(segment.segid, :type => :verb)
|
18
|
-
assert_equal 'verb', annotation.annotid.split(":")
|
18
|
+
assert_equal 'verb', annotation.annotid.split(":")[5]
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_annotid
|
22
22
|
text = "This is a document"
|
23
23
|
Document.setup(text, "TEST", "test_doc1", nil)
|
24
24
|
|
25
|
-
corpus = {}
|
26
|
-
corpus.extend Document::Corpus
|
25
|
+
corpus = Document::Corpus.setup({})
|
27
26
|
|
28
27
|
corpus.add_document(text)
|
29
28
|
|
@@ -2,7 +2,7 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helpe
|
|
2
2
|
require 'rbbt/segment/encoding'
|
3
3
|
|
4
4
|
class TestEncoding < Test::Unit::TestCase
|
5
|
-
def
|
5
|
+
def test_bad_chars
|
6
6
|
text = "A funky character ™ in a sentence."
|
7
7
|
|
8
8
|
assert_equal ["™"], Segment.bad_chars(text)
|
@@ -22,12 +22,13 @@ class TestClass < Test::Unit::TestCase
|
|
22
22
|
assert_equal "SCORE", a.score
|
23
23
|
end
|
24
24
|
|
25
|
-
def
|
25
|
+
def test_tsv
|
26
26
|
a = "test"
|
27
27
|
NamedEntity.setup a, 10, "TYPE", "CODE", "SCORE"
|
28
|
-
assert
|
29
|
-
assert
|
30
|
-
assert
|
28
|
+
assert Annotated.tsv([a]).fields.include? "code"
|
29
|
+
assert Annotated.tsv([a], nil).fields.include? "code"
|
30
|
+
assert Annotated.tsv([a], :all).fields.include? "code"
|
31
|
+
assert Annotated.tsv([a], :all).fields.include? "literal"
|
31
32
|
end
|
32
33
|
|
33
34
|
def __test_segment_brat
|
@@ -9,8 +9,7 @@ class TestRangeIndex < Test::Unit::TestCase
|
|
9
9
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
10
10
|
Document.setup(text, "TEST", "test_doc1", nil)
|
11
11
|
|
12
|
-
corpus = {}
|
13
|
-
corpus.extend Document::Corpus
|
12
|
+
corpus = Document::Corpus.setup({})
|
14
13
|
|
15
14
|
corpus.add_document(text)
|
16
15
|
|
data/test/rbbt/test_segment.rb
CHANGED
@@ -17,8 +17,7 @@ class TestSegment < Test::Unit::TestCase
|
|
17
17
|
text = "This is a document"
|
18
18
|
Document.setup(text, "TEST", "test_doc1", nil)
|
19
19
|
|
20
|
-
corpus = {}
|
21
|
-
corpus.extend Document::Corpus
|
20
|
+
corpus = Document::Corpus.setup({})
|
22
21
|
|
23
22
|
corpus.add_document(text)
|
24
23
|
|
@@ -41,8 +40,7 @@ class TestSegment < Test::Unit::TestCase
|
|
41
40
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
42
41
|
Document.setup(text, "TEST", "test_doc1", nil)
|
43
42
|
|
44
|
-
corpus = {}
|
45
|
-
corpus.extend Document::Corpus
|
43
|
+
corpus = Document::Corpus.setup({})
|
46
44
|
|
47
45
|
corpus.add_document(text)
|
48
46
|
|
@@ -65,8 +63,7 @@ class TestSegment < Test::Unit::TestCase
|
|
65
63
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
66
64
|
Document.setup(text, "TEST", "test_doc1", nil)
|
67
65
|
|
68
|
-
corpus = {}
|
69
|
-
corpus.extend Document::Corpus
|
66
|
+
corpus = Document::Corpus.setup({})
|
70
67
|
|
71
68
|
corpus.add_document(text)
|
72
69
|
|
@@ -94,8 +91,7 @@ class TestSegment < Test::Unit::TestCase
|
|
94
91
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
95
92
|
Document.setup(text, "TEST", "test_doc1", nil)
|
96
93
|
|
97
|
-
corpus = {}
|
98
|
-
corpus.extend Document::Corpus
|
94
|
+
corpus = Document::Corpus.setup({})
|
99
95
|
|
100
96
|
corpus.add_document(text)
|
101
97
|
|
@@ -142,8 +138,7 @@ Atypical teratoid/rhabdoid tumors (AT/RTs) are highly aggressive brain tumors of
|
|
142
138
|
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
143
139
|
Document.setup(text, "TEST", "test_doc1", nil)
|
144
140
|
|
145
|
-
corpus = {}
|
146
|
-
corpus.extend Document::Corpus
|
141
|
+
corpus = Document::Corpus.setup({})
|
147
142
|
|
148
143
|
corpus.add_document(text)
|
149
144
|
|
data/test/test_spaCy.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '', 'test_helper.rb')
|
2
|
+
require 'rbbt/nlp/spaCy'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
|
5
|
+
class TestSpaCy < Test::Unit::TestCase
|
6
|
+
def _test_tokens
|
7
|
+
text = "I tell a story"
|
8
|
+
|
9
|
+
tokens = SpaCy.tokens(text)
|
10
|
+
|
11
|
+
assert_equal 4, tokens.length
|
12
|
+
assert_equal "tell", tokens[1].to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_segments
|
16
|
+
text = "I tell a story. It's a very good story."
|
17
|
+
|
18
|
+
corpus = Document::Corpus.setup({})
|
19
|
+
|
20
|
+
Document.setup(text, "TEST", "test_doc1", "simple_sentence")
|
21
|
+
|
22
|
+
corpus.add_document text
|
23
|
+
text.corpus = corpus
|
24
|
+
|
25
|
+
segments = SpaCy.segments(text)
|
26
|
+
|
27
|
+
segments.each do |segment|
|
28
|
+
assert_equal segment, segment.segid.tap{|e| e.corpus = corpus}.segment
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.3.
|
4
|
+
version: 1.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-05-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|
@@ -102,12 +102,14 @@ files:
|
|
102
102
|
- lib/rbbt/nlp/genia/sentence_splitter.rb
|
103
103
|
- lib/rbbt/nlp/nlp.rb
|
104
104
|
- lib/rbbt/nlp/open_nlp/sentence_splitter.rb
|
105
|
+
- lib/rbbt/nlp/spaCy.rb
|
105
106
|
- lib/rbbt/segment.rb
|
106
107
|
- lib/rbbt/segment/annotation.rb
|
107
108
|
- lib/rbbt/segment/encoding.rb
|
108
109
|
- lib/rbbt/segment/named_entity.rb
|
109
110
|
- lib/rbbt/segment/overlaps.rb
|
110
111
|
- lib/rbbt/segment/range_index.rb
|
112
|
+
- lib/rbbt/segment/relationship.rb
|
111
113
|
- lib/rbbt/segment/segmented.rb
|
112
114
|
- lib/rbbt/segment/token.rb
|
113
115
|
- lib/rbbt/segment/transformed.rb
|
@@ -161,6 +163,7 @@ files:
|
|
161
163
|
- test/rbbt/test_document.rb
|
162
164
|
- test/rbbt/test_segment.rb
|
163
165
|
- test/test_helper.rb
|
166
|
+
- test/test_spaCy.rb
|
164
167
|
homepage: http://github.com/mikisvaz/rbbt-util
|
165
168
|
licenses: []
|
166
169
|
metadata: {}
|
@@ -217,4 +220,5 @@ test_files:
|
|
217
220
|
- test/rbbt/segment/test_encoding.rb
|
218
221
|
- test/rbbt/segment/test_range_index.rb
|
219
222
|
- test/rbbt/segment/test_corpus.rb
|
223
|
+
- test/test_spaCy.rb
|
220
224
|
- test/test_helper.rb
|