rbbt-text 1.2.0 → 1.3.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +55 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +63 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +26 -3
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/entity'
|
4
|
+
|
5
|
+
module AnnotID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
def _parts
|
10
|
+
@parts ||= self.split(":")
|
11
|
+
end
|
12
|
+
|
13
|
+
def segid
|
14
|
+
@segid ||= _parts[0..4] * ":"
|
15
|
+
end
|
16
|
+
|
17
|
+
def type
|
18
|
+
@type ||= _parts[5]
|
19
|
+
end
|
20
|
+
|
21
|
+
property :annotation do
|
22
|
+
segment = SegID.setup(segid, :corpus => corpus).segment
|
23
|
+
|
24
|
+
SegmentAnnotation.setup(segment, :type => type)
|
25
|
+
end
|
26
|
+
|
27
|
+
property :annotid do
|
28
|
+
self
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
module SegmentAnnotation
|
34
|
+
extend Entity
|
35
|
+
include Object::Segment
|
36
|
+
self.annotation :type
|
37
|
+
|
38
|
+
property :segid do
|
39
|
+
case self
|
40
|
+
when SegID
|
41
|
+
self
|
42
|
+
when Segment
|
43
|
+
super()
|
44
|
+
else
|
45
|
+
raise "Unknown object: #{self}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
property :annotid do |corpus=nil|
|
50
|
+
AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
|
51
|
+
end
|
52
|
+
|
53
|
+
alias id annotid
|
54
|
+
|
55
|
+
property :annotation do
|
56
|
+
self
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
module Segment
|
3
|
+
def self.bad_chars(text)
|
4
|
+
segments = []
|
5
|
+
text.chars.each_with_index do |c,i|
|
6
|
+
if ! c.ascii_only?
|
7
|
+
segments << Segment.setup(c, :offset => i)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
segments
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.ascii(text, replace = nil, &block)
|
14
|
+
bad = bad_chars(text)
|
15
|
+
replace = "?" if replace.nil?
|
16
|
+
Transformed.with_transform(text, bad, replace, &block)
|
17
|
+
end
|
18
|
+
end
|
@@ -1,42 +1,46 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/segment/annotation'
|
3
3
|
|
4
|
-
module NamedEntity
|
4
|
+
module NamedEntity
|
5
5
|
extend Entity
|
6
6
|
include Segment
|
7
|
+
include SegmentAnnotation
|
7
8
|
|
8
|
-
self.annotation :
|
9
|
+
self.annotation :entity_type, :code, :score
|
9
10
|
|
10
11
|
def report
|
11
12
|
<<-EOF
|
12
13
|
String: #{ self }
|
13
14
|
Offset: #{ offset.inspect }
|
14
|
-
Type: #{
|
15
|
+
Type: #{entity_type.inspect}
|
15
16
|
Code: #{code.inspect}
|
16
17
|
Score: #{score.inspect}
|
17
18
|
EOF
|
18
19
|
end
|
19
20
|
|
20
21
|
def html
|
22
|
+
title = code.nil? ? entity_type : [entity_type, code].compact * ":"
|
23
|
+
|
21
24
|
text = <<-EOF
|
22
25
|
<span class='Entity'\
|
23
|
-
#{
|
26
|
+
#{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
|
24
27
|
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
25
28
|
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
29
|
+
#{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
|
26
30
|
>#{ self }</span>
|
27
31
|
EOF
|
28
32
|
text.chomp
|
29
33
|
end
|
30
34
|
|
31
35
|
def entity(params = nil)
|
32
|
-
code = self.dup
|
36
|
+
code = self.code || self.dup
|
33
37
|
format, entity = code.split(":")
|
34
38
|
entity, format = format, nil if entity.nil?
|
35
|
-
|
36
|
-
if defined?(Entity) && Entity.formats.include?(
|
39
|
+
|
40
|
+
if defined?(Entity) && Entity.formats.include?(entity_type) or Entity.formats.include?(format)
|
37
41
|
params ||= {}
|
38
42
|
params[:format] = format if format and params[:format].nil?
|
39
|
-
mod = (Entity.formats[
|
43
|
+
mod = (Entity.formats[entity_type] || Entity.format[entity])
|
40
44
|
mod.setup(entity, params)
|
41
45
|
end
|
42
46
|
|
@@ -44,4 +48,3 @@ Score: #{score.inspect}
|
|
44
48
|
end
|
45
49
|
|
46
50
|
end
|
47
|
-
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Segment
|
2
|
+
def pull(offset)
|
3
|
+
if self.offset.nil? or offset.nil?
|
4
|
+
self.offset = nil
|
5
|
+
else
|
6
|
+
self.offset += offset
|
7
|
+
end
|
8
|
+
|
9
|
+
self
|
10
|
+
end
|
11
|
+
|
12
|
+
def push(offset)
|
13
|
+
if self.offset.nil? or offset.nil?
|
14
|
+
self.offset = nil
|
15
|
+
else
|
16
|
+
self.offset -= offset
|
17
|
+
end
|
18
|
+
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def make_relative(segments, &block)
|
23
|
+
if block_given?
|
24
|
+
segments.each{|s| s.push offset}
|
25
|
+
yield(segments)
|
26
|
+
segments.each{|s| s.pull offset}
|
27
|
+
else
|
28
|
+
segments.each{|s| s.push offset}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def range_in(container = nil)
|
33
|
+
raise "No offset specified" if offset.nil?
|
34
|
+
case
|
35
|
+
when (Segment === container and not container.offset.nil?)
|
36
|
+
((offset - container.offset)..(self.eend - container.offset))
|
37
|
+
when Integer === container
|
38
|
+
((offset - container)..(self.eend - container))
|
39
|
+
else
|
40
|
+
range
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def includes?(segment)
|
45
|
+
(segment.offset.to_i >= self.offset.to_i) and
|
46
|
+
(segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
|
47
|
+
end
|
48
|
+
|
49
|
+
def overlaps?(segment)
|
50
|
+
segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.eend ||
|
51
|
+
self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.eend
|
52
|
+
end
|
53
|
+
|
54
|
+
def overlaps(segments)
|
55
|
+
segments.select{|s| self.overlaps?(s) }
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.collisions(main, secondary)
|
59
|
+
secondary.select do |ss|
|
60
|
+
main.select{|ms| ms.overlaps? ss }.any?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Segment::RangeIndex
|
2
|
+
attr_accessor :corpus
|
3
|
+
|
4
|
+
def [](*args)
|
5
|
+
res = super(*args)
|
6
|
+
SegID.setup(res, :corpus => corpus)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.index(segments, corpus, persist_file = :memory)
|
10
|
+
segments = segments.values.flatten if Hash === segments
|
11
|
+
|
12
|
+
annotation_index =
|
13
|
+
Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
|
14
|
+
|
15
|
+
value_size = 0
|
16
|
+
index_data = segments.collect{|segment|
|
17
|
+
next if segment.offset.nil?
|
18
|
+
range = segment.range
|
19
|
+
value_size = [segment.segid.length, value_size].max
|
20
|
+
[segment.segid, [range.begin, range.end]]
|
21
|
+
}.compact
|
22
|
+
|
23
|
+
fwt = FixWidthTable.get :memory, value_size, true
|
24
|
+
fwt.add_range index_data
|
25
|
+
|
26
|
+
fwt
|
27
|
+
end
|
28
|
+
|
29
|
+
annotation_index.extend Segment::RangeIndex
|
30
|
+
annotation_index.corpus = corpus
|
31
|
+
annotation_index
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
|
3
|
+
module Token
|
4
|
+
extend Entity
|
5
|
+
include Segment
|
6
|
+
|
7
|
+
self.annotation :original
|
8
|
+
|
9
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
10
|
+
|
11
|
+
tokens = []
|
12
|
+
while matchdata = text.match(split_at)
|
13
|
+
tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
|
14
|
+
tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
15
|
+
start += matchdata.end(0)
|
16
|
+
text = matchdata.post_match
|
17
|
+
end
|
18
|
+
|
19
|
+
tokens << Token.setup(text, :offset => start) unless text.empty?
|
20
|
+
|
21
|
+
tokens
|
22
|
+
end
|
23
|
+
end
|
@@ -1,6 +1,3 @@
|
|
1
|
-
require 'rbbt/util/misc'
|
2
|
-
require 'rbbt/text/segment'
|
3
|
-
|
4
1
|
module Transformed
|
5
2
|
|
6
3
|
def self.transform(text, segments, replacement = nil, &block)
|
@@ -71,6 +68,10 @@ module Transformed
|
|
71
68
|
|
72
69
|
segments = [segments] unless Array === segments
|
73
70
|
orig_length = self.length
|
71
|
+
|
72
|
+
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
|
+
segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
|
74
|
+
|
74
75
|
Segment.clean_sort(segments).each do |segment|
|
75
76
|
next if segment.offset.nil?
|
76
77
|
|
@@ -89,7 +90,7 @@ module Transformed
|
|
89
90
|
|
90
91
|
updated_text = self[updated_begin..updated_end]
|
91
92
|
if updated_text.nil?
|
92
|
-
Log.warn "Range outside of segment: #{self.length} #{segment.
|
93
|
+
Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
|
93
94
|
next
|
94
95
|
end
|
95
96
|
|
@@ -111,10 +112,10 @@ module Transformed
|
|
111
112
|
|
112
113
|
self[updated_begin..updated_end] = new
|
113
114
|
|
114
|
-
@transformed_segments[segment.
|
115
|
+
@transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
|
115
116
|
|
116
117
|
segment.replace original_text
|
117
|
-
stack << segment.
|
118
|
+
stack << segment.object_id
|
118
119
|
end
|
119
120
|
@transformation_stack << stack
|
120
121
|
end
|
@@ -122,13 +123,13 @@ module Transformed
|
|
122
123
|
def fix_segment(segment, range, diff)
|
123
124
|
case
|
124
125
|
# Before
|
125
|
-
when segment.
|
126
|
+
when segment.eend < range.begin
|
126
127
|
# After
|
127
128
|
when segment.offset.to_i > range.end + diff
|
128
129
|
segment.offset = segment.offset.to_i - diff
|
129
130
|
# Includes
|
130
|
-
when (segment.offset.to_i <= range.begin and segment.
|
131
|
-
segment.replace self[segment.offset.to_i..segment.
|
131
|
+
when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
|
132
|
+
segment.replace self[segment.offset.to_i..segment.eend - diff]
|
132
133
|
else
|
133
134
|
raise "Segment Overlaps"
|
134
135
|
end
|
@@ -141,7 +142,8 @@ module Transformed
|
|
141
142
|
|
142
143
|
if first_only
|
143
144
|
@transformation_stack.pop.reverse.each do |id|
|
144
|
-
|
145
|
+
segment_info = @transformed_segments.delete id
|
146
|
+
orig_range, diff, text, range = segment_info
|
145
147
|
|
146
148
|
new_range = (range.begin..range.last + diff)
|
147
149
|
self[new_range] = text
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#module Segment
|
2
|
+
#
|
3
|
+
# def self.set_tsv_fields(fields, segments)
|
4
|
+
# tsv_fields = []
|
5
|
+
# add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
|
6
|
+
# literal = (fields.delete(:literal) || fields.delete("literal"))
|
7
|
+
# tsv_fields << "Start" << "End"
|
8
|
+
# tsv_fields << :annotation_types if add_types
|
9
|
+
# tsv_fields << :literal if literal
|
10
|
+
#
|
11
|
+
# if fields.any? and not (fields == [:all] or fields == ["all"])
|
12
|
+
# tsv_fields.concat fields
|
13
|
+
# else
|
14
|
+
# tsv_fields.concat segments.first.annotations if segments.any?
|
15
|
+
# end
|
16
|
+
# tsv_fields
|
17
|
+
# tsv_fields.collect!{|f| f.to_s}
|
18
|
+
# tsv_fields.delete "offset"
|
19
|
+
# tsv_fields
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# def self.tsv(segments, *fields)
|
23
|
+
# fields = set_tsv_fields fields, segments
|
24
|
+
# tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
|
25
|
+
#
|
26
|
+
# segments.each do |segment|
|
27
|
+
# tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# tsv
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# def self.load_tsv(tsv)
|
34
|
+
# fields = tsv.fields
|
35
|
+
# tsv.with_unnamed do
|
36
|
+
# tsv.collect do |id, values|
|
37
|
+
# Annotated.load_tsv_values(id, values, fields)
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
# end
|
41
|
+
#end
|
@@ -12,7 +12,7 @@ pkg_dir="`opt_dir \"$name\"`"
|
|
12
12
|
build_dir=`build_dir`
|
13
13
|
mv "$build_dir" "$pkg_dir"
|
14
14
|
tmp_file="~/.rbbt/tmp/species-proxy-properties.tmp"
|
15
|
-
mkdir -p $(
|
15
|
+
mkdir -p $(dirname "$tmp_file")
|
16
16
|
cat "$pkg_dir/species-proxy/properties.conf" |grep -v "^.dir =" >> $tmp_file
|
17
17
|
echo "\$dir = $pkg_dir/species-proxy/" > "$pkg_dir/species-proxy/properties.conf"
|
18
18
|
cat $tmp_file | grep -v "^#" >> "$pkg_dir/species-proxy/properties.conf"
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/document/corpus/pubmed'
|
5
|
+
|
6
|
+
class TestCorpusPubmed < Test::Unit::TestCase
|
7
|
+
def test_add_pmid
|
8
|
+
corpus = Document::Corpus.setup({})
|
9
|
+
|
10
|
+
document = corpus.add_pmid("32299157", :abstract).first
|
11
|
+
title = document.to(:title)
|
12
|
+
assert title.include?("COVID-19")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/document/annotation'
|
6
|
+
require 'rbbt/segment/named_entity'
|
7
|
+
|
8
|
+
class TestAnnotation < Test::Unit::TestCase
|
9
|
+
class CalledOnce < Exception; end
|
10
|
+
def setup
|
11
|
+
Document.define :words do
|
12
|
+
self.split(" ")
|
13
|
+
end
|
14
|
+
|
15
|
+
$called_once = false
|
16
|
+
Document.define :persisted_words do
|
17
|
+
raise CalledOnce if $called_once
|
18
|
+
$called_once = true
|
19
|
+
self.split(" ")
|
20
|
+
end
|
21
|
+
|
22
|
+
Document.define_multiple :multiple_words do |list|
|
23
|
+
list.collect{|doc| doc.words}
|
24
|
+
end
|
25
|
+
|
26
|
+
Document.define :ner do
|
27
|
+
$called_once = true
|
28
|
+
self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
|
29
|
+
end
|
30
|
+
|
31
|
+
Document.persist :ner
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_define
|
35
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
36
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
37
|
+
|
38
|
+
corpus = {}
|
39
|
+
Document::Corpus.setup corpus
|
40
|
+
|
41
|
+
corpus.add_document(text)
|
42
|
+
|
43
|
+
assert_equal text[text.words[1].range], text.words[1]
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_define_multiple
|
47
|
+
text1 = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
48
|
+
text2 = "This is another sentence"
|
49
|
+
Document.setup(text1, "TEST", "test_doc1", nil)
|
50
|
+
Document.setup(text2, "TEST", "test_doc2", nil)
|
51
|
+
|
52
|
+
corpus = {}
|
53
|
+
Document::Corpus.setup corpus
|
54
|
+
|
55
|
+
corpus.add_document(text1)
|
56
|
+
corpus.add_document(text2)
|
57
|
+
|
58
|
+
assert_equal 2, Document.setup([text1, text2]).multiple_words.length
|
59
|
+
assert_equal text1.split(" "), text1.multiple_words
|
60
|
+
|
61
|
+
#Document.persist :multiple_words, :annotations, :annotation_repo => Rbbt.tmp.test.multiple_words
|
62
|
+
#assert_equal 2, Document.setup([text1, text2]).multiple_words.length
|
63
|
+
#assert_equal text1.split(" "), text1.multiple_words
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_persist
|
67
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
68
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
69
|
+
|
70
|
+
corpus = {}
|
71
|
+
Document::Corpus.setup corpus
|
72
|
+
|
73
|
+
corpus.add_document(text)
|
74
|
+
|
75
|
+
assert_equal "persisted_words", text.persisted_words.first.type
|
76
|
+
|
77
|
+
assert_raise CalledOnce do
|
78
|
+
assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
|
79
|
+
end
|
80
|
+
|
81
|
+
Log.severity = 0
|
82
|
+
Document.persist :persisted_words, :annotations, :file => Rbbt.tmp.test.persisted_words.find(:user)
|
83
|
+
|
84
|
+
$called_once = false
|
85
|
+
text.persisted_words
|
86
|
+
assert $called_once
|
87
|
+
|
88
|
+
assert_nothing_raised do
|
89
|
+
assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_persist_annotation_repo
|
94
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
95
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
96
|
+
|
97
|
+
corpus = {}
|
98
|
+
Document::Corpus.setup corpus
|
99
|
+
|
100
|
+
corpus.add_document(text)
|
101
|
+
|
102
|
+
assert_equal "persisted_words", text.persisted_words.first.type
|
103
|
+
|
104
|
+
assert_raise CalledOnce do
|
105
|
+
assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
|
106
|
+
end
|
107
|
+
|
108
|
+
Log.severity = 0
|
109
|
+
Document.persist :persisted_words, :annotations, :annotation_repo => Rbbt.tmp.test.persisted_words_repo.find(:user)
|
110
|
+
|
111
|
+
$called_once = false
|
112
|
+
text.persisted_words
|
113
|
+
assert $called_once
|
114
|
+
|
115
|
+
assert_nothing_raised do
|
116
|
+
assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_persist_ner
|
121
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
122
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
123
|
+
|
124
|
+
corpus = {}
|
125
|
+
Document::Corpus.setup corpus
|
126
|
+
|
127
|
+
corpus.add_document(text)
|
128
|
+
|
129
|
+
|
130
|
+
text.ner
|
131
|
+
|
132
|
+
$called_once = false
|
133
|
+
text.ner
|
134
|
+
|
135
|
+
assert ! $called_once
|
136
|
+
|
137
|
+
assert text.ner.first.segid.include?("TEST:")
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|