rbbt-text 1.2.0 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/bow/bow.rb +5 -2
- data/lib/rbbt/bow/dictionary.rb +27 -23
- data/lib/rbbt/document.rb +55 -0
- data/lib/rbbt/document/annotation.rb +45 -0
- data/lib/rbbt/document/corpus.rb +63 -0
- data/lib/rbbt/document/corpus/pubmed.rb +33 -0
- data/lib/rbbt/ner/NER.rb +3 -3
- data/lib/rbbt/ner/abner.rb +1 -1
- data/lib/rbbt/ner/banner.rb +1 -1
- data/lib/rbbt/ner/brat.rb +1 -1
- data/lib/rbbt/ner/chemical_tagger.rb +1 -2
- data/lib/rbbt/ner/g_norm_plus.rb +26 -3
- data/lib/rbbt/ner/linnaeus.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +3 -3
- data/lib/rbbt/ner/oscar3.rb +1 -2
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +5 -5
- data/lib/rbbt/ner/regexpNER.rb +1 -2
- data/lib/rbbt/ner/token_trieNER.rb +35 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +3 -2
- data/lib/rbbt/nlp/nlp.rb +5 -5
- data/lib/rbbt/nlp/open_nlp/sentence_splitter.rb +37 -36
- data/lib/rbbt/nlp/spaCy.rb +52 -0
- data/lib/rbbt/segment.rb +179 -0
- data/lib/rbbt/segment/annotation.rb +58 -0
- data/lib/rbbt/segment/encoding.rb +18 -0
- data/lib/rbbt/{text/segment → segment}/named_entity.rb +14 -11
- data/lib/rbbt/segment/overlaps.rb +63 -0
- data/lib/rbbt/segment/range_index.rb +35 -0
- data/lib/rbbt/segment/relationship.rb +7 -0
- data/lib/rbbt/{text/segment → segment}/segmented.rb +1 -1
- data/lib/rbbt/segment/token.rb +23 -0
- data/lib/rbbt/{text/segment → segment}/transformed.rb +12 -10
- data/lib/rbbt/segment/tsv.rb +41 -0
- data/share/install/software/Linnaeus +1 -1
- data/share/install/software/OpenNLP +1 -1
- data/test/rbbt/document/corpus/test_pubmed.rb +15 -0
- data/test/rbbt/document/test_annotation.rb +140 -0
- data/test/rbbt/document/test_corpus.rb +33 -0
- data/test/rbbt/ner/test_finder.rb +3 -3
- data/test/rbbt/ner/test_g_norm_plus.rb +20 -2
- data/test/rbbt/ner/test_patterns.rb +9 -9
- data/test/rbbt/ner/test_regexpNER.rb +14 -14
- data/test/rbbt/ner/test_rnorm.rb +3 -4
- data/test/rbbt/ner/test_token_trieNER.rb +1 -0
- data/test/rbbt/nlp/genia/test_sentence_splitter.rb +37 -3
- data/test/rbbt/nlp/open_nlp/test_sentence_splitter.rb +20 -4
- data/test/rbbt/segment/test_annotation.rb +39 -0
- data/test/rbbt/segment/test_corpus.rb +36 -0
- data/test/rbbt/segment/test_encoding.rb +24 -0
- data/test/rbbt/{text/segment → segment}/test_named_entity.rb +15 -11
- data/test/rbbt/segment/test_overlaps.rb +69 -0
- data/test/rbbt/segment/test_range_index.rb +42 -0
- data/test/rbbt/{text/segment → segment}/test_transformed.rb +105 -51
- data/test/rbbt/test_document.rb +14 -0
- data/test/rbbt/test_segment.rb +182 -0
- data/test/test_helper.rb +5 -3
- data/test/test_spaCy.rb +32 -0
- metadata +44 -32
- data/lib/rbbt/text/corpus.rb +0 -106
- data/lib/rbbt/text/corpus/document.rb +0 -383
- data/lib/rbbt/text/corpus/document_repo.rb +0 -68
- data/lib/rbbt/text/corpus/sources/pmid.rb +0 -34
- data/lib/rbbt/text/document.rb +0 -39
- data/lib/rbbt/text/segment.rb +0 -363
- data/lib/rbbt/text/segment/docid.rb +0 -46
- data/lib/rbbt/text/segment/relationship.rb +0 -24
- data/lib/rbbt/text/segment/token.rb +0 -49
- data/test/rbbt/text/corpus/sources/test_pmid.rb +0 -33
- data/test/rbbt/text/corpus/test_document.rb +0 -82
- data/test/rbbt/text/segment/test_relationship.rb +0 -0
- data/test/rbbt/text/segment/test_segmented.rb +0 -23
- data/test/rbbt/text/test_corpus.rb +0 -34
- data/test/rbbt/text/test_document.rb +0 -58
- data/test/rbbt/text/test_segment.rb +0 -100
@@ -0,0 +1,58 @@
|
|
1
|
+
require 'rbbt-util'
|
2
|
+
require 'rbbt/segment'
|
3
|
+
require 'rbbt/entity'
|
4
|
+
|
5
|
+
module AnnotID
|
6
|
+
extend Entity
|
7
|
+
self.annotation :corpus
|
8
|
+
|
9
|
+
def _parts
|
10
|
+
@parts ||= self.split(":")
|
11
|
+
end
|
12
|
+
|
13
|
+
def segid
|
14
|
+
@segid ||= _parts[0..4] * ":"
|
15
|
+
end
|
16
|
+
|
17
|
+
def type
|
18
|
+
@type ||= _parts[5]
|
19
|
+
end
|
20
|
+
|
21
|
+
property :annotation do
|
22
|
+
segment = SegID.setup(segid, :corpus => corpus).segment
|
23
|
+
|
24
|
+
SegmentAnnotation.setup(segment, :type => type)
|
25
|
+
end
|
26
|
+
|
27
|
+
property :annotid do
|
28
|
+
self
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
module SegmentAnnotation
|
34
|
+
extend Entity
|
35
|
+
include Object::Segment
|
36
|
+
self.annotation :type
|
37
|
+
|
38
|
+
property :segid do
|
39
|
+
case self
|
40
|
+
when SegID
|
41
|
+
self
|
42
|
+
when Segment
|
43
|
+
super()
|
44
|
+
else
|
45
|
+
raise "Unknown object: #{self}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
property :annotid do |corpus=nil|
|
50
|
+
AnnotID.setup([segid, type, Misc.obj2digest(self.info)] * ":", :corpus => corpus)
|
51
|
+
end
|
52
|
+
|
53
|
+
alias id annotid
|
54
|
+
|
55
|
+
property :annotation do
|
56
|
+
self
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
module Segment
|
3
|
+
def self.bad_chars(text)
|
4
|
+
segments = []
|
5
|
+
text.chars.each_with_index do |c,i|
|
6
|
+
if ! c.ascii_only?
|
7
|
+
segments << Segment.setup(c, :offset => i)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
segments
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.ascii(text, replace = nil, &block)
|
14
|
+
bad = bad_chars(text)
|
15
|
+
replace = "?" if replace.nil?
|
16
|
+
Transformed.with_transform(text, bad, replace, &block)
|
17
|
+
end
|
18
|
+
end
|
@@ -1,42 +1,46 @@
|
|
1
|
-
require 'rbbt/
|
2
|
-
require 'rbbt/
|
1
|
+
require 'rbbt/segment'
|
2
|
+
require 'rbbt/segment/annotation'
|
3
3
|
|
4
|
-
module NamedEntity
|
4
|
+
module NamedEntity
|
5
5
|
extend Entity
|
6
6
|
include Segment
|
7
|
+
include SegmentAnnotation
|
7
8
|
|
8
|
-
self.annotation :
|
9
|
+
self.annotation :entity_type, :code, :score
|
9
10
|
|
10
11
|
def report
|
11
12
|
<<-EOF
|
12
13
|
String: #{ self }
|
13
14
|
Offset: #{ offset.inspect }
|
14
|
-
Type: #{
|
15
|
+
Type: #{entity_type.inspect}
|
15
16
|
Code: #{code.inspect}
|
16
17
|
Score: #{score.inspect}
|
17
18
|
EOF
|
18
19
|
end
|
19
20
|
|
20
21
|
def html
|
22
|
+
title = code.nil? ? entity_type : [entity_type, code].compact * ":"
|
23
|
+
|
21
24
|
text = <<-EOF
|
22
25
|
<span class='Entity'\
|
23
|
-
#{
|
26
|
+
#{entity_type.nil? ? "" : " attr-entity-type='#{Array === entity_type ? entity_type * " " : entity_type}'"}\
|
24
27
|
#{code.nil? ? "" : " attr-entity-code='#{Array === code ? code * " " : code}'"}\
|
25
28
|
#{score.nil? ? "" : " attr-entity-score='#{Array === score ? score * " " : score}'"}\
|
29
|
+
#{title.nil? ? "" : " title='#{Array === title ? title * " " : title}'"}\
|
26
30
|
>#{ self }</span>
|
27
31
|
EOF
|
28
32
|
text.chomp
|
29
33
|
end
|
30
34
|
|
31
35
|
def entity(params = nil)
|
32
|
-
code = self.dup
|
36
|
+
code = self.code || self.dup
|
33
37
|
format, entity = code.split(":")
|
34
38
|
entity, format = format, nil if entity.nil?
|
35
|
-
|
36
|
-
if defined?(Entity) && Entity.formats.include?(
|
39
|
+
|
40
|
+
if defined?(Entity) && Entity.formats.include?(entity_type) or Entity.formats.include?(format)
|
37
41
|
params ||= {}
|
38
42
|
params[:format] = format if format and params[:format].nil?
|
39
|
-
mod = (Entity.formats[
|
43
|
+
mod = (Entity.formats[entity_type] || Entity.format[entity])
|
40
44
|
mod.setup(entity, params)
|
41
45
|
end
|
42
46
|
|
@@ -44,4 +48,3 @@ Score: #{score.inspect}
|
|
44
48
|
end
|
45
49
|
|
46
50
|
end
|
47
|
-
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Segment
|
2
|
+
def pull(offset)
|
3
|
+
if self.offset.nil? or offset.nil?
|
4
|
+
self.offset = nil
|
5
|
+
else
|
6
|
+
self.offset += offset
|
7
|
+
end
|
8
|
+
|
9
|
+
self
|
10
|
+
end
|
11
|
+
|
12
|
+
def push(offset)
|
13
|
+
if self.offset.nil? or offset.nil?
|
14
|
+
self.offset = nil
|
15
|
+
else
|
16
|
+
self.offset -= offset
|
17
|
+
end
|
18
|
+
|
19
|
+
self
|
20
|
+
end
|
21
|
+
|
22
|
+
def make_relative(segments, &block)
|
23
|
+
if block_given?
|
24
|
+
segments.each{|s| s.push offset}
|
25
|
+
yield(segments)
|
26
|
+
segments.each{|s| s.pull offset}
|
27
|
+
else
|
28
|
+
segments.each{|s| s.push offset}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def range_in(container = nil)
|
33
|
+
raise "No offset specified" if offset.nil?
|
34
|
+
case
|
35
|
+
when (Segment === container and not container.offset.nil?)
|
36
|
+
((offset - container.offset)..(self.eend - container.offset))
|
37
|
+
when Integer === container
|
38
|
+
((offset - container)..(self.eend - container))
|
39
|
+
else
|
40
|
+
range
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def includes?(segment)
|
45
|
+
(segment.offset.to_i >= self.offset.to_i) and
|
46
|
+
(segment.offset.to_i + segment.segment_length.to_i <= self.offset.to_i + self.segment_length.to_i)
|
47
|
+
end
|
48
|
+
|
49
|
+
def overlaps?(segment)
|
50
|
+
segment.offset.to_i >= self.offset.to_i && segment.offset.to_i <= self.eend ||
|
51
|
+
self.offset.to_i >= segment.offset.to_i && self.offset.to_i <= segment.eend
|
52
|
+
end
|
53
|
+
|
54
|
+
def overlaps(segments)
|
55
|
+
segments.select{|s| self.overlaps?(s) }
|
56
|
+
end
|
57
|
+
|
58
|
+
def self.collisions(main, secondary)
|
59
|
+
secondary.select do |ss|
|
60
|
+
main.select{|ms| ms.overlaps? ss }.any?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Segment::RangeIndex
|
2
|
+
attr_accessor :corpus
|
3
|
+
|
4
|
+
def [](*args)
|
5
|
+
res = super(*args)
|
6
|
+
SegID.setup(res, :corpus => corpus)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.index(segments, corpus, persist_file = :memory)
|
10
|
+
segments = segments.values.flatten if Hash === segments
|
11
|
+
|
12
|
+
annotation_index =
|
13
|
+
Persist.persist("Segment_index", :fwt, :persist => (! (persist_file.nil? or persist_file == :memory)), :file => persist_file) do
|
14
|
+
|
15
|
+
value_size = 0
|
16
|
+
index_data = segments.collect{|segment|
|
17
|
+
next if segment.offset.nil?
|
18
|
+
range = segment.range
|
19
|
+
value_size = [segment.segid.length, value_size].max
|
20
|
+
[segment.segid, [range.begin, range.end]]
|
21
|
+
}.compact
|
22
|
+
|
23
|
+
fwt = FixWidthTable.get :memory, value_size, true
|
24
|
+
fwt.add_range index_data
|
25
|
+
|
26
|
+
fwt
|
27
|
+
end
|
28
|
+
|
29
|
+
annotation_index.extend Segment::RangeIndex
|
30
|
+
annotation_index.corpus = corpus
|
31
|
+
annotation_index
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rbbt/segment'
|
2
|
+
|
3
|
+
module Token
|
4
|
+
extend Entity
|
5
|
+
include Segment
|
6
|
+
|
7
|
+
self.annotation :original
|
8
|
+
|
9
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
10
|
+
|
11
|
+
tokens = []
|
12
|
+
while matchdata = text.match(split_at)
|
13
|
+
tokens << Token.setup(matchdata.pre_match, :offset => start) unless matchdata.pre_match.empty?
|
14
|
+
tokens << Token.setup(matchdata.captures.first, :offset => start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
15
|
+
start += matchdata.end(0)
|
16
|
+
text = matchdata.post_match
|
17
|
+
end
|
18
|
+
|
19
|
+
tokens << Token.setup(text, :offset => start) unless text.empty?
|
20
|
+
|
21
|
+
tokens
|
22
|
+
end
|
23
|
+
end
|
@@ -1,6 +1,3 @@
|
|
1
|
-
require 'rbbt/util/misc'
|
2
|
-
require 'rbbt/text/segment'
|
3
|
-
|
4
1
|
module Transformed
|
5
2
|
|
6
3
|
def self.transform(text, segments, replacement = nil, &block)
|
@@ -71,6 +68,10 @@ module Transformed
|
|
71
68
|
|
72
69
|
segments = [segments] unless Array === segments
|
73
70
|
orig_length = self.length
|
71
|
+
|
72
|
+
offset = self.respond_to?(:offset) ? self.offset.to_i : 0
|
73
|
+
segments = segments.select{|s| s.offset.to_i >= offset && s.offset.to_i <= offset + self.length - 1 }
|
74
|
+
|
74
75
|
Segment.clean_sort(segments).each do |segment|
|
75
76
|
next if segment.offset.nil?
|
76
77
|
|
@@ -89,7 +90,7 @@ module Transformed
|
|
89
90
|
|
90
91
|
updated_text = self[updated_begin..updated_end]
|
91
92
|
if updated_text.nil?
|
92
|
-
Log.warn "Range outside of segment: #{self.length} #{segment.
|
93
|
+
Log.warn "Range outside of segment: #{self.length} #{segment.range} (#{updated_range})"
|
93
94
|
next
|
94
95
|
end
|
95
96
|
|
@@ -111,10 +112,10 @@ module Transformed
|
|
111
112
|
|
112
113
|
self[updated_begin..updated_end] = new
|
113
114
|
|
114
|
-
@transformed_segments[segment.
|
115
|
+
@transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
|
115
116
|
|
116
117
|
segment.replace original_text
|
117
|
-
stack << segment.
|
118
|
+
stack << segment.object_id
|
118
119
|
end
|
119
120
|
@transformation_stack << stack
|
120
121
|
end
|
@@ -122,13 +123,13 @@ module Transformed
|
|
122
123
|
def fix_segment(segment, range, diff)
|
123
124
|
case
|
124
125
|
# Before
|
125
|
-
when segment.
|
126
|
+
when segment.eend < range.begin
|
126
127
|
# After
|
127
128
|
when segment.offset.to_i > range.end + diff
|
128
129
|
segment.offset = segment.offset.to_i - diff
|
129
130
|
# Includes
|
130
|
-
when (segment.offset.to_i <= range.begin and segment.
|
131
|
-
segment.replace self[segment.offset.to_i..segment.
|
131
|
+
when (segment.offset.to_i <= range.begin and segment.eend >= range.end + diff)
|
132
|
+
segment.replace self[segment.offset.to_i..segment.eend - diff]
|
132
133
|
else
|
133
134
|
raise "Segment Overlaps"
|
134
135
|
end
|
@@ -141,7 +142,8 @@ module Transformed
|
|
141
142
|
|
142
143
|
if first_only
|
143
144
|
@transformation_stack.pop.reverse.each do |id|
|
144
|
-
|
145
|
+
segment_info = @transformed_segments.delete id
|
146
|
+
orig_range, diff, text, range = segment_info
|
145
147
|
|
146
148
|
new_range = (range.begin..range.last + diff)
|
147
149
|
self[new_range] = text
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#module Segment
|
2
|
+
#
|
3
|
+
# def self.set_tsv_fields(fields, segments)
|
4
|
+
# tsv_fields = []
|
5
|
+
# add_types = ! (fields.delete(:no_types) || fields.delete("no_types") || fields.include?(:JSON) || fields.include?("JSON"))
|
6
|
+
# literal = (fields.delete(:literal) || fields.delete("literal"))
|
7
|
+
# tsv_fields << "Start" << "End"
|
8
|
+
# tsv_fields << :annotation_types if add_types
|
9
|
+
# tsv_fields << :literal if literal
|
10
|
+
#
|
11
|
+
# if fields.any? and not (fields == [:all] or fields == ["all"])
|
12
|
+
# tsv_fields.concat fields
|
13
|
+
# else
|
14
|
+
# tsv_fields.concat segments.first.annotations if segments.any?
|
15
|
+
# end
|
16
|
+
# tsv_fields
|
17
|
+
# tsv_fields.collect!{|f| f.to_s}
|
18
|
+
# tsv_fields.delete "offset"
|
19
|
+
# tsv_fields
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# def self.tsv(segments, *fields)
|
23
|
+
# fields = set_tsv_fields fields, segments
|
24
|
+
# tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
|
25
|
+
#
|
26
|
+
# segments.each do |segment|
|
27
|
+
# tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# tsv
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# def self.load_tsv(tsv)
|
34
|
+
# fields = tsv.fields
|
35
|
+
# tsv.with_unnamed do
|
36
|
+
# tsv.collect do |id, values|
|
37
|
+
# Annotated.load_tsv_values(id, values, fields)
|
38
|
+
# end
|
39
|
+
# end
|
40
|
+
# end
|
41
|
+
#end
|
@@ -12,7 +12,7 @@ pkg_dir="`opt_dir \"$name\"`"
|
|
12
12
|
build_dir=`build_dir`
|
13
13
|
mv "$build_dir" "$pkg_dir"
|
14
14
|
tmp_file="~/.rbbt/tmp/species-proxy-properties.tmp"
|
15
|
-
mkdir -p $(
|
15
|
+
mkdir -p $(dirname "$tmp_file")
|
16
16
|
cat "$pkg_dir/species-proxy/properties.conf" |grep -v "^.dir =" >> $tmp_file
|
17
17
|
echo "\$dir = $pkg_dir/species-proxy/" > "$pkg_dir/species-proxy/properties.conf"
|
18
18
|
cat $tmp_file | grep -v "^#" >> "$pkg_dir/species-proxy/properties.conf"
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/document/corpus/pubmed'
|
5
|
+
|
6
|
+
class TestCorpusPubmed < Test::Unit::TestCase
|
7
|
+
def test_add_pmid
|
8
|
+
corpus = Document::Corpus.setup({})
|
9
|
+
|
10
|
+
document = corpus.add_pmid("32299157", :abstract).first
|
11
|
+
title = document.to(:title)
|
12
|
+
assert title.include?("COVID-19")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require File.join(File.expand_path(File.dirname(__FILE__)), '../..', 'test_helper.rb')
|
2
|
+
require 'rbbt/document'
|
3
|
+
require 'rbbt/document/corpus'
|
4
|
+
require 'rbbt/segment'
|
5
|
+
require 'rbbt/document/annotation'
|
6
|
+
require 'rbbt/segment/named_entity'
|
7
|
+
|
8
|
+
class TestAnnotation < Test::Unit::TestCase
|
9
|
+
class CalledOnce < Exception; end
|
10
|
+
def setup
|
11
|
+
Document.define :words do
|
12
|
+
self.split(" ")
|
13
|
+
end
|
14
|
+
|
15
|
+
$called_once = false
|
16
|
+
Document.define :persisted_words do
|
17
|
+
raise CalledOnce if $called_once
|
18
|
+
$called_once = true
|
19
|
+
self.split(" ")
|
20
|
+
end
|
21
|
+
|
22
|
+
Document.define_multiple :multiple_words do |list|
|
23
|
+
list.collect{|doc| doc.words}
|
24
|
+
end
|
25
|
+
|
26
|
+
Document.define :ner do
|
27
|
+
$called_once = true
|
28
|
+
self.split(" ").collect{|e| NamedEntity.setup(e, :code => Misc.digest(e)) }
|
29
|
+
end
|
30
|
+
|
31
|
+
Document.persist :ner
|
32
|
+
end
|
33
|
+
|
34
|
+
def test_define
|
35
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
36
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
37
|
+
|
38
|
+
corpus = {}
|
39
|
+
Document::Corpus.setup corpus
|
40
|
+
|
41
|
+
corpus.add_document(text)
|
42
|
+
|
43
|
+
assert_equal text[text.words[1].range], text.words[1]
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_define_multiple
|
47
|
+
text1 = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
48
|
+
text2 = "This is another sentence"
|
49
|
+
Document.setup(text1, "TEST", "test_doc1", nil)
|
50
|
+
Document.setup(text2, "TEST", "test_doc2", nil)
|
51
|
+
|
52
|
+
corpus = {}
|
53
|
+
Document::Corpus.setup corpus
|
54
|
+
|
55
|
+
corpus.add_document(text1)
|
56
|
+
corpus.add_document(text2)
|
57
|
+
|
58
|
+
assert_equal 2, Document.setup([text1, text2]).multiple_words.length
|
59
|
+
assert_equal text1.split(" "), text1.multiple_words
|
60
|
+
|
61
|
+
#Document.persist :multiple_words, :annotations, :annotation_repo => Rbbt.tmp.test.multiple_words
|
62
|
+
#assert_equal 2, Document.setup([text1, text2]).multiple_words.length
|
63
|
+
#assert_equal text1.split(" "), text1.multiple_words
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_persist
|
67
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
68
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
69
|
+
|
70
|
+
corpus = {}
|
71
|
+
Document::Corpus.setup corpus
|
72
|
+
|
73
|
+
corpus.add_document(text)
|
74
|
+
|
75
|
+
assert_equal "persisted_words", text.persisted_words.first.type
|
76
|
+
|
77
|
+
assert_raise CalledOnce do
|
78
|
+
assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
|
79
|
+
end
|
80
|
+
|
81
|
+
Log.severity = 0
|
82
|
+
Document.persist :persisted_words, :annotations, :file => Rbbt.tmp.test.persisted_words.find(:user)
|
83
|
+
|
84
|
+
$called_once = false
|
85
|
+
text.persisted_words
|
86
|
+
assert $called_once
|
87
|
+
|
88
|
+
assert_nothing_raised do
|
89
|
+
assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_persist_annotation_repo
|
94
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
95
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
96
|
+
|
97
|
+
corpus = {}
|
98
|
+
Document::Corpus.setup corpus
|
99
|
+
|
100
|
+
corpus.add_document(text)
|
101
|
+
|
102
|
+
assert_equal "persisted_words", text.persisted_words.first.type
|
103
|
+
|
104
|
+
assert_raise CalledOnce do
|
105
|
+
assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
|
106
|
+
end
|
107
|
+
|
108
|
+
Log.severity = 0
|
109
|
+
Document.persist :persisted_words, :annotations, :annotation_repo => Rbbt.tmp.test.persisted_words_repo.find(:user)
|
110
|
+
|
111
|
+
$called_once = false
|
112
|
+
text.persisted_words
|
113
|
+
assert $called_once
|
114
|
+
|
115
|
+
assert_nothing_raised do
|
116
|
+
assert_equal text[text.persisted_words[1].range], text.persisted_words[1]
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_persist_ner
|
121
|
+
text = "This sentence mentions the TP53 gene and the CDK5R1 protein"
|
122
|
+
Document.setup(text, "TEST", "test_doc1", nil)
|
123
|
+
|
124
|
+
corpus = {}
|
125
|
+
Document::Corpus.setup corpus
|
126
|
+
|
127
|
+
corpus.add_document(text)
|
128
|
+
|
129
|
+
|
130
|
+
text.ner
|
131
|
+
|
132
|
+
$called_once = false
|
133
|
+
text.ner
|
134
|
+
|
135
|
+
assert ! $called_once
|
136
|
+
|
137
|
+
assert text.ner.first.segid.include?("TEST:")
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|