rbbt-text 0.6.2 → 0.6.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/bow/dictionary.rb +1 -0
- data/lib/rbbt/corpus/document.rb +5 -2
- data/lib/rbbt/corpus/document_repo.rb +2 -1
- data/lib/rbbt/entity/document.rb +40 -0
- data/lib/rbbt/ner/segment.rb +9 -2
- data/lib/rbbt/ner/segment/named_entity.rb +4 -2
- data/lib/rbbt/ner/segment/token.rb +28 -3
- data/lib/rbbt/ner/segment/transformed.rb +116 -115
- data/lib/rbbt/ner/token_trieNER.rb +17 -13
- data/test/rbbt/corpus/test_document.rb +22 -10
- data/test/rbbt/ner/segment/test_named_entity.rb +1 -1
- data/test/rbbt/ner/segment/test_transformed.rb +38 -3
- metadata +6 -7
data/lib/rbbt/bow/dictionary.rb
CHANGED
data/lib/rbbt/corpus/document.rb
CHANGED
@@ -148,7 +148,9 @@ class Document
|
|
148
148
|
fields = data.fields if fields.nil? and data.respond_to? :fields
|
149
149
|
|
150
150
|
|
151
|
-
data.
|
151
|
+
if data.respond_to? :persistence_path and String === data.persistence_path
|
152
|
+
data.filter(data.persistence_path + '.filters')
|
153
|
+
end
|
152
154
|
data.add_filter("field:#{ doc_field }", @docid)
|
153
155
|
data.add_filter("field:#{ entity_field }", "#{ entity }")
|
154
156
|
keys = data.keys
|
@@ -157,7 +159,7 @@ class Document
|
|
157
159
|
|
158
160
|
if keys.empty?
|
159
161
|
segments = produce_#{entity}
|
160
|
-
segments << Segment.setup("No #{entity} found in document
|
162
|
+
segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
|
161
163
|
tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
|
162
164
|
|
163
165
|
tsv.add_field "#{ doc_field }" do
|
@@ -178,6 +180,7 @@ class Document
|
|
178
180
|
data.pop_filter
|
179
181
|
data.pop_filter
|
180
182
|
data.read
|
183
|
+
|
181
184
|
else
|
182
185
|
if raw == :check
|
183
186
|
data.close
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'rbbt/entity'
|
2
|
+
|
3
|
+
module Document
|
4
|
+
extend Entity
|
5
|
+
|
6
|
+
class << self
|
7
|
+
attr_accessor :corpus
|
8
|
+
end
|
9
|
+
|
10
|
+
property :text => :array2single do
|
11
|
+
article_text = {}
|
12
|
+
missing = []
|
13
|
+
|
14
|
+
self.each do |doc|
|
15
|
+
Document.corpus.read if Document.corpus.respond_to? :read
|
16
|
+
if Document.corpus.include?(doc)
|
17
|
+
article_text[doc] = Document.corpus[doc]
|
18
|
+
else
|
19
|
+
missing << doc
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
if missing.any?
|
24
|
+
missing.first.annotate missing
|
25
|
+
missing_text = Misc.process_to_hash(missing){|list| list._get_text}
|
26
|
+
|
27
|
+
Misc.lock Document.corpus.persistence_path do
|
28
|
+
Document.corpus.write if Document.corpus.respond_to? :write
|
29
|
+
missing_text.each do |doc, text|
|
30
|
+
article_text[doc] = text
|
31
|
+
Document.corpus[doc] = text
|
32
|
+
end
|
33
|
+
Document.corpus.read if Document.corpus.respond_to? :read
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
article_text.values_at *self
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
data/lib/rbbt/ner/segment.rb
CHANGED
@@ -5,6 +5,10 @@ module Segment
|
|
5
5
|
extend Annotation
|
6
6
|
self.annotation :offset
|
7
7
|
|
8
|
+
def offset=(offset)
|
9
|
+
@offset = offset.nil? ? nil : offset.to_i
|
10
|
+
end
|
11
|
+
|
8
12
|
#{{{ Ranges
|
9
13
|
|
10
14
|
def end
|
@@ -297,8 +301,11 @@ module Segment
|
|
297
301
|
end
|
298
302
|
|
299
303
|
def self.load_tsv(tsv)
|
300
|
-
tsv.
|
301
|
-
|
304
|
+
fields = tsv.fields
|
305
|
+
tsv.with_unnamed do
|
306
|
+
tsv.collect do |id, values|
|
307
|
+
Annotated.load_tsv_values(id, values, fields)
|
308
|
+
end
|
302
309
|
end
|
303
310
|
end
|
304
311
|
|
@@ -2,9 +2,34 @@ require 'rbbt/annotations'
|
|
2
2
|
require 'rbbt/ner/segment'
|
3
3
|
|
4
4
|
module Token
|
5
|
-
|
6
|
-
|
7
|
-
self.
|
5
|
+
attr_accessor :offset, :original
|
6
|
+
|
7
|
+
def self.all_annotations
|
8
|
+
[:offset, :original]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.setup(text, start, original = nil)
|
12
|
+
text.extend Token
|
13
|
+
text.offset = start
|
14
|
+
text.original = original
|
15
|
+
text
|
16
|
+
end
|
17
|
+
|
18
|
+
def info
|
19
|
+
{:original => original, :offset => offset}
|
20
|
+
end
|
21
|
+
|
22
|
+
def id
|
23
|
+
Misc.hash2md5 info.merge :self => self
|
24
|
+
end
|
25
|
+
|
26
|
+
def end
|
27
|
+
offset + self.length - 1
|
28
|
+
end
|
29
|
+
|
30
|
+
def range
|
31
|
+
(offset..self.end)
|
32
|
+
end
|
8
33
|
|
9
34
|
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
10
35
|
|
@@ -1,9 +1,9 @@
|
|
1
|
+
require 'rbbt/util/misc'
|
1
2
|
require 'rbbt/ner/segment'
|
3
|
+
|
2
4
|
module Transformed
|
3
|
-
attr_accessor :transformation_offset_differences, :transformation_original
|
4
5
|
|
5
6
|
def self.transform(text, segments, replacement = nil, &block)
|
6
|
-
require 'rbbt/util/misc'
|
7
7
|
|
8
8
|
text.extend Transformed
|
9
9
|
text.replace(segments, replacement, &block)
|
@@ -12,7 +12,6 @@ module Transformed
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def self.with_transform(text, segments, replacement)
|
15
|
-
require 'rbbt/util/misc'
|
16
15
|
|
17
16
|
text.extend Transformed
|
18
17
|
text.replace(segments, replacement)
|
@@ -24,147 +23,149 @@ module Transformed
|
|
24
23
|
text.restore(segments, true)
|
25
24
|
end
|
26
25
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
26
|
+
attr_accessor :transformed_segments, :transformation_stack
|
27
|
+
|
28
|
+
def shift(segment_o)
|
29
|
+
begin_shift = 0
|
30
|
+
end_shift = 0
|
31
|
+
|
32
|
+
@transformed_segments.sort_by{|id, info| info.last}.each{|id,info|
|
33
|
+
pseg_o, diff = info
|
34
|
+
|
35
|
+
case
|
36
|
+
# Before
|
37
|
+
when segment_o.last + end_shift < pseg_o.begin
|
38
|
+
# After
|
39
|
+
when (segment_o.begin + begin_shift > pseg_o.last)
|
40
|
+
begin_shift += diff
|
41
|
+
end_shift += diff
|
42
|
+
# Includes
|
43
|
+
when (segment_o.begin + begin_shift <= pseg_o.begin and segment_o.last + end_shift >= pseg_o.last)
|
44
|
+
end_shift += diff
|
45
|
+
# Inside
|
46
|
+
when (segment_o.begin + begin_shift >= pseg_o.begin and segment_o.last + end_shift <= pseg_o.last)
|
47
|
+
return nil
|
48
|
+
# Overlaps start
|
49
|
+
when (segment_o.begin + begin_shift <= pseg_o.begin and segment_o.last + end_shift <= pseg_o.last)
|
50
|
+
return nil
|
51
|
+
# Overlaps end
|
52
|
+
when (segment_o.begin + begin_shift >= pseg_o.begin and segment_o.last + end_shift >= pseg_o.last)
|
53
|
+
return nil
|
54
|
+
else
|
55
|
+
raise "Unknown overlaps: #{segment_o.inspect} - #{pseg_o.inspect}"
|
36
56
|
end
|
37
|
-
|
38
|
-
end
|
57
|
+
}
|
39
58
|
|
40
|
-
|
59
|
+
[begin_shift, end_shift]
|
41
60
|
end
|
42
61
|
|
43
|
-
def
|
44
|
-
|
62
|
+
def self.sort(segments)
|
63
|
+
segments.compact.sort do |a,b|
|
64
|
+
case
|
65
|
+
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
66
|
+
0
|
67
|
+
when (a.nil? or a.offset.nil?)
|
68
|
+
-1
|
69
|
+
when (b.nil? or b.offset.nil?)
|
70
|
+
+1
|
71
|
+
# Non-overlap
|
72
|
+
when (a.end < b.offset or b.end < a.offset)
|
73
|
+
b.offset <=> a.offset
|
74
|
+
# b includes a
|
75
|
+
when (a.offset >= b.offset and a.end <= b.end)
|
76
|
+
-1
|
77
|
+
# b includes a
|
78
|
+
when (b.offset >= a.offset and b.end <= a.end)
|
79
|
+
+1
|
80
|
+
# Overlap
|
81
|
+
when (a.offset > b.offset and a.end > b.end or b.offset < a.offset and b.end > a.end)
|
82
|
+
a.length <=> b.length
|
83
|
+
else
|
84
|
+
raise "Unexpected case in sort: #{a.range} - #{b.range}"
|
85
|
+
end
|
86
|
+
end
|
45
87
|
end
|
46
88
|
|
47
|
-
def
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
when Integer === pos
|
52
|
-
transform_pos(pos)
|
53
|
-
else
|
54
|
-
raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
|
55
|
-
end
|
56
|
-
|
57
|
-
self[transformed_pos] = value
|
58
|
-
end
|
89
|
+
def replace(segments, replacement = nil, &block)
|
90
|
+
@transformed_segments ||= {}
|
91
|
+
@transformation_stack ||= []
|
92
|
+
stack = []
|
59
93
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
transform_range(pos)
|
64
|
-
when Integer === pos
|
65
|
-
transform_pos(pos)
|
66
|
-
else
|
67
|
-
raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
|
68
|
-
end
|
69
|
-
|
70
|
-
self[transformed_pos]
|
71
|
-
end
|
94
|
+
Transformed.sort(segments).each do |segment|
|
95
|
+
next if segment.offset.nil?
|
96
|
+
shift = shift segment.range
|
72
97
|
|
73
|
-
|
74
|
-
return false if @transformation_offset_differences.nil? or @transformation_offset_differences.empty?
|
75
|
-
transformation_offset_difference = @transformation_offset_differences.last
|
98
|
+
next if shift.nil?
|
76
99
|
|
77
|
-
|
78
|
-
offset, diff, orig_length, trans_length = info
|
79
|
-
return true if segment_range.begin > offset and segment_range.begin < offset + trans_length or
|
80
|
-
segment_range.end > offset and segment_range.end < offset + trans_length
|
81
|
-
end
|
100
|
+
shift_begin, shift_end = shift
|
82
101
|
|
83
|
-
|
84
|
-
|
102
|
+
text_offset = self.respond_to?(:offset)? self.offset : 0
|
103
|
+
updated_begin = segment.offset + shift_begin - text_offset
|
104
|
+
updated_end = segment.range.last + shift_end - text_offset
|
85
105
|
|
86
|
-
|
87
|
-
replacement ||= block
|
88
|
-
raise "No replacement given" if replacement.nil?
|
89
|
-
transformation_offset_differences = []
|
90
|
-
transformation_original = []
|
106
|
+
updated_range = (updated_begin..updated_end)
|
91
107
|
|
92
|
-
|
93
|
-
untransformed_segment_range_here= segment.range_in(self)
|
94
|
-
transformed_segment_range = self.transform_range(untransformed_segment_range_here)
|
95
|
-
next if conflict?(transformed_segment_range)
|
108
|
+
updated_text = self[updated_begin..updated_end]
|
96
109
|
|
97
|
-
|
110
|
+
original_text = segment.dup
|
111
|
+
segment.replace updated_text
|
98
112
|
|
99
113
|
case
|
114
|
+
when block_given?
|
115
|
+
new = block.call(segment)
|
100
116
|
when String === replacement
|
101
|
-
|
117
|
+
new = replacement
|
102
118
|
when Proc === replacement
|
119
|
+
new = replacement.call(segment)
|
120
|
+
end
|
103
121
|
|
104
|
-
|
105
|
-
save_segment_text = segment.dup
|
106
|
-
save_offset = segment.offset
|
107
|
-
segment.replace text_before_transform
|
108
|
-
segment.offset = transformed_segment_range.begin
|
122
|
+
diff = new.length - segment.length
|
109
123
|
|
110
|
-
|
124
|
+
self[updated_begin..updated_end] = new
|
111
125
|
|
112
|
-
|
113
|
-
segment.replace save_segment_text
|
114
|
-
segment.offset = save_offset
|
115
|
-
else
|
116
|
-
raise "Replacemente not String nor Proc"
|
117
|
-
end
|
118
|
-
diff = segment.length - transformed_text.length
|
119
|
-
self[transformed_segment_range] = transformed_text
|
126
|
+
@transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
|
120
127
|
|
121
|
-
|
122
|
-
|
128
|
+
segment.replace original_text
|
129
|
+
stack << segment.object_id
|
123
130
|
end
|
131
|
+
@transformation_stack << stack
|
132
|
+
end
|
124
133
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
134
|
+
def fix_segment(segment, range, diff)
|
135
|
+
case
|
136
|
+
# Before
|
137
|
+
when segment.end < range.begin
|
138
|
+
# After
|
139
|
+
when segment.offset > range.end + diff
|
140
|
+
segment.offset -= diff
|
141
|
+
# Includes
|
142
|
+
when (segment.offset <= range.begin and segment.end >= range.end + diff)
|
143
|
+
segment.replace self[segment.offset..segment.end - diff]
|
144
|
+
else
|
145
|
+
raise "Segment Overlaps"
|
146
|
+
end
|
129
147
|
end
|
130
148
|
|
131
|
-
def restore(segments
|
132
|
-
|
133
|
-
while self.transformation_offset_differences.any? and not stop
|
134
|
-
transformation_offset_differences = self.transformation_offset_differences.pop
|
135
|
-
transformation_original = self.transformation_original.pop
|
149
|
+
def restore(segments, first_only = false)
|
150
|
+
return segments if @transformation_stack.empty?
|
136
151
|
|
137
|
-
|
138
|
-
|
139
|
-
|
152
|
+
if first_only
|
153
|
+
@transformation_stack.pop.reverse.each do |id|
|
154
|
+
orig_range, diff, text, range = @transformed_segments.delete id
|
140
155
|
|
141
|
-
|
142
|
-
self
|
156
|
+
new_range = (range.begin..range.last + diff)
|
157
|
+
self[new_range] = text
|
158
|
+
segments.each do |segment|
|
159
|
+
next unless Segment === segment
|
160
|
+
fix_segment(segment, range, diff)
|
161
|
+
end if Array === segments
|
143
162
|
end
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
segment_ranges = segments.each do |segment|
|
150
|
-
r = segment.range
|
151
|
-
|
152
|
-
s = r.begin
|
153
|
-
e = r.end
|
154
|
-
sdiff = 0
|
155
|
-
ediff = 0
|
156
|
-
transformation_offset_differences.reverse.each do |offset,diff,orig_length,rep_length|
|
157
|
-
sdiff += diff if offset < s
|
158
|
-
ediff += diff if offset + rep_length - 1 < e
|
159
|
-
end
|
160
|
-
|
161
|
-
segment.offset = s + sdiff
|
162
|
-
segment.replace self[(s+sdiff)..(e + ediff)]
|
163
|
+
segments
|
164
|
+
else
|
165
|
+
while @transformation_stack.any?
|
166
|
+
restore(segments, true)
|
163
167
|
end
|
168
|
+
segments
|
164
169
|
end
|
165
|
-
|
166
|
-
segments
|
167
170
|
end
|
168
171
|
end
|
169
|
-
|
170
|
-
|
@@ -110,7 +110,7 @@ class TokenTrieNER < NER
|
|
110
110
|
end
|
111
111
|
|
112
112
|
def self.merge(index1, index2)
|
113
|
-
index1.write if index1.respond_to? :write
|
113
|
+
index1.write if index1.respond_to? :write and not index1.write?
|
114
114
|
index2.each do |key, new_index2|
|
115
115
|
case
|
116
116
|
when key == :END
|
@@ -119,7 +119,8 @@ class TokenTrieNER < NER
|
|
119
119
|
end1.uniq!
|
120
120
|
index1[:END] = end1
|
121
121
|
when index1.include?(key)
|
122
|
-
|
122
|
+
new = merge(index1[key], new_index2)
|
123
|
+
index1[key] = new
|
123
124
|
else
|
124
125
|
index1[key] = new_index2
|
125
126
|
end
|
@@ -148,7 +149,10 @@ class TokenTrieNER < NER
|
|
148
149
|
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
|
149
150
|
tokens.extend EnumeratedArray
|
150
151
|
|
151
|
-
|
152
|
+
token_index = index_for_tokens(tokens, code, type, slack)
|
153
|
+
|
154
|
+
tmp_index = merge(tmp_index, token_index) unless tokens.empty?
|
155
|
+
|
152
156
|
items_in_chunk += 1
|
153
157
|
|
154
158
|
if items_in_chunk > chunk_size
|
@@ -267,22 +271,22 @@ class TokenTrieNER < NER
|
|
267
271
|
TokenTrieNER.merge(@index, new.index)
|
268
272
|
when TSV === new
|
269
273
|
Log.debug "TokenTrieNER merging TSV"
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
new.unnamed = old_unnamed
|
276
|
-
new.monitor = old_monitor
|
274
|
+
new.with_unnamed do
|
275
|
+
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
276
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
277
|
+
end
|
278
|
+
end
|
277
279
|
when Hash === new
|
278
280
|
Log.debug "TokenTrieNER merging Hash"
|
279
281
|
TokenTrieNER.merge(@index, new)
|
280
282
|
when String === new
|
281
283
|
Log.debug "TokenTrieNER merging file: #{ new }"
|
282
284
|
new = TSV.open(new, :flat)
|
283
|
-
new.
|
284
|
-
|
285
|
-
|
285
|
+
new.with_unnamed do
|
286
|
+
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
287
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
288
|
+
end
|
289
|
+
end
|
286
290
|
end
|
287
291
|
end
|
288
292
|
|
@@ -2,7 +2,19 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
|
|
2
2
|
require 'rbbt/corpus/document'
|
3
3
|
require 'test/unit'
|
4
4
|
|
5
|
+
module TokenEntity
|
6
|
+
extend Annotation
|
7
|
+
include Segment
|
8
|
+
self.annotation :original
|
9
|
+
end
|
5
10
|
class Document
|
11
|
+
|
12
|
+
def tokenize(text)
|
13
|
+
Token.tokenize(text).collect do |token|
|
14
|
+
TokenEntity.setup(token.dup, token.offset, token.original)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
6
18
|
define :sentences do
|
7
19
|
require 'rbbt/nlp/nlp'
|
8
20
|
NLP.geniass_sentence_splitter(text)
|
@@ -10,22 +22,22 @@ class Document
|
|
10
22
|
|
11
23
|
define :tokens do
|
12
24
|
require 'rbbt/ner/segment/token'
|
13
|
-
|
25
|
+
tokenize(text)
|
14
26
|
end
|
15
27
|
|
16
28
|
define :long_words do
|
17
29
|
require 'rbbt/ner/segment/token'
|
18
|
-
|
30
|
+
tokenize(text).select{|tok| tok.length > 5}
|
19
31
|
end
|
20
32
|
|
21
33
|
define :short_words do
|
22
34
|
require 'rbbt/ner/segment/token'
|
23
|
-
|
35
|
+
tokenize(text).select{|tok| tok.length < 5}
|
24
36
|
end
|
25
37
|
|
26
38
|
define :even_words do
|
27
39
|
require 'rbbt/ner/segment/token'
|
28
|
-
|
40
|
+
tokenize(text).select{|tok| tok.length % 2 == 0}
|
29
41
|
end
|
30
42
|
|
31
43
|
define :missing do
|
@@ -110,7 +122,7 @@ another sentence.
|
|
110
122
|
doc = Document.new(dir)
|
111
123
|
doc.text = text
|
112
124
|
|
113
|
-
sentence = doc.sentences.last
|
125
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
114
126
|
doc.load_into sentence, :tokens
|
115
127
|
|
116
128
|
assert_equal 5, sentence.tokens.length
|
@@ -134,7 +146,7 @@ another sentence.
|
|
134
146
|
doc = Document.new(dir)
|
135
147
|
doc.text = text
|
136
148
|
|
137
|
-
sentence = doc.sentences.last
|
149
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
138
150
|
Misc.benchmark(1) do
|
139
151
|
doc = Document.new(dir)
|
140
152
|
doc.text = text
|
@@ -166,7 +178,7 @@ another sentence.
|
|
166
178
|
doc = Document.new(dir)
|
167
179
|
doc.text = text * 10
|
168
180
|
|
169
|
-
sentence = doc.sentences.last
|
181
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
170
182
|
|
171
183
|
doc.load_into sentence, :tokens, :long_words
|
172
184
|
|
@@ -178,9 +190,9 @@ another sentence.
|
|
178
190
|
doc = Document.new(dir)
|
179
191
|
doc.text = text * 10
|
180
192
|
doc.sentences
|
181
|
-
assert_equal sentence, doc.sentences.last
|
193
|
+
assert_equal sentence, doc.sentences.sort_by{|sentence| sentence.offset}.last
|
182
194
|
|
183
|
-
sentence = doc.sentences.last
|
195
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
184
196
|
doc.load_into sentence, :tokens, :long_words
|
185
197
|
|
186
198
|
assert_equal 2, sentence.long_words.length
|
@@ -211,7 +223,7 @@ another sentence.
|
|
211
223
|
doc.text = text * 10
|
212
224
|
doc.docid = "TEST"
|
213
225
|
|
214
|
-
sentence = doc.sentences.last
|
226
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
215
227
|
|
216
228
|
doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
|
217
229
|
|
@@ -1,9 +1,11 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/ner/segment/transformed'
|
3
3
|
require 'rbbt/ner/segment/named_entity'
|
4
|
+
require 'rexml/document'
|
5
|
+
require 'rand'
|
4
6
|
|
5
7
|
class TestClass < Test::Unit::TestCase
|
6
|
-
def
|
8
|
+
def tttest_transform
|
7
9
|
a = "This sentence mentions the TP53 gene and the CDK5 protein"
|
8
10
|
original = a.dup
|
9
11
|
|
@@ -56,11 +58,13 @@ class TestClass < Test::Unit::TestCase
|
|
56
58
|
Transformed.with_transform(a, [gene1], "GN") do
|
57
59
|
assert_equal original.sub("TP53", 'GN'), a
|
58
60
|
end
|
61
|
+
|
59
62
|
assert_equal original, a
|
60
63
|
|
61
|
-
Transformed.with_transform(a, [gene1,gene2], "GN") do
|
64
|
+
Transformed.with_transform(a, [gene1, gene2], "GN") do
|
62
65
|
assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
|
63
66
|
end
|
67
|
+
|
64
68
|
assert_equal original, a
|
65
69
|
|
66
70
|
Transformed.with_transform(a, [gene1], "GN") do
|
@@ -69,6 +73,7 @@ class TestClass < Test::Unit::TestCase
|
|
69
73
|
end
|
70
74
|
assert_equal original.gsub(/TP53/, 'GN'), a
|
71
75
|
end
|
76
|
+
|
72
77
|
assert_equal original, a
|
73
78
|
|
74
79
|
exp1, exp2 = nil, nil
|
@@ -169,7 +174,37 @@ class TestClass < Test::Unit::TestCase
|
|
169
174
|
assert_equal one, a
|
170
175
|
end
|
171
176
|
end
|
172
|
-
|
173
177
|
end
|
178
|
+
|
179
|
+
def test_error
|
180
|
+
a = "Do not have a diagnosis of another hereditary APC resistance/Factor V Leiden, Protein S or C deficiency, prothrombin gene mutation (G20210A), or acquired (lupus anticoagulant) thrombophilic disorder"
|
181
|
+
|
182
|
+
entity1 = "gene"
|
183
|
+
entity1.extend NamedEntity
|
184
|
+
entity1.offset = a.index entity1
|
185
|
+
entity1.type = "Gene"
|
186
|
+
|
187
|
+
entity2 = "prothrombin gene mutation"
|
188
|
+
entity2.extend NamedEntity
|
189
|
+
entity2.offset = a.index entity2
|
190
|
+
entity2.type = "Mutation"
|
191
|
+
|
192
|
+
entity3 = "Protein S or C"
|
193
|
+
entity3.extend NamedEntity
|
194
|
+
entity3.offset = a.index entity3
|
195
|
+
entity3.type = "Gene"
|
196
|
+
|
197
|
+
entity4 = "prothrombin gene mutation"
|
198
|
+
entity4.extend NamedEntity
|
199
|
+
entity4.offset = a.index entity2
|
200
|
+
entity4.type = "Disease"
|
201
|
+
|
202
|
+
|
203
|
+
Transformed.with_transform(a, [entity1].sort_by{rand}, Proc.new{|e| e.html}) do
|
204
|
+
Transformed.with_transform(a, [entity3, entity2, entity4].sort_by{rand}, Proc.new{|e| e.html}) do
|
205
|
+
assert_nothing_raised{REXML::Document.new "<xml>"+ a + "</xml>"}
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
174
209
|
end
|
175
210
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 1
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 6
|
9
|
-
-
|
10
|
-
version: 0.6.
|
9
|
+
- 3
|
10
|
+
version: 0.6.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
19
|
-
default_executable: get_ppis.rb
|
18
|
+
date: 2012-02-09 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: rbbt-util
|
@@ -106,6 +105,7 @@ files:
|
|
106
105
|
- lib/rbbt/corpus/document.rb
|
107
106
|
- lib/rbbt/corpus/document_repo.rb
|
108
107
|
- lib/rbbt/corpus/sources/pubmed.rb
|
108
|
+
- lib/rbbt/entity/document.rb
|
109
109
|
- lib/rbbt/ner/NER.rb
|
110
110
|
- lib/rbbt/ner/abner.rb
|
111
111
|
- lib/rbbt/ner/banner.rb
|
@@ -161,7 +161,6 @@ files:
|
|
161
161
|
- test/rbbt/corpus/test_corpus.rb
|
162
162
|
- test/rbbt/corpus/test_document.rb
|
163
163
|
- bin/get_ppis.rb
|
164
|
-
has_rdoc: true
|
165
164
|
homepage: http://github.com/mikisvaz/rbbt-util
|
166
165
|
licenses: []
|
167
166
|
|
@@ -191,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
191
190
|
requirements: []
|
192
191
|
|
193
192
|
rubyforge_project:
|
194
|
-
rubygems_version: 1.
|
193
|
+
rubygems_version: 1.8.10
|
195
194
|
signing_key:
|
196
195
|
specification_version: 3
|
197
196
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|