rbbt-text 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/bow/dictionary.rb +1 -0
- data/lib/rbbt/corpus/document.rb +5 -2
- data/lib/rbbt/corpus/document_repo.rb +2 -1
- data/lib/rbbt/entity/document.rb +40 -0
- data/lib/rbbt/ner/segment.rb +9 -2
- data/lib/rbbt/ner/segment/named_entity.rb +4 -2
- data/lib/rbbt/ner/segment/token.rb +28 -3
- data/lib/rbbt/ner/segment/transformed.rb +116 -115
- data/lib/rbbt/ner/token_trieNER.rb +17 -13
- data/test/rbbt/corpus/test_document.rb +22 -10
- data/test/rbbt/ner/segment/test_named_entity.rb +1 -1
- data/test/rbbt/ner/segment/test_transformed.rb +38 -3
- metadata +6 -7
data/lib/rbbt/bow/dictionary.rb
CHANGED
data/lib/rbbt/corpus/document.rb
CHANGED
@@ -148,7 +148,9 @@ class Document
|
|
148
148
|
fields = data.fields if fields.nil? and data.respond_to? :fields
|
149
149
|
|
150
150
|
|
151
|
-
data.
|
151
|
+
if data.respond_to? :persistence_path and String === data.persistence_path
|
152
|
+
data.filter(data.persistence_path + '.filters')
|
153
|
+
end
|
152
154
|
data.add_filter("field:#{ doc_field }", @docid)
|
153
155
|
data.add_filter("field:#{ entity_field }", "#{ entity }")
|
154
156
|
keys = data.keys
|
@@ -157,7 +159,7 @@ class Document
|
|
157
159
|
|
158
160
|
if keys.empty?
|
159
161
|
segments = produce_#{entity}
|
160
|
-
segments << Segment.setup("No #{entity} found in document
|
162
|
+
segments << Segment.setup("No #{entity} found in document " + @docid.to_s, -1) if segments.empty?
|
161
163
|
tsv = Segment.tsv(segments, *fields.reject{|f| ["#{doc_field}", "#{entity_field}", "Start", "End", "annotation_types"].include? f})
|
162
164
|
|
163
165
|
tsv.add_field "#{ doc_field }" do
|
@@ -178,6 +180,7 @@ class Document
|
|
178
180
|
data.pop_filter
|
179
181
|
data.pop_filter
|
180
182
|
data.read
|
183
|
+
|
181
184
|
else
|
182
185
|
if raw == :check
|
183
186
|
data.close
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'rbbt/entity'
|
2
|
+
|
3
|
+
module Document
|
4
|
+
extend Entity
|
5
|
+
|
6
|
+
class << self
|
7
|
+
attr_accessor :corpus
|
8
|
+
end
|
9
|
+
|
10
|
+
property :text => :array2single do
|
11
|
+
article_text = {}
|
12
|
+
missing = []
|
13
|
+
|
14
|
+
self.each do |doc|
|
15
|
+
Document.corpus.read if Document.corpus.respond_to? :read
|
16
|
+
if Document.corpus.include?(doc)
|
17
|
+
article_text[doc] = Document.corpus[doc]
|
18
|
+
else
|
19
|
+
missing << doc
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
if missing.any?
|
24
|
+
missing.first.annotate missing
|
25
|
+
missing_text = Misc.process_to_hash(missing){|list| list._get_text}
|
26
|
+
|
27
|
+
Misc.lock Document.corpus.persistence_path do
|
28
|
+
Document.corpus.write if Document.corpus.respond_to? :write
|
29
|
+
missing_text.each do |doc, text|
|
30
|
+
article_text[doc] = text
|
31
|
+
Document.corpus[doc] = text
|
32
|
+
end
|
33
|
+
Document.corpus.read if Document.corpus.respond_to? :read
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
article_text.values_at *self
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
data/lib/rbbt/ner/segment.rb
CHANGED
@@ -5,6 +5,10 @@ module Segment
|
|
5
5
|
extend Annotation
|
6
6
|
self.annotation :offset
|
7
7
|
|
8
|
+
def offset=(offset)
|
9
|
+
@offset = offset.nil? ? nil : offset.to_i
|
10
|
+
end
|
11
|
+
|
8
12
|
#{{{ Ranges
|
9
13
|
|
10
14
|
def end
|
@@ -297,8 +301,11 @@ module Segment
|
|
297
301
|
end
|
298
302
|
|
299
303
|
def self.load_tsv(tsv)
|
300
|
-
tsv.
|
301
|
-
|
304
|
+
fields = tsv.fields
|
305
|
+
tsv.with_unnamed do
|
306
|
+
tsv.collect do |id, values|
|
307
|
+
Annotated.load_tsv_values(id, values, fields)
|
308
|
+
end
|
302
309
|
end
|
303
310
|
end
|
304
311
|
|
@@ -2,9 +2,34 @@ require 'rbbt/annotations'
|
|
2
2
|
require 'rbbt/ner/segment'
|
3
3
|
|
4
4
|
module Token
|
5
|
-
|
6
|
-
|
7
|
-
self.
|
5
|
+
attr_accessor :offset, :original
|
6
|
+
|
7
|
+
def self.all_annotations
|
8
|
+
[:offset, :original]
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.setup(text, start, original = nil)
|
12
|
+
text.extend Token
|
13
|
+
text.offset = start
|
14
|
+
text.original = original
|
15
|
+
text
|
16
|
+
end
|
17
|
+
|
18
|
+
def info
|
19
|
+
{:original => original, :offset => offset}
|
20
|
+
end
|
21
|
+
|
22
|
+
def id
|
23
|
+
Misc.hash2md5 info.merge :self => self
|
24
|
+
end
|
25
|
+
|
26
|
+
def end
|
27
|
+
offset + self.length - 1
|
28
|
+
end
|
29
|
+
|
30
|
+
def range
|
31
|
+
(offset..self.end)
|
32
|
+
end
|
8
33
|
|
9
34
|
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
10
35
|
|
@@ -1,9 +1,9 @@
|
|
1
|
+
require 'rbbt/util/misc'
|
1
2
|
require 'rbbt/ner/segment'
|
3
|
+
|
2
4
|
module Transformed
|
3
|
-
attr_accessor :transformation_offset_differences, :transformation_original
|
4
5
|
|
5
6
|
def self.transform(text, segments, replacement = nil, &block)
|
6
|
-
require 'rbbt/util/misc'
|
7
7
|
|
8
8
|
text.extend Transformed
|
9
9
|
text.replace(segments, replacement, &block)
|
@@ -12,7 +12,6 @@ module Transformed
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def self.with_transform(text, segments, replacement)
|
15
|
-
require 'rbbt/util/misc'
|
16
15
|
|
17
16
|
text.extend Transformed
|
18
17
|
text.replace(segments, replacement)
|
@@ -24,147 +23,149 @@ module Transformed
|
|
24
23
|
text.restore(segments, true)
|
25
24
|
end
|
26
25
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
26
|
+
attr_accessor :transformed_segments, :transformation_stack
|
27
|
+
|
28
|
+
def shift(segment_o)
|
29
|
+
begin_shift = 0
|
30
|
+
end_shift = 0
|
31
|
+
|
32
|
+
@transformed_segments.sort_by{|id, info| info.last}.each{|id,info|
|
33
|
+
pseg_o, diff = info
|
34
|
+
|
35
|
+
case
|
36
|
+
# Before
|
37
|
+
when segment_o.last + end_shift < pseg_o.begin
|
38
|
+
# After
|
39
|
+
when (segment_o.begin + begin_shift > pseg_o.last)
|
40
|
+
begin_shift += diff
|
41
|
+
end_shift += diff
|
42
|
+
# Includes
|
43
|
+
when (segment_o.begin + begin_shift <= pseg_o.begin and segment_o.last + end_shift >= pseg_o.last)
|
44
|
+
end_shift += diff
|
45
|
+
# Inside
|
46
|
+
when (segment_o.begin + begin_shift >= pseg_o.begin and segment_o.last + end_shift <= pseg_o.last)
|
47
|
+
return nil
|
48
|
+
# Overlaps start
|
49
|
+
when (segment_o.begin + begin_shift <= pseg_o.begin and segment_o.last + end_shift <= pseg_o.last)
|
50
|
+
return nil
|
51
|
+
# Overlaps end
|
52
|
+
when (segment_o.begin + begin_shift >= pseg_o.begin and segment_o.last + end_shift >= pseg_o.last)
|
53
|
+
return nil
|
54
|
+
else
|
55
|
+
raise "Unknown overlaps: #{segment_o.inspect} - #{pseg_o.inspect}"
|
36
56
|
end
|
37
|
-
|
38
|
-
end
|
57
|
+
}
|
39
58
|
|
40
|
-
|
59
|
+
[begin_shift, end_shift]
|
41
60
|
end
|
42
61
|
|
43
|
-
def
|
44
|
-
|
62
|
+
def self.sort(segments)
|
63
|
+
segments.compact.sort do |a,b|
|
64
|
+
case
|
65
|
+
when ((a.nil? and b.nil?) or (a.offset.nil? and b.offset.nil?))
|
66
|
+
0
|
67
|
+
when (a.nil? or a.offset.nil?)
|
68
|
+
-1
|
69
|
+
when (b.nil? or b.offset.nil?)
|
70
|
+
+1
|
71
|
+
# Non-overlap
|
72
|
+
when (a.end < b.offset or b.end < a.offset)
|
73
|
+
b.offset <=> a.offset
|
74
|
+
# b includes a
|
75
|
+
when (a.offset >= b.offset and a.end <= b.end)
|
76
|
+
-1
|
77
|
+
# b includes a
|
78
|
+
when (b.offset >= a.offset and b.end <= a.end)
|
79
|
+
+1
|
80
|
+
# Overlap
|
81
|
+
when (a.offset > b.offset and a.end > b.end or b.offset < a.offset and b.end > a.end)
|
82
|
+
a.length <=> b.length
|
83
|
+
else
|
84
|
+
raise "Unexpected case in sort: #{a.range} - #{b.range}"
|
85
|
+
end
|
86
|
+
end
|
45
87
|
end
|
46
88
|
|
47
|
-
def
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
when Integer === pos
|
52
|
-
transform_pos(pos)
|
53
|
-
else
|
54
|
-
raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
|
55
|
-
end
|
56
|
-
|
57
|
-
self[transformed_pos] = value
|
58
|
-
end
|
89
|
+
def replace(segments, replacement = nil, &block)
|
90
|
+
@transformed_segments ||= {}
|
91
|
+
@transformation_stack ||= []
|
92
|
+
stack = []
|
59
93
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
transform_range(pos)
|
64
|
-
when Integer === pos
|
65
|
-
transform_pos(pos)
|
66
|
-
else
|
67
|
-
raise "Text position not understood '#{pos.inspect}'. Not Range or Integer"
|
68
|
-
end
|
69
|
-
|
70
|
-
self[transformed_pos]
|
71
|
-
end
|
94
|
+
Transformed.sort(segments).each do |segment|
|
95
|
+
next if segment.offset.nil?
|
96
|
+
shift = shift segment.range
|
72
97
|
|
73
|
-
|
74
|
-
return false if @transformation_offset_differences.nil? or @transformation_offset_differences.empty?
|
75
|
-
transformation_offset_difference = @transformation_offset_differences.last
|
98
|
+
next if shift.nil?
|
76
99
|
|
77
|
-
|
78
|
-
offset, diff, orig_length, trans_length = info
|
79
|
-
return true if segment_range.begin > offset and segment_range.begin < offset + trans_length or
|
80
|
-
segment_range.end > offset and segment_range.end < offset + trans_length
|
81
|
-
end
|
100
|
+
shift_begin, shift_end = shift
|
82
101
|
|
83
|
-
|
84
|
-
|
102
|
+
text_offset = self.respond_to?(:offset)? self.offset : 0
|
103
|
+
updated_begin = segment.offset + shift_begin - text_offset
|
104
|
+
updated_end = segment.range.last + shift_end - text_offset
|
85
105
|
|
86
|
-
|
87
|
-
replacement ||= block
|
88
|
-
raise "No replacement given" if replacement.nil?
|
89
|
-
transformation_offset_differences = []
|
90
|
-
transformation_original = []
|
106
|
+
updated_range = (updated_begin..updated_end)
|
91
107
|
|
92
|
-
|
93
|
-
untransformed_segment_range_here= segment.range_in(self)
|
94
|
-
transformed_segment_range = self.transform_range(untransformed_segment_range_here)
|
95
|
-
next if conflict?(transformed_segment_range)
|
108
|
+
updated_text = self[updated_begin..updated_end]
|
96
109
|
|
97
|
-
|
110
|
+
original_text = segment.dup
|
111
|
+
segment.replace updated_text
|
98
112
|
|
99
113
|
case
|
114
|
+
when block_given?
|
115
|
+
new = block.call(segment)
|
100
116
|
when String === replacement
|
101
|
-
|
117
|
+
new = replacement
|
102
118
|
when Proc === replacement
|
119
|
+
new = replacement.call(segment)
|
120
|
+
end
|
103
121
|
|
104
|
-
|
105
|
-
save_segment_text = segment.dup
|
106
|
-
save_offset = segment.offset
|
107
|
-
segment.replace text_before_transform
|
108
|
-
segment.offset = transformed_segment_range.begin
|
122
|
+
diff = new.length - segment.length
|
109
123
|
|
110
|
-
|
124
|
+
self[updated_begin..updated_end] = new
|
111
125
|
|
112
|
-
|
113
|
-
segment.replace save_segment_text
|
114
|
-
segment.offset = save_offset
|
115
|
-
else
|
116
|
-
raise "Replacemente not String nor Proc"
|
117
|
-
end
|
118
|
-
diff = segment.length - transformed_text.length
|
119
|
-
self[transformed_segment_range] = transformed_text
|
126
|
+
@transformed_segments[segment.object_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
|
120
127
|
|
121
|
-
|
122
|
-
|
128
|
+
segment.replace original_text
|
129
|
+
stack << segment.object_id
|
123
130
|
end
|
131
|
+
@transformation_stack << stack
|
132
|
+
end
|
124
133
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
134
|
+
def fix_segment(segment, range, diff)
|
135
|
+
case
|
136
|
+
# Before
|
137
|
+
when segment.end < range.begin
|
138
|
+
# After
|
139
|
+
when segment.offset > range.end + diff
|
140
|
+
segment.offset -= diff
|
141
|
+
# Includes
|
142
|
+
when (segment.offset <= range.begin and segment.end >= range.end + diff)
|
143
|
+
segment.replace self[segment.offset..segment.end - diff]
|
144
|
+
else
|
145
|
+
raise "Segment Overlaps"
|
146
|
+
end
|
129
147
|
end
|
130
148
|
|
131
|
-
def restore(segments
|
132
|
-
|
133
|
-
while self.transformation_offset_differences.any? and not stop
|
134
|
-
transformation_offset_differences = self.transformation_offset_differences.pop
|
135
|
-
transformation_original = self.transformation_original.pop
|
149
|
+
def restore(segments, first_only = false)
|
150
|
+
return segments if @transformation_stack.empty?
|
136
151
|
|
137
|
-
|
138
|
-
|
139
|
-
|
152
|
+
if first_only
|
153
|
+
@transformation_stack.pop.reverse.each do |id|
|
154
|
+
orig_range, diff, text, range = @transformed_segments.delete id
|
140
155
|
|
141
|
-
|
142
|
-
self
|
156
|
+
new_range = (range.begin..range.last + diff)
|
157
|
+
self[new_range] = text
|
158
|
+
segments.each do |segment|
|
159
|
+
next unless Segment === segment
|
160
|
+
fix_segment(segment, range, diff)
|
161
|
+
end if Array === segments
|
143
162
|
end
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
segment_ranges = segments.each do |segment|
|
150
|
-
r = segment.range
|
151
|
-
|
152
|
-
s = r.begin
|
153
|
-
e = r.end
|
154
|
-
sdiff = 0
|
155
|
-
ediff = 0
|
156
|
-
transformation_offset_differences.reverse.each do |offset,diff,orig_length,rep_length|
|
157
|
-
sdiff += diff if offset < s
|
158
|
-
ediff += diff if offset + rep_length - 1 < e
|
159
|
-
end
|
160
|
-
|
161
|
-
segment.offset = s + sdiff
|
162
|
-
segment.replace self[(s+sdiff)..(e + ediff)]
|
163
|
+
segments
|
164
|
+
else
|
165
|
+
while @transformation_stack.any?
|
166
|
+
restore(segments, true)
|
163
167
|
end
|
168
|
+
segments
|
164
169
|
end
|
165
|
-
|
166
|
-
segments
|
167
170
|
end
|
168
171
|
end
|
169
|
-
|
170
|
-
|
@@ -110,7 +110,7 @@ class TokenTrieNER < NER
|
|
110
110
|
end
|
111
111
|
|
112
112
|
def self.merge(index1, index2)
|
113
|
-
index1.write if index1.respond_to? :write
|
113
|
+
index1.write if index1.respond_to? :write and not index1.write?
|
114
114
|
index2.each do |key, new_index2|
|
115
115
|
case
|
116
116
|
when key == :END
|
@@ -119,7 +119,8 @@ class TokenTrieNER < NER
|
|
119
119
|
end1.uniq!
|
120
120
|
index1[:END] = end1
|
121
121
|
when index1.include?(key)
|
122
|
-
|
122
|
+
new = merge(index1[key], new_index2)
|
123
|
+
index1[key] = new
|
123
124
|
else
|
124
125
|
index1[key] = new_index2
|
125
126
|
end
|
@@ -148,7 +149,10 @@ class TokenTrieNER < NER
|
|
148
149
|
tokens = Array === name ? name : tokenize(name, false, split_at, no_clean)
|
149
150
|
tokens.extend EnumeratedArray
|
150
151
|
|
151
|
-
|
152
|
+
token_index = index_for_tokens(tokens, code, type, slack)
|
153
|
+
|
154
|
+
tmp_index = merge(tmp_index, token_index) unless tokens.empty?
|
155
|
+
|
152
156
|
items_in_chunk += 1
|
153
157
|
|
154
158
|
if items_in_chunk > chunk_size
|
@@ -267,22 +271,22 @@ class TokenTrieNER < NER
|
|
267
271
|
TokenTrieNER.merge(@index, new.index)
|
268
272
|
when TSV === new
|
269
273
|
Log.debug "TokenTrieNER merging TSV"
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
new.unnamed = old_unnamed
|
276
|
-
new.monitor = old_monitor
|
274
|
+
new.with_unnamed do
|
275
|
+
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
276
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
277
|
+
end
|
278
|
+
end
|
277
279
|
when Hash === new
|
278
280
|
Log.debug "TokenTrieNER merging Hash"
|
279
281
|
TokenTrieNER.merge(@index, new)
|
280
282
|
when String === new
|
281
283
|
Log.debug "TokenTrieNER merging file: #{ new }"
|
282
284
|
new = TSV.open(new, :flat)
|
283
|
-
new.
|
284
|
-
|
285
|
-
|
285
|
+
new.with_unnamed do
|
286
|
+
new.with_monitor({:step => 1000, :desc => "Processing TSV into TokenTrieNER"}) do
|
287
|
+
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
288
|
+
end
|
289
|
+
end
|
286
290
|
end
|
287
291
|
end
|
288
292
|
|
@@ -2,7 +2,19 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
|
|
2
2
|
require 'rbbt/corpus/document'
|
3
3
|
require 'test/unit'
|
4
4
|
|
5
|
+
module TokenEntity
|
6
|
+
extend Annotation
|
7
|
+
include Segment
|
8
|
+
self.annotation :original
|
9
|
+
end
|
5
10
|
class Document
|
11
|
+
|
12
|
+
def tokenize(text)
|
13
|
+
Token.tokenize(text).collect do |token|
|
14
|
+
TokenEntity.setup(token.dup, token.offset, token.original)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
6
18
|
define :sentences do
|
7
19
|
require 'rbbt/nlp/nlp'
|
8
20
|
NLP.geniass_sentence_splitter(text)
|
@@ -10,22 +22,22 @@ class Document
|
|
10
22
|
|
11
23
|
define :tokens do
|
12
24
|
require 'rbbt/ner/segment/token'
|
13
|
-
|
25
|
+
tokenize(text)
|
14
26
|
end
|
15
27
|
|
16
28
|
define :long_words do
|
17
29
|
require 'rbbt/ner/segment/token'
|
18
|
-
|
30
|
+
tokenize(text).select{|tok| tok.length > 5}
|
19
31
|
end
|
20
32
|
|
21
33
|
define :short_words do
|
22
34
|
require 'rbbt/ner/segment/token'
|
23
|
-
|
35
|
+
tokenize(text).select{|tok| tok.length < 5}
|
24
36
|
end
|
25
37
|
|
26
38
|
define :even_words do
|
27
39
|
require 'rbbt/ner/segment/token'
|
28
|
-
|
40
|
+
tokenize(text).select{|tok| tok.length % 2 == 0}
|
29
41
|
end
|
30
42
|
|
31
43
|
define :missing do
|
@@ -110,7 +122,7 @@ another sentence.
|
|
110
122
|
doc = Document.new(dir)
|
111
123
|
doc.text = text
|
112
124
|
|
113
|
-
sentence = doc.sentences.last
|
125
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
114
126
|
doc.load_into sentence, :tokens
|
115
127
|
|
116
128
|
assert_equal 5, sentence.tokens.length
|
@@ -134,7 +146,7 @@ another sentence.
|
|
134
146
|
doc = Document.new(dir)
|
135
147
|
doc.text = text
|
136
148
|
|
137
|
-
sentence = doc.sentences.last
|
149
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
138
150
|
Misc.benchmark(1) do
|
139
151
|
doc = Document.new(dir)
|
140
152
|
doc.text = text
|
@@ -166,7 +178,7 @@ another sentence.
|
|
166
178
|
doc = Document.new(dir)
|
167
179
|
doc.text = text * 10
|
168
180
|
|
169
|
-
sentence = doc.sentences.last
|
181
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
170
182
|
|
171
183
|
doc.load_into sentence, :tokens, :long_words
|
172
184
|
|
@@ -178,9 +190,9 @@ another sentence.
|
|
178
190
|
doc = Document.new(dir)
|
179
191
|
doc.text = text * 10
|
180
192
|
doc.sentences
|
181
|
-
assert_equal sentence, doc.sentences.last
|
193
|
+
assert_equal sentence, doc.sentences.sort_by{|sentence| sentence.offset}.last
|
182
194
|
|
183
|
-
sentence = doc.sentences.last
|
195
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
184
196
|
doc.load_into sentence, :tokens, :long_words
|
185
197
|
|
186
198
|
assert_equal 2, sentence.long_words.length
|
@@ -211,7 +223,7 @@ another sentence.
|
|
211
223
|
doc.text = text * 10
|
212
224
|
doc.docid = "TEST"
|
213
225
|
|
214
|
-
sentence = doc.sentences.last
|
226
|
+
sentence = doc.sentences.sort_by{|sentence| sentence.offset}.last
|
215
227
|
|
216
228
|
doc.load_into sentence, :tokens, :long_words, :short_words, :even_words
|
217
229
|
|
@@ -1,9 +1,11 @@
|
|
1
1
|
require File.join(File.expand_path(File.dirname(__FILE__)), '../../..', 'test_helper.rb')
|
2
2
|
require 'rbbt/ner/segment/transformed'
|
3
3
|
require 'rbbt/ner/segment/named_entity'
|
4
|
+
require 'rexml/document'
|
5
|
+
require 'rand'
|
4
6
|
|
5
7
|
class TestClass < Test::Unit::TestCase
|
6
|
-
def
|
8
|
+
def tttest_transform
|
7
9
|
a = "This sentence mentions the TP53 gene and the CDK5 protein"
|
8
10
|
original = a.dup
|
9
11
|
|
@@ -56,11 +58,13 @@ class TestClass < Test::Unit::TestCase
|
|
56
58
|
Transformed.with_transform(a, [gene1], "GN") do
|
57
59
|
assert_equal original.sub("TP53", 'GN'), a
|
58
60
|
end
|
61
|
+
|
59
62
|
assert_equal original, a
|
60
63
|
|
61
|
-
Transformed.with_transform(a, [gene1,gene2], "GN") do
|
64
|
+
Transformed.with_transform(a, [gene1, gene2], "GN") do
|
62
65
|
assert_equal original.gsub(/TP53|CDK5R1/, 'GN'), a
|
63
66
|
end
|
67
|
+
|
64
68
|
assert_equal original, a
|
65
69
|
|
66
70
|
Transformed.with_transform(a, [gene1], "GN") do
|
@@ -69,6 +73,7 @@ class TestClass < Test::Unit::TestCase
|
|
69
73
|
end
|
70
74
|
assert_equal original.gsub(/TP53/, 'GN'), a
|
71
75
|
end
|
76
|
+
|
72
77
|
assert_equal original, a
|
73
78
|
|
74
79
|
exp1, exp2 = nil, nil
|
@@ -169,7 +174,37 @@ class TestClass < Test::Unit::TestCase
|
|
169
174
|
assert_equal one, a
|
170
175
|
end
|
171
176
|
end
|
172
|
-
|
173
177
|
end
|
178
|
+
|
179
|
+
def test_error
|
180
|
+
a = "Do not have a diagnosis of another hereditary APC resistance/Factor V Leiden, Protein S or C deficiency, prothrombin gene mutation (G20210A), or acquired (lupus anticoagulant) thrombophilic disorder"
|
181
|
+
|
182
|
+
entity1 = "gene"
|
183
|
+
entity1.extend NamedEntity
|
184
|
+
entity1.offset = a.index entity1
|
185
|
+
entity1.type = "Gene"
|
186
|
+
|
187
|
+
entity2 = "prothrombin gene mutation"
|
188
|
+
entity2.extend NamedEntity
|
189
|
+
entity2.offset = a.index entity2
|
190
|
+
entity2.type = "Mutation"
|
191
|
+
|
192
|
+
entity3 = "Protein S or C"
|
193
|
+
entity3.extend NamedEntity
|
194
|
+
entity3.offset = a.index entity3
|
195
|
+
entity3.type = "Gene"
|
196
|
+
|
197
|
+
entity4 = "prothrombin gene mutation"
|
198
|
+
entity4.extend NamedEntity
|
199
|
+
entity4.offset = a.index entity2
|
200
|
+
entity4.type = "Disease"
|
201
|
+
|
202
|
+
|
203
|
+
Transformed.with_transform(a, [entity1].sort_by{rand}, Proc.new{|e| e.html}) do
|
204
|
+
Transformed.with_transform(a, [entity3, entity2, entity4].sort_by{rand}, Proc.new{|e| e.html}) do
|
205
|
+
assert_nothing_raised{REXML::Document.new "<xml>"+ a + "</xml>"}
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
174
209
|
end
|
175
210
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 1
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 6
|
9
|
-
-
|
10
|
-
version: 0.6.
|
9
|
+
- 3
|
10
|
+
version: 0.6.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Miguel Vazquez
|
@@ -15,8 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
19
|
-
default_executable: get_ppis.rb
|
18
|
+
date: 2012-02-09 00:00:00 Z
|
20
19
|
dependencies:
|
21
20
|
- !ruby/object:Gem::Dependency
|
22
21
|
name: rbbt-util
|
@@ -106,6 +105,7 @@ files:
|
|
106
105
|
- lib/rbbt/corpus/document.rb
|
107
106
|
- lib/rbbt/corpus/document_repo.rb
|
108
107
|
- lib/rbbt/corpus/sources/pubmed.rb
|
108
|
+
- lib/rbbt/entity/document.rb
|
109
109
|
- lib/rbbt/ner/NER.rb
|
110
110
|
- lib/rbbt/ner/abner.rb
|
111
111
|
- lib/rbbt/ner/banner.rb
|
@@ -161,7 +161,6 @@ files:
|
|
161
161
|
- test/rbbt/corpus/test_corpus.rb
|
162
162
|
- test/rbbt/corpus/test_document.rb
|
163
163
|
- bin/get_ppis.rb
|
164
|
-
has_rdoc: true
|
165
164
|
homepage: http://github.com/mikisvaz/rbbt-util
|
166
165
|
licenses: []
|
167
166
|
|
@@ -191,7 +190,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
191
190
|
requirements: []
|
192
191
|
|
193
192
|
rubyforge_project:
|
194
|
-
rubygems_version: 1.
|
193
|
+
rubygems_version: 1.8.10
|
195
194
|
signing_key:
|
196
195
|
specification_version: 3
|
197
196
|
summary: Text mining tools for the Ruby Bioinformatics Toolkit (rbbt)
|