rbbt-text 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/corpus/corpus.rb +15 -6
- data/lib/rbbt/corpus/document.rb +100 -127
- data/lib/rbbt/corpus/document_repo.rb +72 -51
- data/lib/rbbt/ner/NER.rb +4 -4
- data/lib/rbbt/ner/abner.rb +5 -4
- data/lib/rbbt/ner/banner.rb +3 -3
- data/lib/rbbt/ner/chemical_tagger.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
- data/lib/rbbt/ner/oscar3.rb +3 -3
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +15 -13
- data/lib/rbbt/ner/regexpNER.rb +3 -2
- data/lib/rbbt/ner/rnorm.rb +2 -2
- data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
- data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
- data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
- data/lib/rbbt/ner/segment/relationship.rb +20 -0
- data/lib/rbbt/ner/segment/segmented.rb +13 -0
- data/lib/rbbt/ner/segment/token.rb +24 -0
- data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
- data/lib/rbbt/ner/token_trieNER.rb +30 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/nlp.rb +23 -37
- data/test/rbbt/corpus/test_document.rb +39 -37
- data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
- data/test/rbbt/ner/segment/test_segmented.rb +23 -0
- data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
- data/test/rbbt/ner/test_patterns.rb +11 -12
- data/test/rbbt/ner/test_regexpNER.rb +5 -4
- data/test/rbbt/ner/test_segment.rb +101 -0
- data/test/rbbt/ner/test_token_trieNER.rb +8 -9
- data/test/test_helper.rb +6 -6
- metadata +40 -22
- data/lib/rbbt/ner/annotations/annotated.rb +0 -15
- data/lib/rbbt/ner/annotations/relations.rb +0 -25
- data/lib/rbbt/ner/annotations/token.rb +0 -28
- data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
- data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -1,17 +1,9 @@
|
|
1
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment'
|
2
2
|
|
3
3
|
module NamedEntity
|
4
|
-
|
4
|
+
extend Annotation
|
5
5
|
include Segment
|
6
|
-
|
7
|
-
def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
|
8
|
-
string.extend NamedEntity
|
9
|
-
string.offset = offset unless offset.nil?
|
10
|
-
string.type = type unless type.nil?
|
11
|
-
string.code = code unless code.nil?
|
12
|
-
string.score = score unless score.nil?
|
13
|
-
string
|
14
|
-
end
|
6
|
+
self.annotation :type, :code, :score
|
15
7
|
|
16
8
|
def report
|
17
9
|
<<-EOF
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'rbbt/ner/segment'
|
2
|
+
|
3
|
+
module Relationship
|
4
|
+
extend Annotation
|
5
|
+
include Segment
|
6
|
+
self.annotation :terms
|
7
|
+
|
8
|
+
def html
|
9
|
+
text = <<-EOF
|
10
|
+
<span class='Relationship'\
|
11
|
+
>#{ self }</span>
|
12
|
+
EOF
|
13
|
+
text.chomp
|
14
|
+
end
|
15
|
+
|
16
|
+
def html_with_entities(*types)
|
17
|
+
annotations.values_at(*types).each do |segments|
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'rbbt/annotations'
|
2
|
+
require 'rbbt/ner/segment'
|
3
|
+
|
4
|
+
module Token
|
5
|
+
extend Annotation
|
6
|
+
include Segment
|
7
|
+
self.annotation :original
|
8
|
+
|
9
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
10
|
+
|
11
|
+
tokens = []
|
12
|
+
while matchdata = text.match(split_at)
|
13
|
+
tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
14
|
+
tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
15
|
+
start += matchdata.end(0)
|
16
|
+
text = matchdata.post_match
|
17
|
+
end
|
18
|
+
|
19
|
+
tokens << Token.setup(text, start) unless text.empty?
|
20
|
+
|
21
|
+
tokens
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
@@ -1,7 +1,16 @@
|
|
1
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment'
|
2
2
|
module Transformed
|
3
3
|
attr_accessor :transformation_offset_differences, :transformation_original
|
4
4
|
|
5
|
+
def self.transform(text, segments, replacement = nil, &block)
|
6
|
+
require 'rbbt/util/misc'
|
7
|
+
|
8
|
+
text.extend Transformed
|
9
|
+
text.replace(segments, replacement, &block)
|
10
|
+
|
11
|
+
text
|
12
|
+
end
|
13
|
+
|
5
14
|
def self.with_transform(text, segments, replacement)
|
6
15
|
require 'rbbt/util/misc'
|
7
16
|
|
@@ -14,16 +23,7 @@ module Transformed
|
|
14
23
|
|
15
24
|
text.restore(segments, true)
|
16
25
|
end
|
17
|
-
|
18
|
-
def self.transform(text, segments, replacement = nil, &block)
|
19
|
-
require 'rbbt/util/misc'
|
20
26
|
|
21
|
-
text.extend Transformed
|
22
|
-
text.replace(segments, replacement, &block)
|
23
|
-
|
24
|
-
text
|
25
|
-
end
|
26
|
-
|
27
27
|
def transform_pos(pos)
|
28
28
|
return pos if transformation_offset_differences.nil?
|
29
29
|
# tranformation_offset_differences are assumed to be sorted in reverse
|
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/ner/
|
4
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/tsv'
|
3
|
+
require 'rbbt/ner/segment'
|
4
|
+
require 'rbbt/ner/segment/token'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
|
7
7
|
class TokenTrieNER < NER
|
@@ -16,15 +16,15 @@ class TokenTrieNER < NER
|
|
16
16
|
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
17
17
|
if no_clean
|
18
18
|
if extend_to_token
|
19
|
-
Token.
|
19
|
+
Token.setup(clean(token), start, token)
|
20
20
|
else
|
21
|
-
|
21
|
+
token
|
22
22
|
end
|
23
23
|
else
|
24
24
|
if extend_to_token
|
25
|
-
Token.
|
25
|
+
Token.setup(clean(token), start, token)
|
26
26
|
else
|
27
|
-
token
|
27
|
+
clean(token)
|
28
28
|
end
|
29
29
|
end
|
30
30
|
end
|
@@ -137,6 +137,11 @@ class TokenTrieNER < NER
|
|
137
137
|
hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
|
138
138
|
names = Array === names ? names : [names]
|
139
139
|
names.flatten! if Array === names.first and not Token === names.first.first
|
140
|
+
|
141
|
+
if names.empty?
|
142
|
+
names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
|
143
|
+
end
|
144
|
+
|
140
145
|
names.each do |name|
|
141
146
|
next if name.empty? or (String === name and name.length < 2)
|
142
147
|
|
@@ -167,7 +172,7 @@ class TokenTrieNER < NER
|
|
167
172
|
return index[head]
|
168
173
|
end
|
169
174
|
|
170
|
-
return nil unless (not
|
175
|
+
return nil unless (not TokyoCabinet::HDB === index ) and index.include? :PROCS
|
171
176
|
|
172
177
|
index[:PROCS].each do |key,value|
|
173
178
|
return value if key.call(head)
|
@@ -225,16 +230,16 @@ class TokenTrieNER < NER
|
|
225
230
|
match_offset = match_tokens.first.offset
|
226
231
|
match_tokens.each{|t|
|
227
232
|
match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
|
228
|
-
match << (t.respond_to?(:original) ? t.original : t)
|
233
|
+
match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
|
229
234
|
}
|
230
235
|
|
231
|
-
NamedEntity.
|
236
|
+
NamedEntity.setup(match, match_tokens.first.offset, type, codes)
|
232
237
|
end
|
233
238
|
|
234
239
|
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|
235
240
|
def initialize(type = nil, file = nil, options = {})
|
236
241
|
options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
|
237
|
-
:
|
242
|
+
:persist => false
|
238
243
|
@slack = slack
|
239
244
|
@longest_match = options.delete :longest_match
|
240
245
|
@split_at = options.delete :split_at
|
@@ -242,16 +247,15 @@ class TokenTrieNER < NER
|
|
242
247
|
|
243
248
|
file = [] if file.nil?
|
244
249
|
file = [file] unless Array === file
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
@index = TCHash.get persistecen_file, true, :marshal
|
251
|
-
end
|
250
|
+
persist_options = Misc.pull_keys options, :persist
|
251
|
+
@index = Persist.persist_tsv(file, options, persist_options) do |data|
|
252
|
+
data.serializer = :marshal if data.respond_to? :serializer and data.serializer == :type
|
253
|
+
|
254
|
+
@index = data
|
252
255
|
file.each do |f|
|
253
256
|
merge(f, type)
|
254
257
|
end
|
258
|
+
|
255
259
|
@index
|
256
260
|
end
|
257
261
|
end
|
@@ -259,10 +263,10 @@ class TokenTrieNER < NER
|
|
259
263
|
def merge(new, type = nil)
|
260
264
|
case
|
261
265
|
when TokenTrieNER === new
|
266
|
+
Log.debug "TokenTrieNER merging other TokenTrieNER"
|
262
267
|
TokenTrieNER.merge(@index, new.index)
|
263
|
-
when Hash === new
|
264
|
-
TokenTrieNER.merge(@index, new)
|
265
268
|
when TSV === new
|
269
|
+
Log.debug "TokenTrieNER merging TSV"
|
266
270
|
old_unnamed = new.unnamed
|
267
271
|
old_monitor = new.monitor
|
268
272
|
new.unnamed = true
|
@@ -270,8 +274,12 @@ class TokenTrieNER < NER
|
|
270
274
|
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
271
275
|
new.unnamed = old_unnamed
|
272
276
|
new.monitor = old_monitor
|
277
|
+
when Hash === new
|
278
|
+
Log.debug "TokenTrieNER merging Hash"
|
279
|
+
TokenTrieNER.merge(@index, new)
|
273
280
|
when String === new
|
274
|
-
|
281
|
+
Log.debug "TokenTrieNER merging file: #{ new }"
|
282
|
+
new = TSV.open(new, :flat)
|
275
283
|
new.unnamed = true
|
276
284
|
new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
|
277
285
|
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rbbt/ner/segment'
|
1
2
|
module NLP
|
2
3
|
def self.returnFeatures(prevWord, delimiter, nextWord)
|
3
4
|
if nextWord.match(/__ss__/)
|
@@ -206,7 +207,7 @@ module NLP
|
|
206
207
|
offsets.collect do |s,e|
|
207
208
|
sentence = text[s..e]
|
208
209
|
next if sentence.nil?
|
209
|
-
Segment.
|
210
|
+
Segment.setup sentence, s
|
210
211
|
sentence
|
211
212
|
end
|
212
213
|
|
data/lib/rbbt/nlp/nlp.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/tmpfile'
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/
|
5
|
-
require 'rbbt/ner/
|
6
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/persist'
|
4
|
+
require 'rbbt/resource'
|
5
|
+
require 'rbbt/ner/segment'
|
6
|
+
require 'rbbt/ner/segment/segmented'
|
7
7
|
require 'rbbt/nlp/genia/sentence_splitter'
|
8
8
|
require 'digest/md5'
|
9
9
|
|
@@ -11,7 +11,7 @@ require 'digest/md5'
|
|
11
11
|
module NLP
|
12
12
|
|
13
13
|
extend LocalPersist
|
14
|
-
self.
|
14
|
+
self.local_persist_dir = '/tmp/crap'
|
15
15
|
|
16
16
|
#Rbbt.software.opt.StanfordParser.define_as_install Rbbt.share.install.software.StanfordParser.find
|
17
17
|
#Rbbt.software.opt.StanfordParser.produce
|
@@ -81,44 +81,21 @@ module NLP
|
|
81
81
|
sentence = text[s..e]
|
82
82
|
next if sentence.nil?
|
83
83
|
#sentence.gsub!(NEW_LINE_MASK, "\n")
|
84
|
-
Segment.
|
84
|
+
Segment.setup sentence, s
|
85
85
|
sentence
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
89
|
module GdepToken
|
90
|
-
|
90
|
+
extend Annotation
|
91
91
|
include Segment
|
92
|
-
|
93
|
-
def self.annotate(token, offset = nil, num = nil, lemma = nil, chunk = nil, pos = nil, bio = nil, link = nil, dep = nil)
|
94
|
-
token.extend GdepToken
|
95
|
-
|
96
|
-
token.offset = offset
|
97
|
-
token.num = num
|
98
|
-
token.lemma = lemma
|
99
|
-
token.chunk = chunk
|
100
|
-
token.pos = pos
|
101
|
-
token.bio = bio
|
102
|
-
token.link = link
|
103
|
-
token.dep = dep
|
104
|
-
|
105
|
-
token
|
106
|
-
end
|
92
|
+
self.annotation :num, :lemma, :chunk, :pos, :bio, :link, :dep
|
107
93
|
end
|
108
94
|
|
109
95
|
module GdepChunk
|
110
|
-
|
96
|
+
extend Annotation
|
111
97
|
include Segment
|
112
|
-
|
113
|
-
def self.annotate(string, offset = nil, type = nil, parts = nil)
|
114
|
-
string.extend GdepChunk
|
115
|
-
|
116
|
-
string.offset = offset
|
117
|
-
string.type = type
|
118
|
-
string.parts = parts
|
119
|
-
|
120
|
-
string
|
121
|
-
end
|
98
|
+
self.annotation :type, :parts
|
122
99
|
end
|
123
100
|
|
124
101
|
def self.merge_vp_chunks(chunk_list)
|
@@ -148,7 +125,7 @@ module NLP
|
|
148
125
|
chunk_start = "B"[0]
|
149
126
|
chunk_inside = "I"[0]
|
150
127
|
|
151
|
-
last = GdepToken.
|
128
|
+
last = GdepToken.setup("LW")
|
152
129
|
|
153
130
|
chunk_segments = []
|
154
131
|
segment_list.each do |segment|
|
@@ -159,7 +136,7 @@ module NLP
|
|
159
136
|
cstart = chunk_segments.first.offset
|
160
137
|
cend = chunk_segments.last.end
|
161
138
|
chunk = sentence[cstart..cend]
|
162
|
-
GdepChunk.
|
139
|
+
GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
|
163
140
|
chunks << chunk
|
164
141
|
end
|
165
142
|
|
@@ -172,6 +149,15 @@ module NLP
|
|
172
149
|
last = segment
|
173
150
|
end
|
174
151
|
|
152
|
+
if chunk_segments.any?
|
153
|
+
cstart = chunk_segments.first.offset
|
154
|
+
cend = chunk_segments.last.end
|
155
|
+
chunk = sentence[cstart..cend]
|
156
|
+
GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
|
157
|
+
chunks << chunk
|
158
|
+
end
|
159
|
+
|
160
|
+
|
175
161
|
chunks
|
176
162
|
end
|
177
163
|
|
@@ -188,7 +174,7 @@ module NLP
|
|
188
174
|
tokens = sentence.split(/\n/).collect do |line|
|
189
175
|
next if line.empty?
|
190
176
|
num, token, lemma, chunk, pos, bio, link, dep = line.split(/\t/)
|
191
|
-
GdepToken.
|
177
|
+
GdepToken.setup(token, nil, num, lemma, chunk, pos, bio, link, dep)
|
192
178
|
end.compact
|
193
179
|
end
|
194
180
|
end
|
@@ -214,7 +200,7 @@ module NLP
|
|
214
200
|
Gdep.new.tag(sentence).split(/\n/).collect do |line|
|
215
201
|
next if line.empty?
|
216
202
|
token, lemma, pos, chunk = line.split(/\t/)
|
217
|
-
GdepToken.
|
203
|
+
GdepToken.setup(token, nil, nil, lemma, chunk, pos)
|
218
204
|
token
|
219
205
|
end.compact
|
220
206
|
}
|
@@ -2,11 +2,6 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
|
|
2
2
|
require 'rbbt/corpus/document'
|
3
3
|
require 'test/unit'
|
4
4
|
|
5
|
-
$persistence = TSV.new({})
|
6
|
-
$tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
|
7
|
-
$global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
8
|
-
$tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
9
|
-
|
10
5
|
class Document
|
11
6
|
define :sentences do
|
12
7
|
require 'rbbt/nlp/nlp'
|
@@ -14,22 +9,22 @@ class Document
|
|
14
9
|
end
|
15
10
|
|
16
11
|
define :tokens do
|
17
|
-
require 'rbbt/ner/
|
12
|
+
require 'rbbt/ner/segment/token'
|
18
13
|
Token.tokenize(text)
|
19
14
|
end
|
20
15
|
|
21
16
|
define :long_words do
|
22
|
-
require 'rbbt/ner/
|
17
|
+
require 'rbbt/ner/segment/token'
|
23
18
|
Token.tokenize(text).select{|tok| tok.length > 5}
|
24
19
|
end
|
25
20
|
|
26
21
|
define :short_words do
|
27
|
-
require 'rbbt/ner/
|
22
|
+
require 'rbbt/ner/segment/token'
|
28
23
|
Token.tokenize(text).select{|tok| tok.length < 5}
|
29
24
|
end
|
30
25
|
|
31
26
|
define :even_words do
|
32
|
-
require 'rbbt/ner/
|
27
|
+
require 'rbbt/ner/segment/token'
|
33
28
|
Token.tokenize(text).select{|tok| tok.length % 2 == 0}
|
34
29
|
end
|
35
30
|
|
@@ -40,17 +35,30 @@ class Document
|
|
40
35
|
define :tokens_again do
|
41
36
|
raise "This should be here already"
|
42
37
|
end
|
43
|
-
|
44
|
-
persist :sentences
|
45
|
-
persist_in_tsv :tokens
|
46
|
-
persist_in_tsv :long_words, $tchash_persistence, :Literal
|
47
|
-
persist_in_global_tsv :short_words, $global_persistence
|
48
|
-
persist_in_global_tsv :even_words, $tchash_global_persistence
|
49
|
-
persist_in_global_tsv :missing, $tchash_global_persistence
|
50
38
|
end
|
51
39
|
|
52
40
|
class TestDocument < Test::Unit::TestCase
|
53
41
|
|
42
|
+
def setup
|
43
|
+
global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
|
44
|
+
$persistence = TSV.setup({})
|
45
|
+
$tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
|
46
|
+
$global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
|
47
|
+
$tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
|
48
|
+
$tchash_global_persistence.read
|
49
|
+
$tchash_global_persistence.write
|
50
|
+
|
51
|
+
Document.class_eval do
|
52
|
+
|
53
|
+
persist :sentences
|
54
|
+
persist_in_tsv :tokens, :literal
|
55
|
+
persist_in_tsv :long_words, $tchash_persistence, :literal
|
56
|
+
persist_in_global_tsv :short_words, $global_persistence
|
57
|
+
persist_in_global_tsv :even_words, $tchash_global_persistence
|
58
|
+
persist_in_global_tsv :missing, $tchash_global_persistence
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
54
62
|
def test_annotations
|
55
63
|
|
56
64
|
text =<<-EOF
|
@@ -127,7 +135,7 @@ another sentence.
|
|
127
135
|
doc.text = text
|
128
136
|
|
129
137
|
sentence = doc.sentences.last
|
130
|
-
Misc.benchmark(
|
138
|
+
Misc.benchmark(1) do
|
131
139
|
doc = Document.new(dir)
|
132
140
|
doc.text = text
|
133
141
|
|
@@ -166,6 +174,15 @@ another sentence.
|
|
166
174
|
assert_equal "another", sentence.tokens[2]
|
167
175
|
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
168
176
|
|
177
|
+
assert_equal 2, sentence.long_words.length
|
178
|
+
doc = Document.new(dir)
|
179
|
+
doc.text = text * 10
|
180
|
+
doc.sentences
|
181
|
+
assert_equal sentence, doc.sentences.last
|
182
|
+
|
183
|
+
sentence = doc.sentences.last
|
184
|
+
doc.load_into sentence, :tokens, :long_words
|
185
|
+
|
169
186
|
assert_equal 2, sentence.long_words.length
|
170
187
|
assert_equal %w(another sentence), sentence.long_words
|
171
188
|
assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
|
@@ -183,15 +200,16 @@ another sentence.
|
|
183
200
|
FileUtils.mkdir_p dir
|
184
201
|
|
185
202
|
|
186
|
-
|
203
|
+
global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
|
204
|
+
doc = Document.new(dir, nil, nil, global_persistence)
|
187
205
|
doc.text = text * 10
|
188
|
-
doc.docid = "
|
189
|
-
|
206
|
+
doc.docid = "TEST"
|
207
|
+
|
190
208
|
doc.sentences
|
191
209
|
|
192
210
|
doc = Document.new(dir)
|
193
211
|
doc.text = text * 10
|
194
|
-
doc.docid = "
|
212
|
+
doc.docid = "TEST"
|
195
213
|
|
196
214
|
sentence = doc.sentences.last
|
197
215
|
|
@@ -201,22 +219,6 @@ another sentence.
|
|
201
219
|
assert_equal 3, sentence.even_words.length
|
202
220
|
end
|
203
221
|
end
|
204
|
-
|
205
|
-
def test_dump
|
206
|
-
text =<<-EOF
|
207
|
-
This is a
|
208
|
-
sentence. This is
|
209
|
-
another sentence.
|
210
|
-
EOF
|
211
|
-
|
212
|
-
TmpFile.with_file do |dir|
|
213
|
-
FileUtils.mkdir_p dir
|
214
|
-
|
215
|
-
doc = Document.new(dir)
|
216
|
-
doc.text = text * 10
|
217
|
-
tsv = Document.tsv(doc.sentences, ["Literal"])
|
218
|
-
end
|
219
|
-
end
|
220
222
|
end
|
221
223
|
|
222
224
|
|