rbbt-text 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/rbbt/corpus/corpus.rb +15 -6
- data/lib/rbbt/corpus/document.rb +100 -127
- data/lib/rbbt/corpus/document_repo.rb +72 -51
- data/lib/rbbt/ner/NER.rb +4 -4
- data/lib/rbbt/ner/abner.rb +5 -4
- data/lib/rbbt/ner/banner.rb +3 -3
- data/lib/rbbt/ner/chemical_tagger.rb +3 -3
- data/lib/rbbt/ner/ngram_prefix_dictionary.rb +45 -10
- data/lib/rbbt/ner/oscar3.rb +3 -3
- data/lib/rbbt/ner/oscar4.rb +3 -3
- data/lib/rbbt/ner/patterns.rb +15 -13
- data/lib/rbbt/ner/regexpNER.rb +3 -2
- data/lib/rbbt/ner/rnorm.rb +2 -2
- data/lib/rbbt/ner/rnorm/cue_index.rb +2 -2
- data/lib/rbbt/ner/{annotations.rb → segment.rb} +161 -109
- data/lib/rbbt/ner/{annotations → segment}/named_entity.rb +3 -11
- data/lib/rbbt/ner/segment/relationship.rb +20 -0
- data/lib/rbbt/ner/segment/segmented.rb +13 -0
- data/lib/rbbt/ner/segment/token.rb +24 -0
- data/lib/rbbt/ner/{annotations → segment}/transformed.rb +10 -10
- data/lib/rbbt/ner/token_trieNER.rb +30 -22
- data/lib/rbbt/nlp/genia/sentence_splitter.rb +2 -1
- data/lib/rbbt/nlp/nlp.rb +23 -37
- data/test/rbbt/corpus/test_document.rb +39 -37
- data/test/rbbt/ner/segment/test_named_entity.rb +29 -0
- data/test/rbbt/ner/segment/test_segmented.rb +23 -0
- data/test/rbbt/ner/{annotations → segment}/test_transformed.rb +6 -6
- data/test/rbbt/ner/test_ngram_prefix_dictionary.rb +15 -1
- data/test/rbbt/ner/test_patterns.rb +11 -12
- data/test/rbbt/ner/test_regexpNER.rb +5 -4
- data/test/rbbt/ner/test_segment.rb +101 -0
- data/test/rbbt/ner/test_token_trieNER.rb +8 -9
- data/test/test_helper.rb +6 -6
- metadata +40 -22
- data/lib/rbbt/ner/annotations/annotated.rb +0 -15
- data/lib/rbbt/ner/annotations/relations.rb +0 -25
- data/lib/rbbt/ner/annotations/token.rb +0 -28
- data/test/rbbt/ner/annotations/test_named_entity.rb +0 -14
- data/test/rbbt/ner/test_annotations.rb +0 -70
@@ -1,17 +1,9 @@
|
|
1
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment'
|
2
2
|
|
3
3
|
module NamedEntity
|
4
|
-
|
4
|
+
extend Annotation
|
5
5
|
include Segment
|
6
|
-
|
7
|
-
def self.annotate(string, offset = nil, type = nil, code = nil, score = nil)
|
8
|
-
string.extend NamedEntity
|
9
|
-
string.offset = offset unless offset.nil?
|
10
|
-
string.type = type unless type.nil?
|
11
|
-
string.code = code unless code.nil?
|
12
|
-
string.score = score unless score.nil?
|
13
|
-
string
|
14
|
-
end
|
6
|
+
self.annotation :type, :code, :score
|
15
7
|
|
16
8
|
def report
|
17
9
|
<<-EOF
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'rbbt/ner/segment'
|
2
|
+
|
3
|
+
module Relationship
|
4
|
+
extend Annotation
|
5
|
+
include Segment
|
6
|
+
self.annotation :terms
|
7
|
+
|
8
|
+
def html
|
9
|
+
text = <<-EOF
|
10
|
+
<span class='Relationship'\
|
11
|
+
>#{ self }</span>
|
12
|
+
EOF
|
13
|
+
text.chomp
|
14
|
+
end
|
15
|
+
|
16
|
+
def html_with_entities(*types)
|
17
|
+
annotations.values_at(*types).each do |segments|
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require 'rbbt/annotations'
|
2
|
+
require 'rbbt/ner/segment'
|
3
|
+
|
4
|
+
module Token
|
5
|
+
extend Annotation
|
6
|
+
include Segment
|
7
|
+
self.annotation :original
|
8
|
+
|
9
|
+
def self.tokenize(text, split_at = /\s|(\(|\)|[-."':,])/, start = 0)
|
10
|
+
|
11
|
+
tokens = []
|
12
|
+
while matchdata = text.match(split_at)
|
13
|
+
tokens << Token.setup(matchdata.pre_match, start) unless matchdata.pre_match.empty?
|
14
|
+
tokens << Token.setup(matchdata.captures.first, start + matchdata.begin(1)) if matchdata.captures.any? and not matchdata.captures.first.empty?
|
15
|
+
start += matchdata.end(0)
|
16
|
+
text = matchdata.post_match
|
17
|
+
end
|
18
|
+
|
19
|
+
tokens << Token.setup(text, start) unless text.empty?
|
20
|
+
|
21
|
+
tokens
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
@@ -1,7 +1,16 @@
|
|
1
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt/ner/segment'
|
2
2
|
module Transformed
|
3
3
|
attr_accessor :transformation_offset_differences, :transformation_original
|
4
4
|
|
5
|
+
def self.transform(text, segments, replacement = nil, &block)
|
6
|
+
require 'rbbt/util/misc'
|
7
|
+
|
8
|
+
text.extend Transformed
|
9
|
+
text.replace(segments, replacement, &block)
|
10
|
+
|
11
|
+
text
|
12
|
+
end
|
13
|
+
|
5
14
|
def self.with_transform(text, segments, replacement)
|
6
15
|
require 'rbbt/util/misc'
|
7
16
|
|
@@ -14,16 +23,7 @@ module Transformed
|
|
14
23
|
|
15
24
|
text.restore(segments, true)
|
16
25
|
end
|
17
|
-
|
18
|
-
def self.transform(text, segments, replacement = nil, &block)
|
19
|
-
require 'rbbt/util/misc'
|
20
26
|
|
21
|
-
text.extend Transformed
|
22
|
-
text.replace(segments, replacement, &block)
|
23
|
-
|
24
|
-
text
|
25
|
-
end
|
26
|
-
|
27
27
|
def transform_pos(pos)
|
28
28
|
return pos if transformation_offset_differences.nil?
|
29
29
|
# tranformation_offset_differences are assumed to be sorted in reverse
|
@@ -1,7 +1,7 @@
|
|
1
|
-
require 'rbbt
|
2
|
-
require 'rbbt/
|
3
|
-
require 'rbbt/ner/
|
4
|
-
require 'rbbt/ner/
|
1
|
+
require 'rbbt'
|
2
|
+
require 'rbbt/tsv'
|
3
|
+
require 'rbbt/ner/segment'
|
4
|
+
require 'rbbt/ner/segment/token'
|
5
5
|
require 'rbbt/ner/NER'
|
6
6
|
|
7
7
|
class TokenTrieNER < NER
|
@@ -16,15 +16,15 @@ class TokenTrieNER < NER
|
|
16
16
|
def self.prepare_token(token, start, extend_to_token = true, no_clean = false)
|
17
17
|
if no_clean
|
18
18
|
if extend_to_token
|
19
|
-
Token.
|
19
|
+
Token.setup(clean(token), start, token)
|
20
20
|
else
|
21
|
-
|
21
|
+
token
|
22
22
|
end
|
23
23
|
else
|
24
24
|
if extend_to_token
|
25
|
-
Token.
|
25
|
+
Token.setup(clean(token), start, token)
|
26
26
|
else
|
27
|
-
token
|
27
|
+
clean(token)
|
28
28
|
end
|
29
29
|
end
|
30
30
|
end
|
@@ -137,6 +137,11 @@ class TokenTrieNER < NER
|
|
137
137
|
hash.send(hash.respond_to?(:through)? :through : :each) do |code, names|
|
138
138
|
names = Array === names ? names : [names]
|
139
139
|
names.flatten! if Array === names.first and not Token === names.first.first
|
140
|
+
|
141
|
+
if names.empty?
|
142
|
+
names.unshift code unless TSV === hash and not (hash.fields.nil? or hash.fields.empty?)
|
143
|
+
end
|
144
|
+
|
140
145
|
names.each do |name|
|
141
146
|
next if name.empty? or (String === name and name.length < 2)
|
142
147
|
|
@@ -167,7 +172,7 @@ class TokenTrieNER < NER
|
|
167
172
|
return index[head]
|
168
173
|
end
|
169
174
|
|
170
|
-
return nil unless (not
|
175
|
+
return nil unless (not TokyoCabinet::HDB === index ) and index.include? :PROCS
|
171
176
|
|
172
177
|
index[:PROCS].each do |key,value|
|
173
178
|
return value if key.call(head)
|
@@ -225,16 +230,16 @@ class TokenTrieNER < NER
|
|
225
230
|
match_offset = match_tokens.first.offset
|
226
231
|
match_tokens.each{|t|
|
227
232
|
match << " " * (t.offset - (match_offset + match.length)) if t.offset > (match_offset + match.length)
|
228
|
-
match << (t.respond_to?(:original) ? t.original : t)
|
233
|
+
match << ((t.respond_to?(:original) and not t.original.nil?) ? t.original : t)
|
229
234
|
}
|
230
235
|
|
231
|
-
NamedEntity.
|
236
|
+
NamedEntity.setup(match, match_tokens.first.offset, type, codes)
|
232
237
|
end
|
233
238
|
|
234
239
|
attr_accessor :index, :longest_match, :type, :slack, :split_at, :no_clean
|
235
240
|
def initialize(type = nil, file = nil, options = {})
|
236
241
|
options = Misc.add_defaults options, :longest_match => true, :no_clean => false, :slack => nil, :split_at => nil,
|
237
|
-
:
|
242
|
+
:persist => false
|
238
243
|
@slack = slack
|
239
244
|
@longest_match = options.delete :longest_match
|
240
245
|
@split_at = options.delete :split_at
|
@@ -242,16 +247,15 @@ class TokenTrieNER < NER
|
|
242
247
|
|
243
248
|
file = [] if file.nil?
|
244
249
|
file = [file] unless Array === file
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
@index = TCHash.get persistecen_file, true, :marshal
|
251
|
-
end
|
250
|
+
persist_options = Misc.pull_keys options, :persist
|
251
|
+
@index = Persist.persist_tsv(file, options, persist_options) do |data|
|
252
|
+
data.serializer = :marshal if data.respond_to? :serializer and data.serializer == :type
|
253
|
+
|
254
|
+
@index = data
|
252
255
|
file.each do |f|
|
253
256
|
merge(f, type)
|
254
257
|
end
|
258
|
+
|
255
259
|
@index
|
256
260
|
end
|
257
261
|
end
|
@@ -259,10 +263,10 @@ class TokenTrieNER < NER
|
|
259
263
|
def merge(new, type = nil)
|
260
264
|
case
|
261
265
|
when TokenTrieNER === new
|
266
|
+
Log.debug "TokenTrieNER merging other TokenTrieNER"
|
262
267
|
TokenTrieNER.merge(@index, new.index)
|
263
|
-
when Hash === new
|
264
|
-
TokenTrieNER.merge(@index, new)
|
265
268
|
when TSV === new
|
269
|
+
Log.debug "TokenTrieNER merging TSV"
|
266
270
|
old_unnamed = new.unnamed
|
267
271
|
old_monitor = new.monitor
|
268
272
|
new.unnamed = true
|
@@ -270,8 +274,12 @@ class TokenTrieNER < NER
|
|
270
274
|
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
271
275
|
new.unnamed = old_unnamed
|
272
276
|
new.monitor = old_monitor
|
277
|
+
when Hash === new
|
278
|
+
Log.debug "TokenTrieNER merging Hash"
|
279
|
+
TokenTrieNER.merge(@index, new)
|
273
280
|
when String === new
|
274
|
-
|
281
|
+
Log.debug "TokenTrieNER merging file: #{ new }"
|
282
|
+
new = TSV.open(new, :flat)
|
275
283
|
new.unnamed = true
|
276
284
|
new.monitor = {:step => 1000, :desc => "Processing TSV into TokenTrieNER"}
|
277
285
|
TokenTrieNER.process(@index, new, type, slack, split_at, no_clean)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'rbbt/ner/segment'
|
1
2
|
module NLP
|
2
3
|
def self.returnFeatures(prevWord, delimiter, nextWord)
|
3
4
|
if nextWord.match(/__ss__/)
|
@@ -206,7 +207,7 @@ module NLP
|
|
206
207
|
offsets.collect do |s,e|
|
207
208
|
sentence = text[s..e]
|
208
209
|
next if sentence.nil?
|
209
|
-
Segment.
|
210
|
+
Segment.setup sentence, s
|
210
211
|
sentence
|
211
212
|
end
|
212
213
|
|
data/lib/rbbt/nlp/nlp.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
require 'rbbt'
|
2
2
|
require 'rbbt/util/tmpfile'
|
3
|
-
require 'rbbt/
|
4
|
-
require 'rbbt/
|
5
|
-
require 'rbbt/ner/
|
6
|
-
require 'rbbt/ner/
|
3
|
+
require 'rbbt/persist'
|
4
|
+
require 'rbbt/resource'
|
5
|
+
require 'rbbt/ner/segment'
|
6
|
+
require 'rbbt/ner/segment/segmented'
|
7
7
|
require 'rbbt/nlp/genia/sentence_splitter'
|
8
8
|
require 'digest/md5'
|
9
9
|
|
@@ -11,7 +11,7 @@ require 'digest/md5'
|
|
11
11
|
module NLP
|
12
12
|
|
13
13
|
extend LocalPersist
|
14
|
-
self.
|
14
|
+
self.local_persist_dir = '/tmp/crap'
|
15
15
|
|
16
16
|
#Rbbt.software.opt.StanfordParser.define_as_install Rbbt.share.install.software.StanfordParser.find
|
17
17
|
#Rbbt.software.opt.StanfordParser.produce
|
@@ -81,44 +81,21 @@ module NLP
|
|
81
81
|
sentence = text[s..e]
|
82
82
|
next if sentence.nil?
|
83
83
|
#sentence.gsub!(NEW_LINE_MASK, "\n")
|
84
|
-
Segment.
|
84
|
+
Segment.setup sentence, s
|
85
85
|
sentence
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
89
89
|
module GdepToken
|
90
|
-
|
90
|
+
extend Annotation
|
91
91
|
include Segment
|
92
|
-
|
93
|
-
def self.annotate(token, offset = nil, num = nil, lemma = nil, chunk = nil, pos = nil, bio = nil, link = nil, dep = nil)
|
94
|
-
token.extend GdepToken
|
95
|
-
|
96
|
-
token.offset = offset
|
97
|
-
token.num = num
|
98
|
-
token.lemma = lemma
|
99
|
-
token.chunk = chunk
|
100
|
-
token.pos = pos
|
101
|
-
token.bio = bio
|
102
|
-
token.link = link
|
103
|
-
token.dep = dep
|
104
|
-
|
105
|
-
token
|
106
|
-
end
|
92
|
+
self.annotation :num, :lemma, :chunk, :pos, :bio, :link, :dep
|
107
93
|
end
|
108
94
|
|
109
95
|
module GdepChunk
|
110
|
-
|
96
|
+
extend Annotation
|
111
97
|
include Segment
|
112
|
-
|
113
|
-
def self.annotate(string, offset = nil, type = nil, parts = nil)
|
114
|
-
string.extend GdepChunk
|
115
|
-
|
116
|
-
string.offset = offset
|
117
|
-
string.type = type
|
118
|
-
string.parts = parts
|
119
|
-
|
120
|
-
string
|
121
|
-
end
|
98
|
+
self.annotation :type, :parts
|
122
99
|
end
|
123
100
|
|
124
101
|
def self.merge_vp_chunks(chunk_list)
|
@@ -148,7 +125,7 @@ module NLP
|
|
148
125
|
chunk_start = "B"[0]
|
149
126
|
chunk_inside = "I"[0]
|
150
127
|
|
151
|
-
last = GdepToken.
|
128
|
+
last = GdepToken.setup("LW")
|
152
129
|
|
153
130
|
chunk_segments = []
|
154
131
|
segment_list.each do |segment|
|
@@ -159,7 +136,7 @@ module NLP
|
|
159
136
|
cstart = chunk_segments.first.offset
|
160
137
|
cend = chunk_segments.last.end
|
161
138
|
chunk = sentence[cstart..cend]
|
162
|
-
GdepChunk.
|
139
|
+
GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
|
163
140
|
chunks << chunk
|
164
141
|
end
|
165
142
|
|
@@ -172,6 +149,15 @@ module NLP
|
|
172
149
|
last = segment
|
173
150
|
end
|
174
151
|
|
152
|
+
if chunk_segments.any?
|
153
|
+
cstart = chunk_segments.first.offset
|
154
|
+
cend = chunk_segments.last.end
|
155
|
+
chunk = sentence[cstart..cend]
|
156
|
+
GdepChunk.setup(chunk, cstart, last.chunk.sub(/^.-/,''), chunk_segments)
|
157
|
+
chunks << chunk
|
158
|
+
end
|
159
|
+
|
160
|
+
|
175
161
|
chunks
|
176
162
|
end
|
177
163
|
|
@@ -188,7 +174,7 @@ module NLP
|
|
188
174
|
tokens = sentence.split(/\n/).collect do |line|
|
189
175
|
next if line.empty?
|
190
176
|
num, token, lemma, chunk, pos, bio, link, dep = line.split(/\t/)
|
191
|
-
GdepToken.
|
177
|
+
GdepToken.setup(token, nil, num, lemma, chunk, pos, bio, link, dep)
|
192
178
|
end.compact
|
193
179
|
end
|
194
180
|
end
|
@@ -214,7 +200,7 @@ module NLP
|
|
214
200
|
Gdep.new.tag(sentence).split(/\n/).collect do |line|
|
215
201
|
next if line.empty?
|
216
202
|
token, lemma, pos, chunk = line.split(/\t/)
|
217
|
-
GdepToken.
|
203
|
+
GdepToken.setup(token, nil, nil, lemma, chunk, pos)
|
218
204
|
token
|
219
205
|
end.compact
|
220
206
|
}
|
@@ -2,11 +2,6 @@ require File.join(File.expand_path(File.dirname(__FILE__)), '../../test_helper.r
|
|
2
2
|
require 'rbbt/corpus/document'
|
3
3
|
require 'test/unit'
|
4
4
|
|
5
|
-
$persistence = TSV.new({})
|
6
|
-
$tchash_persistence = TCHash.get(Rbbt.tmp.test.document.persistence.find(:user), true, Persistence::TSV::TSVSerializer)
|
7
|
-
$global_persistence = TSV.new({}, :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
8
|
-
$tchash_global_persistence = TSV.new(TCHash.get(Rbbt.tmp.test.global.persistence.find(:user), true, Persistence::TSV::StringArraySerializer), :key => "ID", :fields => [ "Start", "End", "Info","Document ID", "Entity Type"])
|
9
|
-
|
10
5
|
class Document
|
11
6
|
define :sentences do
|
12
7
|
require 'rbbt/nlp/nlp'
|
@@ -14,22 +9,22 @@ class Document
|
|
14
9
|
end
|
15
10
|
|
16
11
|
define :tokens do
|
17
|
-
require 'rbbt/ner/
|
12
|
+
require 'rbbt/ner/segment/token'
|
18
13
|
Token.tokenize(text)
|
19
14
|
end
|
20
15
|
|
21
16
|
define :long_words do
|
22
|
-
require 'rbbt/ner/
|
17
|
+
require 'rbbt/ner/segment/token'
|
23
18
|
Token.tokenize(text).select{|tok| tok.length > 5}
|
24
19
|
end
|
25
20
|
|
26
21
|
define :short_words do
|
27
|
-
require 'rbbt/ner/
|
22
|
+
require 'rbbt/ner/segment/token'
|
28
23
|
Token.tokenize(text).select{|tok| tok.length < 5}
|
29
24
|
end
|
30
25
|
|
31
26
|
define :even_words do
|
32
|
-
require 'rbbt/ner/
|
27
|
+
require 'rbbt/ner/segment/token'
|
33
28
|
Token.tokenize(text).select{|tok| tok.length % 2 == 0}
|
34
29
|
end
|
35
30
|
|
@@ -40,17 +35,30 @@ class Document
|
|
40
35
|
define :tokens_again do
|
41
36
|
raise "This should be here already"
|
42
37
|
end
|
43
|
-
|
44
|
-
persist :sentences
|
45
|
-
persist_in_tsv :tokens
|
46
|
-
persist_in_tsv :long_words, $tchash_persistence, :Literal
|
47
|
-
persist_in_global_tsv :short_words, $global_persistence
|
48
|
-
persist_in_global_tsv :even_words, $tchash_global_persistence
|
49
|
-
persist_in_global_tsv :missing, $tchash_global_persistence
|
50
38
|
end
|
51
39
|
|
52
40
|
class TestDocument < Test::Unit::TestCase
|
53
41
|
|
42
|
+
def setup
|
43
|
+
global_fields = ["Start", "End", "JSON", "Document ID", "Entity Type"]
|
44
|
+
$persistence = TSV.setup({})
|
45
|
+
$tchash_persistence = Persist.open_tokyocabinet(Rbbt.tmp.test.document.persistence.find(:user), true, :tsv)
|
46
|
+
$global_persistence = TSV.setup({}, :key => "ID", :fields => global_fields)
|
47
|
+
$tchash_global_persistence = TSV.setup(Persist.open_tokyocabinet(Rbbt.tmp.test.global.persistence.find(:user), true, :list), :key => "ID", :fields => global_fields + ["Document ID", "Entity Type"])
|
48
|
+
$tchash_global_persistence.read
|
49
|
+
$tchash_global_persistence.write
|
50
|
+
|
51
|
+
Document.class_eval do
|
52
|
+
|
53
|
+
persist :sentences
|
54
|
+
persist_in_tsv :tokens, :literal
|
55
|
+
persist_in_tsv :long_words, $tchash_persistence, :literal
|
56
|
+
persist_in_global_tsv :short_words, $global_persistence
|
57
|
+
persist_in_global_tsv :even_words, $tchash_global_persistence
|
58
|
+
persist_in_global_tsv :missing, $tchash_global_persistence
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
54
62
|
def test_annotations
|
55
63
|
|
56
64
|
text =<<-EOF
|
@@ -127,7 +135,7 @@ another sentence.
|
|
127
135
|
doc.text = text
|
128
136
|
|
129
137
|
sentence = doc.sentences.last
|
130
|
-
Misc.benchmark(
|
138
|
+
Misc.benchmark(1) do
|
131
139
|
doc = Document.new(dir)
|
132
140
|
doc.text = text
|
133
141
|
|
@@ -166,6 +174,15 @@ another sentence.
|
|
166
174
|
assert_equal "another", sentence.tokens[2]
|
167
175
|
assert_equal sentence.offset + 0, sentence.tokens[0].offset
|
168
176
|
|
177
|
+
assert_equal 2, sentence.long_words.length
|
178
|
+
doc = Document.new(dir)
|
179
|
+
doc.text = text * 10
|
180
|
+
doc.sentences
|
181
|
+
assert_equal sentence, doc.sentences.last
|
182
|
+
|
183
|
+
sentence = doc.sentences.last
|
184
|
+
doc.load_into sentence, :tokens, :long_words
|
185
|
+
|
169
186
|
assert_equal 2, sentence.long_words.length
|
170
187
|
assert_equal %w(another sentence), sentence.long_words
|
171
188
|
assert_equal sentence.offset + "This is ".length, sentence.long_words[0].offset
|
@@ -183,15 +200,16 @@ another sentence.
|
|
183
200
|
FileUtils.mkdir_p dir
|
184
201
|
|
185
202
|
|
186
|
-
|
203
|
+
global_persistence = TSV.setup({}, :fields => %w(Start End annotation_types JSON) + ["Document ID", "Entity Type"])
|
204
|
+
doc = Document.new(dir, nil, nil, global_persistence)
|
187
205
|
doc.text = text * 10
|
188
|
-
doc.docid = "
|
189
|
-
|
206
|
+
doc.docid = "TEST"
|
207
|
+
|
190
208
|
doc.sentences
|
191
209
|
|
192
210
|
doc = Document.new(dir)
|
193
211
|
doc.text = text * 10
|
194
|
-
doc.docid = "
|
212
|
+
doc.docid = "TEST"
|
195
213
|
|
196
214
|
sentence = doc.sentences.last
|
197
215
|
|
@@ -201,22 +219,6 @@ another sentence.
|
|
201
219
|
assert_equal 3, sentence.even_words.length
|
202
220
|
end
|
203
221
|
end
|
204
|
-
|
205
|
-
def test_dump
|
206
|
-
text =<<-EOF
|
207
|
-
This is a
|
208
|
-
sentence. This is
|
209
|
-
another sentence.
|
210
|
-
EOF
|
211
|
-
|
212
|
-
TmpFile.with_file do |dir|
|
213
|
-
FileUtils.mkdir_p dir
|
214
|
-
|
215
|
-
doc = Document.new(dir)
|
216
|
-
doc.text = text * 10
|
217
|
-
tsv = Document.tsv(doc.sentences, ["Literal"])
|
218
|
-
end
|
219
|
-
end
|
220
222
|
end
|
221
223
|
|
222
224
|
|