rbbt-text 1.1.9 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/g_norm_plus.rb +19 -12
- data/lib/rbbt/text/corpus/document.rb +63 -41
- data/lib/rbbt/text/segment.rb +10 -2
- data/lib/rbbt/text/segment/docid.rb +44 -44
- data/lib/rbbt/text/segment/named_entity.rb +1 -0
- data/lib/rbbt/text/segment/transformed.rb +2 -2
- data/test/rbbt/ner/test_g_norm_plus.rb +0 -1
- data/test/rbbt/text/corpus/test_document.rb +39 -9
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
|
4
|
+
data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
|
7
|
+
data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
|
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -10,35 +10,39 @@ module GNormPlus
|
|
10
10
|
end
|
11
11
|
|
12
12
|
CONFIG =<<-EOF
|
13
|
-
|
14
13
|
#===Annotation
|
15
14
|
#Attribution setting:
|
16
15
|
#FocusSpecies = Taxonomy ID
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
16
|
+
# All: All species
|
17
|
+
# 9606: Human
|
18
|
+
# 4932: yeast
|
19
|
+
# 7227: Fly
|
20
|
+
# 10090: Mouse
|
21
|
+
# 10116: Rat
|
22
|
+
# 7955: Zebrafish
|
23
|
+
# 3702: Arabidopsis thaliana
|
25
24
|
#open: True
|
26
25
|
#close: False
|
27
26
|
|
28
27
|
[Focus Species]
|
29
|
-
FocusSpecies =
|
28
|
+
FocusSpecies = 9606
|
29
|
+
FilterAntibody = False
|
30
30
|
[Dictionary & Model]
|
31
31
|
DictionaryFolder = ./Dictionary
|
32
32
|
GNRModel = ./Dictionary/GNR.Model
|
33
33
|
SCModel = ./Dictionary/SimConcept.Model
|
34
34
|
GeneIDMatch = True
|
35
|
+
HomologeneID = False
|
35
36
|
Normalization2Protein = False
|
37
|
+
ShowUnNormalizedMention = False
|
36
38
|
DeleteTmp = True
|
39
|
+
IgnoreNER = True
|
37
40
|
EOF
|
38
41
|
|
39
42
|
def self.process(texts)
|
40
43
|
TmpFile.with_file do |tmpdir|
|
41
44
|
Open.mkdir tmpdir
|
45
|
+
|
42
46
|
Misc.in_dir tmpdir do
|
43
47
|
Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
|
44
48
|
Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
|
@@ -50,12 +54,12 @@ EOF
|
|
50
54
|
|
51
55
|
texts.each do |name,text|
|
52
56
|
Open.write("input/#{name}.txt") do |f|
|
53
|
-
f.puts "#{name}|a|" << text
|
57
|
+
f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
|
54
58
|
f.puts
|
55
59
|
end
|
56
60
|
end
|
57
61
|
Open.write('config', CONFIG)
|
58
|
-
CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
62
|
+
CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
59
63
|
|
60
64
|
if texts.respond_to? :key_field
|
61
65
|
key_field = texts.key_field
|
@@ -68,6 +72,9 @@ EOF
|
|
68
72
|
entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
|
69
73
|
tsv[name] = entities
|
70
74
|
end
|
75
|
+
|
76
|
+
raise "GNormPlus failed: no results found" if tsv.size == 0 && texts.size > 0
|
77
|
+
|
71
78
|
tsv
|
72
79
|
end
|
73
80
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rbbt/text/segment'
|
2
2
|
require 'rbbt/text/segment/segmented'
|
3
|
+
require 'rbbt/text/segment/docid'
|
3
4
|
require 'rbbt/tsv'
|
4
5
|
require 'rbbt/resource/path'
|
5
6
|
require 'rbbt/persist/tsv'
|
@@ -15,6 +16,7 @@ class Corpus
|
|
15
16
|
attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
|
16
17
|
|
17
18
|
attr_accessor :multiple_result
|
19
|
+
|
18
20
|
def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
|
19
21
|
@segments = {}
|
20
22
|
@segment_indices = {}
|
@@ -44,16 +46,22 @@ class Corpus
|
|
44
46
|
end
|
45
47
|
|
46
48
|
def self.define(entity, &block)
|
47
|
-
send :define_method, "produce_#{entity}"
|
49
|
+
send :define_method, "produce_#{entity}" do
|
50
|
+
segments = self.instance_exec &block
|
48
51
|
|
49
|
-
|
52
|
+
segments.each{|s| s.docid = docid }
|
53
|
+
end
|
54
|
+
|
55
|
+
self.class_eval <<-EOC, __FILE__, __LINE__ + 1
|
50
56
|
def load_#{entity}(raw = false)
|
51
57
|
return if segments.include? "#{ entity }"
|
52
58
|
if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
|
53
|
-
|
59
|
+
entities = load_with_persistence_#{entity}(raw)
|
54
60
|
else
|
55
|
-
|
61
|
+
entities = produce_#{entity}
|
56
62
|
end
|
63
|
+
|
64
|
+
segments["#{ entity }"] = entities
|
57
65
|
end
|
58
66
|
|
59
67
|
def #{entity}(raw = false)
|
@@ -77,7 +85,10 @@ class Corpus
|
|
77
85
|
|
78
86
|
def self.define_multiple(entity, &block)
|
79
87
|
send :define_method, "produce_#{entity}" do
|
80
|
-
|
88
|
+
if self.multiple_result && self.multiple_result[entity]
|
89
|
+
segments = self.multiple_result[entity]
|
90
|
+
return segments.each{|s| s.docid = docid }
|
91
|
+
end
|
81
92
|
raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
|
82
93
|
end
|
83
94
|
|
@@ -86,14 +97,16 @@ class Corpus
|
|
86
97
|
self
|
87
98
|
end.send :define_method, name, &block
|
88
99
|
|
89
|
-
self.class_eval <<-EOC, __FILE__, __LINE__
|
100
|
+
self.class_eval <<-EOC, __FILE__, __LINE__ + 1
|
90
101
|
def load_#{entity}(raw = false)
|
91
102
|
return if segments.include? "#{ entity }"
|
92
103
|
if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
|
93
|
-
|
104
|
+
entities = load_with_persistence_#{entity}(raw)
|
94
105
|
else
|
95
|
-
|
106
|
+
entities = produce_#{entity}
|
96
107
|
end
|
108
|
+
|
109
|
+
segments["#{ entity }"] = entities
|
97
110
|
end
|
98
111
|
|
99
112
|
def #{entity}(raw = false)
|
@@ -124,7 +137,7 @@ class Corpus
|
|
124
137
|
missing << doc
|
125
138
|
end
|
126
139
|
end
|
127
|
-
res = self.send("multiple_produce_#{entity.to_s}", missing)
|
140
|
+
res = self.send("multiple_produce_#{entity.to_s}", missing) if missing.any?
|
128
141
|
case res
|
129
142
|
when Array
|
130
143
|
res.each_with_index do |res,i|
|
@@ -142,7 +155,9 @@ class Corpus
|
|
142
155
|
end
|
143
156
|
end
|
144
157
|
end
|
145
|
-
missing.each{|doc|
|
158
|
+
missing.each{|doc|
|
159
|
+
doc.send entity
|
160
|
+
}
|
146
161
|
end
|
147
162
|
|
148
163
|
|
@@ -197,7 +212,7 @@ class Corpus
|
|
197
212
|
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
198
213
|
end
|
199
214
|
|
200
|
-
self.class_eval <<-EOC, __FILE__, __LINE__
|
215
|
+
self.class_eval <<-EOC, __FILE__, __LINE__ + 1
|
201
216
|
def load_with_persistence_#{entity}(raw = false)
|
202
217
|
repo = TSV_REPOS["#{ entity }"]
|
203
218
|
if repo.nil?
|
@@ -253,7 +268,7 @@ class Corpus
|
|
253
268
|
|
254
269
|
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
|
255
270
|
|
256
|
-
self.class_eval <<-EOC, __FILE__, __LINE__
|
271
|
+
self.class_eval <<-EOC, __FILE__, __LINE__ + 1
|
257
272
|
def load_with_persistence_#{entity}(raw = false)
|
258
273
|
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
259
274
|
|
@@ -261,20 +276,23 @@ class Corpus
|
|
261
276
|
|
262
277
|
begin
|
263
278
|
|
264
|
-
data.
|
279
|
+
if data.respond_to? :persistence_path and String === data.persistence_path
|
280
|
+
data.filter(data.persistence_path + '.filters')
|
281
|
+
end
|
282
|
+
|
283
|
+
keys = data.read_and_close do
|
265
284
|
|
266
|
-
|
285
|
+
fields = data.fields if fields.nil? and data.respond_to? :fields
|
267
286
|
|
287
|
+
data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
|
288
|
+
data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
|
289
|
+
keys = data.keys
|
290
|
+
data.pop_filter if fields.include?("#{entity_field}")
|
291
|
+
data.pop_filter if fields.include?("#{doc_field}")
|
268
292
|
|
269
|
-
|
270
|
-
data.filter(data.persistence_path + '.filters')
|
293
|
+
keys
|
271
294
|
end
|
272
295
|
|
273
|
-
data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
|
274
|
-
data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
|
275
|
-
keys = data.keys
|
276
|
-
data.pop_filter if data.fields.include?("#{entity_field}")
|
277
|
-
data.pop_filter if data.fields.include?("#{doc_field}")
|
278
296
|
|
279
297
|
if keys.empty?
|
280
298
|
segments = produce_#{entity}
|
@@ -289,34 +307,38 @@ class Corpus
|
|
289
307
|
"#{ entity }"
|
290
308
|
end
|
291
309
|
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
310
|
+
keys = data.write_and_close do
|
311
|
+
data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
|
312
|
+
data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
|
313
|
+
keys = tsv.collect do |key, value|
|
314
|
+
data[key] = value
|
315
|
+
key
|
316
|
+
end
|
317
|
+
data.pop_filter if fields.include?("#{entity_field}")
|
318
|
+
data.pop_filter if fields.include?("#{doc_field}")
|
319
|
+
keys
|
298
320
|
end
|
299
|
-
data.pop_filter if data.fields.include?("#{entity_field}")
|
300
|
-
data.pop_filter if data.fields.include?("#{doc_field}")
|
301
|
-
data.read
|
302
321
|
|
303
322
|
else
|
304
|
-
if raw == :check
|
305
|
-
data.close
|
306
|
-
return nil
|
307
|
-
end
|
323
|
+
return nil if raw == :check
|
308
324
|
end
|
309
325
|
|
310
326
|
return data.values if raw
|
311
327
|
|
312
328
|
start_pos = data.identify_field "Start"
|
313
|
-
|
329
|
+
data.read_and_close do
|
330
|
+
data.chunked_values_at(keys).collect{|annotation|
|
331
|
+
begin
|
314
332
|
pos = annotation[start_pos]
|
315
|
-
Segment.load_tsv_values(text, annotation,
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
333
|
+
Segment.load_tsv_values(text, annotation, fields) unless [-1, "-1", [-1], ["-1"]].include?(pos)
|
334
|
+
rescue
|
335
|
+
Log.exception $!
|
336
|
+
iif keys
|
337
|
+
iif [text, annotation]
|
338
|
+
end
|
339
|
+
|
340
|
+
}.compact
|
341
|
+
end
|
320
342
|
ensure
|
321
343
|
data.close
|
322
344
|
end
|
@@ -348,7 +370,7 @@ class Corpus
|
|
348
370
|
segment.segments[name] = annotations
|
349
371
|
class << segment
|
350
372
|
self
|
351
|
-
end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
|
373
|
+
end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__ + 1
|
352
374
|
end
|
353
375
|
|
354
376
|
segment
|
data/lib/rbbt/text/segment.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/fix_width_table'
|
|
3
3
|
|
4
4
|
module Segment
|
5
5
|
extend Annotation
|
6
|
-
self.annotation :offset
|
6
|
+
self.annotation :offset, :docid
|
7
7
|
|
8
8
|
def segment_length
|
9
9
|
begin
|
@@ -325,7 +325,7 @@ module Segment
|
|
325
325
|
tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
|
326
326
|
|
327
327
|
segments.each do |segment|
|
328
|
-
tsv[segment.
|
328
|
+
tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
|
329
329
|
end
|
330
330
|
|
331
331
|
tsv
|
@@ -348,6 +348,14 @@ module Segment
|
|
348
348
|
[offset, self.end] * ".."
|
349
349
|
end
|
350
350
|
|
351
|
+
def segment_id
|
352
|
+
if self.respond_to?(:docid)
|
353
|
+
[docid, locus, Misc.obj2digest(info)] * ":"
|
354
|
+
else
|
355
|
+
Misc.obj2digest(info)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
351
359
|
#def ==(other)
|
352
360
|
# self.text == other.text
|
353
361
|
#end
|
@@ -1,46 +1,46 @@
|
|
1
1
|
require 'rbbt/text/segment'
|
2
2
|
|
3
|
-
module SegmentWithDocid
|
4
|
-
extend Annotation
|
5
|
-
|
6
|
-
self.annotation :docid
|
7
|
-
|
8
|
-
def masked?
|
9
|
-
self[0..5] == "MASKED"
|
10
|
-
end
|
11
|
-
|
12
|
-
def mask
|
13
|
-
return self if masked?
|
14
|
-
raise "Cannot mask an array of elements, they must be masked individually" if Array === self
|
15
|
-
raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
|
16
|
-
raise "Cannot mask a segment with no offset" if offset.nil?
|
17
|
-
textual_position = ["MASKED", length] * ":"
|
18
|
-
self.replace(textual_position)
|
19
|
-
self
|
20
|
-
end
|
21
|
-
|
22
|
-
def unmasked_text
|
23
|
-
return self unless masked?
|
24
|
-
tag, length = self.split(":")
|
25
|
-
Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
|
26
|
-
end
|
27
|
-
|
28
|
-
def unmask
|
29
|
-
return self unless masked?
|
30
|
-
self.replace(unmasked_text)
|
31
|
-
self
|
32
|
-
end
|
33
|
-
|
34
|
-
def str_length
|
35
|
-
self.length
|
36
|
-
end
|
37
|
-
|
38
|
-
def masked_length
|
39
|
-
self.split(":").last.to_i
|
40
|
-
end
|
41
|
-
|
42
|
-
def segment_length
|
43
|
-
masked? ? masked_length : str_length
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
3
|
+
#module SegmentWithDocid
|
4
|
+
# extend Annotation
|
5
|
+
#
|
6
|
+
# self.annotation :docid
|
7
|
+
#
|
8
|
+
# def masked?
|
9
|
+
# self[0..5] == "MASKED"
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# def mask
|
13
|
+
# return self if masked?
|
14
|
+
# raise "Cannot mask an array of elements, they must be masked individually" if Array === self
|
15
|
+
# raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
|
16
|
+
# raise "Cannot mask a segment with no offset" if offset.nil?
|
17
|
+
# textual_position = ["MASKED", length] * ":"
|
18
|
+
# self.replace(textual_position)
|
19
|
+
# self
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# def unmasked_text
|
23
|
+
# return self unless masked?
|
24
|
+
# tag, length = self.split(":")
|
25
|
+
# Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
# def unmask
|
29
|
+
# return self unless masked?
|
30
|
+
# self.replace(unmasked_text)
|
31
|
+
# self
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
# def str_length
|
35
|
+
# self.length
|
36
|
+
# end
|
37
|
+
#
|
38
|
+
# def masked_length
|
39
|
+
# self.split(":").last.to_i
|
40
|
+
# end
|
41
|
+
#
|
42
|
+
# def segment_length
|
43
|
+
# masked? ? masked_length : str_length
|
44
|
+
# end
|
45
|
+
#end
|
46
|
+
#
|
@@ -111,10 +111,10 @@ module Transformed
|
|
111
111
|
|
112
112
|
self[updated_begin..updated_end] = new
|
113
113
|
|
114
|
-
@transformed_segments[segment.
|
114
|
+
@transformed_segments[segment.segment_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
|
115
115
|
|
116
116
|
segment.replace original_text
|
117
|
-
stack << segment.
|
117
|
+
stack << segment.segment_id
|
118
118
|
end
|
119
119
|
@transformation_stack << stack
|
120
120
|
end
|
@@ -10,10 +10,6 @@ class TestCorpusDocument < Test::Unit::TestCase
|
|
10
10
|
Segment.align(self.text, words)
|
11
11
|
end
|
12
12
|
|
13
|
-
Open.mkdir Rbbt.tmp.test.annotations.find
|
14
|
-
Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
|
15
|
-
|
16
|
-
|
17
13
|
Corpus::Document.define_multiple :words2 do |documents|
|
18
14
|
documents.collect do |doc|
|
19
15
|
words = doc.text.split(" ")
|
@@ -21,32 +17,66 @@ class TestCorpusDocument < Test::Unit::TestCase
|
|
21
17
|
end
|
22
18
|
end
|
23
19
|
|
20
|
+
Open.mkdir Rbbt.tmp.test.annotations.find
|
21
|
+
|
22
|
+
Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
|
24
23
|
Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
|
25
24
|
end
|
26
25
|
|
27
26
|
def test_words
|
28
27
|
text = "This is a test document"
|
29
|
-
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
|
28
|
+
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", text)
|
30
29
|
assert_equal Segment.sort(document.words), text.split(" ")
|
30
|
+
assert document.words.first.docid
|
31
|
+
assert document.words.first.segment_id.include?("TEST")
|
31
32
|
end
|
32
33
|
|
33
34
|
def test_words_multiple
|
34
35
|
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
35
|
-
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is
|
36
|
+
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
|
37
|
+
document3 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc3:3", "This is yet another test document")
|
36
38
|
|
37
|
-
docs = [document1, document2]
|
39
|
+
docs = [document1, document2, document3]
|
38
40
|
|
39
41
|
Corpus::Document.prepare_multiple(docs, :words2)
|
40
|
-
|
42
|
+
|
43
|
+
assert document1.words.first.docid
|
44
|
+
assert document1.words.first.segment_id.include?("TEST")
|
45
|
+
|
41
46
|
assert_equal document1.words2, document1.text.split(" ")
|
42
47
|
assert_equal document2.words2, document2.text.split(" ")
|
48
|
+
assert_equal document3.words2, document3.text.split(" ")
|
43
49
|
|
44
50
|
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
45
|
-
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is
|
51
|
+
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
|
46
52
|
|
47
53
|
docs = [document1, document2]
|
48
54
|
|
49
55
|
Corpus::Document.prepare_multiple(docs, :words2)
|
50
56
|
end
|
57
|
+
|
58
|
+
def test_parallel
|
59
|
+
text =<<-EOF
|
60
|
+
This is a test document number
|
61
|
+
EOF
|
62
|
+
|
63
|
+
docs = []
|
64
|
+
100.times do |i|
|
65
|
+
docs << text.chomp + " " + i.to_s
|
66
|
+
end
|
67
|
+
|
68
|
+
Log.with_severity 0 do
|
69
|
+
TSV.traverse docs, :cpus => 10, :bar => true do |doc|
|
70
|
+
hash = Misc.digest(doc)
|
71
|
+
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
|
72
|
+
assert_equal Segment.sort(document.words), document.text.split(" ")
|
73
|
+
end
|
74
|
+
TSV.traverse docs, :cpus => 10, :bar => true do |doc|
|
75
|
+
hash = Misc.digest(doc)
|
76
|
+
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
|
77
|
+
assert_equal Segment.sort(document.words), document.text.split(" ")
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
51
81
|
end
|
52
82
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-04-
|
11
|
+
date: 2020-04-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|