rbbt-text 1.1.9 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rbbt/ner/g_norm_plus.rb +19 -12
- data/lib/rbbt/text/corpus/document.rb +63 -41
- data/lib/rbbt/text/segment.rb +10 -2
- data/lib/rbbt/text/segment/docid.rb +44 -44
- data/lib/rbbt/text/segment/named_entity.rb +1 -0
- data/lib/rbbt/text/segment/transformed.rb +2 -2
- data/test/rbbt/ner/test_g_norm_plus.rb +0 -1
- data/test/rbbt/text/corpus/test_document.rb +39 -9
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '009cfce2ce954c03db5c09d0bd6f5d25bf59d508776d7370bb6bd0fb3a135f36'
|
4
|
+
data.tar.gz: 3d11d2a5934512958d10dbdfad5e22a9a2481b332c985ab1e2c8e92427d6f375
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e9338d4b54d2b66efda11dee3d37366c4f4ae78bde80f0abc1016b34c928e1db857ad73f33ba1da611ad232513498430736c46134a902b3930a8f832afed3e09
|
7
|
+
data.tar.gz: 0cdeeee67636d4e0b0714334b3c187cb0f5ea5c7363fe27fc84d438643a0d6f204413a4dd5d99c8c43d847539320c484fde2b5300b298cf9cc782148d98802ee
|
data/lib/rbbt/ner/g_norm_plus.rb
CHANGED
@@ -10,35 +10,39 @@ module GNormPlus
|
|
10
10
|
end
|
11
11
|
|
12
12
|
CONFIG =<<-EOF
|
13
|
-
|
14
13
|
#===Annotation
|
15
14
|
#Attribution setting:
|
16
15
|
#FocusSpecies = Taxonomy ID
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
16
|
+
# All: All species
|
17
|
+
# 9606: Human
|
18
|
+
# 4932: yeast
|
19
|
+
# 7227: Fly
|
20
|
+
# 10090: Mouse
|
21
|
+
# 10116: Rat
|
22
|
+
# 7955: Zebrafish
|
23
|
+
# 3702: Arabidopsis thaliana
|
25
24
|
#open: True
|
26
25
|
#close: False
|
27
26
|
|
28
27
|
[Focus Species]
|
29
|
-
FocusSpecies =
|
28
|
+
FocusSpecies = 9606
|
29
|
+
FilterAntibody = False
|
30
30
|
[Dictionary & Model]
|
31
31
|
DictionaryFolder = ./Dictionary
|
32
32
|
GNRModel = ./Dictionary/GNR.Model
|
33
33
|
SCModel = ./Dictionary/SimConcept.Model
|
34
34
|
GeneIDMatch = True
|
35
|
+
HomologeneID = False
|
35
36
|
Normalization2Protein = False
|
37
|
+
ShowUnNormalizedMention = False
|
36
38
|
DeleteTmp = True
|
39
|
+
IgnoreNER = True
|
37
40
|
EOF
|
38
41
|
|
39
42
|
def self.process(texts)
|
40
43
|
TmpFile.with_file do |tmpdir|
|
41
44
|
Open.mkdir tmpdir
|
45
|
+
|
42
46
|
Misc.in_dir tmpdir do
|
43
47
|
Open.ln_s Rbbt.software.opt.GNormPlus.Dictionary.find, '.'
|
44
48
|
Open.ln_s Rbbt.software.opt.GNormPlus["BioC.dtd"].find, '.'
|
@@ -50,12 +54,12 @@ EOF
|
|
50
54
|
|
51
55
|
texts.each do |name,text|
|
52
56
|
Open.write("input/#{name}.txt") do |f|
|
53
|
-
f.puts "#{name}|a|" << text
|
57
|
+
f.puts "#{name}|a|" << text.gsub("\n\n", "\n·")
|
54
58
|
f.puts
|
55
59
|
end
|
56
60
|
end
|
57
61
|
Open.write('config', CONFIG)
|
58
|
-
CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
62
|
+
CMD.cmd_log("java -Xmx20G -Xms20G -jar '#{Rbbt.software.opt.GNormPlus.produce.find}/GNormPlus.jar' 'input' 'output' 'config'")
|
59
63
|
|
60
64
|
if texts.respond_to? :key_field
|
61
65
|
key_field = texts.key_field
|
@@ -68,6 +72,9 @@ EOF
|
|
68
72
|
entities = Open.read(file).split("\n")[1..-1].collect{|l| l.gsub(':', '.').split("\t")[1..-1] * ":"}
|
69
73
|
tsv[name] = entities
|
70
74
|
end
|
75
|
+
|
76
|
+
raise "GNormPlus failed: no results found" if tsv.size == 0 && texts.size > 0
|
77
|
+
|
71
78
|
tsv
|
72
79
|
end
|
73
80
|
end
|
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'rbbt/text/segment'
|
2
2
|
require 'rbbt/text/segment/segmented'
|
3
|
+
require 'rbbt/text/segment/docid'
|
3
4
|
require 'rbbt/tsv'
|
4
5
|
require 'rbbt/resource/path'
|
5
6
|
require 'rbbt/persist/tsv'
|
@@ -15,6 +16,7 @@ class Corpus
|
|
15
16
|
attr_accessor :text, :docid, :namespace, :id, :type, :hash, :segments, :segment_indices, :persist_dir, :global_persistence, :corpus
|
16
17
|
|
17
18
|
attr_accessor :multiple_result
|
19
|
+
|
18
20
|
def initialize(persist_dir = nil, docid = nil, text = nil, global_persistence = nil, corpus = nil)
|
19
21
|
@segments = {}
|
20
22
|
@segment_indices = {}
|
@@ -44,16 +46,22 @@ class Corpus
|
|
44
46
|
end
|
45
47
|
|
46
48
|
def self.define(entity, &block)
|
47
|
-
send :define_method, "produce_#{entity}"
|
49
|
+
send :define_method, "produce_#{entity}" do
|
50
|
+
segments = self.instance_exec &block
|
48
51
|
|
49
|
-
|
52
|
+
segments.each{|s| s.docid = docid }
|
53
|
+
end
|
54
|
+
|
55
|
+
self.class_eval <<-EOC, __FILE__, __LINE__ + 1
|
50
56
|
def load_#{entity}(raw = false)
|
51
57
|
return if segments.include? "#{ entity }"
|
52
58
|
if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
|
53
|
-
|
59
|
+
entities = load_with_persistence_#{entity}(raw)
|
54
60
|
else
|
55
|
-
|
61
|
+
entities = produce_#{entity}
|
56
62
|
end
|
63
|
+
|
64
|
+
segments["#{ entity }"] = entities
|
57
65
|
end
|
58
66
|
|
59
67
|
def #{entity}(raw = false)
|
@@ -77,7 +85,10 @@ class Corpus
|
|
77
85
|
|
78
86
|
def self.define_multiple(entity, &block)
|
79
87
|
send :define_method, "produce_#{entity}" do
|
80
|
-
|
88
|
+
if self.multiple_result && self.multiple_result[entity]
|
89
|
+
segments = self.multiple_result[entity]
|
90
|
+
return segments.each{|s| s.docid = docid }
|
91
|
+
end
|
81
92
|
raise MultipleEntity, "Entity #{entity} runs with multiple documents, please prepare beforehand with prepare_multiple: #{self.docid}"
|
82
93
|
end
|
83
94
|
|
@@ -86,14 +97,16 @@ class Corpus
|
|
86
97
|
self
|
87
98
|
end.send :define_method, name, &block
|
88
99
|
|
89
|
-
self.class_eval <<-EOC, __FILE__, __LINE__
|
100
|
+
self.class_eval <<-EOC, __FILE__, __LINE__ + 1
|
90
101
|
def load_#{entity}(raw = false)
|
91
102
|
return if segments.include? "#{ entity }"
|
92
103
|
if self.respond_to?("load_with_persistence_#{entity}") and not @persist_dir.nil?
|
93
|
-
|
104
|
+
entities = load_with_persistence_#{entity}(raw)
|
94
105
|
else
|
95
|
-
|
106
|
+
entities = produce_#{entity}
|
96
107
|
end
|
108
|
+
|
109
|
+
segments["#{ entity }"] = entities
|
97
110
|
end
|
98
111
|
|
99
112
|
def #{entity}(raw = false)
|
@@ -124,7 +137,7 @@ class Corpus
|
|
124
137
|
missing << doc
|
125
138
|
end
|
126
139
|
end
|
127
|
-
res = self.send("multiple_produce_#{entity.to_s}", missing)
|
140
|
+
res = self.send("multiple_produce_#{entity.to_s}", missing) if missing.any?
|
128
141
|
case res
|
129
142
|
when Array
|
130
143
|
res.each_with_index do |res,i|
|
@@ -142,7 +155,9 @@ class Corpus
|
|
142
155
|
end
|
143
156
|
end
|
144
157
|
end
|
145
|
-
missing.each{|doc|
|
158
|
+
missing.each{|doc|
|
159
|
+
doc.send entity
|
160
|
+
}
|
146
161
|
end
|
147
162
|
|
148
163
|
|
@@ -197,7 +212,7 @@ class Corpus
|
|
197
212
|
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields unless fields.nil?
|
198
213
|
end
|
199
214
|
|
200
|
-
self.class_eval <<-EOC, __FILE__, __LINE__
|
215
|
+
self.class_eval <<-EOC, __FILE__, __LINE__ + 1
|
201
216
|
def load_with_persistence_#{entity}(raw = false)
|
202
217
|
repo = TSV_REPOS["#{ entity }"]
|
203
218
|
if repo.nil?
|
@@ -253,7 +268,7 @@ class Corpus
|
|
253
268
|
|
254
269
|
FIELDS_FOR_ENTITY_PERSISTENCE[entity.to_s] = fields
|
255
270
|
|
256
|
-
self.class_eval <<-EOC, __FILE__, __LINE__
|
271
|
+
self.class_eval <<-EOC, __FILE__, __LINE__ + 1
|
257
272
|
def load_with_persistence_#{entity}(raw = false)
|
258
273
|
fields = FIELDS_FOR_ENTITY_PERSISTENCE["#{ entity }"]
|
259
274
|
|
@@ -261,20 +276,23 @@ class Corpus
|
|
261
276
|
|
262
277
|
begin
|
263
278
|
|
264
|
-
data.
|
279
|
+
if data.respond_to? :persistence_path and String === data.persistence_path
|
280
|
+
data.filter(data.persistence_path + '.filters')
|
281
|
+
end
|
282
|
+
|
283
|
+
keys = data.read_and_close do
|
265
284
|
|
266
|
-
|
285
|
+
fields = data.fields if fields.nil? and data.respond_to? :fields
|
267
286
|
|
287
|
+
data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
|
288
|
+
data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
|
289
|
+
keys = data.keys
|
290
|
+
data.pop_filter if fields.include?("#{entity_field}")
|
291
|
+
data.pop_filter if fields.include?("#{doc_field}")
|
268
292
|
|
269
|
-
|
270
|
-
data.filter(data.persistence_path + '.filters')
|
293
|
+
keys
|
271
294
|
end
|
272
295
|
|
273
|
-
data.add_filter("field:#{ doc_field }", @docid) if data.fields.include?("#{doc_field}")
|
274
|
-
data.add_filter("field:#{ entity_field }", "#{ entity }") if data.fields.include?("#{entity_field}")
|
275
|
-
keys = data.keys
|
276
|
-
data.pop_filter if data.fields.include?("#{entity_field}")
|
277
|
-
data.pop_filter if data.fields.include?("#{doc_field}")
|
278
296
|
|
279
297
|
if keys.empty?
|
280
298
|
segments = produce_#{entity}
|
@@ -289,34 +307,38 @@ class Corpus
|
|
289
307
|
"#{ entity }"
|
290
308
|
end
|
291
309
|
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
310
|
+
keys = data.write_and_close do
|
311
|
+
data.add_filter("field:#{ doc_field }", @docid) if fields.include?("#{doc_field}")
|
312
|
+
data.add_filter("field:#{ entity_field }", "#{ entity }") if fields.include?("#{entity_field}")
|
313
|
+
keys = tsv.collect do |key, value|
|
314
|
+
data[key] = value
|
315
|
+
key
|
316
|
+
end
|
317
|
+
data.pop_filter if fields.include?("#{entity_field}")
|
318
|
+
data.pop_filter if fields.include?("#{doc_field}")
|
319
|
+
keys
|
298
320
|
end
|
299
|
-
data.pop_filter if data.fields.include?("#{entity_field}")
|
300
|
-
data.pop_filter if data.fields.include?("#{doc_field}")
|
301
|
-
data.read
|
302
321
|
|
303
322
|
else
|
304
|
-
if raw == :check
|
305
|
-
data.close
|
306
|
-
return nil
|
307
|
-
end
|
323
|
+
return nil if raw == :check
|
308
324
|
end
|
309
325
|
|
310
326
|
return data.values if raw
|
311
327
|
|
312
328
|
start_pos = data.identify_field "Start"
|
313
|
-
|
329
|
+
data.read_and_close do
|
330
|
+
data.chunked_values_at(keys).collect{|annotation|
|
331
|
+
begin
|
314
332
|
pos = annotation[start_pos]
|
315
|
-
Segment.load_tsv_values(text, annotation,
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
333
|
+
Segment.load_tsv_values(text, annotation, fields) unless [-1, "-1", [-1], ["-1"]].include?(pos)
|
334
|
+
rescue
|
335
|
+
Log.exception $!
|
336
|
+
iif keys
|
337
|
+
iif [text, annotation]
|
338
|
+
end
|
339
|
+
|
340
|
+
}.compact
|
341
|
+
end
|
320
342
|
ensure
|
321
343
|
data.close
|
322
344
|
end
|
@@ -348,7 +370,7 @@ class Corpus
|
|
348
370
|
segment.segments[name] = annotations
|
349
371
|
class << segment
|
350
372
|
self
|
351
|
-
end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__
|
373
|
+
end.class_eval "def #{ name }; @segments['#{ name }']; end", __FILE__, __LINE__ + 1
|
352
374
|
end
|
353
375
|
|
354
376
|
segment
|
data/lib/rbbt/text/segment.rb
CHANGED
@@ -3,7 +3,7 @@ require 'rbbt/fix_width_table'
|
|
3
3
|
|
4
4
|
module Segment
|
5
5
|
extend Annotation
|
6
|
-
self.annotation :offset
|
6
|
+
self.annotation :offset, :docid
|
7
7
|
|
8
8
|
def segment_length
|
9
9
|
begin
|
@@ -325,7 +325,7 @@ module Segment
|
|
325
325
|
tsv = TSV.setup({}, :key_field => "ID", :fields => fields, :type => :double)
|
326
326
|
|
327
327
|
segments.each do |segment|
|
328
|
-
tsv[segment.
|
328
|
+
tsv[segment.segment_id] = self.tsv_values_for_segment(segment, fields)
|
329
329
|
end
|
330
330
|
|
331
331
|
tsv
|
@@ -348,6 +348,14 @@ module Segment
|
|
348
348
|
[offset, self.end] * ".."
|
349
349
|
end
|
350
350
|
|
351
|
+
def segment_id
|
352
|
+
if self.respond_to?(:docid)
|
353
|
+
[docid, locus, Misc.obj2digest(info)] * ":"
|
354
|
+
else
|
355
|
+
Misc.obj2digest(info)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
351
359
|
#def ==(other)
|
352
360
|
# self.text == other.text
|
353
361
|
#end
|
@@ -1,46 +1,46 @@
|
|
1
1
|
require 'rbbt/text/segment'
|
2
2
|
|
3
|
-
module SegmentWithDocid
|
4
|
-
extend Annotation
|
5
|
-
|
6
|
-
self.annotation :docid
|
7
|
-
|
8
|
-
def masked?
|
9
|
-
self[0..5] == "MASKED"
|
10
|
-
end
|
11
|
-
|
12
|
-
def mask
|
13
|
-
return self if masked?
|
14
|
-
raise "Cannot mask an array of elements, they must be masked individually" if Array === self
|
15
|
-
raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
|
16
|
-
raise "Cannot mask a segment with no offset" if offset.nil?
|
17
|
-
textual_position = ["MASKED", length] * ":"
|
18
|
-
self.replace(textual_position)
|
19
|
-
self
|
20
|
-
end
|
21
|
-
|
22
|
-
def unmasked_text
|
23
|
-
return self unless masked?
|
24
|
-
tag, length = self.split(":")
|
25
|
-
Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
|
26
|
-
end
|
27
|
-
|
28
|
-
def unmask
|
29
|
-
return self unless masked?
|
30
|
-
self.replace(unmasked_text)
|
31
|
-
self
|
32
|
-
end
|
33
|
-
|
34
|
-
def str_length
|
35
|
-
self.length
|
36
|
-
end
|
37
|
-
|
38
|
-
def masked_length
|
39
|
-
self.split(":").last.to_i
|
40
|
-
end
|
41
|
-
|
42
|
-
def segment_length
|
43
|
-
masked? ? masked_length : str_length
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
3
|
+
#module SegmentWithDocid
|
4
|
+
# extend Annotation
|
5
|
+
#
|
6
|
+
# self.annotation :docid
|
7
|
+
#
|
8
|
+
# def masked?
|
9
|
+
# self[0..5] == "MASKED"
|
10
|
+
# end
|
11
|
+
#
|
12
|
+
# def mask
|
13
|
+
# return self if masked?
|
14
|
+
# raise "Cannot mask an array of elements, they must be masked individually" if Array === self
|
15
|
+
# raise "Cannot mask a segment with no docid" if not self.respond_to? :docid or docid.nil?
|
16
|
+
# raise "Cannot mask a segment with no offset" if offset.nil?
|
17
|
+
# textual_position = ["MASKED", length] * ":"
|
18
|
+
# self.replace(textual_position)
|
19
|
+
# self
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# def unmasked_text
|
23
|
+
# return self unless masked?
|
24
|
+
# tag, length = self.split(":")
|
25
|
+
# Document.setup(docid).text[offset.to_i..(offset.to_i+length.to_i-1)]
|
26
|
+
# end
|
27
|
+
#
|
28
|
+
# def unmask
|
29
|
+
# return self unless masked?
|
30
|
+
# self.replace(unmasked_text)
|
31
|
+
# self
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
# def str_length
|
35
|
+
# self.length
|
36
|
+
# end
|
37
|
+
#
|
38
|
+
# def masked_length
|
39
|
+
# self.split(":").last.to_i
|
40
|
+
# end
|
41
|
+
#
|
42
|
+
# def segment_length
|
43
|
+
# masked? ? masked_length : str_length
|
44
|
+
# end
|
45
|
+
#end
|
46
|
+
#
|
@@ -111,10 +111,10 @@ module Transformed
|
|
111
111
|
|
112
112
|
self[updated_begin..updated_end] = new
|
113
113
|
|
114
|
-
@transformed_segments[segment.
|
114
|
+
@transformed_segments[segment.segment_id] = [segment.range, diff, updated_text, updated_range, @transformed_segments.size]
|
115
115
|
|
116
116
|
segment.replace original_text
|
117
|
-
stack << segment.
|
117
|
+
stack << segment.segment_id
|
118
118
|
end
|
119
119
|
@transformation_stack << stack
|
120
120
|
end
|
@@ -10,10 +10,6 @@ class TestCorpusDocument < Test::Unit::TestCase
|
|
10
10
|
Segment.align(self.text, words)
|
11
11
|
end
|
12
12
|
|
13
|
-
Open.mkdir Rbbt.tmp.test.annotations.find
|
14
|
-
Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
|
15
|
-
|
16
|
-
|
17
13
|
Corpus::Document.define_multiple :words2 do |documents|
|
18
14
|
documents.collect do |doc|
|
19
15
|
words = doc.text.split(" ")
|
@@ -21,32 +17,66 @@ class TestCorpusDocument < Test::Unit::TestCase
|
|
21
17
|
end
|
22
18
|
end
|
23
19
|
|
20
|
+
Open.mkdir Rbbt.tmp.test.annotations.find
|
21
|
+
|
22
|
+
Corpus::Document.persist_in_global_tsv(:words, Rbbt.tmp.test.anotations.words.find)
|
24
23
|
Corpus::Document.persist_in_global_tsv(:words2, Rbbt.tmp.test.anotations.counts.find)
|
25
24
|
end
|
26
25
|
|
27
26
|
def test_words
|
28
27
|
text = "This is a test document"
|
29
|
-
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc", text)
|
28
|
+
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", text)
|
30
29
|
assert_equal Segment.sort(document.words), text.split(" ")
|
30
|
+
assert document.words.first.docid
|
31
|
+
assert document.words.first.segment_id.include?("TEST")
|
31
32
|
end
|
32
33
|
|
33
34
|
def test_words_multiple
|
34
35
|
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
35
|
-
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is
|
36
|
+
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
|
37
|
+
document3 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc3:3", "This is yet another test document")
|
36
38
|
|
37
|
-
docs = [document1, document2]
|
39
|
+
docs = [document1, document2, document3]
|
38
40
|
|
39
41
|
Corpus::Document.prepare_multiple(docs, :words2)
|
40
|
-
|
42
|
+
|
43
|
+
assert document1.words.first.docid
|
44
|
+
assert document1.words.first.segment_id.include?("TEST")
|
45
|
+
|
41
46
|
assert_equal document1.words2, document1.text.split(" ")
|
42
47
|
assert_equal document2.words2, document2.text.split(" ")
|
48
|
+
assert_equal document3.words2, document3.text.split(" ")
|
43
49
|
|
44
50
|
document1 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:1", "This is a test document")
|
45
|
-
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is
|
51
|
+
document2 = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc2:2", "This is another test document")
|
46
52
|
|
47
53
|
docs = [document1, document2]
|
48
54
|
|
49
55
|
Corpus::Document.prepare_multiple(docs, :words2)
|
50
56
|
end
|
57
|
+
|
58
|
+
def test_parallel
|
59
|
+
text =<<-EOF
|
60
|
+
This is a test document number
|
61
|
+
EOF
|
62
|
+
|
63
|
+
docs = []
|
64
|
+
100.times do |i|
|
65
|
+
docs << text.chomp + " " + i.to_s
|
66
|
+
end
|
67
|
+
|
68
|
+
Log.with_severity 0 do
|
69
|
+
TSV.traverse docs, :cpus => 10, :bar => true do |doc|
|
70
|
+
hash = Misc.digest(doc)
|
71
|
+
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
|
72
|
+
assert_equal Segment.sort(document.words), document.text.split(" ")
|
73
|
+
end
|
74
|
+
TSV.traverse docs, :cpus => 10, :bar => true do |doc|
|
75
|
+
hash = Misc.digest(doc)
|
76
|
+
document = Corpus::Document.new(Rbbt.tmp.test.persist, "TEST:test_doc:test:#{hash}", doc)
|
77
|
+
assert_equal Segment.sort(document.words), document.text.split(" ")
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
51
81
|
end
|
52
82
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rbbt-text
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Miguel Vazquez
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-04-
|
11
|
+
date: 2020-04-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rbbt-util
|