rbbt-util 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt-util.rb +2 -0
- data/lib/rbbt.rb +1 -0
- data/lib/rbbt/util/R.rb +24 -0
- data/lib/rbbt/util/bed.rb +325 -0
- data/lib/rbbt/util/cmd.rb +2 -1
- data/lib/rbbt/util/data_module.rb +25 -34
- data/lib/rbbt/util/excel2tsv.rb +2 -3
- data/lib/rbbt/util/log.rb +5 -0
- data/lib/rbbt/util/misc.rb +29 -1
- data/lib/rbbt/util/open.rb +1 -0
- data/lib/rbbt/util/persistence.rb +109 -0
- data/lib/rbbt/util/pkg_data.rb +114 -62
- data/lib/rbbt/util/rake.rb +78 -0
- data/lib/rbbt/util/tc_hash.rb +7 -1
- data/lib/rbbt/util/tsv.rb +582 -153
- data/lib/rbbt/util/workflow.rb +1 -2
- data/share/lib/R/util.R +89 -0
- data/test/rbbt/util/test_R.rb +9 -0
- data/test/rbbt/util/test_bed.rb +136 -0
- data/test/rbbt/util/test_data_module.rb +10 -10
- data/test/rbbt/util/test_misc.rb +1 -0
- data/test/rbbt/util/test_persistence.rb +60 -0
- data/test/rbbt/util/test_pkg_data.rb +113 -0
- data/test/rbbt/util/test_rake.rb +54 -0
- data/test/rbbt/util/test_tsv.rb +91 -46
- data/test/rbbt/util/test_workflow.rb +5 -2
- data/test/test_helper.rb +4 -0
- data/test/test_pkg.rb +0 -10
- data/test/test_rbbt.rb +3 -48
- metadata +21 -6
data/lib/rbbt/util/tc_hash.rb
CHANGED
@@ -6,7 +6,13 @@ class TCHash < TokyoCabinet::HDB
|
|
6
6
|
|
7
7
|
Serializer = Marshal
|
8
8
|
|
9
|
-
FIELD_INFO_ENTRIES = {
|
9
|
+
FIELD_INFO_ENTRIES = {
|
10
|
+
:fields => '__tokyocabinet_hash_fields',
|
11
|
+
:key_field => '__tokyocabinet_hash_key_field',
|
12
|
+
:filename => '__tokyocabinet_hash_filename',
|
13
|
+
:type => '__tokyocabinet_hash_type',
|
14
|
+
:case_insensitive => '__tokyocabinet_hash_case_insensitive'
|
15
|
+
}
|
10
16
|
CONNECTIONS = {}
|
11
17
|
|
12
18
|
FIELD_INFO_ENTRIES.each do |entry, key|
|
data/lib/rbbt/util/tsv.rb
CHANGED
@@ -3,17 +3,10 @@ require 'rbbt/util/open'
|
|
3
3
|
require 'rbbt/util/tc_hash'
|
4
4
|
require 'rbbt/util/tmpfile'
|
5
5
|
require 'rbbt/util/log'
|
6
|
+
require 'rbbt/util/persistence'
|
6
7
|
require 'digest'
|
7
8
|
require 'fileutils'
|
8
9
|
|
9
|
-
def add_defaults(options, defaults = {})
|
10
|
-
new_options = options.dup
|
11
|
-
defaults.each do |key, value|
|
12
|
-
new_options[key] = value if new_options[key].nil?
|
13
|
-
end
|
14
|
-
new_options
|
15
|
-
end
|
16
|
-
|
17
10
|
class TSV
|
18
11
|
class FieldNotFoundError < StandardError;end
|
19
12
|
|
@@ -26,8 +19,6 @@ class TSV
|
|
26
19
|
|
27
20
|
#{{{ Persistence
|
28
21
|
|
29
|
-
PersistenceHash = TCHash
|
30
|
-
|
31
22
|
CACHEDIR="/tmp/tsv_persistent_cache"
|
32
23
|
FileUtils.mkdir CACHEDIR unless File.exist? CACHEDIR
|
33
24
|
|
@@ -40,10 +31,7 @@ class TSV
|
|
40
31
|
CACHEDIR
|
41
32
|
end
|
42
33
|
|
43
|
-
|
44
|
-
File.join(CACHEDIR, prefix.gsub(/\s/,'_').gsub(/\//,'>') + Digest::MD5.hexdigest([file, options].inspect))
|
45
|
-
end
|
46
|
-
|
34
|
+
|
47
35
|
#{{{ Headers and Field Stuff
|
48
36
|
|
49
37
|
def self.headers(file, options = {})
|
@@ -130,7 +118,7 @@ class TSV
|
|
130
118
|
end
|
131
119
|
|
132
120
|
each do |key, values|
|
133
|
-
if
|
121
|
+
if type == :double
|
134
122
|
tmp_values = values + [[key]]
|
135
123
|
else
|
136
124
|
tmp_values = values + [key]
|
@@ -144,11 +132,15 @@ class TSV
|
|
144
132
|
new_values = tmp_values.values_at(*new_field_positions)
|
145
133
|
end
|
146
134
|
|
147
|
-
tmp_values[new_key_position]
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
135
|
+
if not Array === tmp_values[new_key_position]
|
136
|
+
yield tmp_values[new_key_position], NamedArray.name(new_values, new_field_names)
|
137
|
+
else
|
138
|
+
tmp_values[new_key_position].each do |new_key|
|
139
|
+
if new_field_names
|
140
|
+
yield new_key, NamedArray.name(new_values, new_field_names)
|
141
|
+
else
|
142
|
+
yield new_key, new_values
|
143
|
+
end
|
152
144
|
end
|
153
145
|
end
|
154
146
|
end
|
@@ -165,7 +157,7 @@ class TSV
|
|
165
157
|
|
166
158
|
def reorder(new_key_field, new_fields = nil, options = {})
|
167
159
|
options = Misc.add_defaults options
|
168
|
-
return TSV.new(
|
160
|
+
return TSV.new(Persistence::TSV.get(options[:persistence_file], false), :case_insensitive => case_insensitive) if options[:persistence_file] and File.exists?(options[:persistence_file])
|
169
161
|
|
170
162
|
new = {}
|
171
163
|
new_key_field, new_fields = through new_key_field, new_fields do |key, values|
|
@@ -181,7 +173,7 @@ class TSV
|
|
181
173
|
end
|
182
174
|
|
183
175
|
if options[:persistence_file]
|
184
|
-
reordered = TSV.new(
|
176
|
+
reordered = TSV.new(Persistence::TSV.get(options[:persistence_file], false), :case_insensitive => case_insensitive)
|
185
177
|
reordered.merge! new
|
186
178
|
else
|
187
179
|
reordered = TSV.new(new, :case_insensitive => case_insensitive)
|
@@ -199,12 +191,12 @@ class TSV
|
|
199
191
|
|
200
192
|
def add_field(name = nil)
|
201
193
|
each do |key, values|
|
202
|
-
self[key] = values
|
194
|
+
self[key] = values + [yield(key, values)]
|
203
195
|
end
|
204
196
|
|
205
|
-
fields
|
206
|
-
|
207
|
-
|
197
|
+
if fields != nil
|
198
|
+
new_fields = fields + [name]
|
199
|
+
self.fields = new_fields
|
208
200
|
end
|
209
201
|
end
|
210
202
|
|
@@ -212,6 +204,9 @@ class TSV
|
|
212
204
|
new = TSV.new({})
|
213
205
|
new.key_field = key_field
|
214
206
|
new.fields = fields.dup
|
207
|
+
new.type = type
|
208
|
+
new.filename = filename + "#Select: #{method.inspect}"
|
209
|
+
new.case_insensitive = case_insensitive
|
215
210
|
|
216
211
|
case
|
217
212
|
when Array === method
|
@@ -222,6 +217,10 @@ class TSV
|
|
222
217
|
through do |key, values|
|
223
218
|
new[key] = values if [key,values].flatten.select{|v| v =~ method}.any?
|
224
219
|
end
|
220
|
+
when String === method
|
221
|
+
through do |key, values|
|
222
|
+
new[key] = values if [key,values].flatten.select{|v| v == method}.any?
|
223
|
+
end
|
225
224
|
when Hash === method
|
226
225
|
key = method.keys.first
|
227
226
|
method = method.values.first
|
@@ -230,89 +229,87 @@ class TSV
|
|
230
229
|
method.each{|item| if values = self[item]; then new[item] = values; end}
|
231
230
|
when Array === method
|
232
231
|
through :main, key do |key, values|
|
233
|
-
new[key] =
|
232
|
+
new[key] = self[key] if (values.flatten & method).any?
|
234
233
|
end
|
235
234
|
when Regexp === method
|
236
235
|
through :main, key do |key, values|
|
237
|
-
new[key] =
|
236
|
+
new[key] = self[key] if values.flatten.select{|v| v =~ method}.any?
|
237
|
+
end
|
238
|
+
when String === method
|
239
|
+
through :main, key do |key, values|
|
240
|
+
new[key] = self[key] if values.flatten.select{|v| v == method}.any?
|
238
241
|
end
|
239
242
|
end
|
240
243
|
end
|
241
244
|
|
245
|
+
|
242
246
|
new
|
243
247
|
end
|
244
248
|
|
245
249
|
def index(options = {})
|
246
|
-
options = Misc.add_defaults options, :order => false
|
250
|
+
options = Misc.add_defaults options, :order => false, :persistence => false
|
247
251
|
|
248
|
-
|
249
|
-
|
250
|
-
|
252
|
+
new, extra = Persistence.persist(filename, :Index, :tsv, options) do |filename, options|
|
253
|
+
new = {}
|
254
|
+
if options[:order]
|
255
|
+
new_key_field, new_fields = through options[:target], options[:others] do |key, values|
|
251
256
|
|
252
|
-
|
253
|
-
|
254
|
-
end
|
257
|
+
values.each_with_index do |list, i|
|
258
|
+
next if list.nil? or list.empty?
|
255
259
|
|
256
|
-
|
257
|
-
if options[:order]
|
258
|
-
new_key_field, new_fields = through options[:field], options[:others] do |key, values|
|
259
|
-
|
260
|
-
values.each_with_index do |list, i|
|
261
|
-
next if list.nil? or list.empty?
|
262
|
-
|
263
|
-
list = [list] unless Array === list
|
260
|
+
list = [list] unless Array === list
|
264
261
|
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
262
|
+
list.each do |value|
|
263
|
+
next if value.nil? or value.empty?
|
264
|
+
value = value.downcase if options[:case_insensitive]
|
265
|
+
new[value] ||= []
|
266
|
+
new[value][i + 1] ||= []
|
267
|
+
new[value][i + 1] << key
|
268
|
+
end
|
272
269
|
new[key] ||= []
|
273
270
|
new[key][0] = key
|
274
|
-
|
271
|
+
end
|
275
272
|
|
276
|
-
|
273
|
+
end
|
277
274
|
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
275
|
+
new.each do |key, values|
|
276
|
+
values.flatten!
|
277
|
+
values.compact!
|
278
|
+
end
|
282
279
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
280
|
+
else
|
281
|
+
new_key_field, new_fields = through options[:target], options[:others] do |key, values|
|
282
|
+
new[key] ||= []
|
283
|
+
new[key] << key
|
284
|
+
values.each do |list|
|
285
|
+
next if list.nil?
|
286
|
+
if Array === list
|
287
|
+
list.each do |value|
|
288
|
+
value = value.downcase if options[:case_insensitive]
|
289
|
+
new[value] ||= []
|
290
|
+
new[value] << key
|
291
|
+
end
|
292
|
+
else
|
293
|
+
next if list.empty?
|
294
|
+
value = list
|
291
295
|
value = value.downcase if options[:case_insensitive]
|
292
296
|
new[value] ||= []
|
293
297
|
new[value] << key
|
294
298
|
end
|
295
|
-
else
|
296
|
-
next if list.empty?
|
297
|
-
value = list
|
298
|
-
value = value.downcase if options[:case_insensitive]
|
299
|
-
new[value] ||= []
|
300
|
-
new[value] << key
|
301
299
|
end
|
302
300
|
end
|
303
301
|
end
|
304
|
-
end
|
305
302
|
|
306
|
-
|
307
|
-
index = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
|
308
|
-
index.merge! new
|
309
|
-
else
|
310
|
-
index = TSV.new(new, :case_insensitive => options[:case_insensitive])
|
303
|
+
[new, {:key_field => new_key_field, :fields => new_fields, :type => :double, :case_insensitive => options[:case_insensitive]}]
|
311
304
|
end
|
312
305
|
|
313
|
-
|
314
|
-
|
315
|
-
|
306
|
+
new = TSV.new(new)
|
307
|
+
new.filename = "Index: " + filename + options.inspect
|
308
|
+
new.fields = extra[:fields]
|
309
|
+
new.key_field = extra[:key_field]
|
310
|
+
new.case_insensitive = extra[:case_insensitive]
|
311
|
+
new.type = extra[:type]
|
312
|
+
new
|
316
313
|
end
|
317
314
|
|
318
315
|
def smart_merge(other, match = nil, new_fields = nil)
|
@@ -413,7 +410,7 @@ class TSV
|
|
413
410
|
|
414
411
|
if nofieldinfo
|
415
412
|
next if other[other_code].nil?
|
416
|
-
if
|
413
|
+
if type == :double
|
417
414
|
other_values = [[other_code]] + other[other_code]
|
418
415
|
else
|
419
416
|
other_values = [other_code] + other[other_code]
|
@@ -427,13 +424,13 @@ class TSV
|
|
427
424
|
new_values = values + other_values
|
428
425
|
else
|
429
426
|
if other[other_code].nil?
|
430
|
-
if
|
427
|
+
if type == :double
|
431
428
|
other_values = [[]] * other.fields.length
|
432
429
|
else
|
433
430
|
other_values = [] * other.fields.length
|
434
431
|
end
|
435
432
|
else
|
436
|
-
if
|
433
|
+
if type == :double
|
437
434
|
other_values = other[other_code] + [[other_code]]
|
438
435
|
else
|
439
436
|
other_values = other[other_code] + [other_code]
|
@@ -443,11 +440,11 @@ class TSV
|
|
443
440
|
|
444
441
|
new_values = values.dup
|
445
442
|
|
446
|
-
if
|
443
|
+
if type == :double
|
447
444
|
this_common_field_positions.zip(other_common_field_positions).each do |tpos, opos|
|
448
445
|
new_values_tops = new_values[tpos]
|
449
446
|
|
450
|
-
if other.
|
447
|
+
if other.type == :double
|
451
448
|
new_values_tops += other_values[opos]
|
452
449
|
else
|
453
450
|
new_values_tops += [other_values[opos]]
|
@@ -466,10 +463,58 @@ class TSV
|
|
466
463
|
|
467
464
|
self.fields = self.fields + new_fields unless nofieldinfo
|
468
465
|
end
|
466
|
+
|
467
|
+
|
468
|
+
def self.field_matches(tsv, values)
|
469
|
+
if values.flatten.sort[0..9].compact.collect{|n| n.to_i} == (1..10).to_a
|
470
|
+
return {}
|
471
|
+
end
|
472
|
+
|
473
|
+
key_field = tsv.key_field
|
474
|
+
fields = tsv.fields
|
475
|
+
|
476
|
+
field_values = {}
|
477
|
+
fields.each{|field|
|
478
|
+
field_values[field] = []
|
479
|
+
}
|
480
|
+
|
481
|
+
tsv.through do |key,entry_values|
|
482
|
+
fields.zip(entry_values).each do |field,entry_field_values|
|
483
|
+
field_values[field].concat entry_field_values
|
484
|
+
end
|
485
|
+
end
|
486
|
+
|
487
|
+
field_values.each do |field,field_value_list|
|
488
|
+
field_value_list.replace(values & field_value_list.flatten.uniq)
|
489
|
+
end
|
490
|
+
|
491
|
+
field_values[key_field] = values & tsv.keys
|
492
|
+
|
493
|
+
field_values
|
494
|
+
end
|
495
|
+
|
496
|
+
def field_matches(values)
|
497
|
+
TSV.field_matches(self, values)
|
498
|
+
end
|
499
|
+
|
500
|
+
|
469
501
|
|
470
502
|
#{{{ Helpers
|
471
503
|
|
472
504
|
def self.index(file, options = {})
|
505
|
+
options = Misc.add_defaults options, :data_persistence => true, :persistence => true
|
506
|
+
persistence, persistence_file = Misc.process_options options, :persistence, :persistence_file
|
507
|
+
options[:persistence], options[:persistence_file] = options.values_at :data_persistence, :data_persistence_file
|
508
|
+
options.delete :data_persistence
|
509
|
+
options.delete :data_persistence_file
|
510
|
+
|
511
|
+
index, extra = Persistence.persist(file, :Index, :tsv, options) do |file, options, filename|
|
512
|
+
TSV.new(file, :double, options).index
|
513
|
+
end
|
514
|
+
index
|
515
|
+
end
|
516
|
+
|
517
|
+
def self.index2(file, options = {})
|
473
518
|
opt_data = options.dup
|
474
519
|
opt_index = options.dup
|
475
520
|
opt_data.delete :field
|
@@ -482,7 +527,7 @@ class TSV
|
|
482
527
|
|
483
528
|
if ! opt_index[:persistence_file].nil? && File.exists?(opt_index[:persistence_file])
|
484
529
|
Log.low "Reloading persistent index for #{ file }: #{opt_index[:persistence_file]}"
|
485
|
-
TSV.new(
|
530
|
+
TSV.new(Persistence::TSV.get(opt_index[:persistence_file], false), opt_index)
|
486
531
|
else
|
487
532
|
Log.low "Creating index for #{ file }: #{opt_index[:persistence_file]}"
|
488
533
|
data = TSV.new(file, opt_data)
|
@@ -501,6 +546,23 @@ class TSV
|
|
501
546
|
end
|
502
547
|
|
503
548
|
#{{{ Accesor Methods
|
549
|
+
attr_accessor :filename, :type, :case_insensitive, :key_field, :fields, :data
|
550
|
+
|
551
|
+
def fields
|
552
|
+
return nil if @fields.nil?
|
553
|
+
fields = @fields
|
554
|
+
fields.each do |f| f.extend Field end if Array === fields
|
555
|
+
fields
|
556
|
+
end
|
557
|
+
|
558
|
+
def fields=(new_fields)
|
559
|
+
@fields = new_fields
|
560
|
+
if Persistence::TSV === @data
|
561
|
+
@data.fields = new_fields
|
562
|
+
end
|
563
|
+
end
|
564
|
+
|
565
|
+
|
504
566
|
|
505
567
|
def keys
|
506
568
|
@data.keys
|
@@ -531,6 +593,7 @@ class TSV
|
|
531
593
|
# Read
|
532
594
|
|
533
595
|
def follow(value)
|
596
|
+
return nil if value.nil?
|
534
597
|
if String === value && value =~ /__Ref:(.*)/
|
535
598
|
return self[$1]
|
536
599
|
else
|
@@ -546,7 +609,7 @@ class TSV
|
|
546
609
|
return nil
|
547
610
|
end
|
548
611
|
|
549
|
-
key = key.downcase if @case_insensitive
|
612
|
+
key = key.downcase if @case_insensitive and key !~ /^__Ref:/
|
550
613
|
follow @data[key]
|
551
614
|
end
|
552
615
|
|
@@ -587,23 +650,37 @@ class TSV
|
|
587
650
|
collect.sort_by &block
|
588
651
|
end
|
589
652
|
|
590
|
-
def
|
653
|
+
def values_to_s(values)
|
654
|
+
case
|
655
|
+
when (values.nil? and fields.nil?)
|
656
|
+
"\n"
|
657
|
+
when (values.nil? and not fields.nil?)
|
658
|
+
"\t" << ([""] * fields.length) * "\t" << "\n"
|
659
|
+
when (not Array === values)
|
660
|
+
"\t" << values.to_s << "\n"
|
661
|
+
when Array === values.first
|
662
|
+
"\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n"
|
663
|
+
else
|
664
|
+
"\t" << values * "\t" << "\n"
|
665
|
+
end
|
666
|
+
end
|
667
|
+
|
668
|
+
def to_s(keys = nil)
|
591
669
|
str = ""
|
592
670
|
|
593
671
|
if fields
|
594
672
|
str << "#" << key_field << "\t" << fields * "\t" << "\n"
|
595
673
|
end
|
596
674
|
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
str << key.dup <<
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
str << key.dup << "\t" << values * "\t" << "\n"
|
675
|
+
if keys.nil?
|
676
|
+
each do |key, values|
|
677
|
+
key = key.to_s if Symbol === key
|
678
|
+
str << key.dup << values_to_s(values)
|
679
|
+
end
|
680
|
+
else
|
681
|
+
keys.zip(values_at(*keys)).each do |key, values|
|
682
|
+
key = key.to_s if Symbol === key
|
683
|
+
str << key.dup << values_to_s(values)
|
607
684
|
end
|
608
685
|
end
|
609
686
|
|
@@ -625,11 +702,301 @@ class TSV
|
|
625
702
|
zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields
|
626
703
|
zipped
|
627
704
|
end
|
628
|
-
|
629
|
-
def self.parse(data, file, options = {})
|
630
705
|
|
706
|
+
def self.key_order(file, options = {})
|
631
707
|
# Prepare options
|
632
708
|
options = add_defaults options,
|
709
|
+
:sep => "\t",
|
710
|
+
:sep2 => "|",
|
711
|
+
:native => 0,
|
712
|
+
:fix => nil,
|
713
|
+
:exclude => nil,
|
714
|
+
:select => nil,
|
715
|
+
:grep => nil,
|
716
|
+
:case_insensitive => false,
|
717
|
+
:header_hash => '#'
|
718
|
+
|
719
|
+
options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
|
720
|
+
|
721
|
+
if String === file and File.exists? file
|
722
|
+
file = File.open(file)
|
723
|
+
end
|
724
|
+
|
725
|
+
#{{{ Process first line
|
726
|
+
|
727
|
+
line = file.gets
|
728
|
+
raise "Empty content" if line.nil?
|
729
|
+
line.chomp!
|
730
|
+
|
731
|
+
if line =~ /^#{options[:header_hash]}/
|
732
|
+
header_fields = parse_fields(line, options[:sep])
|
733
|
+
header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character
|
734
|
+
line = file.gets
|
735
|
+
else
|
736
|
+
header_fields = nil
|
737
|
+
end
|
738
|
+
|
739
|
+
id_pos = Misc.field_position(header_fields, options[:native])
|
740
|
+
|
741
|
+
if options[:extra].nil?
|
742
|
+
extra_pos = nil
|
743
|
+
max_cols = 0
|
744
|
+
else
|
745
|
+
extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) }
|
746
|
+
end
|
747
|
+
|
748
|
+
ids = []
|
749
|
+
#{{{ Process rest
|
750
|
+
while line do
|
751
|
+
line.chomp!
|
752
|
+
|
753
|
+
line = options[:fix].call line if options[:fix]
|
754
|
+
break if not line
|
755
|
+
|
756
|
+
# Select and fix lines
|
757
|
+
if line.empty? or
|
758
|
+
(options[:exclude] and options[:exclude].call(line)) or
|
759
|
+
(options[:select] and not options[:select].call(line))
|
760
|
+
|
761
|
+
line = file.gets
|
762
|
+
next
|
763
|
+
end
|
764
|
+
|
765
|
+
### Process line
|
766
|
+
|
767
|
+
# Chunk fields
|
768
|
+
parts = parse_fields(line, options[:sep])
|
769
|
+
|
770
|
+
# Get next line
|
771
|
+
line = file.gets
|
772
|
+
|
773
|
+
# Get id field
|
774
|
+
next if parts[id_pos].nil? || parts[id_pos].empty?
|
775
|
+
ids << parts[id_pos]
|
776
|
+
end
|
777
|
+
|
778
|
+
ids
|
779
|
+
end
|
780
|
+
|
781
|
+
def self.parse_header(stream, sep, header_hash)
|
782
|
+
fields, key_field = nil
|
783
|
+
options = {}
|
784
|
+
|
785
|
+
line = stream.gets
|
786
|
+
|
787
|
+
if line and line =~ /^#{header_hash}: (.*)/
|
788
|
+
options = Misc.string2hash $1
|
789
|
+
line = stream.gets
|
790
|
+
end
|
791
|
+
|
792
|
+
sep = options[:sep] if options[:sep]
|
793
|
+
|
794
|
+
if line and line =~ /^#{header_hash}/
|
795
|
+
line.chomp!
|
796
|
+
fields = parse_fields(line, sep)
|
797
|
+
key_field = fields.shift
|
798
|
+
key_field = key_field[(0 + header_hash.length)..-1] # Remove initial hash character
|
799
|
+
line = stream.gets
|
800
|
+
end
|
801
|
+
|
802
|
+
raise "Empty content" if line.nil?
|
803
|
+
return key_field, fields, options, line
|
804
|
+
end
|
805
|
+
|
806
|
+
def self.parse(stream, options = {})
|
807
|
+
# Prepare options
|
808
|
+
options = Misc.add_defaults options,
|
809
|
+
:case_insensitive => false,
|
810
|
+
:type => :double,
|
811
|
+
|
812
|
+
:merge => false,
|
813
|
+
:keep_empty => true,
|
814
|
+
:cast => nil,
|
815
|
+
|
816
|
+
:sep => "\t",
|
817
|
+
:sep2 => "|",
|
818
|
+
:header_hash => '#',
|
819
|
+
|
820
|
+
:key => 0,
|
821
|
+
:fields => nil,
|
822
|
+
|
823
|
+
:fix => nil,
|
824
|
+
:exclude => nil,
|
825
|
+
:select => nil,
|
826
|
+
:grep => nil
|
827
|
+
|
828
|
+
|
829
|
+
sep, header_hash =
|
830
|
+
Misc.process_options options, :sep, :header_hash
|
831
|
+
|
832
|
+
key_field, other_fields, more_options, line = TSV.parse_header(stream, sep, header_hash)
|
833
|
+
|
834
|
+
sep = more_options[:sep] if more_options[:sep]
|
835
|
+
options = Misc.add_defaults options, more_options
|
836
|
+
sep2 = Misc.process_options options, :sep2
|
837
|
+
|
838
|
+
key, others =
|
839
|
+
Misc.process_options options, :key, :others
|
840
|
+
|
841
|
+
if key_field.nil?
|
842
|
+
key_pos = key
|
843
|
+
key_field, fields = nil
|
844
|
+
else
|
845
|
+
all_fields = [key_field].concat other_fields
|
846
|
+
|
847
|
+
key_pos = Misc.field_position(all_fields, key)
|
848
|
+
|
849
|
+
if String === others or Symbol === others
|
850
|
+
others = [others]
|
851
|
+
end
|
852
|
+
|
853
|
+
if others.nil?
|
854
|
+
other_pos = (0..(all_fields.length - 1)).to_a
|
855
|
+
other_pos.delete key_pos
|
856
|
+
else
|
857
|
+
other_pos = Misc.field_position(all_fields, *others)
|
858
|
+
end
|
859
|
+
|
860
|
+
key_field = all_fields[key_pos]
|
861
|
+
fields = all_fields.values_at *other_pos
|
862
|
+
end
|
863
|
+
|
864
|
+
case_insensitive, type, merge, keep_empty, cast =
|
865
|
+
Misc.process_options options, :case_insensitive, :type, :merge, :keep_empty, :cast
|
866
|
+
fix, exclude, select, grep =
|
867
|
+
Misc.process_options options, :fix, :exclude, :select, :grep
|
868
|
+
|
869
|
+
#{{{ Process rest
|
870
|
+
data = {}
|
871
|
+
single = type.to_sym != :double
|
872
|
+
max_cols = 0
|
873
|
+
while line do
|
874
|
+
line.chomp!
|
875
|
+
|
876
|
+
line = fix.call line if fix
|
877
|
+
break if not line
|
878
|
+
|
879
|
+
if header_hash and line =~ /^#{header_hash}/
|
880
|
+
line = stream.gets
|
881
|
+
next
|
882
|
+
end
|
883
|
+
|
884
|
+
if line.empty? or
|
885
|
+
(exclude and exclude.call(line)) or
|
886
|
+
(select and not select.call(line))
|
887
|
+
|
888
|
+
line = stream.gets
|
889
|
+
next
|
890
|
+
end
|
891
|
+
|
892
|
+
# Chunk fields
|
893
|
+
parts = parse_fields(line, sep)
|
894
|
+
|
895
|
+
# Get next line
|
896
|
+
line = stream.gets
|
897
|
+
|
898
|
+
# Get id field
|
899
|
+
next if parts[key_pos].nil? || parts[key_pos].empty?
|
900
|
+
|
901
|
+
if single
|
902
|
+
ids = parse_fields(parts[key_pos], sep2)
|
903
|
+
ids.collect!{|id| id.downcase} if case_insensitive
|
904
|
+
|
905
|
+
id = ids.shift
|
906
|
+
ids.each do |id2| data[id2] = "__Ref:#{id}" end
|
907
|
+
|
908
|
+
if key_field.nil?
|
909
|
+
other_pos = (0..(parts.length - 1)).to_a
|
910
|
+
other_pos.delete key_pos
|
911
|
+
end
|
912
|
+
|
913
|
+
extra = parts.values_at(*other_pos).collect{|f| parse_fields(f, sep2).first}
|
914
|
+
extra.collect! do |elem|
|
915
|
+
case
|
916
|
+
when String === cast
|
917
|
+
elem.send(cast)
|
918
|
+
when Proc === cast
|
919
|
+
cast.call elem
|
920
|
+
end
|
921
|
+
end if cast
|
922
|
+
|
923
|
+
max_cols = extra.size if extra.size > (max_cols || 0)
|
924
|
+
case type
|
925
|
+
when :list
|
926
|
+
data[id] = extra unless data.include? id
|
927
|
+
when :flat
|
928
|
+
data[id] = extra.flatten unless data.include? id
|
929
|
+
when :single
|
930
|
+
data[id] = extra.flatten.first unless data.include? id
|
931
|
+
end
|
932
|
+
|
933
|
+
else
|
934
|
+
ids = parse_fields(parts[key_pos], sep2)
|
935
|
+
ids.collect!{|id| id.downcase} if case_insensitive
|
936
|
+
|
937
|
+
id = ids.shift
|
938
|
+
ids.each do |id2| data[id2] = "__Ref:#{id}" end
|
939
|
+
|
940
|
+
if key_field.nil?
|
941
|
+
other_pos = (0..(parts.length - 1)).to_a
|
942
|
+
other_pos.delete key_pos
|
943
|
+
end
|
944
|
+
|
945
|
+
extra = parts.values_at(*other_pos).collect{|f| parse_fields(f, sep2)}
|
946
|
+
extra.collect! do |list|
|
947
|
+
case
|
948
|
+
when String === cast
|
949
|
+
list.collect{|elem| elem.send(cast)}
|
950
|
+
when Proc === cast
|
951
|
+
list.collect{|elem| cast.call elem}
|
952
|
+
end
|
953
|
+
end if cast
|
954
|
+
|
955
|
+
max_cols = extra.size if extra.size > (max_cols || 0)
|
956
|
+
if merge
|
957
|
+
data[id] = extra unless data.include? id
|
958
|
+
else
|
959
|
+
if not data.include? id
|
960
|
+
data[id] = extra
|
961
|
+
else
|
962
|
+
entry = data[id]
|
963
|
+
while entry =~ /__Ref:(.*)/ do entry = data[$1] end
|
964
|
+
extra.each_with_index do |f, i|
|
965
|
+
if f.empty?
|
966
|
+
next unless keep_empty
|
967
|
+
f= [""]
|
968
|
+
end
|
969
|
+
entry[i] ||= []
|
970
|
+
entry[i] = entry[i].concat f
|
971
|
+
end
|
972
|
+
data[id] = entry
|
973
|
+
end
|
974
|
+
end
|
975
|
+
end
|
976
|
+
end
|
977
|
+
|
978
|
+
if keep_empty and max_cols > 0
|
979
|
+
data.each do |key, values|
|
980
|
+
next if values =~ /__Ref:/
|
981
|
+
new_values = values
|
982
|
+
max_cols.times do |i|
|
983
|
+
if type == :double
|
984
|
+
new_values[i] = [""] if new_values[i].nil? or new_values[i].empty?
|
985
|
+
else
|
986
|
+
new_values[i] = "" if new_values[i].nil?
|
987
|
+
end
|
988
|
+
end
|
989
|
+
data[key] = new_values
|
990
|
+
end
|
991
|
+
end
|
992
|
+
|
993
|
+
[data, {:key_field => key_field, :fields => fields, :type => type, :case_insensitive => case_insensitive}]
|
994
|
+
end
|
995
|
+
|
996
|
+
def self.parse2(data, file, options = {})
|
997
|
+
|
998
|
+
# Prepare options
|
999
|
+
options = Misc.add_defaults options,
|
633
1000
|
:sep => "\t",
|
634
1001
|
:sep2 => "|",
|
635
1002
|
:native => 0,
|
@@ -640,18 +1007,19 @@ class TSV
|
|
640
1007
|
:grep => nil,
|
641
1008
|
:single => false,
|
642
1009
|
:unique => false,
|
1010
|
+
:merge => false,
|
643
1011
|
:flatten => false,
|
644
|
-
:overwrite => false,
|
645
1012
|
:keep_empty => true,
|
646
1013
|
:case_insensitive => false,
|
647
1014
|
:header_hash => '#' ,
|
1015
|
+
:cast => nil,
|
648
1016
|
:persistence_file => nil
|
1017
|
+
|
649
1018
|
|
1019
|
+
options[:unique] = options[:uniq] if options[:unique].nil?
|
650
1020
|
options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
|
651
1021
|
options[:flatten] = true if options[:single]
|
652
1022
|
|
653
|
-
|
654
|
-
|
655
1023
|
#{{{ Process first line
|
656
1024
|
|
657
1025
|
line = file.gets
|
@@ -680,10 +1048,18 @@ class TSV
|
|
680
1048
|
line.chomp!
|
681
1049
|
|
682
1050
|
line = options[:fix].call line if options[:fix]
|
1051
|
+
break if not line
|
1052
|
+
|
1053
|
+
if options[:header_hash] && line =~ /^#{options[:header_hash]}/
|
1054
|
+
line = file.gets
|
1055
|
+
next
|
1056
|
+
end
|
683
1057
|
|
684
1058
|
# Select and fix lines
|
685
|
-
if
|
1059
|
+
if line.empty? or
|
1060
|
+
(options[:exclude] and options[:exclude].call(line)) or
|
686
1061
|
(options[:select] and not options[:select].call(line))
|
1062
|
+
|
687
1063
|
line = file.gets
|
688
1064
|
next
|
689
1065
|
end
|
@@ -721,35 +1097,48 @@ class TSV
|
|
721
1097
|
extra.flatten! if options[:flatten]
|
722
1098
|
extra = extra.first if options[:single]
|
723
1099
|
|
724
|
-
if options[:
|
725
|
-
|
726
|
-
|
727
|
-
|
1100
|
+
if options[:cast]
|
1101
|
+
if Array === extra[0]
|
1102
|
+
e = extra
|
1103
|
+
else
|
1104
|
+
e = [extra]
|
728
1105
|
end
|
729
1106
|
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
1107
|
+
e.each do |list|
|
1108
|
+
case
|
1109
|
+
when String === options[:cast]
|
1110
|
+
list.collect!{|elem| elem.send(options[:cast])}
|
1111
|
+
when Proc === options[:cast]
|
1112
|
+
list.collect!{|elem| options[:cast].call elem}
|
1113
|
+
end
|
735
1114
|
end
|
1115
|
+
end
|
736
1116
|
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
1117
|
+
main_entry = ids.shift
|
1118
|
+
ids.each do |id| data[id] = "__Ref:#{main_entry}" end
|
1119
|
+
|
1120
|
+
case
|
1121
|
+
when (options[:single] or options[:unique] or not options[:merge])
|
1122
|
+
data[main_entry] = extra unless data.include? main_entry
|
1123
|
+
when options[:flatten]
|
1124
|
+
entry = data[main_entry]
|
1125
|
+
|
1126
|
+
if entry.nil?
|
1127
|
+
data[main_entry] = extra
|
1128
|
+
else
|
1129
|
+
while entry =~ /__Ref:(.*)/ do entry = data[$1] end
|
1130
|
+
if Persistence::TSV === data
|
1131
|
+
data[main_entry] = entry.concat extra
|
743
1132
|
else
|
744
|
-
data[main_entry] ||= []
|
745
1133
|
data[main_entry].concat extra
|
746
1134
|
end
|
1135
|
+
end
|
1136
|
+
else
|
1137
|
+
entry = data[main_entry]
|
1138
|
+
if entry.nil?
|
1139
|
+
data[main_entry] = extra
|
747
1140
|
else
|
748
|
-
entry = data[
|
749
|
-
while entry =~ /__Ref:(.*)/ do
|
750
|
-
entry = data[$1]
|
751
|
-
end
|
752
|
-
|
1141
|
+
while entry =~ /__Ref:(.*)/ do entry = data[$1] end
|
753
1142
|
extra.each_with_index do |fields, i|
|
754
1143
|
if fields.empty?
|
755
1144
|
next unless options[:keep_empty]
|
@@ -758,7 +1147,6 @@ class TSV
|
|
758
1147
|
entry[i] ||= []
|
759
1148
|
entry[i] = entry[i].concat fields
|
760
1149
|
end
|
761
|
-
|
762
1150
|
data[main_entry] = entry
|
763
1151
|
end
|
764
1152
|
end
|
@@ -774,7 +1162,6 @@ class TSV
|
|
774
1162
|
end
|
775
1163
|
end
|
776
1164
|
|
777
|
-
|
778
1165
|
# Save header information
|
779
1166
|
key_field = nil
|
780
1167
|
fields = nil
|
@@ -788,19 +1175,61 @@ class TSV
|
|
788
1175
|
end
|
789
1176
|
end
|
790
1177
|
|
791
|
-
data.read if
|
1178
|
+
data.read if Persistence::TSV === data
|
792
1179
|
|
793
1180
|
[key_field, fields]
|
794
1181
|
end
|
1182
|
+
def initialize(file = {}, type = :double, options = {})
|
1183
|
+
if Hash === type
|
1184
|
+
options = type
|
1185
|
+
type = :double
|
1186
|
+
end
|
795
1187
|
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
1188
|
+
if String === file and file =~/(.*?)#(.*)/ and File.exists? $1
|
1189
|
+
options = Misc.add_defaults options, Misc.string2hash($2)
|
1190
|
+
file = $1
|
1191
|
+
end
|
1192
|
+
|
1193
|
+
options = Misc.add_defaults options, :persistence => false, :case_insensitive => false, :type => type
|
1194
|
+
|
1195
|
+
@filename = Misc.process_options options, :filename
|
1196
|
+
@filename ||= case
|
1197
|
+
when (String === file and File.exists? file)
|
1198
|
+
File.expand_path file
|
1199
|
+
when File === file
|
1200
|
+
File.expand_path file.path
|
1201
|
+
else
|
1202
|
+
Digest::MD5.hexdigest(file.inspect)
|
1203
|
+
end
|
1204
|
+
|
1205
|
+
if block_given?
|
1206
|
+
@data, extra = Persistence.persist(@filename, :TSV, :tsv, options) do |filename, options| yield filename, options end
|
1207
|
+
else
|
1208
|
+
@data, extra = Persistence.persist(@filename, :TSV, :tsv, options) do |filename, options|
|
1209
|
+
data, extra = nil
|
1210
|
+
case
|
1211
|
+
when String === file
|
1212
|
+
File.open(file) do |f|
|
1213
|
+
data, extra = TSV.parse(f, options)
|
1214
|
+
end
|
1215
|
+
when File === file
|
1216
|
+
data, extra = TSV.parse(file, options)
|
1217
|
+
when Hash === file
|
1218
|
+
data = file
|
1219
|
+
extra = {:case_insensitive => options[:case_insensitive], :type => type}
|
1220
|
+
end
|
1221
|
+
|
1222
|
+
[data, extra]
|
1223
|
+
end
|
1224
|
+
end
|
1225
|
+
|
1226
|
+
@type = extra[:type]
|
1227
|
+
@key_field = extra[:key_field]
|
1228
|
+
@fields = extra[:fields]
|
1229
|
+
@case_insensitive = extra[:case_insensitive]
|
801
1230
|
end
|
802
1231
|
|
803
|
-
def
|
1232
|
+
def initialize2(file = {}, options = {})
|
804
1233
|
options = Misc.add_defaults options
|
805
1234
|
options[:persistence] = true if options[:persistence_file]
|
806
1235
|
|
@@ -817,7 +1246,7 @@ class TSV
|
|
817
1246
|
Log.low "Copying TSV"
|
818
1247
|
@filename = file.filename
|
819
1248
|
|
820
|
-
if options[:persistence] and not
|
1249
|
+
if options[:persistence] and not Persistence::TSV === file.data
|
821
1250
|
persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
|
822
1251
|
Log.low "Making persistance #{ persistence_file }"
|
823
1252
|
@data = TCHash.get(persistence_file)
|
@@ -834,7 +1263,7 @@ class TSV
|
|
834
1263
|
@list = file.list
|
835
1264
|
return self
|
836
1265
|
when Hash === file
|
837
|
-
Log.low "Encapsulating Hash"
|
1266
|
+
Log.low "Encapsulating Hash in TSV object"
|
838
1267
|
@filename = "Hash:" + Digest::MD5.hexdigest(file.inspect)
|
839
1268
|
if options[:persistence]
|
840
1269
|
persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
|
@@ -845,9 +1274,9 @@ class TSV
|
|
845
1274
|
@data = file
|
846
1275
|
end
|
847
1276
|
return self
|
848
|
-
when
|
849
|
-
Log.low "Encapsulating
|
850
|
-
@filename = "
|
1277
|
+
when Persistence::TSV === file
|
1278
|
+
Log.low "Encapsulating Persistence::TSV"
|
1279
|
+
@filename = "Persistence::TSV:" + Digest::MD5.hexdigest(file.inspect)
|
851
1280
|
@data = file
|
852
1281
|
@key_field = file.key_field
|
853
1282
|
@fields = file.fields
|
@@ -860,7 +1289,7 @@ class TSV
|
|
860
1289
|
when StringIO
|
861
1290
|
else
|
862
1291
|
raise "File #{file} not found"
|
863
|
-
|
1292
|
+
end
|
864
1293
|
|
865
1294
|
if options[:persistence]
|
866
1295
|
options.delete :persistence
|
@@ -868,11 +1297,11 @@ class TSV
|
|
868
1297
|
|
869
1298
|
if File.exists? persistence_file
|
870
1299
|
Log.low "Loading Persistence for #{ @filename } in #{persistence_file}"
|
871
|
-
@data =
|
1300
|
+
@data = Persistence::TSV.get(persistence_file, false)
|
872
1301
|
@key_field = @data.key_field
|
873
1302
|
@fields = @data.fields
|
874
1303
|
else
|
875
|
-
@data =
|
1304
|
+
@data = Persistence::TSV.get(persistence_file, true)
|
876
1305
|
file = Open.grep(file, options[:grep]) if options[:grep]
|
877
1306
|
|
878
1307
|
Log.low "Persistent Parsing for #{ @filename } in #{persistence_file}"
|
@@ -897,17 +1326,17 @@ end
|
|
897
1326
|
#{{{ CacheHelper
|
898
1327
|
require 'rbbt/util/cachehelper'
|
899
1328
|
module CacheHelper
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
1329
|
+
def self.tsv_cache(name, key = [])
|
1330
|
+
cache_file = CacheHelper.build_filename name, key
|
1331
|
+
|
1332
|
+
if File.exists? cache_file
|
1333
|
+
Log.debug "TSV cache file '#{cache_file}' found"
|
1334
|
+
hash = TCHash.get(cache_file)
|
1335
|
+
TSV.new(hash)
|
1336
|
+
else
|
1337
|
+
Log.debug "Producing TSV cache file '#{cache_file}'"
|
1338
|
+
data = yield
|
1339
|
+
TSV.new(data, :persistence_file => cache_file)
|
1340
|
+
end
|
912
1341
|
end
|
913
1342
|
end
|