rbbt-util 1.1.0 → 1.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -6,7 +6,13 @@ class TCHash < TokyoCabinet::HDB
6
6
 
7
7
  Serializer = Marshal
8
8
 
9
- FIELD_INFO_ENTRIES = {:fields => '__tokyocabinet_hash_fields', :key_field => '__tokyocabinet_hash_native_field'}
9
+ FIELD_INFO_ENTRIES = {
10
+ :fields => '__tokyocabinet_hash_fields',
11
+ :key_field => '__tokyocabinet_hash_key_field',
12
+ :filename => '__tokyocabinet_hash_filename',
13
+ :type => '__tokyocabinet_hash_type',
14
+ :case_insensitive => '__tokyocabinet_hash_case_insensitive'
15
+ }
10
16
  CONNECTIONS = {}
11
17
 
12
18
  FIELD_INFO_ENTRIES.each do |entry, key|
data/lib/rbbt/util/tsv.rb CHANGED
@@ -3,17 +3,10 @@ require 'rbbt/util/open'
3
3
  require 'rbbt/util/tc_hash'
4
4
  require 'rbbt/util/tmpfile'
5
5
  require 'rbbt/util/log'
6
+ require 'rbbt/util/persistence'
6
7
  require 'digest'
7
8
  require 'fileutils'
8
9
 
9
- def add_defaults(options, defaults = {})
10
- new_options = options.dup
11
- defaults.each do |key, value|
12
- new_options[key] = value if new_options[key].nil?
13
- end
14
- new_options
15
- end
16
-
17
10
  class TSV
18
11
  class FieldNotFoundError < StandardError;end
19
12
 
@@ -26,8 +19,6 @@ class TSV
26
19
 
27
20
  #{{{ Persistence
28
21
 
29
- PersistenceHash = TCHash
30
-
31
22
  CACHEDIR="/tmp/tsv_persistent_cache"
32
23
  FileUtils.mkdir CACHEDIR unless File.exist? CACHEDIR
33
24
 
@@ -40,10 +31,7 @@ class TSV
40
31
  CACHEDIR
41
32
  end
42
33
 
43
- def self.get_persistence_file(file, prefix, options = {})
44
- File.join(CACHEDIR, prefix.gsub(/\s/,'_').gsub(/\//,'>') + Digest::MD5.hexdigest([file, options].inspect))
45
- end
46
-
34
+
47
35
  #{{{ Headers and Field Stuff
48
36
 
49
37
  def self.headers(file, options = {})
@@ -130,7 +118,7 @@ class TSV
130
118
  end
131
119
 
132
120
  each do |key, values|
133
- if list
121
+ if type == :double
134
122
  tmp_values = values + [[key]]
135
123
  else
136
124
  tmp_values = values + [key]
@@ -144,11 +132,15 @@ class TSV
144
132
  new_values = tmp_values.values_at(*new_field_positions)
145
133
  end
146
134
 
147
- tmp_values[new_key_position].each do |new_key|
148
- if new_field_names
149
- yield new_key, NamedArray.name(new_values, new_field_names)
150
- else
151
- yield new_key, new_values
135
+ if not Array === tmp_values[new_key_position]
136
+ yield tmp_values[new_key_position], NamedArray.name(new_values, new_field_names)
137
+ else
138
+ tmp_values[new_key_position].each do |new_key|
139
+ if new_field_names
140
+ yield new_key, NamedArray.name(new_values, new_field_names)
141
+ else
142
+ yield new_key, new_values
143
+ end
152
144
  end
153
145
  end
154
146
  end
@@ -165,7 +157,7 @@ class TSV
165
157
 
166
158
  def reorder(new_key_field, new_fields = nil, options = {})
167
159
  options = Misc.add_defaults options
168
- return TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => case_insensitive) if options[:persistence_file] and File.exists?(options[:persistence_file])
160
+ return TSV.new(Persistence::TSV.get(options[:persistence_file], false), :case_insensitive => case_insensitive) if options[:persistence_file] and File.exists?(options[:persistence_file])
169
161
 
170
162
  new = {}
171
163
  new_key_field, new_fields = through new_key_field, new_fields do |key, values|
@@ -181,7 +173,7 @@ class TSV
181
173
  end
182
174
 
183
175
  if options[:persistence_file]
184
- reordered = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => case_insensitive)
176
+ reordered = TSV.new(Persistence::TSV.get(options[:persistence_file], false), :case_insensitive => case_insensitive)
185
177
  reordered.merge! new
186
178
  else
187
179
  reordered = TSV.new(new, :case_insensitive => case_insensitive)
@@ -199,12 +191,12 @@ class TSV
199
191
 
200
192
  def add_field(name = nil)
201
193
  each do |key, values|
202
- self[key] = values << yield(key, values)
194
+ self[key] = values + [yield(key, values)]
203
195
  end
204
196
 
205
- fields << name if list
206
- if PersistenceHash === @data
207
- @data.fields = fields
197
+ if fields != nil
198
+ new_fields = fields + [name]
199
+ self.fields = new_fields
208
200
  end
209
201
  end
210
202
 
@@ -212,6 +204,9 @@ class TSV
212
204
  new = TSV.new({})
213
205
  new.key_field = key_field
214
206
  new.fields = fields.dup
207
+ new.type = type
208
+ new.filename = filename + "#Select: #{method.inspect}"
209
+ new.case_insensitive = case_insensitive
215
210
 
216
211
  case
217
212
  when Array === method
@@ -222,6 +217,10 @@ class TSV
222
217
  through do |key, values|
223
218
  new[key] = values if [key,values].flatten.select{|v| v =~ method}.any?
224
219
  end
220
+ when String === method
221
+ through do |key, values|
222
+ new[key] = values if [key,values].flatten.select{|v| v == method}.any?
223
+ end
225
224
  when Hash === method
226
225
  key = method.keys.first
227
226
  method = method.values.first
@@ -230,89 +229,87 @@ class TSV
230
229
  method.each{|item| if values = self[item]; then new[item] = values; end}
231
230
  when Array === method
232
231
  through :main, key do |key, values|
233
- new[key] = values if (values.flatten & method).any?
232
+ new[key] = self[key] if (values.flatten & method).any?
234
233
  end
235
234
  when Regexp === method
236
235
  through :main, key do |key, values|
237
- new[key] = values if values.flatten.select{|v| v =~ method}.any?
236
+ new[key] = self[key] if values.flatten.select{|v| v =~ method}.any?
237
+ end
238
+ when String === method
239
+ through :main, key do |key, values|
240
+ new[key] = self[key] if values.flatten.select{|v| v == method}.any?
238
241
  end
239
242
  end
240
243
  end
241
244
 
245
+
242
246
  new
243
247
  end
244
248
 
245
249
  def index(options = {})
246
- options = Misc.add_defaults options, :order => false
250
+ options = Misc.add_defaults options, :order => false, :persistence => false
247
251
 
248
- if options[:persistence] and ! options[:persistence_file]
249
- options[:persistence_file] = TSV.get_persistence_file(filename, "index:#{ filename }_#{options[:field]}:", options)
250
- end
252
+ new, extra = Persistence.persist(filename, :Index, :tsv, options) do |filename, options|
253
+ new = {}
254
+ if options[:order]
255
+ new_key_field, new_fields = through options[:target], options[:others] do |key, values|
251
256
 
252
- if options[:persistence_file] and File.exists?(options[:persistence_file])
253
- return TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
254
- end
257
+ values.each_with_index do |list, i|
258
+ next if list.nil? or list.empty?
255
259
 
256
- new = {}
257
- if options[:order]
258
- new_key_field, new_fields = through options[:field], options[:others] do |key, values|
259
-
260
- values.each_with_index do |list, i|
261
- next if list.nil? or list.empty?
262
-
263
- list = [list] unless Array === list
260
+ list = [list] unless Array === list
264
261
 
265
- list.each do |value|
266
- next if value.nil? or value.empty?
267
- value = value.downcase if options[:case_insensitive]
268
- new[value] ||= []
269
- new[value][i + 1] ||= []
270
- new[value][i + 1] << key
271
- end
262
+ list.each do |value|
263
+ next if value.nil? or value.empty?
264
+ value = value.downcase if options[:case_insensitive]
265
+ new[value] ||= []
266
+ new[value][i + 1] ||= []
267
+ new[value][i + 1] << key
268
+ end
272
269
  new[key] ||= []
273
270
  new[key][0] = key
274
- end
271
+ end
275
272
 
276
- end
273
+ end
277
274
 
278
- new.each do |key, values|
279
- values.flatten!
280
- values.compact!
281
- end
275
+ new.each do |key, values|
276
+ values.flatten!
277
+ values.compact!
278
+ end
282
279
 
283
- else
284
- new_key_field, new_fields = through options[:field], options[:others] do |key, values|
285
- new[key] ||= []
286
- new[key] << key
287
- values.each do |list|
288
- next if list.nil?
289
- if Array === list
290
- list.each do |value|
280
+ else
281
+ new_key_field, new_fields = through options[:target], options[:others] do |key, values|
282
+ new[key] ||= []
283
+ new[key] << key
284
+ values.each do |list|
285
+ next if list.nil?
286
+ if Array === list
287
+ list.each do |value|
288
+ value = value.downcase if options[:case_insensitive]
289
+ new[value] ||= []
290
+ new[value] << key
291
+ end
292
+ else
293
+ next if list.empty?
294
+ value = list
291
295
  value = value.downcase if options[:case_insensitive]
292
296
  new[value] ||= []
293
297
  new[value] << key
294
298
  end
295
- else
296
- next if list.empty?
297
- value = list
298
- value = value.downcase if options[:case_insensitive]
299
- new[value] ||= []
300
- new[value] << key
301
299
  end
302
300
  end
303
301
  end
304
- end
305
302
 
306
- if options[:persistence_file]
307
- index = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
308
- index.merge! new
309
- else
310
- index = TSV.new(new, :case_insensitive => options[:case_insensitive])
303
+ [new, {:key_field => new_key_field, :fields => new_fields, :type => :double, :case_insensitive => options[:case_insensitive]}]
311
304
  end
312
305
 
313
- index.key_field = new_key_field
314
- index.fields = new_fields
315
- index
306
+ new = TSV.new(new)
307
+ new.filename = "Index: " + filename + options.inspect
308
+ new.fields = extra[:fields]
309
+ new.key_field = extra[:key_field]
310
+ new.case_insensitive = extra[:case_insensitive]
311
+ new.type = extra[:type]
312
+ new
316
313
  end
317
314
 
318
315
  def smart_merge(other, match = nil, new_fields = nil)
@@ -413,7 +410,7 @@ class TSV
413
410
 
414
411
  if nofieldinfo
415
412
  next if other[other_code].nil?
416
- if list
413
+ if type == :double
417
414
  other_values = [[other_code]] + other[other_code]
418
415
  else
419
416
  other_values = [other_code] + other[other_code]
@@ -427,13 +424,13 @@ class TSV
427
424
  new_values = values + other_values
428
425
  else
429
426
  if other[other_code].nil?
430
- if list
427
+ if type == :double
431
428
  other_values = [[]] * other.fields.length
432
429
  else
433
430
  other_values = [] * other.fields.length
434
431
  end
435
432
  else
436
- if list
433
+ if type == :double
437
434
  other_values = other[other_code] + [[other_code]]
438
435
  else
439
436
  other_values = other[other_code] + [other_code]
@@ -443,11 +440,11 @@ class TSV
443
440
 
444
441
  new_values = values.dup
445
442
 
446
- if list
443
+ if type == :double
447
444
  this_common_field_positions.zip(other_common_field_positions).each do |tpos, opos|
448
445
  new_values_tops = new_values[tpos]
449
446
 
450
- if other.list
447
+ if other.type == :double
451
448
  new_values_tops += other_values[opos]
452
449
  else
453
450
  new_values_tops += [other_values[opos]]
@@ -466,10 +463,58 @@ class TSV
466
463
 
467
464
  self.fields = self.fields + new_fields unless nofieldinfo
468
465
  end
466
+
467
+
468
+ def self.field_matches(tsv, values)
469
+ if values.flatten.sort[0..9].compact.collect{|n| n.to_i} == (1..10).to_a
470
+ return {}
471
+ end
472
+
473
+ key_field = tsv.key_field
474
+ fields = tsv.fields
475
+
476
+ field_values = {}
477
+ fields.each{|field|
478
+ field_values[field] = []
479
+ }
480
+
481
+ tsv.through do |key,entry_values|
482
+ fields.zip(entry_values).each do |field,entry_field_values|
483
+ field_values[field].concat entry_field_values
484
+ end
485
+ end
486
+
487
+ field_values.each do |field,field_value_list|
488
+ field_value_list.replace(values & field_value_list.flatten.uniq)
489
+ end
490
+
491
+ field_values[key_field] = values & tsv.keys
492
+
493
+ field_values
494
+ end
495
+
496
+ def field_matches(values)
497
+ TSV.field_matches(self, values)
498
+ end
499
+
500
+
469
501
 
470
502
  #{{{ Helpers
471
503
 
472
504
  def self.index(file, options = {})
505
+ options = Misc.add_defaults options, :data_persistence => true, :persistence => true
506
+ persistence, persistence_file = Misc.process_options options, :persistence, :persistence_file
507
+ options[:persistence], options[:persistence_file] = options.values_at :data_persistence, :data_persistence_file
508
+ options.delete :data_persistence
509
+ options.delete :data_persistence_file
510
+
511
+ index, extra = Persistence.persist(file, :Index, :tsv, options) do |file, options, filename|
512
+ TSV.new(file, :double, options).index
513
+ end
514
+ index
515
+ end
516
+
517
+ def self.index2(file, options = {})
473
518
  opt_data = options.dup
474
519
  opt_index = options.dup
475
520
  opt_data.delete :field
@@ -482,7 +527,7 @@ class TSV
482
527
 
483
528
  if ! opt_index[:persistence_file].nil? && File.exists?(opt_index[:persistence_file])
484
529
  Log.low "Reloading persistent index for #{ file }: #{opt_index[:persistence_file]}"
485
- TSV.new(PersistenceHash.get(opt_index[:persistence_file], false), opt_index)
530
+ TSV.new(Persistence::TSV.get(opt_index[:persistence_file], false), opt_index)
486
531
  else
487
532
  Log.low "Creating index for #{ file }: #{opt_index[:persistence_file]}"
488
533
  data = TSV.new(file, opt_data)
@@ -501,6 +546,23 @@ class TSV
501
546
  end
502
547
 
503
548
  #{{{ Accesor Methods
549
+ attr_accessor :filename, :type, :case_insensitive, :key_field, :fields, :data
550
+
551
+ def fields
552
+ return nil if @fields.nil?
553
+ fields = @fields
554
+ fields.each do |f| f.extend Field end if Array === fields
555
+ fields
556
+ end
557
+
558
+ def fields=(new_fields)
559
+ @fields = new_fields
560
+ if Persistence::TSV === @data
561
+ @data.fields = new_fields
562
+ end
563
+ end
564
+
565
+
504
566
 
505
567
  def keys
506
568
  @data.keys
@@ -531,6 +593,7 @@ class TSV
531
593
  # Read
532
594
 
533
595
  def follow(value)
596
+ return nil if value.nil?
534
597
  if String === value && value =~ /__Ref:(.*)/
535
598
  return self[$1]
536
599
  else
@@ -546,7 +609,7 @@ class TSV
546
609
  return nil
547
610
  end
548
611
 
549
- key = key.downcase if @case_insensitive
612
+ key = key.downcase if @case_insensitive and key !~ /^__Ref:/
550
613
  follow @data[key]
551
614
  end
552
615
 
@@ -587,23 +650,37 @@ class TSV
587
650
  collect.sort_by &block
588
651
  end
589
652
 
590
- def to_s
653
+ def values_to_s(values)
654
+ case
655
+ when (values.nil? and fields.nil?)
656
+ "\n"
657
+ when (values.nil? and not fields.nil?)
658
+ "\t" << ([""] * fields.length) * "\t" << "\n"
659
+ when (not Array === values)
660
+ "\t" << values.to_s << "\n"
661
+ when Array === values.first
662
+ "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n"
663
+ else
664
+ "\t" << values * "\t" << "\n"
665
+ end
666
+ end
667
+
668
+ def to_s(keys = nil)
591
669
  str = ""
592
670
 
593
671
  if fields
594
672
  str << "#" << key_field << "\t" << fields * "\t" << "\n"
595
673
  end
596
674
 
597
- each do |key, values|
598
- case
599
- when values.nil?
600
- str << key.dup << "\n"
601
- when (not Array === values)
602
- str << key.dup << "\t" << values.to_s << "\n"
603
- when Array === values.first
604
- str << key.dup << "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n"
605
- else
606
- str << key.dup << "\t" << values * "\t" << "\n"
675
+ if keys.nil?
676
+ each do |key, values|
677
+ key = key.to_s if Symbol === key
678
+ str << key.dup << values_to_s(values)
679
+ end
680
+ else
681
+ keys.zip(values_at(*keys)).each do |key, values|
682
+ key = key.to_s if Symbol === key
683
+ str << key.dup << values_to_s(values)
607
684
  end
608
685
  end
609
686
 
@@ -625,11 +702,301 @@ class TSV
625
702
  zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields
626
703
  zipped
627
704
  end
628
-
629
- def self.parse(data, file, options = {})
630
705
 
706
+ def self.key_order(file, options = {})
631
707
  # Prepare options
632
708
  options = add_defaults options,
709
+ :sep => "\t",
710
+ :sep2 => "|",
711
+ :native => 0,
712
+ :fix => nil,
713
+ :exclude => nil,
714
+ :select => nil,
715
+ :grep => nil,
716
+ :case_insensitive => false,
717
+ :header_hash => '#'
718
+
719
+ options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
720
+
721
+ if String === file and File.exists? file
722
+ file = File.open(file)
723
+ end
724
+
725
+ #{{{ Process first line
726
+
727
+ line = file.gets
728
+ raise "Empty content" if line.nil?
729
+ line.chomp!
730
+
731
+ if line =~ /^#{options[:header_hash]}/
732
+ header_fields = parse_fields(line, options[:sep])
733
+ header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character
734
+ line = file.gets
735
+ else
736
+ header_fields = nil
737
+ end
738
+
739
+ id_pos = Misc.field_position(header_fields, options[:native])
740
+
741
+ if options[:extra].nil?
742
+ extra_pos = nil
743
+ max_cols = 0
744
+ else
745
+ extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) }
746
+ end
747
+
748
+ ids = []
749
+ #{{{ Process rest
750
+ while line do
751
+ line.chomp!
752
+
753
+ line = options[:fix].call line if options[:fix]
754
+ break if not line
755
+
756
+ # Select and fix lines
757
+ if line.empty? or
758
+ (options[:exclude] and options[:exclude].call(line)) or
759
+ (options[:select] and not options[:select].call(line))
760
+
761
+ line = file.gets
762
+ next
763
+ end
764
+
765
+ ### Process line
766
+
767
+ # Chunk fields
768
+ parts = parse_fields(line, options[:sep])
769
+
770
+ # Get next line
771
+ line = file.gets
772
+
773
+ # Get id field
774
+ next if parts[id_pos].nil? || parts[id_pos].empty?
775
+ ids << parts[id_pos]
776
+ end
777
+
778
+ ids
779
+ end
780
+
781
+ def self.parse_header(stream, sep, header_hash)
782
+ fields, key_field = nil
783
+ options = {}
784
+
785
+ line = stream.gets
786
+
787
+ if line and line =~ /^#{header_hash}: (.*)/
788
+ options = Misc.string2hash $1
789
+ line = stream.gets
790
+ end
791
+
792
+ sep = options[:sep] if options[:sep]
793
+
794
+ if line and line =~ /^#{header_hash}/
795
+ line.chomp!
796
+ fields = parse_fields(line, sep)
797
+ key_field = fields.shift
798
+ key_field = key_field[(0 + header_hash.length)..-1] # Remove initial hash character
799
+ line = stream.gets
800
+ end
801
+
802
+ raise "Empty content" if line.nil?
803
+ return key_field, fields, options, line
804
+ end
805
+
806
+ def self.parse(stream, options = {})
807
+ # Prepare options
808
+ options = Misc.add_defaults options,
809
+ :case_insensitive => false,
810
+ :type => :double,
811
+
812
+ :merge => false,
813
+ :keep_empty => true,
814
+ :cast => nil,
815
+
816
+ :sep => "\t",
817
+ :sep2 => "|",
818
+ :header_hash => '#',
819
+
820
+ :key => 0,
821
+ :fields => nil,
822
+
823
+ :fix => nil,
824
+ :exclude => nil,
825
+ :select => nil,
826
+ :grep => nil
827
+
828
+
829
+ sep, header_hash =
830
+ Misc.process_options options, :sep, :header_hash
831
+
832
+ key_field, other_fields, more_options, line = TSV.parse_header(stream, sep, header_hash)
833
+
834
+ sep = more_options[:sep] if more_options[:sep]
835
+ options = Misc.add_defaults options, more_options
836
+ sep2 = Misc.process_options options, :sep2
837
+
838
+ key, others =
839
+ Misc.process_options options, :key, :others
840
+
841
+ if key_field.nil?
842
+ key_pos = key
843
+ key_field, fields = nil
844
+ else
845
+ all_fields = [key_field].concat other_fields
846
+
847
+ key_pos = Misc.field_position(all_fields, key)
848
+
849
+ if String === others or Symbol === others
850
+ others = [others]
851
+ end
852
+
853
+ if others.nil?
854
+ other_pos = (0..(all_fields.length - 1)).to_a
855
+ other_pos.delete key_pos
856
+ else
857
+ other_pos = Misc.field_position(all_fields, *others)
858
+ end
859
+
860
+ key_field = all_fields[key_pos]
861
+ fields = all_fields.values_at *other_pos
862
+ end
863
+
864
+ case_insensitive, type, merge, keep_empty, cast =
865
+ Misc.process_options options, :case_insensitive, :type, :merge, :keep_empty, :cast
866
+ fix, exclude, select, grep =
867
+ Misc.process_options options, :fix, :exclude, :select, :grep
868
+
869
+ #{{{ Process rest
870
+ data = {}
871
+ single = type.to_sym != :double
872
+ max_cols = 0
873
+ while line do
874
+ line.chomp!
875
+
876
+ line = fix.call line if fix
877
+ break if not line
878
+
879
+ if header_hash and line =~ /^#{header_hash}/
880
+ line = stream.gets
881
+ next
882
+ end
883
+
884
+ if line.empty? or
885
+ (exclude and exclude.call(line)) or
886
+ (select and not select.call(line))
887
+
888
+ line = stream.gets
889
+ next
890
+ end
891
+
892
+ # Chunk fields
893
+ parts = parse_fields(line, sep)
894
+
895
+ # Get next line
896
+ line = stream.gets
897
+
898
+ # Get id field
899
+ next if parts[key_pos].nil? || parts[key_pos].empty?
900
+
901
+ if single
902
+ ids = parse_fields(parts[key_pos], sep2)
903
+ ids.collect!{|id| id.downcase} if case_insensitive
904
+
905
+ id = ids.shift
906
+ ids.each do |id2| data[id2] = "__Ref:#{id}" end
907
+
908
+ if key_field.nil?
909
+ other_pos = (0..(parts.length - 1)).to_a
910
+ other_pos.delete key_pos
911
+ end
912
+
913
+ extra = parts.values_at(*other_pos).collect{|f| parse_fields(f, sep2).first}
914
+ extra.collect! do |elem|
915
+ case
916
+ when String === cast
917
+ elem.send(cast)
918
+ when Proc === cast
919
+ cast.call elem
920
+ end
921
+ end if cast
922
+
923
+ max_cols = extra.size if extra.size > (max_cols || 0)
924
+ case type
925
+ when :list
926
+ data[id] = extra unless data.include? id
927
+ when :flat
928
+ data[id] = extra.flatten unless data.include? id
929
+ when :single
930
+ data[id] = extra.flatten.first unless data.include? id
931
+ end
932
+
933
+ else
934
+ ids = parse_fields(parts[key_pos], sep2)
935
+ ids.collect!{|id| id.downcase} if case_insensitive
936
+
937
+ id = ids.shift
938
+ ids.each do |id2| data[id2] = "__Ref:#{id}" end
939
+
940
+ if key_field.nil?
941
+ other_pos = (0..(parts.length - 1)).to_a
942
+ other_pos.delete key_pos
943
+ end
944
+
945
+ extra = parts.values_at(*other_pos).collect{|f| parse_fields(f, sep2)}
946
+ extra.collect! do |list|
947
+ case
948
+ when String === cast
949
+ list.collect{|elem| elem.send(cast)}
950
+ when Proc === cast
951
+ list.collect{|elem| cast.call elem}
952
+ end
953
+ end if cast
954
+
955
+ max_cols = extra.size if extra.size > (max_cols || 0)
956
+ if merge
957
+ data[id] = extra unless data.include? id
958
+ else
959
+ if not data.include? id
960
+ data[id] = extra
961
+ else
962
+ entry = data[id]
963
+ while entry =~ /__Ref:(.*)/ do entry = data[$1] end
964
+ extra.each_with_index do |f, i|
965
+ if f.empty?
966
+ next unless keep_empty
967
+ f= [""]
968
+ end
969
+ entry[i] ||= []
970
+ entry[i] = entry[i].concat f
971
+ end
972
+ data[id] = entry
973
+ end
974
+ end
975
+ end
976
+ end
977
+
978
+ if keep_empty and max_cols > 0
979
+ data.each do |key, values|
980
+ next if values =~ /__Ref:/
981
+ new_values = values
982
+ max_cols.times do |i|
983
+ if type == :double
984
+ new_values[i] = [""] if new_values[i].nil? or new_values[i].empty?
985
+ else
986
+ new_values[i] = "" if new_values[i].nil?
987
+ end
988
+ end
989
+ data[key] = new_values
990
+ end
991
+ end
992
+
993
+ [data, {:key_field => key_field, :fields => fields, :type => type, :case_insensitive => case_insensitive}]
994
+ end
995
+
996
+ def self.parse2(data, file, options = {})
997
+
998
+ # Prepare options
999
+ options = Misc.add_defaults options,
633
1000
  :sep => "\t",
634
1001
  :sep2 => "|",
635
1002
  :native => 0,
@@ -640,18 +1007,19 @@ class TSV
640
1007
  :grep => nil,
641
1008
  :single => false,
642
1009
  :unique => false,
1010
+ :merge => false,
643
1011
  :flatten => false,
644
- :overwrite => false,
645
1012
  :keep_empty => true,
646
1013
  :case_insensitive => false,
647
1014
  :header_hash => '#' ,
1015
+ :cast => nil,
648
1016
  :persistence_file => nil
1017
+
649
1018
 
1019
+ options[:unique] = options[:uniq] if options[:unique].nil?
650
1020
  options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
651
1021
  options[:flatten] = true if options[:single]
652
1022
 
653
-
654
-
655
1023
  #{{{ Process first line
656
1024
 
657
1025
  line = file.gets
@@ -680,10 +1048,18 @@ class TSV
680
1048
  line.chomp!
681
1049
 
682
1050
  line = options[:fix].call line if options[:fix]
1051
+ break if not line
1052
+
1053
+ if options[:header_hash] && line =~ /^#{options[:header_hash]}/
1054
+ line = file.gets
1055
+ next
1056
+ end
683
1057
 
684
1058
  # Select and fix lines
685
- if (options[:exclude] and options[:exclude].call(line)) or
1059
+ if line.empty? or
1060
+ (options[:exclude] and options[:exclude].call(line)) or
686
1061
  (options[:select] and not options[:select].call(line))
1062
+
687
1063
  line = file.gets
688
1064
  next
689
1065
  end
@@ -721,35 +1097,48 @@ class TSV
721
1097
  extra.flatten! if options[:flatten]
722
1098
  extra = extra.first if options[:single]
723
1099
 
724
- if options[:overwrite]
725
- main_entry = ids.shift
726
- ids.each do |id|
727
- data[id] = "__Ref:#{main_entry}"
1100
+ if options[:cast]
1101
+ if Array === extra[0]
1102
+ e = extra
1103
+ else
1104
+ e = [extra]
728
1105
  end
729
1106
 
730
- data[main_entry] = extra
731
- else
732
- main_entry = ids.shift
733
- ids.each do |id|
734
- data[id] = "__Ref:#{main_entry}"
1107
+ e.each do |list|
1108
+ case
1109
+ when String === options[:cast]
1110
+ list.collect!{|elem| elem.send(options[:cast])}
1111
+ when Proc === options[:cast]
1112
+ list.collect!{|elem| options[:cast].call elem}
1113
+ end
735
1114
  end
1115
+ end
736
1116
 
737
- case
738
- when (options[:single] or options[:unique])
739
- data[main_entry] ||= extra
740
- when options[:flatten]
741
- if PersistenceHash === data
742
- data[main_entry] = (data[main_entry] || []).concat extra
1117
+ main_entry = ids.shift
1118
+ ids.each do |id| data[id] = "__Ref:#{main_entry}" end
1119
+
1120
+ case
1121
+ when (options[:single] or options[:unique] or not options[:merge])
1122
+ data[main_entry] = extra unless data.include? main_entry
1123
+ when options[:flatten]
1124
+ entry = data[main_entry]
1125
+
1126
+ if entry.nil?
1127
+ data[main_entry] = extra
1128
+ else
1129
+ while entry =~ /__Ref:(.*)/ do entry = data[$1] end
1130
+ if Persistence::TSV === data
1131
+ data[main_entry] = entry.concat extra
743
1132
  else
744
- data[main_entry] ||= []
745
1133
  data[main_entry].concat extra
746
1134
  end
1135
+ end
1136
+ else
1137
+ entry = data[main_entry]
1138
+ if entry.nil?
1139
+ data[main_entry] = extra
747
1140
  else
748
- entry = data[main_entry] || []
749
- while entry =~ /__Ref:(.*)/ do
750
- entry = data[$1]
751
- end
752
-
1141
+ while entry =~ /__Ref:(.*)/ do entry = data[$1] end
753
1142
  extra.each_with_index do |fields, i|
754
1143
  if fields.empty?
755
1144
  next unless options[:keep_empty]
@@ -758,7 +1147,6 @@ class TSV
758
1147
  entry[i] ||= []
759
1148
  entry[i] = entry[i].concat fields
760
1149
  end
761
-
762
1150
  data[main_entry] = entry
763
1151
  end
764
1152
  end
@@ -774,7 +1162,6 @@ class TSV
774
1162
  end
775
1163
  end
776
1164
 
777
-
778
1165
  # Save header information
779
1166
  key_field = nil
780
1167
  fields = nil
@@ -788,19 +1175,61 @@ class TSV
788
1175
  end
789
1176
  end
790
1177
 
791
- data.read if PersistenceHash === data
1178
+ data.read if Persistence::TSV === data
792
1179
 
793
1180
  [key_field, fields]
794
1181
  end
1182
+ def initialize(file = {}, type = :double, options = {})
1183
+ if Hash === type
1184
+ options = type
1185
+ type = :double
1186
+ end
795
1187
 
796
- attr_accessor :data, :key_field, :fields, :list, :case_insensitive, :filename
797
- def fields
798
- fields = @fields
799
- fields.each do |f| f.extend Field end if Array === fields
800
- fields
1188
+ if String === file and file =~/(.*?)#(.*)/ and File.exists? $1
1189
+ options = Misc.add_defaults options, Misc.string2hash($2)
1190
+ file = $1
1191
+ end
1192
+
1193
+ options = Misc.add_defaults options, :persistence => false, :case_insensitive => false, :type => type
1194
+
1195
+ @filename = Misc.process_options options, :filename
1196
+ @filename ||= case
1197
+ when (String === file and File.exists? file)
1198
+ File.expand_path file
1199
+ when File === file
1200
+ File.expand_path file.path
1201
+ else
1202
+ Digest::MD5.hexdigest(file.inspect)
1203
+ end
1204
+
1205
+ if block_given?
1206
+ @data, extra = Persistence.persist(@filename, :TSV, :tsv, options) do |filename, options| yield filename, options end
1207
+ else
1208
+ @data, extra = Persistence.persist(@filename, :TSV, :tsv, options) do |filename, options|
1209
+ data, extra = nil
1210
+ case
1211
+ when String === file
1212
+ File.open(file) do |f|
1213
+ data, extra = TSV.parse(f, options)
1214
+ end
1215
+ when File === file
1216
+ data, extra = TSV.parse(file, options)
1217
+ when Hash === file
1218
+ data = file
1219
+ extra = {:case_insensitive => options[:case_insensitive], :type => type}
1220
+ end
1221
+
1222
+ [data, extra]
1223
+ end
1224
+ end
1225
+
1226
+ @type = extra[:type]
1227
+ @key_field = extra[:key_field]
1228
+ @fields = extra[:fields]
1229
+ @case_insensitive = extra[:case_insensitive]
801
1230
  end
802
1231
 
803
- def initialize(file = {}, options = {})
1232
+ def initialize2(file = {}, options = {})
804
1233
  options = Misc.add_defaults options
805
1234
  options[:persistence] = true if options[:persistence_file]
806
1235
 
@@ -817,7 +1246,7 @@ class TSV
817
1246
  Log.low "Copying TSV"
818
1247
  @filename = file.filename
819
1248
 
820
- if options[:persistence] and not PersistenceHash === file.data
1249
+ if options[:persistence] and not Persistence::TSV === file.data
821
1250
  persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
822
1251
  Log.low "Making persistance #{ persistence_file }"
823
1252
  @data = TCHash.get(persistence_file)
@@ -834,7 +1263,7 @@ class TSV
834
1263
  @list = file.list
835
1264
  return self
836
1265
  when Hash === file
837
- Log.low "Encapsulating Hash"
1266
+ Log.low "Encapsulating Hash in TSV object"
838
1267
  @filename = "Hash:" + Digest::MD5.hexdigest(file.inspect)
839
1268
  if options[:persistence]
840
1269
  persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
@@ -845,9 +1274,9 @@ class TSV
845
1274
  @data = file
846
1275
  end
847
1276
  return self
848
- when PersistenceHash === file
849
- Log.low "Encapsulating PersistenceHash"
850
- @filename = "PersistenceHash:" + Digest::MD5.hexdigest(file.inspect)
1277
+ when Persistence::TSV === file
1278
+ Log.low "Encapsulating Persistence::TSV"
1279
+ @filename = "Persistence::TSV:" + Digest::MD5.hexdigest(file.inspect)
851
1280
  @data = file
852
1281
  @key_field = file.key_field
853
1282
  @fields = file.fields
@@ -860,7 +1289,7 @@ class TSV
860
1289
  when StringIO
861
1290
  else
862
1291
  raise "File #{file} not found"
863
- end
1292
+ end
864
1293
 
865
1294
  if options[:persistence]
866
1295
  options.delete :persistence
@@ -868,11 +1297,11 @@ class TSV
868
1297
 
869
1298
  if File.exists? persistence_file
870
1299
  Log.low "Loading Persistence for #{ @filename } in #{persistence_file}"
871
- @data = PersistenceHash.get(persistence_file, false)
1300
+ @data = Persistence::TSV.get(persistence_file, false)
872
1301
  @key_field = @data.key_field
873
1302
  @fields = @data.fields
874
1303
  else
875
- @data = PersistenceHash.get(persistence_file, true)
1304
+ @data = Persistence::TSV.get(persistence_file, true)
876
1305
  file = Open.grep(file, options[:grep]) if options[:grep]
877
1306
 
878
1307
  Log.low "Persistent Parsing for #{ @filename } in #{persistence_file}"
@@ -897,17 +1326,17 @@ end
897
1326
  #{{{ CacheHelper
898
1327
  require 'rbbt/util/cachehelper'
899
1328
  module CacheHelper
900
- def self.tsv_cache(name, key = [])
901
- cache_file = CacheHelper.build_filename name, key
902
-
903
- if File.exists? cache_file
904
- Log.debug "TSV cache file '#{cache_file}' found"
905
- hash = TCHash.get(cache_file)
906
- TSV.new(hash)
907
- else
908
- Log.debug "Producing TSV cache file '#{cache_file}'"
909
- data = yield
910
- TSV.new(data, :persistence_file => cache_file)
911
- end
1329
+ def self.tsv_cache(name, key = [])
1330
+ cache_file = CacheHelper.build_filename name, key
1331
+
1332
+ if File.exists? cache_file
1333
+ Log.debug "TSV cache file '#{cache_file}' found"
1334
+ hash = TCHash.get(cache_file)
1335
+ TSV.new(hash)
1336
+ else
1337
+ Log.debug "Producing TSV cache file '#{cache_file}'"
1338
+ data = yield
1339
+ TSV.new(data, :persistence_file => cache_file)
1340
+ end
912
1341
  end
913
1342
  end