rbbt-util 4.3.0 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  require 'rbbt/util/cmd'
2
2
  module TSV
3
3
  class Parser
4
- attr_accessor :header_hash, :sep, :sep2, :type, :key_position, :field_positions, :cast, :key_field, :fields, :fix, :select, :serializer, :straight, :take_all, :zipped, :namespace
4
+ attr_accessor :header_hash, :sep, :sep2, :type, :key_position, :field_positions, :cast, :key_field, :fields, :fix, :select, :serializer, :straight, :take_all, :zipped, :namespace, :first_line
5
5
 
6
6
  class SKIP_LINE < Exception; end
7
7
  class END_PARSING < Exception; end
@@ -48,7 +48,7 @@ module TSV
48
48
 
49
49
  def process(line)
50
50
  l = line.chomp
51
- raise Parser::SKIP_LINE if Proc === @select and not @select.call l
51
+ raise Parser::SKIP_LINE if l[0] == "#"[0] or (Proc === @select and not @select.call l)
52
52
  l = @fix.call l if Proc === @fix
53
53
  raise Parser::END_PARSING unless l
54
54
  l
@@ -75,7 +75,7 @@ module TSV
75
75
  def get_values_single(parts)
76
76
  return parts.shift, parts.first if field_positions.nil? and key_position.nil?
77
77
  key = parts[key_position]
78
- value = parts[field_positions.nil? ? 0 : field_positions.first]
78
+ value = parts[(field_positions.nil? or field_positions.empty?) ? 0 : field_positions.first]
79
79
  [key, value]
80
80
  end
81
81
 
data/lib/rbbt/tsv/util.rb CHANGED
@@ -1,29 +1,6 @@
1
1
  require 'rbbt/resource/path'
2
2
  module TSV
3
3
 
4
- def self.field_match_counts(file, values)
5
- fields = TSV.parse_header(Open.open(file)).all_fields
6
-
7
- counts = {}
8
- TmpFile.with_file do |tmpfile|
9
- if Array === values
10
- Open.write(tmpfile, values * "\n")
11
- else
12
- FileUtils.ln_s values, tmpfile
13
- end
14
-
15
- fields.each_with_index do |field,i|
16
- counts[field] = begin
17
- CMD.cmd("cat #{ file } |grep -v ^#|cut -f #{i + 1}|tr '|' '\\n' |sort -u |grep [[:alpha:]]|grep -f #{tmpfile} -F -w").read.count("\n")
18
- rescue
19
- 0
20
- end
21
- end
22
- end
23
-
24
- counts
25
- end
26
-
27
4
  def self.field_match_counts(file, values, options = {})
28
5
  options = Misc.add_defaults options, :persist_prefix => "Field_Matches"
29
6
  persist_options = Misc.pull_keys options, :persist
@@ -45,12 +22,12 @@ module TSV
45
22
  end
46
23
 
47
24
  path = Persist.persistence_path(filename, persist_options)
48
- TmpFile.with_file(values * "\n") do |value_file|
49
- cmd = "cat '#{ path }' | grep -w -F -f '#{ value_file }' |cut -f 2 |sort|uniq -c|sed 's/^ *//;s/ /\t/'"
25
+ TmpFile.with_file(values.uniq * "\n") do |value_file|
26
+ cmd = "cat '#{ path }' | sed 's/\\t/\\tHEADERNOMATCH/' | grep -w -F -f '#{ value_file }' |cut -f 2 | sed 's/HEADERNOMATCH//' | sort|uniq -c|sed 's/^ *//;s/ /\t/'"
50
27
  begin
51
28
  TSV.open(CMD.cmd(cmd), :key_field => 1, :type => :single, :cast => :to_i)
52
29
  rescue
53
- TSV.setup({nil => 0}, :type => :single, :cast => :to_i)
30
+ TSV.setup({}, :type => :single, :cast => :to_i)
54
31
  end
55
32
  end
56
33
  end
@@ -67,12 +44,12 @@ module TSV
67
44
  filename
68
45
  end
69
46
 
70
- def self.get_stream(file)
47
+ def self.get_stream(file, open_options = {})
71
48
  case
72
49
  when Path === file
73
- file.open
50
+ file.open(open_options)
74
51
  when String === file
75
- File.open(file)
52
+ Open.open(file, open_options)
76
53
  when file.respond_to?(:gets)
77
54
  file
78
55
  else
data/lib/rbbt/util/R.rb CHANGED
@@ -22,9 +22,26 @@ module R
22
22
  CMD.cmd('R --vanilla --slave --quiet', options.merge(:in => cmd))
23
23
  end
24
24
 
25
+ def self.interactive(init_file, options = {})
26
+ CMD.cmd("env R_PROFILE='#{init_file}' xterm R")
27
+ end
28
+
29
+ def self.interactive(script, options = {})
30
+ TmpFile.with_file do |init_file|
31
+ Open.write(init_file) do |file|
32
+ profile = File.join(ENV["HOME"], ".Rprofile")
33
+ file.puts "source('#{profile}');\n" if File.exists? profile
34
+ file.puts "source('#{R::UTIL}');\n"
35
+ file.puts script
36
+ end
37
+ CMD.cmd("env R_PROFILE='#{init_file}' xterm R")
38
+ end
39
+ end
40
+
25
41
  end
26
42
 
27
43
  module TSV
44
+
28
45
  def R(script, open_options = {})
29
46
  TmpFile.with_file do |f|
30
47
  Open.write(f, self.to_s)
@@ -32,11 +49,19 @@ module TSV
32
49
  <<-EOF
33
50
  data = rbbt.tsv('#{f}');
34
51
  #{script.strip}
35
- rbbt.tsv.write('#{f}', data);
52
+ if (! is.null(data)){ rbbt.tsv.write('#{f}', data); }
36
53
  EOF
37
54
  ).read)
38
55
  open_options = Misc.add_defaults open_options, :type => :list
39
- TSV.open(f, open_options)
56
+ TSV.open(f, open_options) unless open_options[:ignore_output]
57
+ end
58
+ end
59
+
60
+ def R_interactive(open_options = {})
61
+ TmpFile.with_file do |f|
62
+ Open.write(f, self.to_s)
63
+ R.interactive("data_file = '#{f}';\n")
40
64
  end
41
65
  end
42
66
  end
67
+
@@ -1,4 +1,3 @@
1
- require 'rbbt/util/log'
2
1
  require 'set'
3
2
 
4
3
  module ChainMethods
@@ -33,7 +32,7 @@ module ChainMethods
33
32
 
34
33
  class << base; self; end.module_eval do
35
34
  methods.each do |new_method|
36
- original = new_method.sub(prefix.to_s + '_', '')
35
+ original = new_method.to_s.sub(prefix.to_s + '_', '')
37
36
  clean_method = prefix.to_s + '_clean_' + original
38
37
 
39
38
  original = "[]" if original == "get_brackets"
@@ -4,12 +4,125 @@ require 'rbbt/resource/path'
4
4
  require 'rbbt/annotations'
5
5
  require 'net/smtp'
6
6
  require 'narray'
7
+ require 'digest/md5'
7
8
 
8
9
  module Misc
9
10
  class FieldNotFoundError < StandardError;end
10
11
 
12
+ COLOR_LIST = %w(red green blue black yellow pink purple)
13
+ def self.colors_for(list)
14
+ unused = COLOR_LIST.dup
15
+
16
+ used = {}
17
+ colors = list.collect do |elem|
18
+ if used.include? elem
19
+ used[elem]
20
+ else
21
+ color = unused.shift
22
+ used[elem]=color
23
+ color
24
+ end
25
+ end
26
+
27
+ [colors, used]
28
+ end
29
+
30
+ def self.total_length(ranges)
31
+ processed = []
32
+ last = nil
33
+ ranges.sort_by{|range| range.begin }.each do |range|
34
+ if last.nil? or range.begin > last
35
+ processed << range
36
+ last = range.end
37
+ else
38
+ new_processed = []
39
+ processed.each do |processed_range|
40
+ if processed_range.end < range.begin
41
+ new_processed << processed_range
42
+ else
43
+ eend = [range.end, processed_range.end].max
44
+ new_processed << (processed_range.begin..eend)
45
+ break
46
+ end
47
+ end
48
+ processed = new_processed
49
+ last = range.end if range.end > last
50
+ end
51
+ end
52
+
53
+ processed.inject(0) do |total,range| total += range.end - range.begin + 1 end
54
+ end
55
+
56
+ def self.random_sample_in_range(total, size)
57
+ p = Set.new
58
+
59
+ if size > total / 10
60
+ template = (0..total - 1).to_a
61
+ size.times do |i|
62
+ pos = (rand * (total - i)).floor
63
+ if pos == template.length - 1
64
+ v = template.pop
65
+ else
66
+ v, n = template[pos], template[-1]
67
+ template.pop
68
+ template[pos] = n
69
+ end
70
+ p << v
71
+ end
72
+ else
73
+ size.times do
74
+ pos = nil
75
+ while pos.nil?
76
+ pos = (rand * total).floor
77
+ if p.include? pos
78
+ pos = nil
79
+ end
80
+ end
81
+ p << pos
82
+ end
83
+ end
84
+ p
85
+ end
86
+
87
+ def self.sample(ary, size, replacement = false)
88
+ total = ary.length
89
+ p = random_sample_in_range(total, size)
90
+ ary.values_at *p
91
+ end
92
+
93
+ Log2Multiplier = 1.0 / Math.log(2.0)
94
+ def self.log2(x)
95
+ Math.log(x) * Log2Multiplier
96
+ end
97
+
98
+ def self.prepare_entity(entity, field, options = {})
99
+ return entity unless String === entity or Array === entity
100
+ options ||= {}
101
+ dup_array = options.delete :dup_array
102
+ entity = Entity.formats[field].setup(((entity.frozen? and not entity.nil?) ? entity.dup : ((Array === entity and dup_array) ? entity.collect{|e| e.nil? ? e : e.dup} : entity) ), options.merge({:format => field})) if defined?(Entity) and Entity.respond_to?(:formats) and Entity.formats.include? field
103
+ entity
104
+ end
105
+
11
106
  ARRAY_MAX_LENGTH = 10000
12
107
  STRING_MAX_LENGTH = ARRAY_MAX_LENGTH * 10
108
+
109
+ def self.sanitize_filename(filename, length = 200)
110
+ if filename.length > length
111
+ if filename =~ /(\..{2,4})$/
112
+ extension = $1
113
+ else
114
+ extension = ''
115
+ end
116
+
117
+ post_fix = " TRUNCATED at #{length} (#{filename.length})" + extension
118
+
119
+ filename = filename[0..(length - post_fix.length - 1)] << post_fix
120
+ else
121
+ filename
122
+ end
123
+ filename
124
+ end
125
+
13
126
  def self.remove_long_items(obj)
14
127
  case
15
128
  when (Array === obj and obj.length > ARRAY_MAX_LENGTH)
@@ -460,11 +573,14 @@ end
460
573
  end
461
574
 
462
575
  def self.lock(file, *args)
576
+ return yield file, *args if file.nil?
463
577
  FileUtils.mkdir_p File.dirname(File.expand_path(file)) unless File.exists? File.dirname(File.expand_path(file))
464
578
 
465
579
  res = nil
466
580
 
467
- Lockfile.new(file + '.lock') do
581
+ lockfile = Lockfile.new(File.expand_path(file + '.lock'))
582
+
583
+ lockfile.lock do
468
584
  res = yield file, *args
469
585
  end
470
586
 
@@ -500,16 +616,18 @@ end
500
616
  end
501
617
 
502
618
  def self.fixutf8(string)
503
- if string.respond_to?(:valid_encoding?) and ! string.valid_encoding?
619
+ return string if (string.respond_to? :valid_encoding? and string.valid_encoding?) or
620
+ (string.respond_to? :valid_encoding and string.valid_encoding)
621
+ if string.respond_to?(:encode)
622
+ string.encode("UTF-16BE", :invalid => :replace, :undef => :replace, :replace => "?").encode('UTF-8')
623
+ else
504
624
  @@ic ||= Iconv.new('UTF-8//IGNORE', 'UTF-8')
505
625
  @@ic.iconv(string)
506
- else
507
- string
508
626
  end
509
627
  end
510
628
 
511
629
  def self.sensiblewrite(path, content)
512
- Misc.lock path do
630
+ Misc.lock path + '.sensible_write' do
513
631
  begin
514
632
  case
515
633
  when String === content
@@ -520,10 +638,10 @@ end
520
638
  File.open(path, 'w') do |f| end
521
639
  end
522
640
  rescue Interrupt
523
- FileUtils.rm_f path
641
+ FileUtils.rm_f path if File.exists? path
524
642
  raise "Interrupted (Ctrl-c)"
525
643
  rescue Exception
526
- FileUtils.rm_f path
644
+ FileUtils.rm_f path if File.exists? path
527
645
  raise $!
528
646
  end
529
647
  end
@@ -551,9 +669,15 @@ end
551
669
  Digest::MD5.hexdigest(text)
552
670
  end
553
671
 
672
+ HASH2MD5_MAX_STRING_LENGTH = 1000
673
+ HASH2MD5_MAX_ARRAY_LENGTH = 100
554
674
  def self.hash2md5(hash)
555
675
  str = ""
556
- hash.keys.sort_by{|k| k.to_s}.each do |k|
676
+ keys = hash.keys
677
+ keys = keys.clean_annotations if keys.respond_to? :clean_annotations
678
+ keys = keys.sort_by{|k| k.to_s}
679
+
680
+ keys.each do |k|
557
681
  next if k == :monitor or k == "monitor" or k == :in_situ_persistence or k == "in_situ_persistence"
558
682
  v = hash[k]
559
683
  case
@@ -565,10 +689,14 @@ end
565
689
  str << k.to_s << "=>" << hash2md5(v)
566
690
  when Symbol === v
567
691
  str << k.to_s << "=>" << v.to_s
692
+ when (String === v and v.length > HASH2MD5_MAX_STRING_LENGTH)
693
+ str << k.to_s << "=>" << v[0..HASH2MD5_MAX_STRING_LENGTH]
568
694
  when String === v
569
- str << k.to_s << "=>" << v[0..10000]
695
+ str << k.to_s << "=>" << v
696
+ when (Array === v and v.length > HASH2MD5_MAX_ARRAY_LENGTH)
697
+ str << k.to_s << "=>[" << v[0..HASH2MD5_MAX_ARRAY_LENGTH] * "," << "]"
570
698
  when Array === v
571
- str << k.to_s << "=>[" << v[0..1000] * "," << "]"
699
+ str << k.to_s << "=>[" << v * "," << "]"
572
700
  else
573
701
  v_ins = v.inspect
574
702
 
@@ -585,7 +713,7 @@ end
585
713
  if str.empty?
586
714
  ""
587
715
  else
588
- Digest::MD5.hexdigest(str)
716
+ digest(str)
589
717
  end
590
718
  end
591
719
 
@@ -696,6 +824,7 @@ end
696
824
  end
697
825
 
698
826
  def self.zip_fields(array)
827
+ return [] if array.empty?
699
828
  array[0].zip(*array[1..-1])
700
829
  end
701
830
 
@@ -766,13 +895,13 @@ module IndiferentHash
766
895
  end
767
896
 
768
897
  module PDF2Text
769
- def self.pdftotext(filename)
898
+ def self.pdftotext(filename, options = {})
770
899
  require 'rbbt/util/cmd'
771
900
  require 'rbbt/util/tmpfile'
772
901
  require 'rbbt/util/open'
773
902
 
774
903
 
775
- TmpFile.with_file(Open.open(filename, :nocache => true).read) do |pdf_file|
904
+ TmpFile.with_file(Open.open(filename, options.merge(:nocache => true)).read) do |pdf_file|
776
905
  CMD.cmd("pdftotext #{pdf_file} -", :pipe => false, :stderr => true)
777
906
  end
778
907
  end
@@ -7,13 +7,13 @@ module NamedArray
7
7
  self.chain_prefix = :named_array
8
8
  attr_accessor :fields
9
9
  attr_accessor :key
10
- attr_accessor :namespace
10
+ attr_accessor :entity_options
11
11
 
12
- def self.setup(array, fields, key = nil, namespace = nil)
12
+ def self.setup(array, fields, key = nil, entity_options = nil)
13
13
  array.extend NamedArray unless NamedArray === array
14
14
  array.fields = fields
15
15
  array.key = key
16
- array.namespace = namespace
16
+ array.entity_options = entity_options
17
17
  array
18
18
  end
19
19
 
@@ -47,14 +47,14 @@ module NamedArray
47
47
  return elem if @fields.nil? or @fields.empty?
48
48
 
49
49
  field = NamedArray === @fields ? @fields.named_array_clean_get_brackets(pos) : @fields[pos]
50
- elem = Entity.formats[field].setup((elem.frozen? ? elem.dup : elem), (namespace ? {:namespace => namespace, :organism => namespace} : {}).merge({:format => field})) if defined?(Entity) and Entity.respond_to?(:formats) and Entity.formats.include?(field) and not field == elem
50
+ elem = Misc.prepare_entity(elem, field, entity_options)
51
51
  elem
52
52
  end
53
53
 
54
54
  def named_array_each(&block)
55
55
  if defined?(Entity) and not @fields.nil? and not @fields.empty?
56
56
  @fields.zip(self).each do |field,elem|
57
- elem = Entity.formats[field].setup((elem.frozen? ? elem.dup : elem), (namespace ? {:namespace => namespace, :organism => namespace} : {}).merge({:format => field})) if defined?(Entity) and Entity.respond_to?(:formats) and Entity.formats.include?(field) and not field == elem
57
+ elem = Misc.prepare_entity(elem, field, entity_options)
58
58
  yield(elem)
59
59
  elem
60
60
  end
@@ -174,6 +174,8 @@ module Open
174
174
  wget_options[:cookies] = options.delete(:cookies)
175
175
 
176
176
  io = case
177
+ when (IO === url or StringIO === url)
178
+ url
177
179
  when (not remote?(url))
178
180
  file_open(url, options[:grep])
179
181
  when (options[:nocache] and options[:nocache] != :update)
@@ -189,8 +191,8 @@ module Open
189
191
  io.close
190
192
  file_open(in_cache(url, wget_options), options[:grep])
191
193
  end
192
- io = unzip(io) if (zip?(url) and not options[:noz]) or options[:zip]
193
- io = gunzip(io) if (gzip?(url) and not options[:noz]) or options[:gzip]
194
+ io = unzip(io) if ((String === url and zip?(url)) and not options[:noz]) or options[:zip]
195
+ io = gunzip(io) if ((String === url and gzip?(url)) and not options[:noz]) or options[:gzip]
194
196
 
195
197
  if block_given?
196
198
  yield io
@@ -214,12 +216,14 @@ module Open
214
216
  f = open(file, options)
215
217
 
216
218
  if block_given?
219
+ res = []
217
220
  while not f.eof?
218
221
  l = f.gets
219
- l = fixutf8(l) if l.respond_to?(:valid_encoding?) && ! l.valid_encoding?
220
- yield l
222
+ l = Misc.fixutf8(l)
223
+ res << yield(l)
221
224
  end
222
225
  f.close
226
+ res
223
227
  else
224
228
  text = Misc.fixutf8(f.read)
225
229
  f.close unless f.closed?