rbbt-util 4.3.0 → 4.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,7 +1,7 @@
1
1
  require 'rbbt/util/cmd'
2
2
  module TSV
3
3
  class Parser
4
- attr_accessor :header_hash, :sep, :sep2, :type, :key_position, :field_positions, :cast, :key_field, :fields, :fix, :select, :serializer, :straight, :take_all, :zipped, :namespace
4
+ attr_accessor :header_hash, :sep, :sep2, :type, :key_position, :field_positions, :cast, :key_field, :fields, :fix, :select, :serializer, :straight, :take_all, :zipped, :namespace, :first_line
5
5
 
6
6
  class SKIP_LINE < Exception; end
7
7
  class END_PARSING < Exception; end
@@ -48,7 +48,7 @@ module TSV
48
48
 
49
49
  def process(line)
50
50
  l = line.chomp
51
- raise Parser::SKIP_LINE if Proc === @select and not @select.call l
51
+ raise Parser::SKIP_LINE if l[0] == "#"[0] or (Proc === @select and not @select.call l)
52
52
  l = @fix.call l if Proc === @fix
53
53
  raise Parser::END_PARSING unless l
54
54
  l
@@ -75,7 +75,7 @@ module TSV
75
75
  def get_values_single(parts)
76
76
  return parts.shift, parts.first if field_positions.nil? and key_position.nil?
77
77
  key = parts[key_position]
78
- value = parts[field_positions.nil? ? 0 : field_positions.first]
78
+ value = parts[(field_positions.nil? or field_positions.empty?) ? 0 : field_positions.first]
79
79
  [key, value]
80
80
  end
81
81
 
data/lib/rbbt/tsv/util.rb CHANGED
@@ -1,29 +1,6 @@
1
1
  require 'rbbt/resource/path'
2
2
  module TSV
3
3
 
4
- def self.field_match_counts(file, values)
5
- fields = TSV.parse_header(Open.open(file)).all_fields
6
-
7
- counts = {}
8
- TmpFile.with_file do |tmpfile|
9
- if Array === values
10
- Open.write(tmpfile, values * "\n")
11
- else
12
- FileUtils.ln_s values, tmpfile
13
- end
14
-
15
- fields.each_with_index do |field,i|
16
- counts[field] = begin
17
- CMD.cmd("cat #{ file } |grep -v ^#|cut -f #{i + 1}|tr '|' '\\n' |sort -u |grep [[:alpha:]]|grep -f #{tmpfile} -F -w").read.count("\n")
18
- rescue
19
- 0
20
- end
21
- end
22
- end
23
-
24
- counts
25
- end
26
-
27
4
  def self.field_match_counts(file, values, options = {})
28
5
  options = Misc.add_defaults options, :persist_prefix => "Field_Matches"
29
6
  persist_options = Misc.pull_keys options, :persist
@@ -45,12 +22,12 @@ module TSV
45
22
  end
46
23
 
47
24
  path = Persist.persistence_path(filename, persist_options)
48
- TmpFile.with_file(values * "\n") do |value_file|
49
- cmd = "cat '#{ path }' | grep -w -F -f '#{ value_file }' |cut -f 2 |sort|uniq -c|sed 's/^ *//;s/ /\t/'"
25
+ TmpFile.with_file(values.uniq * "\n") do |value_file|
26
+ cmd = "cat '#{ path }' | sed 's/\\t/\\tHEADERNOMATCH/' | grep -w -F -f '#{ value_file }' |cut -f 2 | sed 's/HEADERNOMATCH//' | sort|uniq -c|sed 's/^ *//;s/ /\t/'"
50
27
  begin
51
28
  TSV.open(CMD.cmd(cmd), :key_field => 1, :type => :single, :cast => :to_i)
52
29
  rescue
53
- TSV.setup({nil => 0}, :type => :single, :cast => :to_i)
30
+ TSV.setup({}, :type => :single, :cast => :to_i)
54
31
  end
55
32
  end
56
33
  end
@@ -67,12 +44,12 @@ module TSV
67
44
  filename
68
45
  end
69
46
 
70
- def self.get_stream(file)
47
+ def self.get_stream(file, open_options = {})
71
48
  case
72
49
  when Path === file
73
- file.open
50
+ file.open(open_options)
74
51
  when String === file
75
- File.open(file)
52
+ Open.open(file, open_options)
76
53
  when file.respond_to?(:gets)
77
54
  file
78
55
  else
data/lib/rbbt/util/R.rb CHANGED
@@ -22,9 +22,26 @@ module R
22
22
  CMD.cmd('R --vanilla --slave --quiet', options.merge(:in => cmd))
23
23
  end
24
24
 
25
+ def self.interactive(init_file, options = {})
26
+ CMD.cmd("env R_PROFILE='#{init_file}' xterm R")
27
+ end
28
+
29
+ def self.interactive(script, options = {})
30
+ TmpFile.with_file do |init_file|
31
+ Open.write(init_file) do |file|
32
+ profile = File.join(ENV["HOME"], ".Rprofile")
33
+ file.puts "source('#{profile}');\n" if File.exists? profile
34
+ file.puts "source('#{R::UTIL}');\n"
35
+ file.puts script
36
+ end
37
+ CMD.cmd("env R_PROFILE='#{init_file}' xterm R")
38
+ end
39
+ end
40
+
25
41
  end
26
42
 
27
43
  module TSV
44
+
28
45
  def R(script, open_options = {})
29
46
  TmpFile.with_file do |f|
30
47
  Open.write(f, self.to_s)
@@ -32,11 +49,19 @@ module TSV
32
49
  <<-EOF
33
50
  data = rbbt.tsv('#{f}');
34
51
  #{script.strip}
35
- rbbt.tsv.write('#{f}', data);
52
+ if (! is.null(data)){ rbbt.tsv.write('#{f}', data); }
36
53
  EOF
37
54
  ).read)
38
55
  open_options = Misc.add_defaults open_options, :type => :list
39
- TSV.open(f, open_options)
56
+ TSV.open(f, open_options) unless open_options[:ignore_output]
57
+ end
58
+ end
59
+
60
+ def R_interactive(open_options = {})
61
+ TmpFile.with_file do |f|
62
+ Open.write(f, self.to_s)
63
+ R.interactive("data_file = '#{f}';\n")
40
64
  end
41
65
  end
42
66
  end
67
+
@@ -1,4 +1,3 @@
1
- require 'rbbt/util/log'
2
1
  require 'set'
3
2
 
4
3
  module ChainMethods
@@ -33,7 +32,7 @@ module ChainMethods
33
32
 
34
33
  class << base; self; end.module_eval do
35
34
  methods.each do |new_method|
36
- original = new_method.sub(prefix.to_s + '_', '')
35
+ original = new_method.to_s.sub(prefix.to_s + '_', '')
37
36
  clean_method = prefix.to_s + '_clean_' + original
38
37
 
39
38
  original = "[]" if original == "get_brackets"
@@ -4,12 +4,125 @@ require 'rbbt/resource/path'
4
4
  require 'rbbt/annotations'
5
5
  require 'net/smtp'
6
6
  require 'narray'
7
+ require 'digest/md5'
7
8
 
8
9
  module Misc
9
10
  class FieldNotFoundError < StandardError;end
10
11
 
12
+ COLOR_LIST = %w(red green blue black yellow pink purple)
13
+ def self.colors_for(list)
14
+ unused = COLOR_LIST.dup
15
+
16
+ used = {}
17
+ colors = list.collect do |elem|
18
+ if used.include? elem
19
+ used[elem]
20
+ else
21
+ color = unused.shift
22
+ used[elem]=color
23
+ color
24
+ end
25
+ end
26
+
27
+ [colors, used]
28
+ end
29
+
30
+ def self.total_length(ranges)
31
+ processed = []
32
+ last = nil
33
+ ranges.sort_by{|range| range.begin }.each do |range|
34
+ if last.nil? or range.begin > last
35
+ processed << range
36
+ last = range.end
37
+ else
38
+ new_processed = []
39
+ processed.each do |processed_range|
40
+ if processed_range.end < range.begin
41
+ new_processed << processed_range
42
+ else
43
+ eend = [range.end, processed_range.end].max
44
+ new_processed << (processed_range.begin..eend)
45
+ break
46
+ end
47
+ end
48
+ processed = new_processed
49
+ last = range.end if range.end > last
50
+ end
51
+ end
52
+
53
+ processed.inject(0) do |total,range| total += range.end - range.begin + 1 end
54
+ end
55
+
56
+ def self.random_sample_in_range(total, size)
57
+ p = Set.new
58
+
59
+ if size > total / 10
60
+ template = (0..total - 1).to_a
61
+ size.times do |i|
62
+ pos = (rand * (total - i)).floor
63
+ if pos == template.length - 1
64
+ v = template.pop
65
+ else
66
+ v, n = template[pos], template[-1]
67
+ template.pop
68
+ template[pos] = n
69
+ end
70
+ p << v
71
+ end
72
+ else
73
+ size.times do
74
+ pos = nil
75
+ while pos.nil?
76
+ pos = (rand * total).floor
77
+ if p.include? pos
78
+ pos = nil
79
+ end
80
+ end
81
+ p << pos
82
+ end
83
+ end
84
+ p
85
+ end
86
+
87
+ def self.sample(ary, size, replacement = false)
88
+ total = ary.length
89
+ p = random_sample_in_range(total, size)
90
+ ary.values_at *p
91
+ end
92
+
93
+ Log2Multiplier = 1.0 / Math.log(2.0)
94
+ def self.log2(x)
95
+ Math.log(x) * Log2Multiplier
96
+ end
97
+
98
+ def self.prepare_entity(entity, field, options = {})
99
+ return entity unless String === entity or Array === entity
100
+ options ||= {}
101
+ dup_array = options.delete :dup_array
102
+ entity = Entity.formats[field].setup(((entity.frozen? and not entity.nil?) ? entity.dup : ((Array === entity and dup_array) ? entity.collect{|e| e.nil? ? e : e.dup} : entity) ), options.merge({:format => field})) if defined?(Entity) and Entity.respond_to?(:formats) and Entity.formats.include? field
103
+ entity
104
+ end
105
+
11
106
  ARRAY_MAX_LENGTH = 10000
12
107
  STRING_MAX_LENGTH = ARRAY_MAX_LENGTH * 10
108
+
109
+ def self.sanitize_filename(filename, length = 200)
110
+ if filename.length > length
111
+ if filename =~ /(\..{2,4})$/
112
+ extension = $1
113
+ else
114
+ extension = ''
115
+ end
116
+
117
+ post_fix = " TRUNCATED at #{length} (#{filename.length})" + extension
118
+
119
+ filename = filename[0..(length - post_fix.length - 1)] << post_fix
120
+ else
121
+ filename
122
+ end
123
+ filename
124
+ end
125
+
13
126
  def self.remove_long_items(obj)
14
127
  case
15
128
  when (Array === obj and obj.length > ARRAY_MAX_LENGTH)
@@ -460,11 +573,14 @@ end
460
573
  end
461
574
 
462
575
  def self.lock(file, *args)
576
+ return yield file, *args if file.nil?
463
577
  FileUtils.mkdir_p File.dirname(File.expand_path(file)) unless File.exists? File.dirname(File.expand_path(file))
464
578
 
465
579
  res = nil
466
580
 
467
- Lockfile.new(file + '.lock') do
581
+ lockfile = Lockfile.new(File.expand_path(file + '.lock'))
582
+
583
+ lockfile.lock do
468
584
  res = yield file, *args
469
585
  end
470
586
 
@@ -500,16 +616,18 @@ end
500
616
  end
501
617
 
502
618
  def self.fixutf8(string)
503
- if string.respond_to?(:valid_encoding?) and ! string.valid_encoding?
619
+ return string if (string.respond_to? :valid_encoding? and string.valid_encoding?) or
620
+ (string.respond_to? :valid_encoding and string.valid_encoding)
621
+ if string.respond_to?(:encode)
622
+ string.encode("UTF-16BE", :invalid => :replace, :undef => :replace, :replace => "?").encode('UTF-8')
623
+ else
504
624
  @@ic ||= Iconv.new('UTF-8//IGNORE', 'UTF-8')
505
625
  @@ic.iconv(string)
506
- else
507
- string
508
626
  end
509
627
  end
510
628
 
511
629
  def self.sensiblewrite(path, content)
512
- Misc.lock path do
630
+ Misc.lock path + '.sensible_write' do
513
631
  begin
514
632
  case
515
633
  when String === content
@@ -520,10 +638,10 @@ end
520
638
  File.open(path, 'w') do |f| end
521
639
  end
522
640
  rescue Interrupt
523
- FileUtils.rm_f path
641
+ FileUtils.rm_f path if File.exists? path
524
642
  raise "Interrupted (Ctrl-c)"
525
643
  rescue Exception
526
- FileUtils.rm_f path
644
+ FileUtils.rm_f path if File.exists? path
527
645
  raise $!
528
646
  end
529
647
  end
@@ -551,9 +669,15 @@ end
551
669
  Digest::MD5.hexdigest(text)
552
670
  end
553
671
 
672
+ HASH2MD5_MAX_STRING_LENGTH = 1000
673
+ HASH2MD5_MAX_ARRAY_LENGTH = 100
554
674
  def self.hash2md5(hash)
555
675
  str = ""
556
- hash.keys.sort_by{|k| k.to_s}.each do |k|
676
+ keys = hash.keys
677
+ keys = keys.clean_annotations if keys.respond_to? :clean_annotations
678
+ keys = keys.sort_by{|k| k.to_s}
679
+
680
+ keys.each do |k|
557
681
  next if k == :monitor or k == "monitor" or k == :in_situ_persistence or k == "in_situ_persistence"
558
682
  v = hash[k]
559
683
  case
@@ -565,10 +689,14 @@ end
565
689
  str << k.to_s << "=>" << hash2md5(v)
566
690
  when Symbol === v
567
691
  str << k.to_s << "=>" << v.to_s
692
+ when (String === v and v.length > HASH2MD5_MAX_STRING_LENGTH)
693
+ str << k.to_s << "=>" << v[0..HASH2MD5_MAX_STRING_LENGTH]
568
694
  when String === v
569
- str << k.to_s << "=>" << v[0..10000]
695
+ str << k.to_s << "=>" << v
696
+ when (Array === v and v.length > HASH2MD5_MAX_ARRAY_LENGTH)
697
+ str << k.to_s << "=>[" << v[0..HASH2MD5_MAX_ARRAY_LENGTH] * "," << "]"
570
698
  when Array === v
571
- str << k.to_s << "=>[" << v[0..1000] * "," << "]"
699
+ str << k.to_s << "=>[" << v * "," << "]"
572
700
  else
573
701
  v_ins = v.inspect
574
702
 
@@ -585,7 +713,7 @@ end
585
713
  if str.empty?
586
714
  ""
587
715
  else
588
- Digest::MD5.hexdigest(str)
716
+ digest(str)
589
717
  end
590
718
  end
591
719
 
@@ -696,6 +824,7 @@ end
696
824
  end
697
825
 
698
826
  def self.zip_fields(array)
827
+ return [] if array.empty?
699
828
  array[0].zip(*array[1..-1])
700
829
  end
701
830
 
@@ -766,13 +895,13 @@ module IndiferentHash
766
895
  end
767
896
 
768
897
  module PDF2Text
769
- def self.pdftotext(filename)
898
+ def self.pdftotext(filename, options = {})
770
899
  require 'rbbt/util/cmd'
771
900
  require 'rbbt/util/tmpfile'
772
901
  require 'rbbt/util/open'
773
902
 
774
903
 
775
- TmpFile.with_file(Open.open(filename, :nocache => true).read) do |pdf_file|
904
+ TmpFile.with_file(Open.open(filename, options.merge(:nocache => true)).read) do |pdf_file|
776
905
  CMD.cmd("pdftotext #{pdf_file} -", :pipe => false, :stderr => true)
777
906
  end
778
907
  end
@@ -7,13 +7,13 @@ module NamedArray
7
7
  self.chain_prefix = :named_array
8
8
  attr_accessor :fields
9
9
  attr_accessor :key
10
- attr_accessor :namespace
10
+ attr_accessor :entity_options
11
11
 
12
- def self.setup(array, fields, key = nil, namespace = nil)
12
+ def self.setup(array, fields, key = nil, entity_options = nil)
13
13
  array.extend NamedArray unless NamedArray === array
14
14
  array.fields = fields
15
15
  array.key = key
16
- array.namespace = namespace
16
+ array.entity_options = entity_options
17
17
  array
18
18
  end
19
19
 
@@ -47,14 +47,14 @@ module NamedArray
47
47
  return elem if @fields.nil? or @fields.empty?
48
48
 
49
49
  field = NamedArray === @fields ? @fields.named_array_clean_get_brackets(pos) : @fields[pos]
50
- elem = Entity.formats[field].setup((elem.frozen? ? elem.dup : elem), (namespace ? {:namespace => namespace, :organism => namespace} : {}).merge({:format => field})) if defined?(Entity) and Entity.respond_to?(:formats) and Entity.formats.include?(field) and not field == elem
50
+ elem = Misc.prepare_entity(elem, field, entity_options)
51
51
  elem
52
52
  end
53
53
 
54
54
  def named_array_each(&block)
55
55
  if defined?(Entity) and not @fields.nil? and not @fields.empty?
56
56
  @fields.zip(self).each do |field,elem|
57
- elem = Entity.formats[field].setup((elem.frozen? ? elem.dup : elem), (namespace ? {:namespace => namespace, :organism => namespace} : {}).merge({:format => field})) if defined?(Entity) and Entity.respond_to?(:formats) and Entity.formats.include?(field) and not field == elem
57
+ elem = Misc.prepare_entity(elem, field, entity_options)
58
58
  yield(elem)
59
59
  elem
60
60
  end
@@ -174,6 +174,8 @@ module Open
174
174
  wget_options[:cookies] = options.delete(:cookies)
175
175
 
176
176
  io = case
177
+ when (IO === url or StringIO === url)
178
+ url
177
179
  when (not remote?(url))
178
180
  file_open(url, options[:grep])
179
181
  when (options[:nocache] and options[:nocache] != :update)
@@ -189,8 +191,8 @@ module Open
189
191
  io.close
190
192
  file_open(in_cache(url, wget_options), options[:grep])
191
193
  end
192
- io = unzip(io) if (zip?(url) and not options[:noz]) or options[:zip]
193
- io = gunzip(io) if (gzip?(url) and not options[:noz]) or options[:gzip]
194
+ io = unzip(io) if ((String === url and zip?(url)) and not options[:noz]) or options[:zip]
195
+ io = gunzip(io) if ((String === url and gzip?(url)) and not options[:noz]) or options[:gzip]
194
196
 
195
197
  if block_given?
196
198
  yield io
@@ -214,12 +216,14 @@ module Open
214
216
  f = open(file, options)
215
217
 
216
218
  if block_given?
219
+ res = []
217
220
  while not f.eof?
218
221
  l = f.gets
219
- l = fixutf8(l) if l.respond_to?(:valid_encoding?) && ! l.valid_encoding?
220
- yield l
222
+ l = Misc.fixutf8(l)
223
+ res << yield(l)
221
224
  end
222
225
  f.close
226
+ res
223
227
  else
224
228
  text = Misc.fixutf8(f.read)
225
229
  f.close unless f.closed?