rbbt-util 4.3.0 → 4.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +11 -6
- data/bin/rbbt_exec.rb +9 -1
- data/lib/rbbt/annotations.rb +78 -22
- data/lib/rbbt/persist.rb +14 -6
- data/lib/rbbt/persist/tsv.rb +10 -4
- data/lib/rbbt/resource.rb +1 -0
- data/lib/rbbt/resource/path.rb +43 -2
- data/lib/rbbt/resource/util.rb +3 -3
- data/lib/rbbt/tsv.rb +8 -2
- data/lib/rbbt/tsv/accessor.rb +29 -28
- data/lib/rbbt/tsv/attach.rb +8 -3
- data/lib/rbbt/tsv/attach/util.rb +3 -0
- data/lib/rbbt/tsv/excel.rb +91 -0
- data/lib/rbbt/tsv/filter.rb +17 -7
- data/lib/rbbt/tsv/manipulate.rb +26 -11
- data/lib/rbbt/tsv/parser.rb +3 -3
- data/lib/rbbt/tsv/util.rb +6 -29
- data/lib/rbbt/util/R.rb +27 -2
- data/lib/rbbt/util/chain_methods.rb +1 -2
- data/lib/rbbt/util/misc.rb +142 -13
- data/lib/rbbt/util/named_array.rb +5 -5
- data/lib/rbbt/util/open.rb +8 -4
- data/lib/rbbt/workflow.rb +5 -40
- data/lib/rbbt/workflow/accessor.rb +11 -0
- data/lib/rbbt/workflow/annotate.rb +8 -2
- data/lib/rbbt/workflow/step.rb +21 -2
- data/lib/rbbt/workflow/task.rb +3 -3
- data/share/lib/R/util.R +48 -15
- data/test/rbbt/resource/test_path.rb +15 -0
- data/test/rbbt/test_annotations.rb +11 -1
- data/test/rbbt/test_tsv.rb +15 -2
- data/test/rbbt/tsv/test_index.rb +27 -24
- data/test/rbbt/tsv/test_util.rb +1 -0
- data/test/rbbt/util/test_misc.rb +20 -0
- data/test/rbbt/util/test_open.rb +3 -5
- data/test/rbbt/workflow/test_step.rb +2 -2
- metadata +24 -9
data/lib/rbbt/tsv/parser.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'rbbt/util/cmd'
|
2
2
|
module TSV
|
3
3
|
class Parser
|
4
|
-
attr_accessor :header_hash, :sep, :sep2, :type, :key_position, :field_positions, :cast, :key_field, :fields, :fix, :select, :serializer, :straight, :take_all, :zipped, :namespace
|
4
|
+
attr_accessor :header_hash, :sep, :sep2, :type, :key_position, :field_positions, :cast, :key_field, :fields, :fix, :select, :serializer, :straight, :take_all, :zipped, :namespace, :first_line
|
5
5
|
|
6
6
|
class SKIP_LINE < Exception; end
|
7
7
|
class END_PARSING < Exception; end
|
@@ -48,7 +48,7 @@ module TSV
|
|
48
48
|
|
49
49
|
def process(line)
|
50
50
|
l = line.chomp
|
51
|
-
raise Parser::SKIP_LINE if Proc === @select and not @select.call l
|
51
|
+
raise Parser::SKIP_LINE if l[0] == "#"[0] or (Proc === @select and not @select.call l)
|
52
52
|
l = @fix.call l if Proc === @fix
|
53
53
|
raise Parser::END_PARSING unless l
|
54
54
|
l
|
@@ -75,7 +75,7 @@ module TSV
|
|
75
75
|
def get_values_single(parts)
|
76
76
|
return parts.shift, parts.first if field_positions.nil? and key_position.nil?
|
77
77
|
key = parts[key_position]
|
78
|
-
value = parts[field_positions.nil? ? 0 : field_positions.first]
|
78
|
+
value = parts[(field_positions.nil? or field_positions.empty?) ? 0 : field_positions.first]
|
79
79
|
[key, value]
|
80
80
|
end
|
81
81
|
|
data/lib/rbbt/tsv/util.rb
CHANGED
@@ -1,29 +1,6 @@
|
|
1
1
|
require 'rbbt/resource/path'
|
2
2
|
module TSV
|
3
3
|
|
4
|
-
def self.field_match_counts(file, values)
|
5
|
-
fields = TSV.parse_header(Open.open(file)).all_fields
|
6
|
-
|
7
|
-
counts = {}
|
8
|
-
TmpFile.with_file do |tmpfile|
|
9
|
-
if Array === values
|
10
|
-
Open.write(tmpfile, values * "\n")
|
11
|
-
else
|
12
|
-
FileUtils.ln_s values, tmpfile
|
13
|
-
end
|
14
|
-
|
15
|
-
fields.each_with_index do |field,i|
|
16
|
-
counts[field] = begin
|
17
|
-
CMD.cmd("cat #{ file } |grep -v ^#|cut -f #{i + 1}|tr '|' '\\n' |sort -u |grep [[:alpha:]]|grep -f #{tmpfile} -F -w").read.count("\n")
|
18
|
-
rescue
|
19
|
-
0
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
counts
|
25
|
-
end
|
26
|
-
|
27
4
|
def self.field_match_counts(file, values, options = {})
|
28
5
|
options = Misc.add_defaults options, :persist_prefix => "Field_Matches"
|
29
6
|
persist_options = Misc.pull_keys options, :persist
|
@@ -45,12 +22,12 @@ module TSV
|
|
45
22
|
end
|
46
23
|
|
47
24
|
path = Persist.persistence_path(filename, persist_options)
|
48
|
-
TmpFile.with_file(values * "\n") do |value_file|
|
49
|
-
cmd = "cat '#{ path }' | grep -w -F -f '#{ value_file }' |cut -f 2 |sort|uniq -c|sed 's/^ *//;s/ /\t/'"
|
25
|
+
TmpFile.with_file(values.uniq * "\n") do |value_file|
|
26
|
+
cmd = "cat '#{ path }' | sed 's/\\t/\\tHEADERNOMATCH/' | grep -w -F -f '#{ value_file }' |cut -f 2 | sed 's/HEADERNOMATCH//' | sort|uniq -c|sed 's/^ *//;s/ /\t/'"
|
50
27
|
begin
|
51
28
|
TSV.open(CMD.cmd(cmd), :key_field => 1, :type => :single, :cast => :to_i)
|
52
29
|
rescue
|
53
|
-
TSV.setup({
|
30
|
+
TSV.setup({}, :type => :single, :cast => :to_i)
|
54
31
|
end
|
55
32
|
end
|
56
33
|
end
|
@@ -67,12 +44,12 @@ module TSV
|
|
67
44
|
filename
|
68
45
|
end
|
69
46
|
|
70
|
-
def self.get_stream(file)
|
47
|
+
def self.get_stream(file, open_options = {})
|
71
48
|
case
|
72
49
|
when Path === file
|
73
|
-
file.open
|
50
|
+
file.open(open_options)
|
74
51
|
when String === file
|
75
|
-
|
52
|
+
Open.open(file, open_options)
|
76
53
|
when file.respond_to?(:gets)
|
77
54
|
file
|
78
55
|
else
|
data/lib/rbbt/util/R.rb
CHANGED
@@ -22,9 +22,26 @@ module R
|
|
22
22
|
CMD.cmd('R --vanilla --slave --quiet', options.merge(:in => cmd))
|
23
23
|
end
|
24
24
|
|
25
|
+
def self.interactive(init_file, options = {})
|
26
|
+
CMD.cmd("env R_PROFILE='#{init_file}' xterm R")
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.interactive(script, options = {})
|
30
|
+
TmpFile.with_file do |init_file|
|
31
|
+
Open.write(init_file) do |file|
|
32
|
+
profile = File.join(ENV["HOME"], ".Rprofile")
|
33
|
+
file.puts "source('#{profile}');\n" if File.exists? profile
|
34
|
+
file.puts "source('#{R::UTIL}');\n"
|
35
|
+
file.puts script
|
36
|
+
end
|
37
|
+
CMD.cmd("env R_PROFILE='#{init_file}' xterm R")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
25
41
|
end
|
26
42
|
|
27
43
|
module TSV
|
44
|
+
|
28
45
|
def R(script, open_options = {})
|
29
46
|
TmpFile.with_file do |f|
|
30
47
|
Open.write(f, self.to_s)
|
@@ -32,11 +49,19 @@ module TSV
|
|
32
49
|
<<-EOF
|
33
50
|
data = rbbt.tsv('#{f}');
|
34
51
|
#{script.strip}
|
35
|
-
rbbt.tsv.write('#{f}', data);
|
52
|
+
if (! is.null(data)){ rbbt.tsv.write('#{f}', data); }
|
36
53
|
EOF
|
37
54
|
).read)
|
38
55
|
open_options = Misc.add_defaults open_options, :type => :list
|
39
|
-
TSV.open(f, open_options)
|
56
|
+
TSV.open(f, open_options) unless open_options[:ignore_output]
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def R_interactive(open_options = {})
|
61
|
+
TmpFile.with_file do |f|
|
62
|
+
Open.write(f, self.to_s)
|
63
|
+
R.interactive("data_file = '#{f}';\n")
|
40
64
|
end
|
41
65
|
end
|
42
66
|
end
|
67
|
+
|
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'rbbt/util/log'
|
2
1
|
require 'set'
|
3
2
|
|
4
3
|
module ChainMethods
|
@@ -33,7 +32,7 @@ module ChainMethods
|
|
33
32
|
|
34
33
|
class << base; self; end.module_eval do
|
35
34
|
methods.each do |new_method|
|
36
|
-
original = new_method.sub(prefix.to_s + '_', '')
|
35
|
+
original = new_method.to_s.sub(prefix.to_s + '_', '')
|
37
36
|
clean_method = prefix.to_s + '_clean_' + original
|
38
37
|
|
39
38
|
original = "[]" if original == "get_brackets"
|
data/lib/rbbt/util/misc.rb
CHANGED
@@ -4,12 +4,125 @@ require 'rbbt/resource/path'
|
|
4
4
|
require 'rbbt/annotations'
|
5
5
|
require 'net/smtp'
|
6
6
|
require 'narray'
|
7
|
+
require 'digest/md5'
|
7
8
|
|
8
9
|
module Misc
|
9
10
|
class FieldNotFoundError < StandardError;end
|
10
11
|
|
12
|
+
COLOR_LIST = %w(red green blue black yellow pink purple)
|
13
|
+
def self.colors_for(list)
|
14
|
+
unused = COLOR_LIST.dup
|
15
|
+
|
16
|
+
used = {}
|
17
|
+
colors = list.collect do |elem|
|
18
|
+
if used.include? elem
|
19
|
+
used[elem]
|
20
|
+
else
|
21
|
+
color = unused.shift
|
22
|
+
used[elem]=color
|
23
|
+
color
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
[colors, used]
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.total_length(ranges)
|
31
|
+
processed = []
|
32
|
+
last = nil
|
33
|
+
ranges.sort_by{|range| range.begin }.each do |range|
|
34
|
+
if last.nil? or range.begin > last
|
35
|
+
processed << range
|
36
|
+
last = range.end
|
37
|
+
else
|
38
|
+
new_processed = []
|
39
|
+
processed.each do |processed_range|
|
40
|
+
if processed_range.end < range.begin
|
41
|
+
new_processed << processed_range
|
42
|
+
else
|
43
|
+
eend = [range.end, processed_range.end].max
|
44
|
+
new_processed << (processed_range.begin..eend)
|
45
|
+
break
|
46
|
+
end
|
47
|
+
end
|
48
|
+
processed = new_processed
|
49
|
+
last = range.end if range.end > last
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
processed.inject(0) do |total,range| total += range.end - range.begin + 1 end
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.random_sample_in_range(total, size)
|
57
|
+
p = Set.new
|
58
|
+
|
59
|
+
if size > total / 10
|
60
|
+
template = (0..total - 1).to_a
|
61
|
+
size.times do |i|
|
62
|
+
pos = (rand * (total - i)).floor
|
63
|
+
if pos == template.length - 1
|
64
|
+
v = template.pop
|
65
|
+
else
|
66
|
+
v, n = template[pos], template[-1]
|
67
|
+
template.pop
|
68
|
+
template[pos] = n
|
69
|
+
end
|
70
|
+
p << v
|
71
|
+
end
|
72
|
+
else
|
73
|
+
size.times do
|
74
|
+
pos = nil
|
75
|
+
while pos.nil?
|
76
|
+
pos = (rand * total).floor
|
77
|
+
if p.include? pos
|
78
|
+
pos = nil
|
79
|
+
end
|
80
|
+
end
|
81
|
+
p << pos
|
82
|
+
end
|
83
|
+
end
|
84
|
+
p
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.sample(ary, size, replacement = false)
|
88
|
+
total = ary.length
|
89
|
+
p = random_sample_in_range(total, size)
|
90
|
+
ary.values_at *p
|
91
|
+
end
|
92
|
+
|
93
|
+
Log2Multiplier = 1.0 / Math.log(2.0)
|
94
|
+
def self.log2(x)
|
95
|
+
Math.log(x) * Log2Multiplier
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.prepare_entity(entity, field, options = {})
|
99
|
+
return entity unless String === entity or Array === entity
|
100
|
+
options ||= {}
|
101
|
+
dup_array = options.delete :dup_array
|
102
|
+
entity = Entity.formats[field].setup(((entity.frozen? and not entity.nil?) ? entity.dup : ((Array === entity and dup_array) ? entity.collect{|e| e.nil? ? e : e.dup} : entity) ), options.merge({:format => field})) if defined?(Entity) and Entity.respond_to?(:formats) and Entity.formats.include? field
|
103
|
+
entity
|
104
|
+
end
|
105
|
+
|
11
106
|
ARRAY_MAX_LENGTH = 10000
|
12
107
|
STRING_MAX_LENGTH = ARRAY_MAX_LENGTH * 10
|
108
|
+
|
109
|
+
def self.sanitize_filename(filename, length = 200)
|
110
|
+
if filename.length > length
|
111
|
+
if filename =~ /(\..{2,4})$/
|
112
|
+
extension = $1
|
113
|
+
else
|
114
|
+
extension = ''
|
115
|
+
end
|
116
|
+
|
117
|
+
post_fix = " TRUNCATED at #{length} (#{filename.length})" + extension
|
118
|
+
|
119
|
+
filename = filename[0..(length - post_fix.length - 1)] << post_fix
|
120
|
+
else
|
121
|
+
filename
|
122
|
+
end
|
123
|
+
filename
|
124
|
+
end
|
125
|
+
|
13
126
|
def self.remove_long_items(obj)
|
14
127
|
case
|
15
128
|
when (Array === obj and obj.length > ARRAY_MAX_LENGTH)
|
@@ -460,11 +573,14 @@ end
|
|
460
573
|
end
|
461
574
|
|
462
575
|
def self.lock(file, *args)
|
576
|
+
return yield file, *args if file.nil?
|
463
577
|
FileUtils.mkdir_p File.dirname(File.expand_path(file)) unless File.exists? File.dirname(File.expand_path(file))
|
464
578
|
|
465
579
|
res = nil
|
466
580
|
|
467
|
-
Lockfile.new(file + '.lock')
|
581
|
+
lockfile = Lockfile.new(File.expand_path(file + '.lock'))
|
582
|
+
|
583
|
+
lockfile.lock do
|
468
584
|
res = yield file, *args
|
469
585
|
end
|
470
586
|
|
@@ -500,16 +616,18 @@ end
|
|
500
616
|
end
|
501
617
|
|
502
618
|
def self.fixutf8(string)
|
503
|
-
if string.respond_to?
|
619
|
+
return string if (string.respond_to? :valid_encoding? and string.valid_encoding?) or
|
620
|
+
(string.respond_to? :valid_encoding and string.valid_encoding)
|
621
|
+
if string.respond_to?(:encode)
|
622
|
+
string.encode("UTF-16BE", :invalid => :replace, :undef => :replace, :replace => "?").encode('UTF-8')
|
623
|
+
else
|
504
624
|
@@ic ||= Iconv.new('UTF-8//IGNORE', 'UTF-8')
|
505
625
|
@@ic.iconv(string)
|
506
|
-
else
|
507
|
-
string
|
508
626
|
end
|
509
627
|
end
|
510
628
|
|
511
629
|
def self.sensiblewrite(path, content)
|
512
|
-
Misc.lock path do
|
630
|
+
Misc.lock path + '.sensible_write' do
|
513
631
|
begin
|
514
632
|
case
|
515
633
|
when String === content
|
@@ -520,10 +638,10 @@ end
|
|
520
638
|
File.open(path, 'w') do |f| end
|
521
639
|
end
|
522
640
|
rescue Interrupt
|
523
|
-
FileUtils.rm_f path
|
641
|
+
FileUtils.rm_f path if File.exists? path
|
524
642
|
raise "Interrupted (Ctrl-c)"
|
525
643
|
rescue Exception
|
526
|
-
FileUtils.rm_f path
|
644
|
+
FileUtils.rm_f path if File.exists? path
|
527
645
|
raise $!
|
528
646
|
end
|
529
647
|
end
|
@@ -551,9 +669,15 @@ end
|
|
551
669
|
Digest::MD5.hexdigest(text)
|
552
670
|
end
|
553
671
|
|
672
|
+
HASH2MD5_MAX_STRING_LENGTH = 1000
|
673
|
+
HASH2MD5_MAX_ARRAY_LENGTH = 100
|
554
674
|
def self.hash2md5(hash)
|
555
675
|
str = ""
|
556
|
-
hash.keys
|
676
|
+
keys = hash.keys
|
677
|
+
keys = keys.clean_annotations if keys.respond_to? :clean_annotations
|
678
|
+
keys = keys.sort_by{|k| k.to_s}
|
679
|
+
|
680
|
+
keys.each do |k|
|
557
681
|
next if k == :monitor or k == "monitor" or k == :in_situ_persistence or k == "in_situ_persistence"
|
558
682
|
v = hash[k]
|
559
683
|
case
|
@@ -565,10 +689,14 @@ end
|
|
565
689
|
str << k.to_s << "=>" << hash2md5(v)
|
566
690
|
when Symbol === v
|
567
691
|
str << k.to_s << "=>" << v.to_s
|
692
|
+
when (String === v and v.length > HASH2MD5_MAX_STRING_LENGTH)
|
693
|
+
str << k.to_s << "=>" << v[0..HASH2MD5_MAX_STRING_LENGTH]
|
568
694
|
when String === v
|
569
|
-
str << k.to_s << "=>" << v
|
695
|
+
str << k.to_s << "=>" << v
|
696
|
+
when (Array === v and v.length > HASH2MD5_MAX_ARRAY_LENGTH)
|
697
|
+
str << k.to_s << "=>[" << v[0..HASH2MD5_MAX_ARRAY_LENGTH] * "," << "]"
|
570
698
|
when Array === v
|
571
|
-
str << k.to_s << "=>[" << v
|
699
|
+
str << k.to_s << "=>[" << v * "," << "]"
|
572
700
|
else
|
573
701
|
v_ins = v.inspect
|
574
702
|
|
@@ -585,7 +713,7 @@ end
|
|
585
713
|
if str.empty?
|
586
714
|
""
|
587
715
|
else
|
588
|
-
|
716
|
+
digest(str)
|
589
717
|
end
|
590
718
|
end
|
591
719
|
|
@@ -696,6 +824,7 @@ end
|
|
696
824
|
end
|
697
825
|
|
698
826
|
def self.zip_fields(array)
|
827
|
+
return [] if array.empty?
|
699
828
|
array[0].zip(*array[1..-1])
|
700
829
|
end
|
701
830
|
|
@@ -766,13 +895,13 @@ module IndiferentHash
|
|
766
895
|
end
|
767
896
|
|
768
897
|
module PDF2Text
|
769
|
-
def self.pdftotext(filename)
|
898
|
+
def self.pdftotext(filename, options = {})
|
770
899
|
require 'rbbt/util/cmd'
|
771
900
|
require 'rbbt/util/tmpfile'
|
772
901
|
require 'rbbt/util/open'
|
773
902
|
|
774
903
|
|
775
|
-
TmpFile.with_file(Open.open(filename, :nocache => true).read) do |pdf_file|
|
904
|
+
TmpFile.with_file(Open.open(filename, options.merge(:nocache => true)).read) do |pdf_file|
|
776
905
|
CMD.cmd("pdftotext #{pdf_file} -", :pipe => false, :stderr => true)
|
777
906
|
end
|
778
907
|
end
|
@@ -7,13 +7,13 @@ module NamedArray
|
|
7
7
|
self.chain_prefix = :named_array
|
8
8
|
attr_accessor :fields
|
9
9
|
attr_accessor :key
|
10
|
-
attr_accessor :
|
10
|
+
attr_accessor :entity_options
|
11
11
|
|
12
|
-
def self.setup(array, fields, key = nil,
|
12
|
+
def self.setup(array, fields, key = nil, entity_options = nil)
|
13
13
|
array.extend NamedArray unless NamedArray === array
|
14
14
|
array.fields = fields
|
15
15
|
array.key = key
|
16
|
-
array.
|
16
|
+
array.entity_options = entity_options
|
17
17
|
array
|
18
18
|
end
|
19
19
|
|
@@ -47,14 +47,14 @@ module NamedArray
|
|
47
47
|
return elem if @fields.nil? or @fields.empty?
|
48
48
|
|
49
49
|
field = NamedArray === @fields ? @fields.named_array_clean_get_brackets(pos) : @fields[pos]
|
50
|
-
elem =
|
50
|
+
elem = Misc.prepare_entity(elem, field, entity_options)
|
51
51
|
elem
|
52
52
|
end
|
53
53
|
|
54
54
|
def named_array_each(&block)
|
55
55
|
if defined?(Entity) and not @fields.nil? and not @fields.empty?
|
56
56
|
@fields.zip(self).each do |field,elem|
|
57
|
-
elem =
|
57
|
+
elem = Misc.prepare_entity(elem, field, entity_options)
|
58
58
|
yield(elem)
|
59
59
|
elem
|
60
60
|
end
|
data/lib/rbbt/util/open.rb
CHANGED
@@ -174,6 +174,8 @@ module Open
|
|
174
174
|
wget_options[:cookies] = options.delete(:cookies)
|
175
175
|
|
176
176
|
io = case
|
177
|
+
when (IO === url or StringIO === url)
|
178
|
+
url
|
177
179
|
when (not remote?(url))
|
178
180
|
file_open(url, options[:grep])
|
179
181
|
when (options[:nocache] and options[:nocache] != :update)
|
@@ -189,8 +191,8 @@ module Open
|
|
189
191
|
io.close
|
190
192
|
file_open(in_cache(url, wget_options), options[:grep])
|
191
193
|
end
|
192
|
-
io = unzip(io) if (zip?(url) and not options[:noz]) or options[:zip]
|
193
|
-
io = gunzip(io) if (gzip?(url) and not options[:noz]) or options[:gzip]
|
194
|
+
io = unzip(io) if ((String === url and zip?(url)) and not options[:noz]) or options[:zip]
|
195
|
+
io = gunzip(io) if ((String === url and gzip?(url)) and not options[:noz]) or options[:gzip]
|
194
196
|
|
195
197
|
if block_given?
|
196
198
|
yield io
|
@@ -214,12 +216,14 @@ module Open
|
|
214
216
|
f = open(file, options)
|
215
217
|
|
216
218
|
if block_given?
|
219
|
+
res = []
|
217
220
|
while not f.eof?
|
218
221
|
l = f.gets
|
219
|
-
l = fixutf8(l)
|
220
|
-
yield
|
222
|
+
l = Misc.fixutf8(l)
|
223
|
+
res << yield(l)
|
221
224
|
end
|
222
225
|
f.close
|
226
|
+
res
|
223
227
|
else
|
224
228
|
text = Misc.fixutf8(f.read)
|
225
229
|
f.close unless f.closed?
|