rbbt-util 1.0.1 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/rbbt/util/cmd.rb +39 -11
- data/lib/rbbt/util/data_module.rb +1 -0
- data/lib/rbbt/util/excel2tsv.rb +2 -1
- data/lib/rbbt/util/log.rb +2 -2
- data/lib/rbbt/util/misc.rb +26 -0
- data/lib/rbbt/util/open.rb +19 -3
- data/lib/rbbt/util/pkg_software.rb +2 -1
- data/lib/rbbt/util/tc_hash.rb +10 -10
- data/lib/rbbt/util/tsv.rb +691 -582
- data/lib/rbbt/util/workflow.rb +4 -0
- data/test/rbbt/util/test_cmd.rb +14 -0
- data/test/rbbt/util/test_misc.rb +12 -0
- data/test/rbbt/util/test_tc_hash.rb +1 -1
- data/test/rbbt/util/test_tsv.rb +110 -0
- data/test/rbbt/util/test_workflow.rb +11 -0
- metadata +7 -4
data/lib/rbbt/util/cmd.rb
CHANGED
@@ -3,20 +3,27 @@ require 'rbbt/util/log'
|
|
3
3
|
require 'stringio'
|
4
4
|
|
5
5
|
module CMD
|
6
|
-
class CMDError <
|
6
|
+
class CMDError < RBBTError;end
|
7
7
|
|
8
8
|
module SmartIO
|
9
|
-
def self.tie(io, pid = nil, post = nil)
|
9
|
+
def self.tie(io, pid = nil, cmd = "", post = nil)
|
10
10
|
io.instance_eval{
|
11
11
|
@pid = pid
|
12
|
+
@cmd = cmd
|
12
13
|
@post = post
|
13
14
|
alias original_close close
|
14
15
|
def close
|
15
16
|
begin
|
16
|
-
Process.waitpid(@pid
|
17
|
+
Process.waitpid(@pid) if @pid
|
17
18
|
rescue
|
18
19
|
end
|
19
20
|
|
21
|
+
if $? and not $?.success?
|
22
|
+
Log.debug "Raising exception"
|
23
|
+
exception = CMDError.new "Command [#{@pid}] #{@cmd} failed with error status #{$?.exitstatus}"
|
24
|
+
raise exception
|
25
|
+
end
|
26
|
+
|
20
27
|
@post.call if @post
|
21
28
|
original_close
|
22
29
|
end
|
@@ -107,6 +114,8 @@ module CMD
|
|
107
114
|
sout.last.close
|
108
115
|
serr.last.close
|
109
116
|
|
117
|
+
Log.debug "CMD: [#{pid}] #{cmd}"
|
118
|
+
|
110
119
|
case
|
111
120
|
when String === in_content
|
112
121
|
sin.last.write in_content
|
@@ -120,20 +129,39 @@ module CMD
|
|
120
129
|
end
|
121
130
|
end
|
122
131
|
|
123
|
-
|
124
|
-
|
125
|
-
|
132
|
+
if pipe
|
133
|
+
Thread.new do
|
134
|
+
while l = serr.first.gets
|
135
|
+
Log.log l, stderr if Integer === stderr
|
136
|
+
end
|
137
|
+
serr.first.close
|
126
138
|
end
|
127
|
-
serr.first.close
|
128
|
-
end
|
129
139
|
|
130
|
-
|
131
|
-
SmartIO.tie sout.first, pid, post
|
140
|
+
SmartIO.tie sout.first, pid, cmd, post
|
132
141
|
sout.first
|
142
|
+
|
133
143
|
else
|
144
|
+
err = ""
|
145
|
+
Thread.new do
|
146
|
+
while l = serr.first.gets
|
147
|
+
err << l if Integer === stderr
|
148
|
+
end
|
149
|
+
serr.first.close
|
150
|
+
end
|
151
|
+
|
134
152
|
out = StringIO.new sout.first.read
|
135
|
-
SmartIO.tie out
|
153
|
+
SmartIO.tie out, pid, cmd, post
|
154
|
+
|
136
155
|
Process.waitpid pid
|
156
|
+
|
157
|
+
if not $?.success?
|
158
|
+
exception = CMDError.new "Command [#{pid}] #{cmd} failed with error status #{$?.exitstatus}"
|
159
|
+
exception.info = err if Integer === stderr and stderr >= Log.severity
|
160
|
+
raise exception
|
161
|
+
else
|
162
|
+
Log.log err, stderr if Integer === stderr
|
163
|
+
end
|
164
|
+
|
137
165
|
out
|
138
166
|
end
|
139
167
|
end
|
data/lib/rbbt/util/excel2tsv.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'rbbt/util/tsv'
|
2
|
+
require 'rbbt/util/open'
|
2
3
|
require 'spreadsheet'
|
3
4
|
|
4
5
|
class TSV
|
@@ -8,7 +9,7 @@ class TSV
|
|
8
9
|
header = true unless header == false
|
9
10
|
sheet ||= 0
|
10
11
|
TmpFile.with_file do |filename|
|
11
|
-
workbook = Spreadsheet.open
|
12
|
+
workbook = Spreadsheet.open Open.open(file)
|
12
13
|
sheet = workbook.worksheet sheet
|
13
14
|
|
14
15
|
rows = []
|
data/lib/rbbt/util/log.rb
CHANGED
data/lib/rbbt/util/misc.rb
CHANGED
@@ -1,7 +1,25 @@
|
|
1
1
|
require 'iconv'
|
2
|
+
|
3
|
+
class RBBTError < StandardError
|
4
|
+
attr_accessor :info
|
5
|
+
|
6
|
+
alias old_to_s to_s
|
7
|
+
def to_s
|
8
|
+
str = old_to_s
|
9
|
+
if info
|
10
|
+
str << "\n" << "Additional Info:\n---\n" << info << "---"
|
11
|
+
end
|
12
|
+
str
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
2
16
|
module Misc
|
3
17
|
class FieldNotFoundError < StandardError;end
|
4
18
|
|
19
|
+
def self.this_dir
|
20
|
+
File.expand_path(File.dirname(caller[0]))
|
21
|
+
end
|
22
|
+
|
5
23
|
def self.env_add(var, value, sep = ":", prepend = true)
|
6
24
|
ENV[var] ||= ""
|
7
25
|
return if ENV[var] =~ /(#{sep}|^)#{Regexp.quote value}(#{sep}|$)/
|
@@ -113,6 +131,14 @@ module Misc
|
|
113
131
|
end
|
114
132
|
end
|
115
133
|
|
134
|
+
module PDF2Text
|
135
|
+
def self.pdf2text(filename)
|
136
|
+
TmpFile.with_file(Open.read(filename)) do |pdf|
|
137
|
+
CMD.cmd("pdftotext #{pdf} -", :pipe => false, :stderr => true)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
116
142
|
class NamedArray < Array
|
117
143
|
attr_accessor :fields
|
118
144
|
|
data/lib/rbbt/util/open.rb
CHANGED
@@ -34,6 +34,7 @@ module Open
|
|
34
34
|
end
|
35
35
|
|
36
36
|
def self.wget(url, options = {})
|
37
|
+
Log.low "WGET:\n -URL: #{ url }\n -OPTIONS: #{options.inspect}"
|
37
38
|
options = Misc.add_defaults options, "--user-agent=" => 'firefox', :pipe => true
|
38
39
|
|
39
40
|
wait(options[:nice], options[:nice_key]) if options[:nice]
|
@@ -42,7 +43,18 @@ module Open
|
|
42
43
|
|
43
44
|
pipe = options.delete(:pipe)
|
44
45
|
quiet = options.delete(:quiet)
|
45
|
-
|
46
|
+
post = options.delete(:post)
|
47
|
+
cookies = options.delete(:cookies)
|
48
|
+
|
49
|
+
options["--quiet"] = quiet if options["--quiet"].nil?
|
50
|
+
options["--post-data="] ||= post if post
|
51
|
+
|
52
|
+
if cookies
|
53
|
+
options["--save-cookies"] = cookies
|
54
|
+
options["--load-cookies"] = cookies
|
55
|
+
options["--keep-session-cookies"] = true
|
56
|
+
end
|
57
|
+
|
46
58
|
|
47
59
|
stderr = case
|
48
60
|
when options['stderr']
|
@@ -52,6 +64,7 @@ module Open
|
|
52
64
|
else
|
53
65
|
nil
|
54
66
|
end
|
67
|
+
|
55
68
|
begin
|
56
69
|
CMD.cmd("wget '#{ url }'", options.merge(
|
57
70
|
'-O' => '-',
|
@@ -141,6 +154,9 @@ module Open
|
|
141
154
|
wget_options = options[:wget_options] || {}
|
142
155
|
wget_options[:nice] = options.delete(:nice)
|
143
156
|
wget_options[:nice_key] = options.delete(:nice_key)
|
157
|
+
wget_options[:quiet] = options.delete(:quiet)
|
158
|
+
wget_options[:post] = options.delete(:post)
|
159
|
+
wget_options[:cookies] = options.delete(:cookies)
|
144
160
|
|
145
161
|
io = case
|
146
162
|
when (not remote?(url))
|
@@ -155,8 +171,8 @@ module Open
|
|
155
171
|
io.close
|
156
172
|
file_open(in_cache(url), options[:grep])
|
157
173
|
end
|
158
|
-
io = unzip(io) if zip? url and not options[:noz]
|
159
|
-
io = gunzip(io) if gzip? url and not options[:noz]
|
174
|
+
io = unzip(io) if (zip? url and not options[:noz]) or options[:zip]
|
175
|
+
io = gunzip(io) if (gzip? url and not options[:noz]) or options[:gzip]
|
160
176
|
|
161
177
|
io
|
162
178
|
end
|
@@ -70,6 +70,7 @@ module PKGSoftware
|
|
70
70
|
if not File.exists?(path)
|
71
71
|
sharedir ||= PKGSoftware.get_caller_sharedir
|
72
72
|
get_pkg(pkg.to_s, path, get, sharedir)
|
73
|
+
setup_env(software_dir)
|
73
74
|
end
|
74
75
|
|
75
76
|
SOFTWARE[pkg.to_s] = path
|
@@ -80,7 +81,6 @@ module PKGSoftware
|
|
80
81
|
SOFTWARE[pkg.to_s]
|
81
82
|
end
|
82
83
|
|
83
|
-
|
84
84
|
def setup_env(software_dir)
|
85
85
|
Misc.env_add 'PATH', bin_dir
|
86
86
|
|
@@ -127,4 +127,5 @@ module PKGSoftware
|
|
127
127
|
|
128
128
|
CMD.cmd(File.join(opt_dir, '.post_install'))
|
129
129
|
end
|
130
|
+
|
130
131
|
end
|
data/lib/rbbt/util/tc_hash.rb
CHANGED
@@ -39,7 +39,7 @@ class TCHash < TokyoCabinet::HDB
|
|
39
39
|
alias original_keys keys
|
40
40
|
def keys
|
41
41
|
list = self.original_keys
|
42
|
-
indexes = FIELD_INFO_ENTRIES.values.collect do |field| list.index(field) end.compact
|
42
|
+
indexes = FIELD_INFO_ENTRIES.values.collect do |field| list.index(field) end.compact.sort.reverse
|
43
43
|
indexes.each do |index| list.delete_at index end
|
44
44
|
list
|
45
45
|
end
|
@@ -48,19 +48,12 @@ class TCHash < TokyoCabinet::HDB
|
|
48
48
|
def values
|
49
49
|
values = self.original_values
|
50
50
|
keys = self.original_keys
|
51
|
-
indexes = FIELD_INFO_ENTRIES.values.collect do |field| keys.index(field) end.compact
|
51
|
+
indexes = FIELD_INFO_ENTRIES.values.collect do |field| keys.index(field) end.compact.sort.reverse
|
52
52
|
indexes.each do |index| values.delete_at index end
|
53
53
|
|
54
54
|
values.collect{|v| Serializer.load(v)}
|
55
55
|
end
|
56
56
|
|
57
|
-
def merge!(data)
|
58
|
-
new_data = {}
|
59
|
-
data.each do |key, values|
|
60
|
-
self[key] = values
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
64
57
|
# This version of each fixes a problem in ruby 1.9. It also
|
65
58
|
# removes the special entries
|
66
59
|
def each19(&block)
|
@@ -77,10 +70,17 @@ class TCHash < TokyoCabinet::HDB
|
|
77
70
|
|
78
71
|
def collect
|
79
72
|
res = []
|
80
|
-
self.each{|k, v| res <<
|
73
|
+
self.each{|k, v| res << yield(k,v)}
|
81
74
|
res
|
82
75
|
end
|
83
76
|
|
77
|
+
def merge!(data)
|
78
|
+
new_data = {}
|
79
|
+
data.each do |key, values|
|
80
|
+
self[key] = values
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
84
|
alias original_open open
|
85
85
|
def open(write = false)
|
86
86
|
flags = write ? TokyoCabinet::HDB::OWRITER | TokyoCabinet::HDB::OCREAT : TokyoCabinet::BDB::OREADER
|
data/lib/rbbt/util/tsv.rb
CHANGED
@@ -2,6 +2,7 @@ require 'rbbt/util/misc'
|
|
2
2
|
require 'rbbt/util/open'
|
3
3
|
require 'rbbt/util/tc_hash'
|
4
4
|
require 'rbbt/util/tmpfile'
|
5
|
+
require 'rbbt/util/log'
|
5
6
|
require 'digest'
|
6
7
|
require 'fileutils'
|
7
8
|
|
@@ -16,6 +17,13 @@ end
|
|
16
17
|
class TSV
|
17
18
|
class FieldNotFoundError < StandardError;end
|
18
19
|
|
20
|
+
module Field
|
21
|
+
def ==(string)
|
22
|
+
return false unless String === string
|
23
|
+
self.sub(/#.*/,'').casecmp(string.sub(/#.*/,'')) == 0
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
19
27
|
#{{{ Persistence
|
20
28
|
|
21
29
|
PersistenceHash = TCHash
|
@@ -36,14 +44,7 @@ class TSV
|
|
36
44
|
File.join(CACHEDIR, prefix.gsub(/\s/,'_').gsub(/\//,'>') + Digest::MD5.hexdigest([file, options].inspect))
|
37
45
|
end
|
38
46
|
|
39
|
-
|
40
|
-
def self.log(message)
|
41
|
-
STDERR.puts message if @debug == true
|
42
|
-
end
|
43
|
-
|
44
|
-
def self.debug=(value)
|
45
|
-
@debug = value
|
46
|
-
end
|
47
|
+
#{{{ Headers and Field Stuff
|
47
48
|
|
48
49
|
def self.headers(file, options = {})
|
49
50
|
if file =~ /(.*)#(.*)/ and File.exists? $1
|
@@ -63,742 +64,850 @@ class TSV
|
|
63
64
|
end
|
64
65
|
end
|
65
66
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
@data.keys
|
67
|
+
def self.fields_include(key_field, fields, field)
|
68
|
+
return true if key_field == field or fields.include? field
|
69
|
+
return false
|
70
70
|
end
|
71
71
|
|
72
|
-
def
|
73
|
-
|
72
|
+
def self.field_positions(key_field, fields, *selected)
|
73
|
+
selected.collect do |sel|
|
74
|
+
case
|
75
|
+
when (sel.nil? or sel == :main or sel == key_field)
|
76
|
+
-1
|
77
|
+
when Integer === sel
|
78
|
+
sel
|
79
|
+
else
|
80
|
+
Misc.field_position fields, sel
|
81
|
+
end
|
82
|
+
end
|
74
83
|
end
|
75
84
|
|
76
|
-
def
|
77
|
-
|
85
|
+
def fields_include(field)
|
86
|
+
return TSV.fields_include key_field, fields, field
|
78
87
|
end
|
79
88
|
|
80
|
-
|
89
|
+
def field_positions(*selected)
|
90
|
+
return nil if selected.nil? or selected == [nil]
|
91
|
+
TSV.field_positions(key_field, fields, *selected)
|
92
|
+
end
|
81
93
|
|
82
|
-
def
|
83
|
-
|
84
|
-
|
94
|
+
def fields_at(*positions)
|
95
|
+
return nil if fields.nil?
|
96
|
+
return nil if positions.nil? or positions == [nil]
|
97
|
+
(fields + [key_field]).values_at(*positions)
|
85
98
|
end
|
86
99
|
|
100
|
+
#{{{ Iteration, Merging, etc
|
101
|
+
def through(new_key_field = nil, new_fields = nil, &block)
|
102
|
+
new_key_position = (field_positions(new_key_field) || [-1]).first
|
103
|
+
new_fields = [new_fields] if String === new_fields
|
87
104
|
|
88
|
-
|
89
|
-
new_data.each do |key, value|
|
90
|
-
self[key] = value
|
91
|
-
end
|
92
|
-
end
|
105
|
+
if new_key_position == -1
|
93
106
|
|
94
|
-
|
107
|
+
if new_fields.nil? or new_fields == fields
|
108
|
+
each &block
|
109
|
+
return [key_field, fields]
|
110
|
+
else
|
111
|
+
new_field_positions = field_positions(*new_fields)
|
112
|
+
each do |key, values|
|
113
|
+
if values.nil?
|
114
|
+
yield key, nil
|
115
|
+
else
|
116
|
+
yield key, values.values_at(*new_field_positions)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
return [key_field, fields_at(*new_field_positions)]
|
120
|
+
end
|
95
121
|
|
96
|
-
def follow(value)
|
97
|
-
if String === value && value =~ /__Ref:(.*)/
|
98
|
-
return self[$1]
|
99
122
|
else
|
100
|
-
|
101
|
-
|
123
|
+
new_field_positions = field_positions(*new_fields)
|
124
|
+
|
125
|
+
new_field_names = fields_at(*new_field_positions)
|
126
|
+
if new_field_names.nil? and fields
|
127
|
+
new_field_names = fields.dup
|
128
|
+
new_field_names.delete_at new_key_position
|
129
|
+
new_field_names.unshift key_field
|
130
|
+
end
|
131
|
+
|
132
|
+
each do |key, values|
|
133
|
+
if list
|
134
|
+
tmp_values = values + [[key]]
|
135
|
+
else
|
136
|
+
tmp_values = values + [key]
|
137
|
+
end
|
138
|
+
|
139
|
+
if new_field_positions.nil?
|
140
|
+
new_values = values.dup
|
141
|
+
new_values.delete_at new_key_position
|
142
|
+
new_values.unshift [key]
|
143
|
+
else
|
144
|
+
new_values = tmp_values.values_at(*new_field_positions)
|
145
|
+
end
|
146
|
+
|
147
|
+
tmp_values[new_key_position].each do |new_key|
|
148
|
+
if new_field_names
|
149
|
+
yield new_key, NamedArray.name(new_values, new_field_names)
|
150
|
+
else
|
151
|
+
yield new_key, new_values
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
return [(fields_at(new_key_position) || [nil]).first, new_field_names]
|
102
156
|
end
|
103
157
|
end
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
return nil
|
158
|
+
|
159
|
+
def process(field)
|
160
|
+
through do |key, values|
|
161
|
+
values[field].replace yield(values[field], key, values) unless values[field].nil?
|
109
162
|
end
|
110
|
-
|
111
|
-
key = key.downcase if @case_insensitive
|
112
|
-
follow @data[key]
|
113
163
|
end
|
114
164
|
|
115
|
-
def values_at(*keys)
|
116
|
-
keys.collect{|k|
|
117
|
-
self[k]
|
118
|
-
}
|
119
|
-
end
|
120
165
|
|
121
|
-
def
|
122
|
-
|
123
|
-
|
124
|
-
end
|
125
|
-
end
|
166
|
+
def reorder(new_key_field, new_fields = nil, options = {})
|
167
|
+
options = Misc.add_defaults options
|
168
|
+
return TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => case_insensitive) if options[:persistence_file] and File.exists?(options[:persistence_file])
|
126
169
|
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
else
|
134
|
-
@data.collect do |key, value|
|
135
|
-
[key, follow(value)]
|
170
|
+
new = {}
|
171
|
+
new_key_field, new_fields = through new_key_field, new_fields do |key, values|
|
172
|
+
if new[key].nil?
|
173
|
+
new[key] = values
|
174
|
+
else
|
175
|
+
new[key] = new[key].zip(values)
|
136
176
|
end
|
137
177
|
end
|
138
|
-
end
|
139
178
|
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
value = NamedArray.name value, fields if fields
|
144
|
-
[key, value]
|
145
|
-
}
|
146
|
-
end
|
179
|
+
new.each do |key,values|
|
180
|
+
values.each{|list| list.flatten! if Array === list}
|
181
|
+
end
|
147
182
|
|
148
|
-
|
149
|
-
|
150
|
-
|
183
|
+
if options[:persistence_file]
|
184
|
+
reordered = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => case_insensitive)
|
185
|
+
reordered.merge! new
|
186
|
+
else
|
187
|
+
reordered = TSV.new(new, :case_insensitive => case_insensitive)
|
188
|
+
end
|
151
189
|
|
152
|
-
|
153
|
-
|
154
|
-
def self.parse_fields(io, delimiter = "\t")
|
155
|
-
return [] if io.nil?
|
156
|
-
fields = io.split(delimiter, -1)
|
157
|
-
fields
|
158
|
-
end
|
190
|
+
reordered.key_field = new_key_field
|
191
|
+
reordered.fields = new_fields
|
159
192
|
|
160
|
-
|
161
|
-
return [] if list.nil? || list.empty?
|
162
|
-
fields ||= list.fields if list.respond_to? :fields
|
163
|
-
zipped = list[0].zip(*list[1..-1])
|
164
|
-
zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields
|
165
|
-
zipped
|
193
|
+
reordered
|
166
194
|
end
|
167
|
-
|
168
|
-
def self.parse(data, file, options = {})
|
169
195
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
:sep2 => "|",
|
174
|
-
:native => 0,
|
175
|
-
:extra => nil,
|
176
|
-
:fix => nil,
|
177
|
-
:exclude => nil,
|
178
|
-
:select => nil,
|
179
|
-
:grep => nil,
|
180
|
-
:single => false,
|
181
|
-
:unique => false,
|
182
|
-
:flatten => false,
|
183
|
-
:overwrite => false,
|
184
|
-
:keep_empty => true,
|
185
|
-
:case_insensitive => false,
|
186
|
-
:header_hash => '#' ,
|
187
|
-
:persistence_file => nil
|
196
|
+
def slice(new_fields, options = {})
|
197
|
+
reorder(:main, new_fields)
|
198
|
+
end
|
188
199
|
|
189
|
-
|
190
|
-
|
200
|
+
def add_field(name = nil)
|
201
|
+
each do |key, values|
|
202
|
+
self[key] = values << yield(key, values)
|
203
|
+
end
|
191
204
|
|
205
|
+
fields << name if list
|
206
|
+
if PersistenceHash === @data
|
207
|
+
@data.fields = fields
|
208
|
+
end
|
209
|
+
end
|
192
210
|
|
211
|
+
def select(method)
|
212
|
+
new = TSV.new({})
|
213
|
+
new.key_field = key_field
|
214
|
+
new.fields = fields.dup
|
215
|
+
|
216
|
+
case
|
217
|
+
when Array === method
|
218
|
+
through do |key, values|
|
219
|
+
new[key] = values if ([key,values].flatten & method).any?
|
220
|
+
end
|
221
|
+
when Regexp === method
|
222
|
+
through do |key, values|
|
223
|
+
new[key] = values if [key,values].flatten.select{|v| v =~ method}.any?
|
224
|
+
end
|
225
|
+
when Hash === method
|
226
|
+
key = method.keys.first
|
227
|
+
method = method.values.first
|
228
|
+
case
|
229
|
+
when (Array === method and (:main == key or key_field == key))
|
230
|
+
method.each{|item| if values = self[item]; then new[item] = values; end}
|
231
|
+
when Array === method
|
232
|
+
through :main, key do |key, values|
|
233
|
+
new[key] = values if (values.flatten & method).any?
|
234
|
+
end
|
235
|
+
when Regexp === method
|
236
|
+
through :main, key do |key, values|
|
237
|
+
new[key] = values if values.flatten.select{|v| v =~ method}.any?
|
238
|
+
end
|
239
|
+
end
|
240
|
+
end
|
193
241
|
|
194
|
-
|
242
|
+
new
|
243
|
+
end
|
195
244
|
|
196
|
-
|
197
|
-
|
198
|
-
line.chomp!
|
245
|
+
def index(options = {})
|
246
|
+
options = Misc.add_defaults options, :order => false
|
199
247
|
|
200
|
-
if
|
201
|
-
|
202
|
-
header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character
|
203
|
-
line = file.gets
|
204
|
-
else
|
205
|
-
header_fields = nil
|
248
|
+
if options[:persistence] and ! options[:persistence_file]
|
249
|
+
options[:persistence_file] = TSV.get_persistence_file(filename, "index:#{ filename }_#{options[:field]}:", options)
|
206
250
|
end
|
207
|
-
|
208
|
-
id_pos = Misc.field_position(header_fields, options[:native])
|
209
251
|
|
210
|
-
if options[:
|
211
|
-
|
212
|
-
max_cols = 0
|
213
|
-
else
|
214
|
-
extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) }
|
252
|
+
if options[:persistence_file] and File.exists?(options[:persistence_file])
|
253
|
+
return TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
|
215
254
|
end
|
216
255
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
line = options[:fix].call line if options[:fix]
|
256
|
+
new = {}
|
257
|
+
if options[:order]
|
258
|
+
new_key_field, new_fields = through options[:field], options[:others] do |key, values|
|
222
259
|
|
223
|
-
|
224
|
-
|
225
|
-
(options[:select] and not options[:select].call(line))
|
226
|
-
line = file.gets
|
227
|
-
next
|
228
|
-
end
|
260
|
+
values.each_with_index do |list, i|
|
261
|
+
next if list.nil? or list.empty?
|
229
262
|
|
230
|
-
|
263
|
+
list = [list] unless Array === list
|
231
264
|
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
ids.collect!{|id| id.downcase } if options[:case_insensitive]
|
242
|
-
|
243
|
-
# Get extra fields
|
244
|
-
|
245
|
-
if options[:extra].nil? and not (options[:flatten] or options[:single])
|
246
|
-
extra = parts
|
247
|
-
extra.delete_at(id_pos)
|
248
|
-
max_cols = extra.size if extra.size > (max_cols || 0)
|
249
|
-
else
|
250
|
-
if extra_pos.nil?
|
251
|
-
extra = parts
|
252
|
-
extra.delete_at id_pos
|
253
|
-
else
|
254
|
-
extra = parts.values_at(*extra_pos)
|
255
|
-
end
|
256
|
-
end
|
257
|
-
|
258
|
-
extra.collect!{|value| parse_fields(value, options[:sep2])}
|
259
|
-
extra.collect!{|values| values.first} if options[:unique]
|
260
|
-
extra.flatten! if options[:flatten]
|
261
|
-
extra = extra.first if options[:single]
|
262
|
-
|
263
|
-
if options[:overwrite]
|
264
|
-
main_entry = ids.shift
|
265
|
-
ids.each do |id|
|
266
|
-
data[id] = "__Ref:#{main_entry}"
|
265
|
+
list.each do |value|
|
266
|
+
next if value.nil? or value.empty?
|
267
|
+
value = value.downcase if options[:case_insensitive]
|
268
|
+
new[value] ||= []
|
269
|
+
new[value][i + 1] ||= []
|
270
|
+
new[value][i + 1] << key
|
271
|
+
end
|
272
|
+
new[key] ||= []
|
273
|
+
new[key][0] = key
|
267
274
|
end
|
268
275
|
|
269
|
-
|
270
|
-
else
|
271
|
-
main_entry = ids.shift
|
272
|
-
ids.each do |id|
|
273
|
-
data[id] = "__Ref:#{main_entry}"
|
274
|
-
end
|
276
|
+
end
|
275
277
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
if PersistenceHash === data
|
281
|
-
data[main_entry] = (data[main_entry] || []).concat extra
|
282
|
-
else
|
283
|
-
data[main_entry] ||= []
|
284
|
-
data[main_entry].concat extra
|
285
|
-
end
|
286
|
-
else
|
287
|
-
entry = data[main_entry] || []
|
288
|
-
while entry =~ /__Ref:(.*)/ do
|
289
|
-
entry = data[$1]
|
290
|
-
end
|
278
|
+
new.each do |key, values|
|
279
|
+
values.flatten!
|
280
|
+
values.compact!
|
281
|
+
end
|
291
282
|
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
283
|
+
else
|
284
|
+
new_key_field, new_fields = through options[:field], options[:others] do |key, values|
|
285
|
+
new[key] ||= []
|
286
|
+
new[key] << key
|
287
|
+
values.each do |list|
|
288
|
+
next if list.nil?
|
289
|
+
if Array === list
|
290
|
+
list.each do |value|
|
291
|
+
value = value.downcase if options[:case_insensitive]
|
292
|
+
new[value] ||= []
|
293
|
+
new[value] << key
|
296
294
|
end
|
297
|
-
|
298
|
-
|
295
|
+
else
|
296
|
+
next if list.empty?
|
297
|
+
value = list
|
298
|
+
value = value.downcase if options[:case_insensitive]
|
299
|
+
new[value] ||= []
|
300
|
+
new[value] << key
|
299
301
|
end
|
300
|
-
|
301
|
-
data[main_entry] = entry
|
302
|
-
end
|
303
|
-
end
|
304
|
-
end
|
305
|
-
|
306
|
-
if options[:keep_empty] and not max_cols.nil?
|
307
|
-
data.each do |key,values|
|
308
|
-
new_values = values
|
309
|
-
max_cols.times do |i|
|
310
|
-
new_values[i] ||= [""]
|
311
302
|
end
|
312
|
-
data[key] = new_values
|
313
303
|
end
|
314
304
|
end
|
315
305
|
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
key_field = header_fields[id_pos]
|
322
|
-
if extra_pos.nil?
|
323
|
-
fields = header_fields
|
324
|
-
fields.delete_at(id_pos)
|
325
|
-
else
|
326
|
-
fields = header_fields.values_at(*extra_pos)
|
327
|
-
end
|
306
|
+
if options[:persistence_file]
|
307
|
+
index = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
|
308
|
+
index.merge! new
|
309
|
+
else
|
310
|
+
index = TSV.new(new, :case_insensitive => options[:case_insensitive])
|
328
311
|
end
|
329
312
|
|
330
|
-
|
331
|
-
|
332
|
-
|
313
|
+
index.key_field = new_key_field
|
314
|
+
index.fields = new_fields
|
315
|
+
index
|
333
316
|
end
|
334
317
|
|
335
|
-
|
336
|
-
def initialize(file = {}, options = {})
|
337
|
-
@case_insensitive = options[:case_insensitive] == true
|
338
|
-
@list = ! (options[:flatten] == true || options[:single] == true || options[:unique] == true)
|
339
|
-
|
340
|
-
case
|
341
|
-
when TSV === file
|
342
|
-
@filename = file.filename
|
343
|
-
@data = file.data
|
344
|
-
@key_field = file.key_field
|
345
|
-
@fields = file.fields
|
346
|
-
@case_insensitive = file.case_insensitive
|
347
|
-
@list = file.is_list
|
348
|
-
return self
|
349
|
-
when (Hash === file or PersistenceHash === file)
|
350
|
-
@filename = "Hash:" + Digest::MD5.hexdigest(file.inspect)
|
351
|
-
@data = file
|
352
|
-
return self
|
353
|
-
when File === file
|
354
|
-
@filename = File.expand_path file.path
|
355
|
-
when String === file && File.exists?(file)
|
356
|
-
@filename = File.expand_path file
|
357
|
-
file = Open.open(file)
|
358
|
-
when StringIO
|
359
|
-
else
|
360
|
-
raise "File #{file} not found"
|
361
|
-
end
|
318
|
+
def smart_merge(other, match = nil, new_fields = nil)
|
362
319
|
|
363
|
-
|
364
|
-
|
365
|
-
|
320
|
+
new_fields = [new_fields] if String === new_fields
|
321
|
+
if self.fields and other.fields
|
322
|
+
common_fields = ([self.key_field] + self.fields) & ([other.key_field] + other.fields)
|
323
|
+
new_fields ||= ([other.key_field] + other.fields) - ([self.key_field] + self.fields)
|
366
324
|
|
367
|
-
if
|
368
|
-
|
369
|
-
@data = PersistenceHash.get(persistence_file, false)
|
370
|
-
@key_field = @data.key_field
|
371
|
-
@fields = @data.fields
|
372
|
-
else
|
373
|
-
@data = PersistenceHash.get(persistence_file, true)
|
374
|
-
file = Open.grep(file, options[:grep]) if options[:grep]
|
325
|
+
common_fields.delete match if String === match
|
326
|
+
common_fields.delete_at match if Integer === match
|
375
327
|
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
@data.fields = @fields
|
380
|
-
@data.read
|
381
|
-
end
|
328
|
+
this_common_field_positions = self.field_positions *common_fields
|
329
|
+
other_common_field_positions = other.field_positions *common_fields
|
330
|
+
other_new_field_positions = other.field_positions *new_fields
|
382
331
|
else
|
383
|
-
|
384
|
-
@data = {}
|
385
|
-
file = Open.grep(file, options[:grep]) if options[:grep]
|
386
|
-
@key_field, @fields = TSV.parse(@data, file, options)
|
332
|
+
nofieldinfo = true
|
387
333
|
end
|
388
334
|
|
389
|
-
|
390
|
-
|
391
|
-
|
335
|
+
case
|
336
|
+
when TSV === match
|
337
|
+
match_index = match
|
338
|
+
matching_code_position = nil
|
392
339
|
|
340
|
+
when Array === match
|
341
|
+
match_index = match.first
|
342
|
+
matching_code_position = field_positions(match.last).first
|
393
343
|
|
394
|
-
|
395
|
-
|
344
|
+
when match =~ /^through:(.*)/
|
345
|
+
through = $1
|
346
|
+
if through =~ /(.*)#using:(.*)/
|
347
|
+
through = $1
|
348
|
+
matching_code_position = field_positions($2).first
|
349
|
+
else
|
350
|
+
matching_code_position = nil
|
351
|
+
end
|
352
|
+
index_fields = TSV.headers(through)
|
353
|
+
target_field = index_fields.select{|field| other.fields_include field}.first
|
354
|
+
Log.debug "Target Field: #{ target_field }"
|
355
|
+
match_index = TSV.open_file(through).index(:field => target_field)
|
396
356
|
|
397
|
-
|
398
|
-
|
357
|
+
when field_positions(match).first
|
358
|
+
matching_code_position = field_positions(match).first
|
359
|
+
match_index = nil
|
399
360
|
end
|
400
361
|
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
str << key.dup << "\t" << values.to_s << "\n"
|
407
|
-
when Array === values.first
|
408
|
-
str << key.dup << "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n"
|
409
|
-
else
|
410
|
-
str << key.dup << "\t" << values * "\t" << "\n"
|
362
|
+
if matching_code_position.nil? and match_index.fields
|
363
|
+
match_index.fields.each do |field|
|
364
|
+
if matching_code_position = field_positions(field).first
|
365
|
+
break
|
366
|
+
end
|
411
367
|
end
|
412
368
|
end
|
413
369
|
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
return true if field == key_field or fields.include? field
|
421
|
-
return false
|
422
|
-
end
|
370
|
+
if match_index and match_index.key_field == other.key_field
|
371
|
+
other_index = nil
|
372
|
+
else
|
373
|
+
other_index = (match === String and other.fields_include(match)) ?
|
374
|
+
other.index(:other => match, :order => true) : other.index(:order => true)
|
375
|
+
end
|
423
376
|
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
-1
|
429
|
-
when Integer === sel
|
430
|
-
sel
|
377
|
+
each do |key,values|
|
378
|
+
Log.debug "Key: #{ key }. Values: #{values * ", "}"
|
379
|
+
if matching_code_position.nil? or matching_code_position == -1
|
380
|
+
matching_codes = [key]
|
431
381
|
else
|
432
|
-
|
382
|
+
matching_codes = values[matching_code_position]
|
383
|
+
matching_codes = [matching_codes] unless matching_codes.nil? or Array === matching_codes
|
433
384
|
end
|
434
|
-
|
435
|
-
end
|
436
|
-
|
437
|
-
def fields_include(field)
|
438
|
-
return TSV.fields_include key_field, fields, field
|
439
|
-
end
|
440
|
-
|
441
|
-
def field_positions(*selected)
|
442
|
-
return nil if selected.nil? or selected == [nil]
|
443
|
-
TSV.field_positions(key_field, fields, *selected)
|
444
|
-
end
|
385
|
+
Log.debug "Matching codes: #{matching_codes}"
|
445
386
|
|
446
|
-
|
447
|
-
return nil if fields.nil?
|
448
|
-
return nil if positions.nil? or positions == [nil]
|
449
|
-
(fields + [key_field]).values_at(*positions)
|
450
|
-
end
|
387
|
+
next if matching_codes.nil?
|
451
388
|
|
452
|
-
|
453
|
-
|
389
|
+
matching_codes.each do |matching_code|
|
390
|
+
if match_index
|
391
|
+
if match_index[matching_code]
|
392
|
+
matching_code_fix = match_index[matching_code].first
|
393
|
+
else
|
394
|
+
matching_code_fix = nil
|
395
|
+
end
|
396
|
+
else
|
397
|
+
matching_code_fix = matching_code
|
398
|
+
end
|
454
399
|
|
455
|
-
|
400
|
+
Log.debug "Matching code (fix): #{matching_code_fix}"
|
401
|
+
next if matching_code_fix.nil?
|
456
402
|
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
each do |key, values|
|
463
|
-
yield key, values.values_at(*new_field_positions)
|
403
|
+
if other_index
|
404
|
+
Log.debug "Using other_index"
|
405
|
+
other_codes = other_index[matching_code_fix]
|
406
|
+
else
|
407
|
+
other_codes = matching_code_fix
|
464
408
|
end
|
465
|
-
|
466
|
-
end
|
409
|
+
Log.debug "Other codes: #{other_codes}"
|
467
410
|
|
468
|
-
|
469
|
-
|
411
|
+
next if other_codes.nil? or other_codes.empty?
|
412
|
+
other_code = other_codes.first
|
470
413
|
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
414
|
+
if nofieldinfo
|
415
|
+
next if other[other_code].nil?
|
416
|
+
if list
|
417
|
+
other_values = [[other_code]] + other[other_code]
|
418
|
+
else
|
419
|
+
other_values = [other_code] + other[other_code]
|
420
|
+
end
|
421
|
+
other_values.delete_if do |list|
|
422
|
+
list = [list] unless Array === list
|
423
|
+
list.collect{|e| case_insensitive ? e.downcase : e }.
|
424
|
+
select{|e| case_insensitive ? e == matching_code.downcase : e == matching_code }.any?
|
425
|
+
end
|
477
426
|
|
478
|
-
|
479
|
-
if list
|
480
|
-
tmp_values = values + [[key]]
|
427
|
+
new_values = values + other_values
|
481
428
|
else
|
482
|
-
|
483
|
-
|
429
|
+
if other[other_code].nil?
|
430
|
+
if list
|
431
|
+
other_values = [[]] * other.fields.length
|
432
|
+
else
|
433
|
+
other_values = [] * other.fields.length
|
434
|
+
end
|
435
|
+
else
|
436
|
+
if list
|
437
|
+
other_values = other[other_code] + [[other_code]]
|
438
|
+
else
|
439
|
+
other_values = other[other_code] + [other_code]
|
440
|
+
end
|
441
|
+
end
|
442
|
+
|
484
443
|
|
485
|
-
if new_field_positions.nil?
|
486
444
|
new_values = values.dup
|
487
|
-
new_values.delete_at new_key_position
|
488
|
-
new_values.unshift [key]
|
489
|
-
else
|
490
|
-
new_values = tmp_values.values_at(*new_field_positions)
|
491
|
-
end
|
492
445
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
446
|
+
if list
|
447
|
+
this_common_field_positions.zip(other_common_field_positions).each do |tpos, opos|
|
448
|
+
new_values_tops = new_values[tpos]
|
449
|
+
|
450
|
+
if other.list
|
451
|
+
new_values_tops += other_values[opos]
|
452
|
+
else
|
453
|
+
new_values_tops += [other_values[opos]]
|
454
|
+
end
|
455
|
+
|
456
|
+
new_values[tpos] = new_values_tops.uniq
|
457
|
+
end
|
498
458
|
end
|
459
|
+
|
460
|
+
new_values.concat other_values.values_at *other_new_field_positions
|
499
461
|
end
|
462
|
+
|
463
|
+
self[key] = new_values
|
500
464
|
end
|
501
|
-
return [(fields_at(new_key_position) || [nil]).first, new_field_names]
|
502
465
|
end
|
466
|
+
|
467
|
+
self.fields = self.fields + new_fields unless nofieldinfo
|
468
|
+
end
|
469
|
+
|
470
|
+
#{{{ Helpers
|
471
|
+
|
472
|
+
def self.index(file, options = {})
|
473
|
+
opt_data = options.dup
|
474
|
+
opt_index = options.dup
|
475
|
+
opt_data.delete :field
|
476
|
+
opt_data.delete :persistence
|
477
|
+
opt_index.delete :persistence
|
478
|
+
|
479
|
+
opt_data[:persistence] = true if options[:data_persistence]
|
480
|
+
|
481
|
+
opt_index.merge! :persistence_file => get_persistence_file(file, "index:#{ file }_#{options[:field]}:", opt_index) if options[:persistence]
|
482
|
+
|
483
|
+
if ! opt_index[:persistence_file].nil? && File.exists?(opt_index[:persistence_file])
|
484
|
+
Log.low "Reloading persistent index for #{ file }: #{opt_index[:persistence_file]}"
|
485
|
+
TSV.new(PersistenceHash.get(opt_index[:persistence_file], false), opt_index)
|
486
|
+
else
|
487
|
+
Log.low "Creating index for #{ file }: #{opt_index[:persistence_file]}"
|
488
|
+
data = TSV.new(file, opt_data)
|
489
|
+
data.index(opt_index)
|
490
|
+
end
|
491
|
+
end
|
492
|
+
|
493
|
+
def self.open_file(file)
|
494
|
+
if file =~ /(.*?)#(.*)/
|
495
|
+
file, options = $1, Misc.string2hash($2.to_s)
|
496
|
+
else
|
497
|
+
options = {}
|
498
|
+
end
|
499
|
+
|
500
|
+
TSV.new(file, options)
|
503
501
|
end
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
502
|
+
|
503
|
+
#{{{ Accesor Methods
|
504
|
+
|
505
|
+
def keys
|
506
|
+
@data.keys
|
509
507
|
end
|
510
508
|
|
509
|
+
def values
|
510
|
+
@data.values
|
511
|
+
end
|
511
512
|
|
512
|
-
def
|
513
|
-
|
514
|
-
|
513
|
+
def size
|
514
|
+
@data.size
|
515
|
+
end
|
515
516
|
|
516
|
-
|
517
|
-
new_key_field, new_fields = through new_key_field, new_fields do |key, values|
|
518
|
-
if new[key].nil?
|
519
|
-
new[key] = values
|
520
|
-
else
|
521
|
-
new[key] = new[key].zip(values)
|
522
|
-
end
|
523
|
-
end
|
517
|
+
# Write
|
524
518
|
|
525
|
-
|
526
|
-
|
519
|
+
def []=(key, value)
|
520
|
+
key = key.downcase if @case_insensitive
|
521
|
+
@data[key] = value
|
522
|
+
end
|
523
|
+
|
524
|
+
|
525
|
+
def merge!(new_data)
|
526
|
+
new_data.each do |key, value|
|
527
|
+
self[key] = value
|
527
528
|
end
|
529
|
+
end
|
528
530
|
|
529
|
-
|
530
|
-
|
531
|
-
|
531
|
+
# Read
|
532
|
+
|
533
|
+
def follow(value)
|
534
|
+
if String === value && value =~ /__Ref:(.*)/
|
535
|
+
return self[$1]
|
532
536
|
else
|
533
|
-
|
537
|
+
value = NamedArray.name value, fields if Array === value and fields
|
538
|
+
value
|
534
539
|
end
|
540
|
+
end
|
535
541
|
|
536
|
-
|
537
|
-
|
542
|
+
def [](key)
|
543
|
+
if Array === key
|
544
|
+
return @data[key] if @data[key] != nil
|
545
|
+
key.each{|k| v = self[k]; return v unless v.nil?}
|
546
|
+
return nil
|
547
|
+
end
|
538
548
|
|
539
|
-
|
549
|
+
key = key.downcase if @case_insensitive
|
550
|
+
follow @data[key]
|
540
551
|
end
|
541
552
|
|
542
|
-
def
|
543
|
-
|
553
|
+
def values_at(*keys)
|
554
|
+
keys.collect{|k|
|
555
|
+
self[k]
|
556
|
+
}
|
544
557
|
end
|
545
558
|
|
546
|
-
def
|
547
|
-
|
548
|
-
|
549
|
-
if options[:persistence] and ! options[:persistence_file]
|
550
|
-
options[:persistence_file] = TSV.get_persistence_file(filename, "index:#{ filename }_#{options[:field]}:", options)
|
559
|
+
def each(&block)
|
560
|
+
@data.each do |key, value|
|
561
|
+
block.call(key, follow(value))
|
551
562
|
end
|
563
|
+
end
|
552
564
|
|
553
|
-
|
554
|
-
|
565
|
+
def collect
|
566
|
+
if block_given?
|
567
|
+
@data.collect do |key, value|
|
568
|
+
value = follow(value)
|
569
|
+
key, values = yield key, value
|
570
|
+
end
|
571
|
+
else
|
572
|
+
@data.collect do |key, value|
|
573
|
+
[key, follow(value)]
|
574
|
+
end
|
555
575
|
end
|
576
|
+
end
|
556
577
|
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
list = [list] unless Array === list
|
565
|
-
|
566
|
-
list.each do |value|
|
567
|
-
next if value.nil? or value.empty?
|
568
|
-
value = value.downcase if options[:case_insensitive]
|
569
|
-
new[value] ||= []
|
570
|
-
new[value][i + 1] ||= []
|
571
|
-
new[value][i + 1] << key
|
572
|
-
end
|
573
|
-
new[key] ||= []
|
574
|
-
new[key][0] = key
|
575
|
-
end
|
578
|
+
def sort(&block)
|
579
|
+
collect.sort(&block).collect{|p|
|
580
|
+
key, value = p
|
581
|
+
value = NamedArray.name value, fields if fields
|
582
|
+
[key, value]
|
583
|
+
}
|
584
|
+
end
|
576
585
|
|
577
|
-
|
586
|
+
def sort_by(&block)
|
587
|
+
collect.sort_by &block
|
588
|
+
end
|
578
589
|
|
579
|
-
|
580
|
-
|
581
|
-
values.compact!
|
582
|
-
end
|
590
|
+
def to_s
|
591
|
+
str = ""
|
583
592
|
|
584
|
-
|
585
|
-
|
586
|
-
new[key] ||= []
|
587
|
-
new[key] << key
|
588
|
-
values.each do |list|
|
589
|
-
next if list.nil?
|
590
|
-
if Array === list
|
591
|
-
list.each do |value|
|
592
|
-
value = value.downcase if options[:case_insensitive]
|
593
|
-
new[value] ||= []
|
594
|
-
new[value] << key
|
595
|
-
end
|
596
|
-
else
|
597
|
-
next if list.empty?
|
598
|
-
value = list
|
599
|
-
value = value.downcase if options[:case_insensitive]
|
600
|
-
new[value] ||= []
|
601
|
-
new[value] << key
|
602
|
-
end
|
603
|
-
end
|
604
|
-
end
|
593
|
+
if fields
|
594
|
+
str << "#" << key_field << "\t" << fields * "\t" << "\n"
|
605
595
|
end
|
606
596
|
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
597
|
+
each do |key, values|
|
598
|
+
case
|
599
|
+
when values.nil?
|
600
|
+
str << key.dup << "\n"
|
601
|
+
when (not Array === values)
|
602
|
+
str << key.dup << "\t" << values.to_s << "\n"
|
603
|
+
when Array === values.first
|
604
|
+
str << key.dup << "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n"
|
605
|
+
else
|
606
|
+
str << key.dup << "\t" << values * "\t" << "\n"
|
607
|
+
end
|
612
608
|
end
|
613
609
|
|
614
|
-
|
615
|
-
index.fields = new_fields
|
616
|
-
index
|
610
|
+
str
|
617
611
|
end
|
618
612
|
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
613
|
+
#{{{ Parsing
|
614
|
+
|
615
|
+
def self.parse_fields(io, delimiter = "\t")
|
616
|
+
return [] if io.nil?
|
617
|
+
fields = io.split(delimiter, -1)
|
618
|
+
fields
|
619
|
+
end
|
625
620
|
|
626
|
-
|
627
|
-
|
621
|
+
def self.zip_fields(list, fields = nil)
|
622
|
+
return [] if list.nil? || list.empty?
|
623
|
+
fields ||= list.fields if list.respond_to? :fields
|
624
|
+
zipped = list[0].zip(*list[1..-1])
|
625
|
+
zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields
|
626
|
+
zipped
|
627
|
+
end
|
628
|
+
|
629
|
+
def self.parse(data, file, options = {})
|
628
630
|
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
631
|
+
# Prepare options
|
632
|
+
options = add_defaults options,
|
633
|
+
:sep => "\t",
|
634
|
+
:sep2 => "|",
|
635
|
+
:native => 0,
|
636
|
+
:extra => nil,
|
637
|
+
:fix => nil,
|
638
|
+
:exclude => nil,
|
639
|
+
:select => nil,
|
640
|
+
:grep => nil,
|
641
|
+
:single => false,
|
642
|
+
:unique => false,
|
643
|
+
:flatten => false,
|
644
|
+
:overwrite => false,
|
645
|
+
:keep_empty => true,
|
646
|
+
:case_insensitive => false,
|
647
|
+
:header_hash => '#' ,
|
648
|
+
:persistence_file => nil
|
635
649
|
|
636
|
-
|
637
|
-
|
638
|
-
match_index = match
|
639
|
-
matching_code_position = nil
|
650
|
+
options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
|
651
|
+
options[:flatten] = true if options[:single]
|
640
652
|
|
641
|
-
when Array === match
|
642
|
-
match_index = match.first
|
643
|
-
matching_code_position = field_positions(match.last).first
|
644
653
|
|
645
|
-
when match =~ /^through:(.*)/
|
646
|
-
through = $1
|
647
|
-
if through =~ /(.*)#using:(.*)/
|
648
|
-
through = $1
|
649
|
-
matching_code_position = field_positions($2).first
|
650
|
-
else
|
651
|
-
matching_code_position = nil
|
652
|
-
end
|
653
|
-
index_fields = TSV.headers(through)
|
654
|
-
target_field = index_fields.select{|field| other.fields_include field}.first
|
655
|
-
Log.debug "Target Field: #{ target_field }"
|
656
|
-
match_index = TSV.open_file(through).index(:field => target_field)
|
657
654
|
|
658
|
-
|
659
|
-
matching_code_position = field_positions(match).first
|
660
|
-
match_index = nil
|
661
|
-
end
|
655
|
+
#{{{ Process first line
|
662
656
|
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
657
|
+
line = file.gets
|
658
|
+
raise "Empty content" if line.nil?
|
659
|
+
line.chomp!
|
660
|
+
|
661
|
+
if line =~ /^#{options[:header_hash]}/
|
662
|
+
header_fields = parse_fields(line, options[:sep])
|
663
|
+
header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character
|
664
|
+
line = file.gets
|
665
|
+
else
|
666
|
+
header_fields = nil
|
669
667
|
end
|
668
|
+
|
669
|
+
id_pos = Misc.field_position(header_fields, options[:native])
|
670
670
|
|
671
|
-
if
|
672
|
-
|
671
|
+
if options[:extra].nil?
|
672
|
+
extra_pos = nil
|
673
|
+
max_cols = 0
|
673
674
|
else
|
674
|
-
|
675
|
-
other.index(:other => match, :order => true) : other.index(:order => true)
|
675
|
+
extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) }
|
676
676
|
end
|
677
677
|
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
678
|
+
#{{{ Process rest
|
679
|
+
while line do
|
680
|
+
line.chomp!
|
681
|
+
|
682
|
+
line = options[:fix].call line if options[:fix]
|
683
|
+
|
684
|
+
# Select and fix lines
|
685
|
+
if (options[:exclude] and options[:exclude].call(line)) or
|
686
|
+
(options[:select] and not options[:select].call(line))
|
687
|
+
line = file.gets
|
688
|
+
next
|
685
689
|
end
|
686
|
-
Log.debug "Matching codes: #{matching_codes}"
|
687
690
|
|
688
|
-
|
691
|
+
### Process line
|
689
692
|
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
693
|
+
# Chunk fields
|
694
|
+
parts = parse_fields(line, options[:sep])
|
695
|
+
|
696
|
+
# Get next line
|
697
|
+
line = file.gets
|
698
|
+
|
699
|
+
# Get id field
|
700
|
+
next if parts[id_pos].nil? || parts[id_pos].empty?
|
701
|
+
ids = parse_fields(parts[id_pos], options[:sep2])
|
702
|
+
ids.collect!{|id| id.downcase } if options[:case_insensitive]
|
703
|
+
|
704
|
+
# Get extra fields
|
705
|
+
|
706
|
+
if options[:extra].nil? and not (options[:flatten] or options[:single])
|
707
|
+
extra = parts
|
708
|
+
extra.delete_at(id_pos)
|
709
|
+
max_cols = extra.size if extra.size > (max_cols || 0)
|
710
|
+
else
|
711
|
+
if extra_pos.nil?
|
712
|
+
extra = parts
|
713
|
+
extra.delete_at id_pos
|
697
714
|
else
|
698
|
-
|
715
|
+
extra = parts.values_at(*extra_pos)
|
699
716
|
end
|
717
|
+
end
|
700
718
|
|
701
|
-
|
702
|
-
|
719
|
+
extra.collect!{|value| parse_fields(value, options[:sep2])}
|
720
|
+
extra.collect!{|values| values.first} if options[:unique]
|
721
|
+
extra.flatten! if options[:flatten]
|
722
|
+
extra = extra.first if options[:single]
|
703
723
|
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
other_codes = matching_code_fix
|
724
|
+
if options[:overwrite]
|
725
|
+
main_entry = ids.shift
|
726
|
+
ids.each do |id|
|
727
|
+
data[id] = "__Ref:#{main_entry}"
|
709
728
|
end
|
710
|
-
Log.debug "Other codes: #{other_codes}"
|
711
729
|
|
712
|
-
|
713
|
-
|
730
|
+
data[main_entry] = extra
|
731
|
+
else
|
732
|
+
main_entry = ids.shift
|
733
|
+
ids.each do |id|
|
734
|
+
data[id] = "__Ref:#{main_entry}"
|
735
|
+
end
|
714
736
|
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
737
|
+
case
|
738
|
+
when (options[:single] or options[:unique])
|
739
|
+
data[main_entry] ||= extra
|
740
|
+
when options[:flatten]
|
741
|
+
if PersistenceHash === data
|
742
|
+
data[main_entry] = (data[main_entry] || []).concat extra
|
719
743
|
else
|
720
|
-
|
721
|
-
|
722
|
-
other_values.delete_if do |list|
|
723
|
-
list = [list] unless Array === list
|
724
|
-
list.collect{|e| case_insensitive ? e.downcase : e }.
|
725
|
-
select{|e| case_insensitive ? e == matching_code.downcase : e == matching_code }.any?
|
744
|
+
data[main_entry] ||= []
|
745
|
+
data[main_entry].concat extra
|
726
746
|
end
|
727
|
-
|
728
|
-
new_values = values + other_values
|
729
747
|
else
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
else
|
734
|
-
other_values = [] * other.fields.length
|
735
|
-
end
|
736
|
-
else
|
737
|
-
if list
|
738
|
-
other_values = other[other_code] + [[other_code]]
|
739
|
-
else
|
740
|
-
other_values = other[other_code] + [other_code]
|
741
|
-
end
|
748
|
+
entry = data[main_entry] || []
|
749
|
+
while entry =~ /__Ref:(.*)/ do
|
750
|
+
entry = data[$1]
|
742
751
|
end
|
743
|
-
|
744
|
-
|
745
|
-
new_values = values.dup
|
746
|
-
|
747
|
-
if list
|
748
|
-
this_common_field_positions.zip(other_common_field_positions).each do |tpos, opos|
|
749
|
-
new_values_tops = new_values[tpos]
|
750
|
-
|
751
|
-
if other.list
|
752
|
-
new_values_tops += other_values[opos]
|
753
|
-
else
|
754
|
-
new_values_tops += [other_values[opos]]
|
755
|
-
end
|
756
752
|
|
757
|
-
|
753
|
+
extra.each_with_index do |fields, i|
|
754
|
+
if fields.empty?
|
755
|
+
next unless options[:keep_empty]
|
756
|
+
fields = [""]
|
758
757
|
end
|
758
|
+
entry[i] ||= []
|
759
|
+
entry[i] = entry[i].concat fields
|
759
760
|
end
|
760
761
|
|
761
|
-
|
762
|
+
data[main_entry] = entry
|
762
763
|
end
|
764
|
+
end
|
765
|
+
end
|
763
766
|
|
764
|
-
|
767
|
+
if options[:keep_empty] and not max_cols.nil?
|
768
|
+
data.each do |key,values|
|
769
|
+
new_values = values
|
770
|
+
max_cols.times do |i|
|
771
|
+
new_values[i] ||= [""]
|
772
|
+
end
|
773
|
+
data[key] = new_values
|
765
774
|
end
|
766
775
|
end
|
767
776
|
|
768
|
-
self.fields = self.fields + new_fields unless nofieldinfo
|
769
|
-
end
|
770
777
|
|
771
|
-
|
778
|
+
# Save header information
|
779
|
+
key_field = nil
|
780
|
+
fields = nil
|
781
|
+
if header_fields && header_fields.any?
|
782
|
+
key_field = header_fields[id_pos]
|
783
|
+
if extra_pos.nil?
|
784
|
+
fields = header_fields
|
785
|
+
fields.delete_at(id_pos)
|
786
|
+
else
|
787
|
+
fields = header_fields.values_at(*extra_pos)
|
788
|
+
end
|
789
|
+
end
|
772
790
|
|
773
|
-
|
774
|
-
opt_data = options.dup
|
775
|
-
opt_index = options.dup
|
776
|
-
opt_data.delete :field
|
777
|
-
opt_data.delete :persistence
|
778
|
-
opt_index.delete :persistence
|
791
|
+
data.read if PersistenceHash === data
|
779
792
|
|
780
|
-
|
793
|
+
[key_field, fields]
|
794
|
+
end
|
781
795
|
|
782
|
-
|
796
|
+
attr_accessor :data, :key_field, :fields, :list, :case_insensitive, :filename
|
797
|
+
def fields
|
798
|
+
fields = @fields
|
799
|
+
fields.each do |f| f.extend Field end if Array === fields
|
800
|
+
fields
|
801
|
+
end
|
783
802
|
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
803
|
+
def initialize(file = {}, options = {})
|
804
|
+
options = Misc.add_defaults options
|
805
|
+
options[:persistence] = true if options[:persistence_file]
|
806
|
+
|
807
|
+
if String === file && file =~ /(.*?)#(.*)/
|
808
|
+
file, file_options = $1, $2
|
809
|
+
options = Misc.add_defaults file_options, options
|
791
810
|
end
|
792
|
-
end
|
793
811
|
|
794
|
-
|
795
|
-
|
796
|
-
|
812
|
+
@case_insensitive = options[:case_insensitive] == true
|
813
|
+
@list = ! (options[:flatten] == true || options[:single] == true || options[:unique] == true)
|
814
|
+
|
815
|
+
case
|
816
|
+
when TSV === file
|
817
|
+
Log.low "Copying TSV"
|
818
|
+
@filename = file.filename
|
819
|
+
|
820
|
+
if options[:persistence] and not PersistenceHash === file.data
|
821
|
+
persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
|
822
|
+
Log.low "Making persistance #{ persistence_file }"
|
823
|
+
@data = TCHash.get(persistence_file)
|
824
|
+
@data.merge! file
|
825
|
+
@data.key_field = file.key_field
|
826
|
+
@data.fields = file.fields
|
827
|
+
else
|
828
|
+
@data = file.data
|
829
|
+
end
|
830
|
+
|
831
|
+
@key_field = file.key_field
|
832
|
+
@fields = file.fields
|
833
|
+
@case_insensitive = file.case_insensitive
|
834
|
+
@list = file.list
|
835
|
+
return self
|
836
|
+
when Hash === file
|
837
|
+
Log.low "Encapsulating Hash"
|
838
|
+
@filename = "Hash:" + Digest::MD5.hexdigest(file.inspect)
|
839
|
+
if options[:persistence]
|
840
|
+
persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
|
841
|
+
Log.low "Making persistance #{ persistence_file }"
|
842
|
+
@data = TCHash.get(persistence_file)
|
843
|
+
@data.merge! file
|
844
|
+
else
|
845
|
+
@data = file
|
846
|
+
end
|
847
|
+
return self
|
848
|
+
when PersistenceHash === file
|
849
|
+
Log.low "Encapsulating PersistenceHash"
|
850
|
+
@filename = "PersistenceHash:" + Digest::MD5.hexdigest(file.inspect)
|
851
|
+
@data = file
|
852
|
+
@key_field = file.key_field
|
853
|
+
@fields = file.fields
|
854
|
+
return self
|
855
|
+
when File === file
|
856
|
+
@filename = File.expand_path file.path
|
857
|
+
when String === file && File.exists?(file)
|
858
|
+
@filename = File.expand_path file
|
859
|
+
file = Open.open(file)
|
860
|
+
when StringIO
|
861
|
+
else
|
862
|
+
raise "File #{file} not found"
|
863
|
+
end
|
864
|
+
|
865
|
+
if options[:persistence]
|
866
|
+
options.delete :persistence
|
867
|
+
persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
|
868
|
+
|
869
|
+
if File.exists? persistence_file
|
870
|
+
Log.low "Loading Persistence for #{ @filename } in #{persistence_file}"
|
871
|
+
@data = PersistenceHash.get(persistence_file, false)
|
872
|
+
@key_field = @data.key_field
|
873
|
+
@fields = @data.fields
|
874
|
+
else
|
875
|
+
@data = PersistenceHash.get(persistence_file, true)
|
876
|
+
file = Open.grep(file, options[:grep]) if options[:grep]
|
877
|
+
|
878
|
+
Log.low "Persistent Parsing for #{ @filename } in #{persistence_file}"
|
879
|
+
@key_field, @fields = TSV.parse(@data, file, options.merge(:persistence_file => persistence_file))
|
880
|
+
@data.key_field = @key_field
|
881
|
+
@data.fields = @fields
|
882
|
+
@data.read
|
883
|
+
end
|
797
884
|
else
|
798
|
-
|
885
|
+
Log.low "Non-persistent parsing for #{ @filename }"
|
886
|
+
@data = {}
|
887
|
+
file = Open.grep(file, options[:grep]) if options[:grep]
|
888
|
+
@key_field, @fields = TSV.parse(@data, file, options)
|
799
889
|
end
|
800
890
|
|
801
|
-
|
891
|
+
file.close
|
892
|
+
@case_insensitive = options[:case_insensitive] == true
|
802
893
|
end
|
803
894
|
|
804
895
|
end
|
896
|
+
|
897
|
+
#{{{ CacheHelper
|
898
|
+
require 'rbbt/util/cachehelper'
|
899
|
+
module CacheHelper
|
900
|
+
def self.tsv_cache(name, key = [])
|
901
|
+
cache_file = CacheHelper.build_filename name, key
|
902
|
+
|
903
|
+
if File.exists? cache_file
|
904
|
+
Log.debug "TSV cache file '#{cache_file}' found"
|
905
|
+
hash = TCHash.get(cache_file)
|
906
|
+
TSV.new(hash)
|
907
|
+
else
|
908
|
+
Log.debug "Producing TSV cache file '#{cache_file}'"
|
909
|
+
data = yield
|
910
|
+
TSV.new(data, :persistence_file => cache_file)
|
911
|
+
end
|
912
|
+
end
|
913
|
+
end
|