rbbt-util 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rbbt/util/cmd.rb CHANGED
@@ -3,20 +3,27 @@ require 'rbbt/util/log'
3
3
  require 'stringio'
4
4
 
5
5
  module CMD
6
- class CMDError < StandardError;end
6
+ class CMDError < RBBTError;end
7
7
 
8
8
  module SmartIO
9
- def self.tie(io, pid = nil, post = nil)
9
+ def self.tie(io, pid = nil, cmd = "", post = nil)
10
10
  io.instance_eval{
11
11
  @pid = pid
12
+ @cmd = cmd
12
13
  @post = post
13
14
  alias original_close close
14
15
  def close
15
16
  begin
16
- Process.waitpid(@pid, Process::WNOHANG) if @pid
17
+ Process.waitpid(@pid) if @pid
17
18
  rescue
18
19
  end
19
20
 
21
+ if $? and not $?.success?
22
+ Log.debug "Raising exception"
23
+ exception = CMDError.new "Command [#{@pid}] #{@cmd} failed with error status #{$?.exitstatus}"
24
+ raise exception
25
+ end
26
+
20
27
  @post.call if @post
21
28
  original_close
22
29
  end
@@ -107,6 +114,8 @@ module CMD
107
114
  sout.last.close
108
115
  serr.last.close
109
116
 
117
+ Log.debug "CMD: [#{pid}] #{cmd}"
118
+
110
119
  case
111
120
  when String === in_content
112
121
  sin.last.write in_content
@@ -120,20 +129,39 @@ module CMD
120
129
  end
121
130
  end
122
131
 
123
- Thread.new do
124
- while l = serr.first.gets
125
- Log.log l, stderr if Integer === stderr
132
+ if pipe
133
+ Thread.new do
134
+ while l = serr.first.gets
135
+ Log.log l, stderr if Integer === stderr
136
+ end
137
+ serr.first.close
126
138
  end
127
- serr.first.close
128
- end
129
139
 
130
- if pipe
131
- SmartIO.tie sout.first, pid, post
140
+ SmartIO.tie sout.first, pid, cmd, post
132
141
  sout.first
142
+
133
143
  else
144
+ err = ""
145
+ Thread.new do
146
+ while l = serr.first.gets
147
+ err << l if Integer === stderr
148
+ end
149
+ serr.first.close
150
+ end
151
+
134
152
  out = StringIO.new sout.first.read
135
- SmartIO.tie out
153
+ SmartIO.tie out, pid, cmd, post
154
+
136
155
  Process.waitpid pid
156
+
157
+ if not $?.success?
158
+ exception = CMDError.new "Command [#{pid}] #{cmd} failed with error status #{$?.exitstatus}"
159
+ exception.info = err if Integer === stderr and stderr >= Log.severity
160
+ raise exception
161
+ else
162
+ Log.log err, stderr if Integer === stderr
163
+ end
164
+
137
165
  out
138
166
  end
139
167
  end
@@ -38,6 +38,7 @@ module DataModule
38
38
  pkg_module.add_datafiles filename => ['', self.to_s, sharedir]
39
39
  rescue
40
40
  Log.debug $!.message
41
+ Log.debug $!.backtrace * "\n"
41
42
  old_method_missing name, *args, &block
42
43
  end
43
44
 
@@ -1,4 +1,5 @@
1
1
  require 'rbbt/util/tsv'
2
+ require 'rbbt/util/open'
2
3
  require 'spreadsheet'
3
4
 
4
5
  class TSV
@@ -8,7 +9,7 @@ class TSV
8
9
  header = true unless header == false
9
10
  sheet ||= 0
10
11
  TmpFile.with_file do |filename|
11
- workbook = Spreadsheet.open File.open(file)
12
+ workbook = Spreadsheet.open Open.open(file)
12
13
  sheet = workbook.worksheet sheet
13
14
 
14
15
  rows = []
data/lib/rbbt/util/log.rb CHANGED
@@ -5,11 +5,11 @@ module Log
5
5
  MEDIUM = 2
6
6
  HIGH = 3
7
7
 
8
- def severity=(severity)
8
+ def self.severity=(severity)
9
9
  @@severity = severity
10
10
  end
11
11
 
12
- def severity
12
+ def self.severity
13
13
  @@severity
14
14
  end
15
15
 
@@ -1,7 +1,25 @@
1
1
  require 'iconv'
2
+
3
+ class RBBTError < StandardError
4
+ attr_accessor :info
5
+
6
+ alias old_to_s to_s
7
+ def to_s
8
+ str = old_to_s
9
+ if info
10
+ str << "\n" << "Additional Info:\n---\n" << info << "---"
11
+ end
12
+ str
13
+ end
14
+ end
15
+
2
16
  module Misc
3
17
  class FieldNotFoundError < StandardError;end
4
18
 
19
+ def self.this_dir
20
+ File.expand_path(File.dirname(caller[0]))
21
+ end
22
+
5
23
  def self.env_add(var, value, sep = ":", prepend = true)
6
24
  ENV[var] ||= ""
7
25
  return if ENV[var] =~ /(#{sep}|^)#{Regexp.quote value}(#{sep}|$)/
@@ -113,6 +131,14 @@ module Misc
113
131
  end
114
132
  end
115
133
 
134
+ module PDF2Text
135
+ def self.pdf2text(filename)
136
+ TmpFile.with_file(Open.read(filename)) do |pdf|
137
+ CMD.cmd("pdftotext #{pdf} -", :pipe => false, :stderr => true)
138
+ end
139
+ end
140
+ end
141
+
116
142
  class NamedArray < Array
117
143
  attr_accessor :fields
118
144
 
@@ -34,6 +34,7 @@ module Open
34
34
  end
35
35
 
36
36
  def self.wget(url, options = {})
37
+ Log.low "WGET:\n -URL: #{ url }\n -OPTIONS: #{options.inspect}"
37
38
  options = Misc.add_defaults options, "--user-agent=" => 'firefox', :pipe => true
38
39
 
39
40
  wait(options[:nice], options[:nice_key]) if options[:nice]
@@ -42,7 +43,18 @@ module Open
42
43
 
43
44
  pipe = options.delete(:pipe)
44
45
  quiet = options.delete(:quiet)
45
- options["--quiet"] = quiet if options["--quiet"].nil?
46
+ post = options.delete(:post)
47
+ cookies = options.delete(:cookies)
48
+
49
+ options["--quiet"] = quiet if options["--quiet"].nil?
50
+ options["--post-data="] ||= post if post
51
+
52
+ if cookies
53
+ options["--save-cookies"] = cookies
54
+ options["--load-cookies"] = cookies
55
+ options["--keep-session-cookies"] = true
56
+ end
57
+
46
58
 
47
59
  stderr = case
48
60
  when options['stderr']
@@ -52,6 +64,7 @@ module Open
52
64
  else
53
65
  nil
54
66
  end
67
+
55
68
  begin
56
69
  CMD.cmd("wget '#{ url }'", options.merge(
57
70
  '-O' => '-',
@@ -141,6 +154,9 @@ module Open
141
154
  wget_options = options[:wget_options] || {}
142
155
  wget_options[:nice] = options.delete(:nice)
143
156
  wget_options[:nice_key] = options.delete(:nice_key)
157
+ wget_options[:quiet] = options.delete(:quiet)
158
+ wget_options[:post] = options.delete(:post)
159
+ wget_options[:cookies] = options.delete(:cookies)
144
160
 
145
161
  io = case
146
162
  when (not remote?(url))
@@ -155,8 +171,8 @@ module Open
155
171
  io.close
156
172
  file_open(in_cache(url), options[:grep])
157
173
  end
158
- io = unzip(io) if zip? url and not options[:noz]
159
- io = gunzip(io) if gzip? url and not options[:noz]
174
+ io = unzip(io) if (zip? url and not options[:noz]) or options[:zip]
175
+ io = gunzip(io) if (gzip? url and not options[:noz]) or options[:gzip]
160
176
 
161
177
  io
162
178
  end
@@ -70,6 +70,7 @@ module PKGSoftware
70
70
  if not File.exists?(path)
71
71
  sharedir ||= PKGSoftware.get_caller_sharedir
72
72
  get_pkg(pkg.to_s, path, get, sharedir)
73
+ setup_env(software_dir)
73
74
  end
74
75
 
75
76
  SOFTWARE[pkg.to_s] = path
@@ -80,7 +81,6 @@ module PKGSoftware
80
81
  SOFTWARE[pkg.to_s]
81
82
  end
82
83
 
83
-
84
84
  def setup_env(software_dir)
85
85
  Misc.env_add 'PATH', bin_dir
86
86
 
@@ -127,4 +127,5 @@ module PKGSoftware
127
127
 
128
128
  CMD.cmd(File.join(opt_dir, '.post_install'))
129
129
  end
130
+
130
131
  end
@@ -39,7 +39,7 @@ class TCHash < TokyoCabinet::HDB
39
39
  alias original_keys keys
40
40
  def keys
41
41
  list = self.original_keys
42
- indexes = FIELD_INFO_ENTRIES.values.collect do |field| list.index(field) end.compact
42
+ indexes = FIELD_INFO_ENTRIES.values.collect do |field| list.index(field) end.compact.sort.reverse
43
43
  indexes.each do |index| list.delete_at index end
44
44
  list
45
45
  end
@@ -48,19 +48,12 @@ class TCHash < TokyoCabinet::HDB
48
48
  def values
49
49
  values = self.original_values
50
50
  keys = self.original_keys
51
- indexes = FIELD_INFO_ENTRIES.values.collect do |field| keys.index(field) end.compact
51
+ indexes = FIELD_INFO_ENTRIES.values.collect do |field| keys.index(field) end.compact.sort.reverse
52
52
  indexes.each do |index| values.delete_at index end
53
53
 
54
54
  values.collect{|v| Serializer.load(v)}
55
55
  end
56
56
 
57
- def merge!(data)
58
- new_data = {}
59
- data.each do |key, values|
60
- self[key] = values
61
- end
62
- end
63
-
64
57
  # This version of each fixes a problem in ruby 1.9. It also
65
58
  # removes the special entries
66
59
  def each19(&block)
@@ -77,10 +70,17 @@ class TCHash < TokyoCabinet::HDB
77
70
 
78
71
  def collect
79
72
  res = []
80
- self.each{|k, v| res << [k,v]}
73
+ self.each{|k, v| res << yield(k,v)}
81
74
  res
82
75
  end
83
76
 
77
+ def merge!(data)
78
+ new_data = {}
79
+ data.each do |key, values|
80
+ self[key] = values
81
+ end
82
+ end
83
+
84
84
  alias original_open open
85
85
  def open(write = false)
86
86
  flags = write ? TokyoCabinet::HDB::OWRITER | TokyoCabinet::HDB::OCREAT : TokyoCabinet::BDB::OREADER
data/lib/rbbt/util/tsv.rb CHANGED
@@ -2,6 +2,7 @@ require 'rbbt/util/misc'
2
2
  require 'rbbt/util/open'
3
3
  require 'rbbt/util/tc_hash'
4
4
  require 'rbbt/util/tmpfile'
5
+ require 'rbbt/util/log'
5
6
  require 'digest'
6
7
  require 'fileutils'
7
8
 
@@ -16,6 +17,13 @@ end
16
17
  class TSV
17
18
  class FieldNotFoundError < StandardError;end
18
19
 
20
+ module Field
21
+ def ==(string)
22
+ return false unless String === string
23
+ self.sub(/#.*/,'').casecmp(string.sub(/#.*/,'')) == 0
24
+ end
25
+ end
26
+
19
27
  #{{{ Persistence
20
28
 
21
29
  PersistenceHash = TCHash
@@ -36,14 +44,7 @@ class TSV
36
44
  File.join(CACHEDIR, prefix.gsub(/\s/,'_').gsub(/\//,'>') + Digest::MD5.hexdigest([file, options].inspect))
37
45
  end
38
46
 
39
- @debug = ENV['TSV_DEBUG'] == "true"
40
- def self.log(message)
41
- STDERR.puts message if @debug == true
42
- end
43
-
44
- def self.debug=(value)
45
- @debug = value
46
- end
47
+ #{{{ Headers and Field Stuff
47
48
 
48
49
  def self.headers(file, options = {})
49
50
  if file =~ /(.*)#(.*)/ and File.exists? $1
@@ -63,742 +64,850 @@ class TSV
63
64
  end
64
65
  end
65
66
 
66
- #{{{ Accesor Methods
67
-
68
- def keys
69
- @data.keys
67
+ def self.fields_include(key_field, fields, field)
68
+ return true if key_field == field or fields.include? field
69
+ return false
70
70
  end
71
71
 
72
- def values
73
- @data.values
72
+ def self.field_positions(key_field, fields, *selected)
73
+ selected.collect do |sel|
74
+ case
75
+ when (sel.nil? or sel == :main or sel == key_field)
76
+ -1
77
+ when Integer === sel
78
+ sel
79
+ else
80
+ Misc.field_position fields, sel
81
+ end
82
+ end
74
83
  end
75
84
 
76
- def size
77
- @data.size
85
+ def fields_include(field)
86
+ return TSV.fields_include key_field, fields, field
78
87
  end
79
88
 
80
- # Write
89
+ def field_positions(*selected)
90
+ return nil if selected.nil? or selected == [nil]
91
+ TSV.field_positions(key_field, fields, *selected)
92
+ end
81
93
 
82
- def []=(key, value)
83
- key = key.downcase if @case_insensitive
84
- @data[key] = value
94
+ def fields_at(*positions)
95
+ return nil if fields.nil?
96
+ return nil if positions.nil? or positions == [nil]
97
+ (fields + [key_field]).values_at(*positions)
85
98
  end
86
99
 
100
+ #{{{ Iteration, Merging, etc
101
+ def through(new_key_field = nil, new_fields = nil, &block)
102
+ new_key_position = (field_positions(new_key_field) || [-1]).first
103
+ new_fields = [new_fields] if String === new_fields
87
104
 
88
- def merge!(new_data)
89
- new_data.each do |key, value|
90
- self[key] = value
91
- end
92
- end
105
+ if new_key_position == -1
93
106
 
94
- # Read
107
+ if new_fields.nil? or new_fields == fields
108
+ each &block
109
+ return [key_field, fields]
110
+ else
111
+ new_field_positions = field_positions(*new_fields)
112
+ each do |key, values|
113
+ if values.nil?
114
+ yield key, nil
115
+ else
116
+ yield key, values.values_at(*new_field_positions)
117
+ end
118
+ end
119
+ return [key_field, fields_at(*new_field_positions)]
120
+ end
95
121
 
96
- def follow(value)
97
- if String === value && value =~ /__Ref:(.*)/
98
- return self[$1]
99
122
  else
100
- value = NamedArray.name value, fields if Array === value and fields
101
- value
123
+ new_field_positions = field_positions(*new_fields)
124
+
125
+ new_field_names = fields_at(*new_field_positions)
126
+ if new_field_names.nil? and fields
127
+ new_field_names = fields.dup
128
+ new_field_names.delete_at new_key_position
129
+ new_field_names.unshift key_field
130
+ end
131
+
132
+ each do |key, values|
133
+ if list
134
+ tmp_values = values + [[key]]
135
+ else
136
+ tmp_values = values + [key]
137
+ end
138
+
139
+ if new_field_positions.nil?
140
+ new_values = values.dup
141
+ new_values.delete_at new_key_position
142
+ new_values.unshift [key]
143
+ else
144
+ new_values = tmp_values.values_at(*new_field_positions)
145
+ end
146
+
147
+ tmp_values[new_key_position].each do |new_key|
148
+ if new_field_names
149
+ yield new_key, NamedArray.name(new_values, new_field_names)
150
+ else
151
+ yield new_key, new_values
152
+ end
153
+ end
154
+ end
155
+ return [(fields_at(new_key_position) || [nil]).first, new_field_names]
102
156
  end
103
157
  end
104
- def [](key)
105
- if Array === key
106
- return @data[key] if @data[key] != nil
107
- key.each{|k| v = self[k]; return v unless v.nil?}
108
- return nil
158
+
159
+ def process(field)
160
+ through do |key, values|
161
+ values[field].replace yield(values[field], key, values) unless values[field].nil?
109
162
  end
110
-
111
- key = key.downcase if @case_insensitive
112
- follow @data[key]
113
163
  end
114
164
 
115
- def values_at(*keys)
116
- keys.collect{|k|
117
- self[k]
118
- }
119
- end
120
165
 
121
- def each(&block)
122
- @data.each do |key, value|
123
- block.call(key, follow(value))
124
- end
125
- end
166
+ def reorder(new_key_field, new_fields = nil, options = {})
167
+ options = Misc.add_defaults options
168
+ return TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => case_insensitive) if options[:persistence_file] and File.exists?(options[:persistence_file])
126
169
 
127
- def collect
128
- if block_given?
129
- @data.collect do |key, value|
130
- value = follow(value)
131
- key, values = yield key, value
132
- end
133
- else
134
- @data.collect do |key, value|
135
- [key, follow(value)]
170
+ new = {}
171
+ new_key_field, new_fields = through new_key_field, new_fields do |key, values|
172
+ if new[key].nil?
173
+ new[key] = values
174
+ else
175
+ new[key] = new[key].zip(values)
136
176
  end
137
177
  end
138
- end
139
178
 
140
- def sort(&block)
141
- collect.sort(&block).collect{|p|
142
- key, value = p
143
- value = NamedArray.name value, fields if fields
144
- [key, value]
145
- }
146
- end
179
+ new.each do |key,values|
180
+ values.each{|list| list.flatten! if Array === list}
181
+ end
147
182
 
148
- def sort_by(&block)
149
- collect.sort_by &block
150
- end
183
+ if options[:persistence_file]
184
+ reordered = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => case_insensitive)
185
+ reordered.merge! new
186
+ else
187
+ reordered = TSV.new(new, :case_insensitive => case_insensitive)
188
+ end
151
189
 
152
- #{{{ Parsing
153
-
154
- def self.parse_fields(io, delimiter = "\t")
155
- return [] if io.nil?
156
- fields = io.split(delimiter, -1)
157
- fields
158
- end
190
+ reordered.key_field = new_key_field
191
+ reordered.fields = new_fields
159
192
 
160
- def self.zip_fields(list, fields = nil)
161
- return [] if list.nil? || list.empty?
162
- fields ||= list.fields if list.respond_to? :fields
163
- zipped = list[0].zip(*list[1..-1])
164
- zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields
165
- zipped
193
+ reordered
166
194
  end
167
-
168
- def self.parse(data, file, options = {})
169
195
 
170
- # Prepare options
171
- options = add_defaults options,
172
- :sep => "\t",
173
- :sep2 => "|",
174
- :native => 0,
175
- :extra => nil,
176
- :fix => nil,
177
- :exclude => nil,
178
- :select => nil,
179
- :grep => nil,
180
- :single => false,
181
- :unique => false,
182
- :flatten => false,
183
- :overwrite => false,
184
- :keep_empty => true,
185
- :case_insensitive => false,
186
- :header_hash => '#' ,
187
- :persistence_file => nil
196
+ def slice(new_fields, options = {})
197
+ reorder(:main, new_fields)
198
+ end
188
199
 
189
- options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
190
- options[:flatten] = true if options[:single]
200
+ def add_field(name = nil)
201
+ each do |key, values|
202
+ self[key] = values << yield(key, values)
203
+ end
191
204
 
205
+ fields << name if list
206
+ if PersistenceHash === @data
207
+ @data.fields = fields
208
+ end
209
+ end
192
210
 
211
+ def select(method)
212
+ new = TSV.new({})
213
+ new.key_field = key_field
214
+ new.fields = fields.dup
215
+
216
+ case
217
+ when Array === method
218
+ through do |key, values|
219
+ new[key] = values if ([key,values].flatten & method).any?
220
+ end
221
+ when Regexp === method
222
+ through do |key, values|
223
+ new[key] = values if [key,values].flatten.select{|v| v =~ method}.any?
224
+ end
225
+ when Hash === method
226
+ key = method.keys.first
227
+ method = method.values.first
228
+ case
229
+ when (Array === method and (:main == key or key_field == key))
230
+ method.each{|item| if values = self[item]; then new[item] = values; end}
231
+ when Array === method
232
+ through :main, key do |key, values|
233
+ new[key] = values if (values.flatten & method).any?
234
+ end
235
+ when Regexp === method
236
+ through :main, key do |key, values|
237
+ new[key] = values if values.flatten.select{|v| v =~ method}.any?
238
+ end
239
+ end
240
+ end
193
241
 
194
- #{{{ Process first line
242
+ new
243
+ end
195
244
 
196
- line = file.gets
197
- raise "Empty content" if line.nil?
198
- line.chomp!
245
+ def index(options = {})
246
+ options = Misc.add_defaults options, :order => false
199
247
 
200
- if line =~ /^#{options[:header_hash]}/
201
- header_fields = parse_fields(line, options[:sep])
202
- header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character
203
- line = file.gets
204
- else
205
- header_fields = nil
248
+ if options[:persistence] and ! options[:persistence_file]
249
+ options[:persistence_file] = TSV.get_persistence_file(filename, "index:#{ filename }_#{options[:field]}:", options)
206
250
  end
207
-
208
- id_pos = Misc.field_position(header_fields, options[:native])
209
251
 
210
- if options[:extra].nil?
211
- extra_pos = nil
212
- max_cols = 0
213
- else
214
- extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) }
252
+ if options[:persistence_file] and File.exists?(options[:persistence_file])
253
+ return TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
215
254
  end
216
255
 
217
- #{{{ Process rest
218
- while line do
219
- line.chomp!
220
-
221
- line = options[:fix].call line if options[:fix]
256
+ new = {}
257
+ if options[:order]
258
+ new_key_field, new_fields = through options[:field], options[:others] do |key, values|
222
259
 
223
- # Select and fix lines
224
- if (options[:exclude] and options[:exclude].call(line)) or
225
- (options[:select] and not options[:select].call(line))
226
- line = file.gets
227
- next
228
- end
260
+ values.each_with_index do |list, i|
261
+ next if list.nil? or list.empty?
229
262
 
230
- ### Process line
263
+ list = [list] unless Array === list
231
264
 
232
- # Chunk fields
233
- parts = parse_fields(line, options[:sep])
234
-
235
- # Get next line
236
- line = file.gets
237
-
238
- # Get id field
239
- next if parts[id_pos].nil? || parts[id_pos].empty?
240
- ids = parse_fields(parts[id_pos], options[:sep2])
241
- ids.collect!{|id| id.downcase } if options[:case_insensitive]
242
-
243
- # Get extra fields
244
-
245
- if options[:extra].nil? and not (options[:flatten] or options[:single])
246
- extra = parts
247
- extra.delete_at(id_pos)
248
- max_cols = extra.size if extra.size > (max_cols || 0)
249
- else
250
- if extra_pos.nil?
251
- extra = parts
252
- extra.delete_at id_pos
253
- else
254
- extra = parts.values_at(*extra_pos)
255
- end
256
- end
257
-
258
- extra.collect!{|value| parse_fields(value, options[:sep2])}
259
- extra.collect!{|values| values.first} if options[:unique]
260
- extra.flatten! if options[:flatten]
261
- extra = extra.first if options[:single]
262
-
263
- if options[:overwrite]
264
- main_entry = ids.shift
265
- ids.each do |id|
266
- data[id] = "__Ref:#{main_entry}"
265
+ list.each do |value|
266
+ next if value.nil? or value.empty?
267
+ value = value.downcase if options[:case_insensitive]
268
+ new[value] ||= []
269
+ new[value][i + 1] ||= []
270
+ new[value][i + 1] << key
271
+ end
272
+ new[key] ||= []
273
+ new[key][0] = key
267
274
  end
268
275
 
269
- data[main_entry] = extra
270
- else
271
- main_entry = ids.shift
272
- ids.each do |id|
273
- data[id] = "__Ref:#{main_entry}"
274
- end
276
+ end
275
277
 
276
- case
277
- when (options[:single] or options[:unique])
278
- data[main_entry] ||= extra
279
- when options[:flatten]
280
- if PersistenceHash === data
281
- data[main_entry] = (data[main_entry] || []).concat extra
282
- else
283
- data[main_entry] ||= []
284
- data[main_entry].concat extra
285
- end
286
- else
287
- entry = data[main_entry] || []
288
- while entry =~ /__Ref:(.*)/ do
289
- entry = data[$1]
290
- end
278
+ new.each do |key, values|
279
+ values.flatten!
280
+ values.compact!
281
+ end
291
282
 
292
- extra.each_with_index do |fields, i|
293
- if fields.empty?
294
- next unless options[:keep_empty]
295
- fields = [""]
283
+ else
284
+ new_key_field, new_fields = through options[:field], options[:others] do |key, values|
285
+ new[key] ||= []
286
+ new[key] << key
287
+ values.each do |list|
288
+ next if list.nil?
289
+ if Array === list
290
+ list.each do |value|
291
+ value = value.downcase if options[:case_insensitive]
292
+ new[value] ||= []
293
+ new[value] << key
296
294
  end
297
- entry[i] ||= []
298
- entry[i] = entry[i].concat fields
295
+ else
296
+ next if list.empty?
297
+ value = list
298
+ value = value.downcase if options[:case_insensitive]
299
+ new[value] ||= []
300
+ new[value] << key
299
301
  end
300
-
301
- data[main_entry] = entry
302
- end
303
- end
304
- end
305
-
306
- if options[:keep_empty] and not max_cols.nil?
307
- data.each do |key,values|
308
- new_values = values
309
- max_cols.times do |i|
310
- new_values[i] ||= [""]
311
302
  end
312
- data[key] = new_values
313
303
  end
314
304
  end
315
305
 
316
-
317
- # Save header information
318
- key_field = nil
319
- fields = nil
320
- if header_fields && header_fields.any?
321
- key_field = header_fields[id_pos]
322
- if extra_pos.nil?
323
- fields = header_fields
324
- fields.delete_at(id_pos)
325
- else
326
- fields = header_fields.values_at(*extra_pos)
327
- end
306
+ if options[:persistence_file]
307
+ index = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
308
+ index.merge! new
309
+ else
310
+ index = TSV.new(new, :case_insensitive => options[:case_insensitive])
328
311
  end
329
312
 
330
- data.read if PersistenceHash === data
331
-
332
- [key_field, fields]
313
+ index.key_field = new_key_field
314
+ index.fields = new_fields
315
+ index
333
316
  end
334
317
 
335
- attr_accessor :data, :key_field, :fields, :list, :case_insensitive, :filename
336
- def initialize(file = {}, options = {})
337
- @case_insensitive = options[:case_insensitive] == true
338
- @list = ! (options[:flatten] == true || options[:single] == true || options[:unique] == true)
339
-
340
- case
341
- when TSV === file
342
- @filename = file.filename
343
- @data = file.data
344
- @key_field = file.key_field
345
- @fields = file.fields
346
- @case_insensitive = file.case_insensitive
347
- @list = file.is_list
348
- return self
349
- when (Hash === file or PersistenceHash === file)
350
- @filename = "Hash:" + Digest::MD5.hexdigest(file.inspect)
351
- @data = file
352
- return self
353
- when File === file
354
- @filename = File.expand_path file.path
355
- when String === file && File.exists?(file)
356
- @filename = File.expand_path file
357
- file = Open.open(file)
358
- when StringIO
359
- else
360
- raise "File #{file} not found"
361
- end
318
+ def smart_merge(other, match = nil, new_fields = nil)
362
319
 
363
- if options[:persistence]
364
- options.delete :persistence
365
- persistence_file = TSV.get_persistence_file @filename, "file:#{ @filename }:", options
320
+ new_fields = [new_fields] if String === new_fields
321
+ if self.fields and other.fields
322
+ common_fields = ([self.key_field] + self.fields) & ([other.key_field] + other.fields)
323
+ new_fields ||= ([other.key_field] + other.fields) - ([self.key_field] + self.fields)
366
324
 
367
- if File.exists? persistence_file
368
- TSV.log "Loading Persistence for #{ @filename } in #{persistence_file}"
369
- @data = PersistenceHash.get(persistence_file, false)
370
- @key_field = @data.key_field
371
- @fields = @data.fields
372
- else
373
- @data = PersistenceHash.get(persistence_file, true)
374
- file = Open.grep(file, options[:grep]) if options[:grep]
325
+ common_fields.delete match if String === match
326
+ common_fields.delete_at match if Integer === match
375
327
 
376
- TSV.log "Persistent Parsing for #{ @filename } in #{persistence_file}"
377
- @key_field, @fields = TSV.parse(@data, file, options.merge(:persistence_file => persistence_file))
378
- @data.key_field = @key_field
379
- @data.fields = @fields
380
- @data.read
381
- end
328
+ this_common_field_positions = self.field_positions *common_fields
329
+ other_common_field_positions = other.field_positions *common_fields
330
+ other_new_field_positions = other.field_positions *new_fields
382
331
  else
383
- TSV.log "Non-persistent parsing for #{ @filename }"
384
- @data = {}
385
- file = Open.grep(file, options[:grep]) if options[:grep]
386
- @key_field, @fields = TSV.parse(@data, file, options)
332
+ nofieldinfo = true
387
333
  end
388
334
 
389
- file.close
390
- @case_insensitive = options[:case_insensitive] == true
391
- end
335
+ case
336
+ when TSV === match
337
+ match_index = match
338
+ matching_code_position = nil
392
339
 
340
+ when Array === match
341
+ match_index = match.first
342
+ matching_code_position = field_positions(match.last).first
393
343
 
394
- def to_s
395
- str = ""
344
+ when match =~ /^through:(.*)/
345
+ through = $1
346
+ if through =~ /(.*)#using:(.*)/
347
+ through = $1
348
+ matching_code_position = field_positions($2).first
349
+ else
350
+ matching_code_position = nil
351
+ end
352
+ index_fields = TSV.headers(through)
353
+ target_field = index_fields.select{|field| other.fields_include field}.first
354
+ Log.debug "Target Field: #{ target_field }"
355
+ match_index = TSV.open_file(through).index(:field => target_field)
396
356
 
397
- if fields
398
- str << "#" << key_field << "\t" << fields * "\t" << "\n"
357
+ when field_positions(match).first
358
+ matching_code_position = field_positions(match).first
359
+ match_index = nil
399
360
  end
400
361
 
401
- each do |key, values|
402
- case
403
- when values.nil?
404
- str << key.dup << "\n"
405
- when (not Array === values)
406
- str << key.dup << "\t" << values.to_s << "\n"
407
- when Array === values.first
408
- str << key.dup << "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n"
409
- else
410
- str << key.dup << "\t" << values * "\t" << "\n"
362
+ if matching_code_position.nil? and match_index.fields
363
+ match_index.fields.each do |field|
364
+ if matching_code_position = field_positions(field).first
365
+ break
366
+ end
411
367
  end
412
368
  end
413
369
 
414
- str
415
- end
416
-
417
- #{{{ New
418
-
419
- def self.fields_include(key_field, fields, field)
420
- return true if field == key_field or fields.include? field
421
- return false
422
- end
370
+ if match_index and match_index.key_field == other.key_field
371
+ other_index = nil
372
+ else
373
+ other_index = (match === String and other.fields_include(match)) ?
374
+ other.index(:other => match, :order => true) : other.index(:order => true)
375
+ end
423
376
 
424
- def self.field_positions(key_field, fields, *selected)
425
- selected.collect do |sel|
426
- case
427
- when (sel.nil? or sel == :main or sel == key_field)
428
- -1
429
- when Integer === sel
430
- sel
377
+ each do |key,values|
378
+ Log.debug "Key: #{ key }. Values: #{values * ", "}"
379
+ if matching_code_position.nil? or matching_code_position == -1
380
+ matching_codes = [key]
431
381
  else
432
- Misc.field_position fields, sel
382
+ matching_codes = values[matching_code_position]
383
+ matching_codes = [matching_codes] unless matching_codes.nil? or Array === matching_codes
433
384
  end
434
- end
435
- end
436
-
437
- def fields_include(field)
438
- return TSV.fields_include key_field, fields, field
439
- end
440
-
441
- def field_positions(*selected)
442
- return nil if selected.nil? or selected == [nil]
443
- TSV.field_positions(key_field, fields, *selected)
444
- end
385
+ Log.debug "Matching codes: #{matching_codes}"
445
386
 
446
- def fields_at(*positions)
447
- return nil if fields.nil?
448
- return nil if positions.nil? or positions == [nil]
449
- (fields + [key_field]).values_at(*positions)
450
- end
387
+ next if matching_codes.nil?
451
388
 
452
- def through(new_key_field = nil, new_fields = nil, &block)
453
- new_key_position = (field_positions(new_key_field) || [-1]).first
389
+ matching_codes.each do |matching_code|
390
+ if match_index
391
+ if match_index[matching_code]
392
+ matching_code_fix = match_index[matching_code].first
393
+ else
394
+ matching_code_fix = nil
395
+ end
396
+ else
397
+ matching_code_fix = matching_code
398
+ end
454
399
 
455
- if new_key_position == -1
400
+ Log.debug "Matching code (fix): #{matching_code_fix}"
401
+ next if matching_code_fix.nil?
456
402
 
457
- if new_fields.nil? or new_fields == fields
458
- each &block
459
- return [key_field, fields]
460
- else
461
- new_field_positions = field_positions(*new_fields)
462
- each do |key, values|
463
- yield key, values.values_at(*new_field_positions)
403
+ if other_index
404
+ Log.debug "Using other_index"
405
+ other_codes = other_index[matching_code_fix]
406
+ else
407
+ other_codes = matching_code_fix
464
408
  end
465
- return [key_field, fields_at(*new_field_positions)]
466
- end
409
+ Log.debug "Other codes: #{other_codes}"
467
410
 
468
- else
469
- new_field_positions = field_positions(*new_fields)
411
+ next if other_codes.nil? or other_codes.empty?
412
+ other_code = other_codes.first
470
413
 
471
- new_field_names = fields_at(*new_field_positions)
472
- if new_field_names.nil? and fields
473
- new_field_names = fields.dup
474
- new_field_names.delete_at new_key_position
475
- new_field_names.unshift key_field
476
- end
414
+ if nofieldinfo
415
+ next if other[other_code].nil?
416
+ if list
417
+ other_values = [[other_code]] + other[other_code]
418
+ else
419
+ other_values = [other_code] + other[other_code]
420
+ end
421
+ other_values.delete_if do |list|
422
+ list = [list] unless Array === list
423
+ list.collect{|e| case_insensitive ? e.downcase : e }.
424
+ select{|e| case_insensitive ? e == matching_code.downcase : e == matching_code }.any?
425
+ end
477
426
 
478
- each do |key, values|
479
- if list
480
- tmp_values = values + [[key]]
427
+ new_values = values + other_values
481
428
  else
482
- tmp_values = values + [key]
483
- end
429
+ if other[other_code].nil?
430
+ if list
431
+ other_values = [[]] * other.fields.length
432
+ else
433
+ other_values = [] * other.fields.length
434
+ end
435
+ else
436
+ if list
437
+ other_values = other[other_code] + [[other_code]]
438
+ else
439
+ other_values = other[other_code] + [other_code]
440
+ end
441
+ end
442
+
484
443
 
485
- if new_field_positions.nil?
486
444
  new_values = values.dup
487
- new_values.delete_at new_key_position
488
- new_values.unshift [key]
489
- else
490
- new_values = tmp_values.values_at(*new_field_positions)
491
- end
492
445
 
493
- tmp_values[new_key_position].each do |new_key|
494
- if new_field_names
495
- yield new_key, NamedArray.name(new_values, new_field_names)
496
- else
497
- yield new_key, new_values
446
+ if list
447
+ this_common_field_positions.zip(other_common_field_positions).each do |tpos, opos|
448
+ new_values_tops = new_values[tpos]
449
+
450
+ if other.list
451
+ new_values_tops += other_values[opos]
452
+ else
453
+ new_values_tops += [other_values[opos]]
454
+ end
455
+
456
+ new_values[tpos] = new_values_tops.uniq
457
+ end
498
458
  end
459
+
460
+ new_values.concat other_values.values_at *other_new_field_positions
499
461
  end
462
+
463
+ self[key] = new_values
500
464
  end
501
- return [(fields_at(new_key_position) || [nil]).first, new_field_names]
502
465
  end
466
+
467
+ self.fields = self.fields + new_fields unless nofieldinfo
468
+ end
469
+
470
+ #{{{ Helpers
471
+
472
+ def self.index(file, options = {})
473
+ opt_data = options.dup
474
+ opt_index = options.dup
475
+ opt_data.delete :field
476
+ opt_data.delete :persistence
477
+ opt_index.delete :persistence
478
+
479
+ opt_data[:persistence] = true if options[:data_persistence]
480
+
481
+ opt_index.merge! :persistence_file => get_persistence_file(file, "index:#{ file }_#{options[:field]}:", opt_index) if options[:persistence]
482
+
483
+ if ! opt_index[:persistence_file].nil? && File.exists?(opt_index[:persistence_file])
484
+ Log.low "Reloading persistent index for #{ file }: #{opt_index[:persistence_file]}"
485
+ TSV.new(PersistenceHash.get(opt_index[:persistence_file], false), opt_index)
486
+ else
487
+ Log.low "Creating index for #{ file }: #{opt_index[:persistence_file]}"
488
+ data = TSV.new(file, opt_data)
489
+ data.index(opt_index)
490
+ end
491
+ end
492
+
493
+ def self.open_file(file)
494
+ if file =~ /(.*?)#(.*)/
495
+ file, options = $1, Misc.string2hash($2.to_s)
496
+ else
497
+ options = {}
498
+ end
499
+
500
+ TSV.new(file, options)
503
501
  end
504
-
505
- def process(field)
506
- through do |key, values|
507
- values[field].replace yield(values[field], key, values) unless values[field].nil?
508
- end
502
+
503
+ #{{{ Accesor Methods
504
+
505
+ def keys
506
+ @data.keys
509
507
  end
510
508
 
509
+ def values
510
+ @data.values
511
+ end
511
512
 
512
- def reorder(new_key_field, new_fields = nil, options = {})
513
- options = Misc.add_defaults options
514
- return TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => case_insensitive) if options[:persistence_file] and File.exists?(options[:persistence_file])
513
+ def size
514
+ @data.size
515
+ end
515
516
 
516
- new = {}
517
- new_key_field, new_fields = through new_key_field, new_fields do |key, values|
518
- if new[key].nil?
519
- new[key] = values
520
- else
521
- new[key] = new[key].zip(values)
522
- end
523
- end
517
+ # Write
524
518
 
525
- new.each do |key,values|
526
- values.each{|list| list.flatten! if Array === list}
519
+ def []=(key, value)
520
+ key = key.downcase if @case_insensitive
521
+ @data[key] = value
522
+ end
523
+
524
+
525
+ def merge!(new_data)
526
+ new_data.each do |key, value|
527
+ self[key] = value
527
528
  end
529
+ end
528
530
 
529
- if options[:persistence_file]
530
- reordered = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => case_insensitive)
531
- reordered.merge! new
531
+ # Read
532
+
533
+ def follow(value)
534
+ if String === value && value =~ /__Ref:(.*)/
535
+ return self[$1]
532
536
  else
533
- reordered = TSV.new(new, :case_insensitive => case_insensitive)
537
+ value = NamedArray.name value, fields if Array === value and fields
538
+ value
534
539
  end
540
+ end
535
541
 
536
- reordered.key_field = new_key_field
537
- reordered.fields = new_fields
542
+ def [](key)
543
+ if Array === key
544
+ return @data[key] if @data[key] != nil
545
+ key.each{|k| v = self[k]; return v unless v.nil?}
546
+ return nil
547
+ end
538
548
 
539
- reordered
549
+ key = key.downcase if @case_insensitive
550
+ follow @data[key]
540
551
  end
541
552
 
542
- def slice(new_fields, options = {})
543
- reorder(:main, new_fields)
553
+ def values_at(*keys)
554
+ keys.collect{|k|
555
+ self[k]
556
+ }
544
557
  end
545
558
 
546
- def index(options = {})
547
- options = Misc.add_defaults options, :order => false
548
-
549
- if options[:persistence] and ! options[:persistence_file]
550
- options[:persistence_file] = TSV.get_persistence_file(filename, "index:#{ filename }_#{options[:field]}:", options)
559
+ def each(&block)
560
+ @data.each do |key, value|
561
+ block.call(key, follow(value))
551
562
  end
563
+ end
552
564
 
553
- if options[:persistence_file] and File.exists?(options[:persistence_file])
554
- return TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
565
+ def collect
566
+ if block_given?
567
+ @data.collect do |key, value|
568
+ value = follow(value)
569
+ key, values = yield key, value
570
+ end
571
+ else
572
+ @data.collect do |key, value|
573
+ [key, follow(value)]
574
+ end
555
575
  end
576
+ end
556
577
 
557
- new = {}
558
- if options[:order]
559
- new_key_field, new_fields = through options[:field], options[:others] do |key, values|
560
-
561
- values.each_with_index do |list, i|
562
- next if list.nil? or list.empty?
563
-
564
- list = [list] unless Array === list
565
-
566
- list.each do |value|
567
- next if value.nil? or value.empty?
568
- value = value.downcase if options[:case_insensitive]
569
- new[value] ||= []
570
- new[value][i + 1] ||= []
571
- new[value][i + 1] << key
572
- end
573
- new[key] ||= []
574
- new[key][0] = key
575
- end
578
+ def sort(&block)
579
+ collect.sort(&block).collect{|p|
580
+ key, value = p
581
+ value = NamedArray.name value, fields if fields
582
+ [key, value]
583
+ }
584
+ end
576
585
 
577
- end
586
+ def sort_by(&block)
587
+ collect.sort_by &block
588
+ end
578
589
 
579
- new.each do |key, values|
580
- values.flatten!
581
- values.compact!
582
- end
590
+ def to_s
591
+ str = ""
583
592
 
584
- else
585
- new_key_field, new_fields = through options[:field], options[:others] do |key, values|
586
- new[key] ||= []
587
- new[key] << key
588
- values.each do |list|
589
- next if list.nil?
590
- if Array === list
591
- list.each do |value|
592
- value = value.downcase if options[:case_insensitive]
593
- new[value] ||= []
594
- new[value] << key
595
- end
596
- else
597
- next if list.empty?
598
- value = list
599
- value = value.downcase if options[:case_insensitive]
600
- new[value] ||= []
601
- new[value] << key
602
- end
603
- end
604
- end
593
+ if fields
594
+ str << "#" << key_field << "\t" << fields * "\t" << "\n"
605
595
  end
606
596
 
607
- if options[:persistence_file]
608
- index = TSV.new(PersistenceHash.get(options[:persistence_file], false), :case_insensitive => options[:case_insensitive])
609
- index.merge! new
610
- else
611
- index = TSV.new(new, :case_insensitive => options[:case_insensitive])
597
+ each do |key, values|
598
+ case
599
+ when values.nil?
600
+ str << key.dup << "\n"
601
+ when (not Array === values)
602
+ str << key.dup << "\t" << values.to_s << "\n"
603
+ when Array === values.first
604
+ str << key.dup << "\t" << values.collect{|list| (list || []) * "|"} * "\t" << "\n"
605
+ else
606
+ str << key.dup << "\t" << values * "\t" << "\n"
607
+ end
612
608
  end
613
609
 
614
- index.key_field = new_key_field
615
- index.fields = new_fields
616
- index
610
+ str
617
611
  end
618
612
 
619
- def smart_merge(other, match = nil, new_fields = nil)
620
-
621
- new_fields = [new_fields] if String === new_fields
622
- if self.fields and other.fields
623
- common_fields = ([self.key_field] + self.fields) & ([other.key_field] + other.fields)
624
- new_fields ||= ([other.key_field] + other.fields) - ([self.key_field] + self.fields)
613
+ #{{{ Parsing
614
+
615
+ def self.parse_fields(io, delimiter = "\t")
616
+ return [] if io.nil?
617
+ fields = io.split(delimiter, -1)
618
+ fields
619
+ end
625
620
 
626
- common_fields.delete match if String === match
627
- common_fields.delete_at match if Integer === match
621
+ def self.zip_fields(list, fields = nil)
622
+ return [] if list.nil? || list.empty?
623
+ fields ||= list.fields if list.respond_to? :fields
624
+ zipped = list[0].zip(*list[1..-1])
625
+ zipped = zipped.collect{|v| NamedArray.name(v, fields)} if fields
626
+ zipped
627
+ end
628
+
629
+ def self.parse(data, file, options = {})
628
630
 
629
- this_common_field_positions = self.field_positions *common_fields
630
- other_common_field_positions = other.field_positions *common_fields
631
- other_new_field_positions = other.field_positions *new_fields
632
- else
633
- nofieldinfo = true
634
- end
631
+ # Prepare options
632
+ options = add_defaults options,
633
+ :sep => "\t",
634
+ :sep2 => "|",
635
+ :native => 0,
636
+ :extra => nil,
637
+ :fix => nil,
638
+ :exclude => nil,
639
+ :select => nil,
640
+ :grep => nil,
641
+ :single => false,
642
+ :unique => false,
643
+ :flatten => false,
644
+ :overwrite => false,
645
+ :keep_empty => true,
646
+ :case_insensitive => false,
647
+ :header_hash => '#' ,
648
+ :persistence_file => nil
635
649
 
636
- case
637
- when TSV === match
638
- match_index = match
639
- matching_code_position = nil
650
+ options[:extra] = [options[:extra]] if options[:extra] != nil && ! (Array === options[:extra])
651
+ options[:flatten] = true if options[:single]
640
652
 
641
- when Array === match
642
- match_index = match.first
643
- matching_code_position = field_positions(match.last).first
644
653
 
645
- when match =~ /^through:(.*)/
646
- through = $1
647
- if through =~ /(.*)#using:(.*)/
648
- through = $1
649
- matching_code_position = field_positions($2).first
650
- else
651
- matching_code_position = nil
652
- end
653
- index_fields = TSV.headers(through)
654
- target_field = index_fields.select{|field| other.fields_include field}.first
655
- Log.debug "Target Field: #{ target_field }"
656
- match_index = TSV.open_file(through).index(:field => target_field)
657
654
 
658
- when field_positions(match).first
659
- matching_code_position = field_positions(match).first
660
- match_index = nil
661
- end
655
+ #{{{ Process first line
662
656
 
663
- if matching_code_position.nil? and match_index.fields
664
- match_index.fields.each do |field|
665
- if matching_code_position = field_positions(field).first
666
- break
667
- end
668
- end
657
+ line = file.gets
658
+ raise "Empty content" if line.nil?
659
+ line.chomp!
660
+
661
+ if line =~ /^#{options[:header_hash]}/
662
+ header_fields = parse_fields(line, options[:sep])
663
+ header_fields[0] = header_fields[0][(0 + options[:header_hash].length)..-1] # Remove initial hash character
664
+ line = file.gets
665
+ else
666
+ header_fields = nil
669
667
  end
668
+
669
+ id_pos = Misc.field_position(header_fields, options[:native])
670
670
 
671
- if match_index and match_index.key_field == other.key_field
672
- other_index = nil
671
+ if options[:extra].nil?
672
+ extra_pos = nil
673
+ max_cols = 0
673
674
  else
674
- other_index = (match === String and other.fields_include(match)) ?
675
- other.index(:other => match, :order => true) : other.index(:order => true)
675
+ extra_pos = options[:extra].collect{|pos| Misc.field_position(header_fields, pos) }
676
676
  end
677
677
 
678
- each do |key,values|
679
- Log.debug "Key: #{ key }. Values: #{values * ", "}"
680
- if matching_code_position.nil? or matching_code_position == -1
681
- matching_codes = [key]
682
- else
683
- matching_codes = values[matching_code_position]
684
- matching_codes = [matching_codes] unless matching_codes.nil? or Array === matching_codes
678
+ #{{{ Process rest
679
+ while line do
680
+ line.chomp!
681
+
682
+ line = options[:fix].call line if options[:fix]
683
+
684
+ # Select and fix lines
685
+ if (options[:exclude] and options[:exclude].call(line)) or
686
+ (options[:select] and not options[:select].call(line))
687
+ line = file.gets
688
+ next
685
689
  end
686
- Log.debug "Matching codes: #{matching_codes}"
687
690
 
688
- next if matching_codes.nil?
691
+ ### Process line
689
692
 
690
- matching_codes.each do |matching_code|
691
- if match_index
692
- if match_index[matching_code]
693
- matching_code_fix = match_index[matching_code].first
694
- else
695
- matching_code_fix = nil
696
- end
693
+ # Chunk fields
694
+ parts = parse_fields(line, options[:sep])
695
+
696
+ # Get next line
697
+ line = file.gets
698
+
699
+ # Get id field
700
+ next if parts[id_pos].nil? || parts[id_pos].empty?
701
+ ids = parse_fields(parts[id_pos], options[:sep2])
702
+ ids.collect!{|id| id.downcase } if options[:case_insensitive]
703
+
704
+ # Get extra fields
705
+
706
+ if options[:extra].nil? and not (options[:flatten] or options[:single])
707
+ extra = parts
708
+ extra.delete_at(id_pos)
709
+ max_cols = extra.size if extra.size > (max_cols || 0)
710
+ else
711
+ if extra_pos.nil?
712
+ extra = parts
713
+ extra.delete_at id_pos
697
714
  else
698
- matching_code_fix = matching_code
715
+ extra = parts.values_at(*extra_pos)
699
716
  end
717
+ end
700
718
 
701
- Log.debug "Matching code (fix): #{matching_code_fix}"
702
- next if matching_code_fix.nil?
719
+ extra.collect!{|value| parse_fields(value, options[:sep2])}
720
+ extra.collect!{|values| values.first} if options[:unique]
721
+ extra.flatten! if options[:flatten]
722
+ extra = extra.first if options[:single]
703
723
 
704
- if other_index
705
- Log.debug "Using other_index"
706
- other_codes = other_index[matching_code_fix]
707
- else
708
- other_codes = matching_code_fix
724
+ if options[:overwrite]
725
+ main_entry = ids.shift
726
+ ids.each do |id|
727
+ data[id] = "__Ref:#{main_entry}"
709
728
  end
710
- Log.debug "Other codes: #{other_codes}"
711
729
 
712
- next if other_codes.nil? or other_codes.empty?
713
- other_code = other_codes.first
730
+ data[main_entry] = extra
731
+ else
732
+ main_entry = ids.shift
733
+ ids.each do |id|
734
+ data[id] = "__Ref:#{main_entry}"
735
+ end
714
736
 
715
- if nofieldinfo
716
- next if other[other_code].nil?
717
- if list
718
- other_values = [[other_code]] + other[other_code]
737
+ case
738
+ when (options[:single] or options[:unique])
739
+ data[main_entry] ||= extra
740
+ when options[:flatten]
741
+ if PersistenceHash === data
742
+ data[main_entry] = (data[main_entry] || []).concat extra
719
743
  else
720
- other_values = [other_code] + other[other_code]
721
- end
722
- other_values.delete_if do |list|
723
- list = [list] unless Array === list
724
- list.collect{|e| case_insensitive ? e.downcase : e }.
725
- select{|e| case_insensitive ? e == matching_code.downcase : e == matching_code }.any?
744
+ data[main_entry] ||= []
745
+ data[main_entry].concat extra
726
746
  end
727
-
728
- new_values = values + other_values
729
747
  else
730
- if other[other_code].nil?
731
- if list
732
- other_values = [[]] * other.fields.length
733
- else
734
- other_values = [] * other.fields.length
735
- end
736
- else
737
- if list
738
- other_values = other[other_code] + [[other_code]]
739
- else
740
- other_values = other[other_code] + [other_code]
741
- end
748
+ entry = data[main_entry] || []
749
+ while entry =~ /__Ref:(.*)/ do
750
+ entry = data[$1]
742
751
  end
743
-
744
-
745
- new_values = values.dup
746
-
747
- if list
748
- this_common_field_positions.zip(other_common_field_positions).each do |tpos, opos|
749
- new_values_tops = new_values[tpos]
750
-
751
- if other.list
752
- new_values_tops += other_values[opos]
753
- else
754
- new_values_tops += [other_values[opos]]
755
- end
756
752
 
757
- new_values[tpos] = new_values_tops.uniq
753
+ extra.each_with_index do |fields, i|
754
+ if fields.empty?
755
+ next unless options[:keep_empty]
756
+ fields = [""]
758
757
  end
758
+ entry[i] ||= []
759
+ entry[i] = entry[i].concat fields
759
760
  end
760
761
 
761
- new_values.concat other_values.values_at *other_new_field_positions
762
+ data[main_entry] = entry
762
763
  end
764
+ end
765
+ end
763
766
 
764
- self[key] = new_values
767
+ if options[:keep_empty] and not max_cols.nil?
768
+ data.each do |key,values|
769
+ new_values = values
770
+ max_cols.times do |i|
771
+ new_values[i] ||= [""]
772
+ end
773
+ data[key] = new_values
765
774
  end
766
775
  end
767
776
 
768
- self.fields = self.fields + new_fields unless nofieldinfo
769
- end
770
777
 
771
- #{{{ Helpers
778
+ # Save header information
779
+ key_field = nil
780
+ fields = nil
781
+ if header_fields && header_fields.any?
782
+ key_field = header_fields[id_pos]
783
+ if extra_pos.nil?
784
+ fields = header_fields
785
+ fields.delete_at(id_pos)
786
+ else
787
+ fields = header_fields.values_at(*extra_pos)
788
+ end
789
+ end
772
790
 
773
- def self.index(file, options = {})
774
- opt_data = options.dup
775
- opt_index = options.dup
776
- opt_data.delete :field
777
- opt_data.delete :persistence
778
- opt_index.delete :persistence
791
+ data.read if PersistenceHash === data
779
792
 
780
- opt_data[:persistence] = true if options[:data_persistence]
793
+ [key_field, fields]
794
+ end
781
795
 
782
- opt_index.merge! :persistence_file => get_persistence_file(file, "index:#{ file }_#{options[:field]}:", opt_index) if options[:persistence]
796
+ attr_accessor :data, :key_field, :fields, :list, :case_insensitive, :filename
797
+ def fields
798
+ fields = @fields
799
+ fields.each do |f| f.extend Field end if Array === fields
800
+ fields
801
+ end
783
802
 
784
- if ! opt_index[:persistence_file].nil? && File.exists?(opt_index[:persistence_file])
785
- TSV.log "Reloading persistent index for #{ file }: #{opt_index[:persistence_file]}"
786
- TSV.new(PersistenceHash.get(opt_index[:persistence_file], false), opt_index)
787
- else
788
- TSV.log "Creating index for #{ file }: #{opt_index[:persistence_file]}"
789
- data = TSV.new(file, opt_data)
790
- data.index(opt_index)
803
+ def initialize(file = {}, options = {})
804
+ options = Misc.add_defaults options
805
+ options[:persistence] = true if options[:persistence_file]
806
+
807
+ if String === file && file =~ /(.*?)#(.*)/
808
+ file, file_options = $1, $2
809
+ options = Misc.add_defaults file_options, options
791
810
  end
792
- end
793
811
 
794
- def self.open_file(file)
795
- if file =~ /(.*?)#(.*)/
796
- file, options = $1, Misc.string2hash($2.to_s)
812
+ @case_insensitive = options[:case_insensitive] == true
813
+ @list = ! (options[:flatten] == true || options[:single] == true || options[:unique] == true)
814
+
815
+ case
816
+ when TSV === file
817
+ Log.low "Copying TSV"
818
+ @filename = file.filename
819
+
820
+ if options[:persistence] and not PersistenceHash === file.data
821
+ persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
822
+ Log.low "Making persistance #{ persistence_file }"
823
+ @data = TCHash.get(persistence_file)
824
+ @data.merge! file
825
+ @data.key_field = file.key_field
826
+ @data.fields = file.fields
827
+ else
828
+ @data = file.data
829
+ end
830
+
831
+ @key_field = file.key_field
832
+ @fields = file.fields
833
+ @case_insensitive = file.case_insensitive
834
+ @list = file.list
835
+ return self
836
+ when Hash === file
837
+ Log.low "Encapsulating Hash"
838
+ @filename = "Hash:" + Digest::MD5.hexdigest(file.inspect)
839
+ if options[:persistence]
840
+ persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
841
+ Log.low "Making persistance #{ persistence_file }"
842
+ @data = TCHash.get(persistence_file)
843
+ @data.merge! file
844
+ else
845
+ @data = file
846
+ end
847
+ return self
848
+ when PersistenceHash === file
849
+ Log.low "Encapsulating PersistenceHash"
850
+ @filename = "PersistenceHash:" + Digest::MD5.hexdigest(file.inspect)
851
+ @data = file
852
+ @key_field = file.key_field
853
+ @fields = file.fields
854
+ return self
855
+ when File === file
856
+ @filename = File.expand_path file.path
857
+ when String === file && File.exists?(file)
858
+ @filename = File.expand_path file
859
+ file = Open.open(file)
860
+ when StringIO
861
+ else
862
+ raise "File #{file} not found"
863
+ end
864
+
865
+ if options[:persistence]
866
+ options.delete :persistence
867
+ persistence_file = options.delete(:persistence_file) || TSV.get_persistence_file(@filename, "file:#{ @filename }:", options)
868
+
869
+ if File.exists? persistence_file
870
+ Log.low "Loading Persistence for #{ @filename } in #{persistence_file}"
871
+ @data = PersistenceHash.get(persistence_file, false)
872
+ @key_field = @data.key_field
873
+ @fields = @data.fields
874
+ else
875
+ @data = PersistenceHash.get(persistence_file, true)
876
+ file = Open.grep(file, options[:grep]) if options[:grep]
877
+
878
+ Log.low "Persistent Parsing for #{ @filename } in #{persistence_file}"
879
+ @key_field, @fields = TSV.parse(@data, file, options.merge(:persistence_file => persistence_file))
880
+ @data.key_field = @key_field
881
+ @data.fields = @fields
882
+ @data.read
883
+ end
797
884
  else
798
- options = {}
885
+ Log.low "Non-persistent parsing for #{ @filename }"
886
+ @data = {}
887
+ file = Open.grep(file, options[:grep]) if options[:grep]
888
+ @key_field, @fields = TSV.parse(@data, file, options)
799
889
  end
800
890
 
801
- TSV.new(file, options)
891
+ file.close
892
+ @case_insensitive = options[:case_insensitive] == true
802
893
  end
803
894
 
804
895
  end
896
+
897
+ #{{{ CacheHelper
898
+ require 'rbbt/util/cachehelper'
899
+ module CacheHelper
900
+ def self.tsv_cache(name, key = [])
901
+ cache_file = CacheHelper.build_filename name, key
902
+
903
+ if File.exists? cache_file
904
+ Log.debug "TSV cache file '#{cache_file}' found"
905
+ hash = TCHash.get(cache_file)
906
+ TSV.new(hash)
907
+ else
908
+ Log.debug "Producing TSV cache file '#{cache_file}'"
909
+ data = yield
910
+ TSV.new(data, :persistence_file => cache_file)
911
+ end
912
+ end
913
+ end