scout-gear 7.2.0 → 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vimproject +51 -6
- data/VERSION +1 -1
- data/bin/scout +6 -3
- data/lib/rbbt-scout.rb +1 -0
- data/lib/scout/cmd.rb +1 -1
- data/lib/scout/concurrent_stream.rb +33 -29
- data/lib/scout/config.rb +1 -1
- data/lib/scout/exceptions.rb +1 -0
- data/lib/scout/log/color.rb +4 -2
- data/lib/scout/log/progress/report.rb +1 -1
- data/lib/scout/log/progress/util.rb +71 -2
- data/lib/scout/log/progress.rb +1 -1
- data/lib/scout/log/trap.rb +107 -0
- data/lib/scout/log.rb +56 -21
- data/lib/scout/meta_extension.rb +13 -6
- data/lib/scout/misc/digest.rb +1 -1
- data/lib/scout/misc/format.rb +12 -0
- data/lib/scout/misc/helper.rb +31 -0
- data/lib/scout/misc/insist.rb +1 -1
- data/lib/scout/misc/monitor.rb +12 -1
- data/lib/scout/misc/system.rb +10 -0
- data/lib/scout/misc.rb +1 -0
- data/lib/scout/named_array.rb +65 -3
- data/lib/scout/open/lock/lockfile.rb +587 -0
- data/lib/scout/open/lock.rb +28 -2
- data/lib/scout/open/remote.rb +4 -0
- data/lib/scout/open/stream.rb +111 -42
- data/lib/scout/open/util.rb +13 -3
- data/lib/scout/path/find.rb +9 -1
- data/lib/scout/path/util.rb +35 -0
- data/lib/scout/persist/serialize.rb +18 -5
- data/lib/scout/persist.rb +60 -30
- data/lib/scout/resource/path.rb +53 -0
- data/lib/scout/resource/produce.rb +0 -8
- data/lib/scout/resource/util.rb +2 -1
- data/lib/scout/semaphore.rb +8 -1
- data/lib/scout/tmpfile.rb +7 -8
- data/lib/scout/tsv/attach.rb +177 -0
- data/lib/scout/tsv/change_id.rb +40 -0
- data/lib/scout/tsv/dumper.rb +85 -54
- data/lib/scout/tsv/index.rb +188 -20
- data/lib/scout/tsv/open.rb +182 -0
- data/lib/scout/tsv/parser.rb +200 -118
- data/lib/scout/tsv/path.rb +5 -6
- data/lib/scout/tsv/persist/adapter.rb +26 -37
- data/lib/scout/tsv/persist/fix_width_table.rb +327 -0
- data/lib/scout/tsv/persist/serialize.rb +117 -0
- data/lib/scout/tsv/persist/tokyocabinet.rb +6 -3
- data/lib/scout/tsv/persist.rb +4 -2
- data/lib/scout/tsv/transformer.rb +141 -0
- data/lib/scout/tsv/traverse.rb +136 -37
- data/lib/scout/tsv/util/filter.rb +312 -0
- data/lib/scout/tsv/util/process.rb +73 -0
- data/lib/scout/tsv/util/reorder.rb +81 -0
- data/lib/scout/tsv/util/select.rb +265 -0
- data/lib/scout/tsv/util/unzip.rb +86 -0
- data/lib/scout/tsv/util.rb +126 -19
- data/lib/scout/tsv.rb +28 -5
- data/lib/scout/work_queue/socket.rb +6 -1
- data/lib/scout/work_queue/worker.rb +5 -2
- data/lib/scout/work_queue.rb +15 -8
- data/lib/scout/workflow/definition.rb +29 -2
- data/lib/scout/workflow/step/dependencies.rb +24 -4
- data/lib/scout/workflow/step/info.rb +40 -5
- data/lib/scout/workflow/step/progress.rb +14 -0
- data/lib/scout/workflow/step/provenance.rb +8 -7
- data/lib/scout/workflow/step/status.rb +45 -0
- data/lib/scout/workflow/step.rb +104 -33
- data/lib/scout/workflow/task/inputs.rb +14 -20
- data/lib/scout/workflow/task.rb +86 -47
- data/lib/scout/workflow/usage.rb +10 -6
- data/scout-gear.gemspec +30 -3
- data/scout_commands/workflow/task +37 -9
- data/scout_commands/workflow/task_old +2 -2
- data/test/scout/open/test_stream.rb +61 -59
- data/test/scout/path/test_find.rb +10 -1
- data/test/scout/resource/test_produce.rb +15 -0
- data/test/scout/test_meta_extension.rb +25 -0
- data/test/scout/test_named_array.rb +18 -0
- data/test/scout/test_persist.rb +67 -0
- data/test/scout/test_tmpfile.rb +1 -1
- data/test/scout/test_tsv.rb +222 -3
- data/test/scout/test_work_queue.rb +21 -18
- data/test/scout/tsv/persist/test_adapter.rb +11 -1
- data/test/scout/tsv/persist/test_fix_width_table.rb +134 -0
- data/test/scout/tsv/persist/test_tokyocabinet.rb +29 -1
- data/test/scout/tsv/test_attach.rb +227 -0
- data/test/scout/tsv/test_change_id.rb +98 -0
- data/test/scout/tsv/test_dumper.rb +1 -1
- data/test/scout/tsv/test_index.rb +127 -3
- data/test/scout/tsv/test_open.rb +167 -0
- data/test/scout/tsv/test_parser.rb +45 -3
- data/test/scout/tsv/test_persist.rb +9 -0
- data/test/scout/tsv/test_transformer.rb +108 -0
- data/test/scout/tsv/test_traverse.rb +195 -3
- data/test/scout/tsv/test_util.rb +24 -0
- data/test/scout/tsv/util/test_filter.rb +188 -0
- data/test/scout/tsv/util/test_process.rb +47 -0
- data/test/scout/tsv/util/test_reorder.rb +94 -0
- data/test/scout/tsv/util/test_select.rb +58 -0
- data/test/scout/tsv/util/test_unzip.rb +112 -0
- data/test/scout/work_queue/test_socket.rb +0 -1
- data/test/scout/work_queue/test_worker.rb +63 -6
- data/test/scout/workflow/step/test_load.rb +3 -3
- data/test/scout/workflow/step/test_status.rb +31 -0
- data/test/scout/workflow/task/test_inputs.rb +14 -14
- data/test/scout/workflow/test_step.rb +13 -13
- data/test/scout/workflow/test_task.rb +168 -32
- data/test/scout/workflow/test_usage.rb +33 -6
- data/test/test_helper.rb +3 -1
- metadata +29 -2
data/lib/scout/tmpfile.rb
CHANGED
@@ -93,19 +93,18 @@ module TmpFile
|
|
93
93
|
end
|
94
94
|
end
|
95
95
|
|
96
|
+
SLASH_REPLACE = '·'
|
96
97
|
def self.tmp_for_file(file, tmp_options = {}, other_options = {})
|
97
|
-
tmp_for_file = IndiferentHash.process_options tmp_options, :file
|
98
|
+
tmp_for_file, prefix, key, persistence_dir = IndiferentHash.process_options tmp_options, :file, :prefix, :key, :dir
|
98
99
|
return tmp_for_file unless tmp_for_file.nil?
|
99
100
|
|
100
|
-
prefix = IndiferentHash.process_options tmp_options, :prefix
|
101
|
-
|
102
101
|
if prefix.nil?
|
103
|
-
perfile = file.to_s.
|
102
|
+
perfile = file.to_s.sub(/\.b?gz$/,'')
|
104
103
|
else
|
105
|
-
perfile = prefix.to_s + ":" + file.to_s.
|
104
|
+
perfile = prefix.to_s + ":" + file.to_s.sub(/\.b?gz$/,'')
|
106
105
|
end
|
107
106
|
|
108
|
-
perfile
|
107
|
+
perfile += "[#{ key }]" if key
|
109
108
|
|
110
109
|
if other_options.include? :filters
|
111
110
|
other_options[:filters].each do |match,value|
|
@@ -113,10 +112,10 @@ module TmpFile
|
|
113
112
|
end
|
114
113
|
end
|
115
114
|
|
116
|
-
persistence_dir =
|
115
|
+
persistence_dir = TmpFile.tmpdir if persistence_dir.nil?
|
117
116
|
Path.setup(persistence_dir) unless Path === persistence_dir
|
118
117
|
|
119
|
-
filename = perfile.gsub(/\s/,'_').gsub(
|
118
|
+
filename = perfile.gsub(/\s/,'_').gsub('/', SLASH_REPLACE)
|
120
119
|
clean_options = other_options.dup
|
121
120
|
clean_options.delete :unnamed
|
122
121
|
clean_options.delete "unnamed"
|
@@ -0,0 +1,177 @@
|
|
1
|
+
module TSV
|
2
|
+
|
3
|
+
def self.match_keys(source, other, match_key: nil, other_key: nil)
|
4
|
+
match_key = (source.all_fields & other.all_fields).first if match_key.nil?
|
5
|
+
|
6
|
+
if match_key.nil?
|
7
|
+
source.all_fields.collect do |f|
|
8
|
+
other_key = other.identify_field(f)
|
9
|
+
if other_key
|
10
|
+
other_key = other.key_field if other_key == :key
|
11
|
+
match_key = f
|
12
|
+
break
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
if match_key.nil?
|
18
|
+
other.all_fields.collect do |f|
|
19
|
+
match_key = source.identify_field(f)
|
20
|
+
if match_key
|
21
|
+
other_key = f
|
22
|
+
break
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
match_key = source.key_field if match_key.nil?
|
28
|
+
|
29
|
+
if other_key.nil?
|
30
|
+
other_key = other.identify_field(match_key)
|
31
|
+
end
|
32
|
+
|
33
|
+
other_key = other.key_field if other_key.nil?
|
34
|
+
|
35
|
+
match_key = :key if match_key == source.key_field
|
36
|
+
other_key = :key if other_key == other.key_field
|
37
|
+
|
38
|
+
[match_key, other_key]
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.attach(source, other, target: nil, fields: nil, match_key: nil, other_key: nil, one2one: true, complete: false, insitu: nil, persist_input: false, bar: nil)
|
42
|
+
source = TSV::Transformer.new source unless TSV === source || TSV::Parser === source
|
43
|
+
other = TSV.open other, persist: persist_input unless TSV === other
|
44
|
+
|
45
|
+
fields = [fields] if String === fields
|
46
|
+
|
47
|
+
match_key, other_key = TSV.match_keys(source, other, match_key: match_key, other_key: other_key)
|
48
|
+
|
49
|
+
if TSV::Transformer === source
|
50
|
+
source.dumper = case target
|
51
|
+
when :stream
|
52
|
+
TSV::Dumper.new(source.options.merge(sep: "\t"))
|
53
|
+
when nil
|
54
|
+
TSV.setup({}, **source.options.dup)
|
55
|
+
else
|
56
|
+
target
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
other.with_unnamed do
|
61
|
+
source.with_unnamed do
|
62
|
+
|
63
|
+
other_key_name = other_key == :key ? other.key_field : other_key
|
64
|
+
other_key_name = other.fields[other_key_name] if Integer === other_key
|
65
|
+
fields = other.all_fields - [other_key_name, source.key_field] if fields.nil?
|
66
|
+
|
67
|
+
if other_key != :key
|
68
|
+
other = other.reorder other_key, fields, one2one: one2one
|
69
|
+
end
|
70
|
+
|
71
|
+
other_field_positions = other.identify_field(fields)
|
72
|
+
|
73
|
+
log_message = "Attach #{Log.fingerprint fields - source.fields} to #{Log.fingerprint source} (#{[match_key, other_key] * "=~"})"
|
74
|
+
Log.debug log_message
|
75
|
+
bar = log_message if TrueClass === bar
|
76
|
+
|
77
|
+
new = fields - source.fields
|
78
|
+
|
79
|
+
source.fields = (source.fields + fields).uniq
|
80
|
+
|
81
|
+
overlaps = source.identify_field(fields)
|
82
|
+
|
83
|
+
empty_other_values = case source.type
|
84
|
+
when :list
|
85
|
+
[nil] * other.fields.length
|
86
|
+
when :flat
|
87
|
+
[]
|
88
|
+
when :double
|
89
|
+
[[]] * other.fields.length
|
90
|
+
end
|
91
|
+
|
92
|
+
insitu = TSV === source ? true : false if insitu.nil?
|
93
|
+
|
94
|
+
match_key_pos = source.identify_field(match_key)
|
95
|
+
source.traverse bar: bar, unnamed: true do |orig_key,current_values|
|
96
|
+
keys = (match_key == :key || match_key_pos == :key) ? [orig_key] : current_values[match_key_pos]
|
97
|
+
keys = [keys] unless Array === keys
|
98
|
+
|
99
|
+
current_values = current_values.dup unless insitu
|
100
|
+
keys.each do |current_key|
|
101
|
+
other_values = other[current_key]
|
102
|
+
|
103
|
+
if other_values.nil?
|
104
|
+
other_values = empty_other_values
|
105
|
+
elsif other.type == :flat
|
106
|
+
other_values = [other_values]
|
107
|
+
elsif other.type == :list && source.type == :double
|
108
|
+
other_values = other_values.collect{|v| [v] }
|
109
|
+
elsif other.type == :double && source.type == :list
|
110
|
+
other_values = other_values.collect{|v| v.first }
|
111
|
+
end
|
112
|
+
|
113
|
+
other_values = other_values.values_at *other_field_positions
|
114
|
+
|
115
|
+
other_values.zip(overlaps).each do |v,overlap|
|
116
|
+
if source.type == :list
|
117
|
+
current_values[overlap] = v if current_values[overlap].nil? || String === current_values[overlap] && current_values[overlap].empty?
|
118
|
+
else
|
119
|
+
current_values[overlap] ||= []
|
120
|
+
current_values[overlap].concat (v - current_values[overlap])
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
source[orig_key] = current_values unless insitu
|
125
|
+
nil
|
126
|
+
end
|
127
|
+
|
128
|
+
if complete && match_key == :key
|
129
|
+
empty_self_values = case source.type
|
130
|
+
when :list
|
131
|
+
[nil] * source.fields.length
|
132
|
+
when :flat
|
133
|
+
[]
|
134
|
+
when :double
|
135
|
+
[[]] * source.fields.length
|
136
|
+
end
|
137
|
+
other.each do |other_key,other_values|
|
138
|
+
next if source.include?(other_key)
|
139
|
+
if other.type == :flat
|
140
|
+
other_values = [other_values]
|
141
|
+
elsif other.type == :list && source.type == :double
|
142
|
+
other_values = other_values.collect{|v| [v] }
|
143
|
+
elsif other.type == :double && source.type == :list
|
144
|
+
other_values = other_values.collect{|v| v.first }
|
145
|
+
end
|
146
|
+
|
147
|
+
new_values = case source.type
|
148
|
+
when :list
|
149
|
+
[nil] * source.fields.length
|
150
|
+
when :flat
|
151
|
+
[]
|
152
|
+
when :double
|
153
|
+
source.fields.length.times.collect{ [] }
|
154
|
+
end
|
155
|
+
|
156
|
+
other_values.zip(overlaps).each do |v,overlap|
|
157
|
+
if false && overlap == :key
|
158
|
+
other_key = Array === v ? v : v.first
|
159
|
+
elsif source.type == :list
|
160
|
+
new_values[overlap] = v if v[overlap].nil? || String === v[overlap] && v[overlap].empty?
|
161
|
+
else
|
162
|
+
new_values[overlap].concat v
|
163
|
+
end
|
164
|
+
end
|
165
|
+
source[other_key] = new_values
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
169
|
+
end
|
170
|
+
|
171
|
+
source
|
172
|
+
end
|
173
|
+
|
174
|
+
def attach(*args, **kwargs)
|
175
|
+
TSV.attach(self, *args, **kwargs)
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
module TSV
|
2
|
+
def self.change_key(source, new_key_field, identifiers: nil, one2one: false, stream: false, keep: false, persist_identifiers: nil)
|
3
|
+
source = TSV::Parser.new source if String === source
|
4
|
+
if identifiers && source.identify_field(new_key_field, strict: true).nil?
|
5
|
+
identifiers = identifiers.nil? ? source.identifiers : identifiers
|
6
|
+
new = source.attach(identifiers, fields: [new_key_field], insitu: false, one2one: true, persist_input: persist_identifiers)
|
7
|
+
new = new.change_key(new_key_field, keep: keep, stream: stream, one2one: one2one)
|
8
|
+
return new
|
9
|
+
end
|
10
|
+
|
11
|
+
fields = source.fields.dup - [new_key_field]
|
12
|
+
fields.unshift source.key_field if keep
|
13
|
+
transformer = TSV::Transformer.new source
|
14
|
+
transformer.key_field = new_key_field
|
15
|
+
transformer.fields = fields
|
16
|
+
transformer.traverse key_field: new_key_field, fields: fields, one2one: one2one, unnamed: true do |k,v|
|
17
|
+
[k, v]
|
18
|
+
end
|
19
|
+
|
20
|
+
stream ? transformer : transformer.tsv
|
21
|
+
end
|
22
|
+
|
23
|
+
def change_key(*args, **kwargs)
|
24
|
+
TSV.change_key(self, *args, **kwargs)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.change_id(source, source_id, new_id, identifiers: nil, one2one: false, insitu: false)
|
28
|
+
source = TSV::Parser.new source if String === source
|
29
|
+
|
30
|
+
identifiers = identifiers.nil? ? source.identifiers : identifiers
|
31
|
+
|
32
|
+
new_fields = source.fields.dup
|
33
|
+
new_fields[new_fields.index(source_id)] = new_id
|
34
|
+
return source.attach(identifiers, fields: [new_id], insitu: insitu).slice(new_fields)
|
35
|
+
end
|
36
|
+
|
37
|
+
def change_id(*args, **kwargs)
|
38
|
+
TSV.change_id(self, *args, **kwargs)
|
39
|
+
end
|
40
|
+
end
|
data/lib/scout/tsv/dumper.rb
CHANGED
@@ -1,38 +1,14 @@
|
|
1
1
|
module TSV
|
2
2
|
class Dumper
|
3
|
-
def self.header_lines(key_field, fields, entry_hash = nil)
|
4
|
-
if Hash === entry_hash
|
5
|
-
sep = entry_hash[:sep] ? entry_hash[:sep] : "\t"
|
6
|
-
preamble = entry_hash[:preamble]
|
7
|
-
header_hash = entry_hash[:header_hash]
|
8
|
-
end
|
9
|
-
|
10
|
-
header_hash = "#" if header_hash.nil?
|
11
|
-
|
12
|
-
preamble = "#: " << Misc.hash2string(entry_hash.merge(:key_field => nil, :fields => nil)) << "\n" if preamble.nil? and entry_hash and entry_hash.values.compact.any?
|
13
|
-
|
14
|
-
str = ""
|
15
|
-
str << preamble.strip << "\n" if preamble and not preamble.empty?
|
16
|
-
if fields
|
17
|
-
if fields.empty?
|
18
|
-
str << header_hash << (key_field || "ID").to_s << "\n"
|
19
|
-
else
|
20
|
-
str << header_hash << (key_field || "ID").to_s << sep << (fields * sep) << "\n"
|
21
|
-
end
|
22
|
-
end
|
23
|
-
|
24
|
-
str
|
25
|
-
end
|
26
|
-
|
27
3
|
def self.header(options={})
|
28
|
-
key_field, fields, sep, header_hash, preamble = IndiferentHash.process_options options,
|
29
|
-
:key_field, :fields, :sep, :header_hash, :preamble,
|
4
|
+
key_field, fields, sep, header_hash, preamble, unnamed = IndiferentHash.process_options options,
|
5
|
+
:key_field, :fields, :sep, :header_hash, :preamble, :unnamed,
|
30
6
|
:sep => "\t", :header_hash => "#", :preamble => true
|
31
7
|
|
32
|
-
if fields.nil?
|
8
|
+
if fields.nil?
|
33
9
|
fields_str = nil
|
34
10
|
else
|
35
|
-
fields_str = "#{header_hash}#{key_field}#{sep}#{fields*sep}"
|
11
|
+
fields_str = "#{header_hash}#{key_field || "Id"}#{sep}#{fields*sep}"
|
36
12
|
end
|
37
13
|
|
38
14
|
if preamble && options.values.compact.any?
|
@@ -45,31 +21,62 @@ module TSV
|
|
45
21
|
end
|
46
22
|
|
47
23
|
|
48
|
-
attr_accessor :options
|
24
|
+
attr_accessor :options, :initialized, :type, :sep
|
49
25
|
def initialize(options = {})
|
26
|
+
options = options.options.merge(sep: nil) if TSV::Parser === options || TSV === options
|
50
27
|
@sep, @type = IndiferentHash.process_options options,
|
51
28
|
:sep, :type,
|
52
29
|
:sep => "\t", :type => :double
|
53
30
|
@options = options
|
54
31
|
@sout, @sin = Open.pipe
|
55
|
-
|
56
|
-
|
32
|
+
@initialized = false
|
33
|
+
@mutex = Mutex.new
|
34
|
+
ConcurrentStream.setup(@sin, pair: @sout)
|
35
|
+
ConcurrentStream.setup(@sout, pair: @sin)
|
57
36
|
end
|
58
37
|
|
59
|
-
def
|
60
|
-
|
61
|
-
|
38
|
+
def key_field
|
39
|
+
@options[:key_field]
|
40
|
+
end
|
41
|
+
|
42
|
+
def fields
|
43
|
+
@options[:fields]
|
62
44
|
end
|
63
45
|
|
64
|
-
def
|
46
|
+
def key_field=(key_field)
|
47
|
+
@options[:key_field] = key_field
|
48
|
+
end
|
49
|
+
|
50
|
+
def fields=(fields)
|
51
|
+
@options[:fields] = fields
|
52
|
+
end
|
53
|
+
|
54
|
+
def all_fields
|
55
|
+
return nil if fields.nil?
|
56
|
+
[key_field] + fields
|
57
|
+
end
|
65
58
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
@
|
71
|
-
|
72
|
-
|
59
|
+
|
60
|
+
def init(preamble: true)
|
61
|
+
header = Dumper.header(@options.merge(type: @type, sep: @sep, preamble: preamble))
|
62
|
+
@mutex.synchronize do
|
63
|
+
@initialized = true
|
64
|
+
@sin.puts header if header and ! header.empty?
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def add(key, value)
|
69
|
+
@mutex.synchronize do
|
70
|
+
|
71
|
+
key = key.to_s unless String === key
|
72
|
+
case @type
|
73
|
+
when :single
|
74
|
+
@sin.puts key + @sep + value.to_s
|
75
|
+
when :list, :flat
|
76
|
+
@sin.puts key + @sep + value * @sep
|
77
|
+
when :double
|
78
|
+
@sin.puts key + @sep + value.collect{|v| Array === v ? v * "|" : v } * @sep
|
79
|
+
end
|
73
80
|
end
|
74
81
|
end
|
75
82
|
|
@@ -85,23 +92,47 @@ module TSV
|
|
85
92
|
def abort(exception=nil)
|
86
93
|
@sin.abort(exception)
|
87
94
|
end
|
95
|
+
|
96
|
+
def tsv(*args)
|
97
|
+
TSV.open(stream, *args)
|
98
|
+
end
|
99
|
+
|
100
|
+
def fingerprint
|
101
|
+
"Dumper:{"<< Log.fingerprint(self.all_fields|| []) << "}"
|
102
|
+
end
|
103
|
+
|
104
|
+
def digest_str
|
105
|
+
fingerprint
|
106
|
+
end
|
107
|
+
|
108
|
+
def inspect
|
109
|
+
fingerprint
|
110
|
+
end
|
88
111
|
end
|
89
112
|
|
90
|
-
def
|
91
|
-
|
92
|
-
dumper = TSV::Dumper.new self.extension_attr_hash
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
dumper.
|
113
|
+
def dumper_stream(options = {})
|
114
|
+
preamble = IndiferentHash.process_options options, :preamble, :preamble => true
|
115
|
+
dumper = TSV::Dumper.new self.extension_attr_hash.merge(options)
|
116
|
+
t = Thread.new do
|
117
|
+
begin
|
118
|
+
Thread.current.report_on_exception = true
|
119
|
+
Thread.current["name"] = "Dumper thread"
|
120
|
+
dumper.init(preamble: preamble)
|
121
|
+
self.each do |k,v|
|
122
|
+
dumper.add k, v
|
123
|
+
end
|
124
|
+
dumper.close
|
125
|
+
rescue
|
126
|
+
dumper.abort($!)
|
98
127
|
end
|
99
|
-
dumper.close
|
100
128
|
end
|
101
|
-
|
129
|
+
Thread.pass until t["name"]
|
130
|
+
s = dumper.stream
|
131
|
+
ConcurrentStream.setup(s, :threads => [t])
|
132
|
+
s
|
102
133
|
end
|
103
134
|
|
104
|
-
def to_s
|
105
|
-
|
135
|
+
def to_s(options = {})
|
136
|
+
dumper_stream(options).read
|
106
137
|
end
|
107
138
|
end
|
data/lib/scout/tsv/index.rb
CHANGED
@@ -1,12 +1,16 @@
|
|
1
1
|
require_relative 'parser'
|
2
|
+
require_relative 'transformer'
|
3
|
+
require_relative 'persist/fix_width_table'
|
2
4
|
module TSV
|
3
|
-
def self.index(tsv_file, target: 0, order: true, **kwargs)
|
4
|
-
persist, type = IndiferentHash.process_options kwargs,
|
5
|
-
:persist, :persist_type,
|
5
|
+
def self.index(tsv_file, target: 0, fields: nil, order: true, bar: nil, **kwargs)
|
6
|
+
persist, type, persist_update, data_persist = IndiferentHash.process_options kwargs,
|
7
|
+
:persist, :persist_type, :persist_update, :data_persist,
|
6
8
|
:persist => false, :persist_type => "HDB"
|
7
9
|
kwargs.delete :type
|
8
10
|
|
9
|
-
|
11
|
+
fields = :all if fields.nil?
|
12
|
+
|
13
|
+
Persist.persist(tsv_file, type, kwargs.merge(target: target, fields: fields, persist: persist, update: persist_update, :prefix => "Index", :other_options => kwargs)) do |filename|
|
10
14
|
if filename
|
11
15
|
index = ScoutCabinet.open(filename, true, type)
|
12
16
|
TSV.setup(index, :type => :single)
|
@@ -15,35 +19,199 @@ module TSV
|
|
15
19
|
index = TSV.setup({}, :type => :single)
|
16
20
|
end
|
17
21
|
|
18
|
-
|
22
|
+
tsv_file = TSV.open(tsv_file, persist: true) if data_persist && ! TSV === tsv_file
|
23
|
+
|
24
|
+
bar = "Index #{Log.fingerprint tsv_file} target #{Log.fingerprint target}" if TrueClass === bar
|
25
|
+
|
19
26
|
if order
|
20
27
|
tmp_index = {}
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
28
|
+
include_self = fields == :all || (Array === fields) && fields.include?(target)
|
29
|
+
target_key_field, source_field_names = Open.traverse tsv_file, key_field: target, fields: fields, type: :double, unnamed: true, bar: bar, **kwargs do |k,values|
|
30
|
+
tmp_index[k] ||= [[k]] if include_self
|
31
|
+
values.each_with_index do |list,i|
|
32
|
+
i += 1 if include_self
|
33
|
+
list.each do |e|
|
34
|
+
tmp_index[e] ||= []
|
35
|
+
tmp_index[e][i] ||= []
|
36
|
+
tmp_index[e][i] << k
|
29
37
|
end
|
30
38
|
end
|
31
39
|
end
|
32
40
|
tmp_index.each do |e,list|
|
33
41
|
index[e] = list.flatten.compact.uniq.first
|
34
42
|
end
|
43
|
+
|
44
|
+
index.key_field = source_field_names * ","
|
45
|
+
index.fields = [target_key_field]
|
46
|
+
|
47
|
+
tmp_index = {}
|
48
|
+
|
35
49
|
else
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
50
|
+
target_key_field, source_field_names = Open.traverse tsv_file, key_field: target, fields: fields, type: :flat, unnamed: true, bar: bar, **kwargs do |k,values|
|
51
|
+
values.each do |e|
|
52
|
+
index[e] = k unless index.include?(e)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
index.key_field = source_field_names * ","
|
57
|
+
index.fields = [target_key_field]
|
58
|
+
end
|
59
|
+
|
60
|
+
|
61
|
+
index
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def index(*args, **kwargs, &block)
|
66
|
+
TSV.index(self, *args, **kwargs, &block)
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.range_index(tsv_file, start_field = nil, end_field = nil, key_field: :key, **kwargs)
|
70
|
+
persist, type, persist_update, data_persist = IndiferentHash.process_options kwargs,
|
71
|
+
:persist, :persist_type, :persist_update, :data_persist,
|
72
|
+
:persist => false, :persist_type => :fwt
|
73
|
+
kwargs.delete :type
|
74
|
+
|
75
|
+
Persist.persist(tsv_file, type, kwargs.merge(:persist => persist, :prefix => "RangeIndex", :other_options => kwargs, update: persist_update)) do |filename|
|
76
|
+
|
77
|
+
tsv_file = TSV.open(tsv_file, persist: true) if data_persist && ! TSV === tsv_file
|
78
|
+
|
79
|
+
max_key_size = 0
|
80
|
+
index_data = []
|
81
|
+
TSV.traverse tsv_file, key_field: key_field, fields: [start_field, end_field], **kwargs do |key, values|
|
82
|
+
key_size = key.length
|
83
|
+
max_key_size = key_size if key_size > max_key_size
|
84
|
+
|
85
|
+
start_pos, end_pos = values
|
86
|
+
if Array === start_pos
|
87
|
+
start_pos.zip(end_pos).each do |s,e|
|
88
|
+
index_data << [key, [s.to_i, e.to_i]]
|
41
89
|
end
|
90
|
+
else
|
91
|
+
index_data << [key, [start_pos.to_i, end_pos.to_i]]
|
42
92
|
end
|
43
93
|
end
|
44
|
-
|
45
|
-
|
94
|
+
|
95
|
+
filename = :memory if filename.nil?
|
96
|
+
index = FixWidthTable.get(filename, max_key_size, true)
|
97
|
+
index.add_range index_data
|
98
|
+
index.read
|
46
99
|
index
|
47
100
|
end
|
48
101
|
end
|
102
|
+
|
103
|
+
def range_index(*args, **kwargs, &block)
|
104
|
+
TSV.range_index(self, *args, **kwargs, &block)
|
105
|
+
end
|
106
|
+
|
107
|
+
def self.pos_index(tsv_file, pos_field = nil, key_field: :key, **kwargs)
|
108
|
+
persist, type, persist_update, data_persist = IndiferentHash.process_options kwargs,
|
109
|
+
:persist, :persist_type, :persist_update, :data_persist,
|
110
|
+
:persist => false, :persist_type => :fwt
|
111
|
+
kwargs.delete :type
|
112
|
+
|
113
|
+
Persist.persist(tsv_file, type, kwargs.merge(:persist => persist, update: persist_update, :prefix => "RangeIndex", :other_options => kwargs)) do |filename|
|
114
|
+
|
115
|
+
tsv_file = TSV.open(tsv_file, persist: true) if data_persist && ! TSV === tsv_file
|
116
|
+
|
117
|
+
max_key_size = 0
|
118
|
+
index_data = []
|
119
|
+
TSV.traverse tsv_file, key_field: key_field, fields: [pos_field], type: :single, cast: :to_i, **kwargs do |key, pos|
|
120
|
+
key_size = key.length
|
121
|
+
max_key_size = key_size if key_size > max_key_size
|
122
|
+
|
123
|
+
if Array === pos
|
124
|
+
pos.zip(end_pos).each do |p|
|
125
|
+
index_pos << [key, p]
|
126
|
+
end
|
127
|
+
else
|
128
|
+
index_data << [key, pos]
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
filename = :memory if filename.nil?
|
133
|
+
index = FixWidthTable.get(filename, max_key_size, false)
|
134
|
+
index.add_point index_data
|
135
|
+
index.read
|
136
|
+
index
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
def pos_index(*args, **kwargs, &block)
|
141
|
+
TSV.pos_index(self, *args, **kwargs, &block)
|
142
|
+
end
|
143
|
+
|
144
|
+
|
145
|
+
#def range_index(start_field = nil, end_field = nil, options = {})
|
146
|
+
# start_field ||= "Start"
|
147
|
+
# end_field ||= "End"
|
148
|
+
|
149
|
+
# options = Misc.add_defaults options,
|
150
|
+
# :persist => false, :persist_file => nil, :persist_update => false
|
151
|
+
|
152
|
+
# persist_options = Misc.pull_keys options, :persist
|
153
|
+
# persist_options[:prefix] ||= "RangeIndex[#{start_field}-#{end_field}]"
|
154
|
+
|
155
|
+
# Persist.persist(filename || self.object_id.to_s, :fwt, persist_options) do
|
156
|
+
# max_key_size = 0
|
157
|
+
# index_data = []
|
158
|
+
# with_unnamed do
|
159
|
+
# with_monitor :desc => "Creating Index Data", :step => 10000 do
|
160
|
+
# through :key, [start_field, end_field] do |key, values|
|
161
|
+
# key_size = key.length
|
162
|
+
# max_key_size = key_size if key_size > max_key_size
|
163
|
+
|
164
|
+
# start_pos, end_pos = values
|
165
|
+
# if Array === start_pos
|
166
|
+
# start_pos.zip(end_pos).each do |s,e|
|
167
|
+
# index_data << [key, [s.to_i, e.to_i]]
|
168
|
+
# end
|
169
|
+
# else
|
170
|
+
# index_data << [key, [start_pos.to_i, end_pos.to_i]]
|
171
|
+
# end
|
172
|
+
# end
|
173
|
+
# end
|
174
|
+
# end
|
175
|
+
|
176
|
+
# index = FixWidthTable.get(:memory, max_key_size, true)
|
177
|
+
# index.add_range index_data
|
178
|
+
# index.read
|
179
|
+
# index
|
180
|
+
# end
|
181
|
+
#end
|
182
|
+
|
183
|
+
#def self.range_index(file, start_field = nil, end_field = nil, options = {})
|
184
|
+
# start_field ||= "Start"
|
185
|
+
# end_field ||= "End"
|
186
|
+
|
187
|
+
# data_options = Misc.pull_keys options, :data
|
188
|
+
# filename = case
|
189
|
+
# when (String === file or Path === file)
|
190
|
+
# file
|
191
|
+
# when file.respond_to?(:filename)
|
192
|
+
# file.filename
|
193
|
+
# else
|
194
|
+
# file.object_id.to_s
|
195
|
+
# end
|
196
|
+
# persist_options = Misc.pull_keys options, :persist
|
197
|
+
# persist_options[:prefix] ||= "StaticRangeIndex[#{start_field}-#{end_field}]"
|
198
|
+
|
199
|
+
# filters = Misc.process_options options, :filters
|
200
|
+
|
201
|
+
# if filters
|
202
|
+
# filename += ":Filtered[#{filters.collect{|f| f * "="} * ", "}]"
|
203
|
+
# end
|
204
|
+
|
205
|
+
# Persist.persist(filename, :fwt, persist_options) do
|
206
|
+
# tsv = TSV.open(file, data_options)
|
207
|
+
# if filters
|
208
|
+
# tsv.filter
|
209
|
+
# filters.each do |match, value|
|
210
|
+
# tsv.add_filter match, value
|
211
|
+
# end
|
212
|
+
# end
|
213
|
+
|
214
|
+
# tsv.range_index(start_field, end_field, options)
|
215
|
+
# end
|
216
|
+
#end
|
49
217
|
end
|