td 0.10.38 → 0.10.39
Sign up to get free protection for your applications and to get access to all the features.
- data/ChangeLog +5 -0
- data/lib/td/command/acl.rb +1 -1
- data/lib/td/command/bulk_import.rb +134 -4
- data/lib/td/command/list.rb +2 -0
- data/lib/td/file_reader.rb +324 -0
- data/lib/td/version.rb +1 -1
- metadata +3 -2
data/ChangeLog
CHANGED
data/lib/td/command/acl.rb
CHANGED
@@ -35,7 +35,7 @@ module Command
|
|
35
35
|
|
36
36
|
client.grant_access_control(subject, action, scope, grant_option)
|
37
37
|
|
38
|
-
$stderr.puts "Access control [#{subject} #{action} #{scope}] is created #{grant_option
|
38
|
+
$stderr.puts "Access control [#{subject} #{action} #{scope}] is created #{grant_option ? 'with' : 'without'} grant option."
|
39
39
|
end
|
40
40
|
|
41
41
|
def acl_revoke(op)
|
@@ -85,17 +85,48 @@ module Command
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def bulk_import_upload_part(op)
|
88
|
-
|
88
|
+
retry_limit = 10
|
89
|
+
retry_wait = 1
|
89
90
|
|
90
|
-
|
91
|
+
name, part_name, path = op.cmd_parse
|
91
92
|
|
92
|
-
File.open(path, "rb") {|
|
93
|
-
|
93
|
+
File.open(path, "rb") {|io|
|
94
|
+
bulk_import_upload_impl(name, part_name, io, io.size, retry_limit, retry_wait)
|
94
95
|
}
|
95
96
|
|
96
97
|
$stderr.puts "Part '#{part_name}' is uploaded."
|
97
98
|
end
|
98
99
|
|
100
|
+
def bulk_import_upload_parts(op)
|
101
|
+
retry_limit = 10
|
102
|
+
retry_wait = 1
|
103
|
+
suffix_count = 0
|
104
|
+
part_prefix = ""
|
105
|
+
|
106
|
+
op.on('-P', '--prefix NAME', 'add prefix to parts name') {|s|
|
107
|
+
part_prefix = s
|
108
|
+
}
|
109
|
+
op.on('-s', '--use-suffix COUNT', 'use COUNT number of . (dots) in the source file name to the parts name', Integer) {|i|
|
110
|
+
suffix_count = i
|
111
|
+
}
|
112
|
+
|
113
|
+
name, *files = op.cmd_parse
|
114
|
+
|
115
|
+
files.each {|ifname|
|
116
|
+
basename = File.basename(ifname)
|
117
|
+
part_name = part_prefix + basename.split('.')[0..suffix_count].join('.')
|
118
|
+
|
119
|
+
File.open(ifname, "rb") {|io|
|
120
|
+
size = io.size
|
121
|
+
$stderr.puts "Uploading '#{ifname}' -> '#{part_name}'... (#{size} bytes)"
|
122
|
+
|
123
|
+
bulk_import_upload_impl(name, part_name, io, size, retry_limit, retry_wait)
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
$stderr.puts "done."
|
128
|
+
end
|
129
|
+
|
99
130
|
def bulk_import_delete_part(op)
|
100
131
|
name, part_name = op.cmd_parse
|
101
132
|
|
@@ -203,6 +234,105 @@ module Command
|
|
203
234
|
$stderr.puts "Bulk import session '#{name}' is unfrozen."
|
204
235
|
end
|
205
236
|
|
237
|
+
|
238
|
+
PART_SPLIT_SIZE = 16*1024*1024
|
239
|
+
|
240
|
+
def bulk_import_prepare_part(op)
|
241
|
+
outdir = nil
|
242
|
+
split_size_kb = PART_SPLIT_SIZE / 1024 # kb
|
243
|
+
|
244
|
+
require 'td/file_reader'
|
245
|
+
reader = FileReader.new
|
246
|
+
reader.init_optparse(op)
|
247
|
+
|
248
|
+
op.on('-s', '--split-size SIZE_IN_KB', "size of each parts (default: #{split_size_kb})", Integer) {|i|
|
249
|
+
split_size_kb = i
|
250
|
+
}
|
251
|
+
op.on('-o', '--output DIR', 'output directory') {|s|
|
252
|
+
outdir = s
|
253
|
+
}
|
254
|
+
|
255
|
+
*files = op.cmd_parse
|
256
|
+
|
257
|
+
unless outdir
|
258
|
+
$stderr.puts "-o, --output DIR option is required."
|
259
|
+
exit 1
|
260
|
+
end
|
261
|
+
|
262
|
+
split_size = split_size_kb * 1024
|
263
|
+
|
264
|
+
require 'fileutils'
|
265
|
+
FileUtils.mkdir_p(outdir)
|
266
|
+
|
267
|
+
require 'json'
|
268
|
+
require 'msgpack'
|
269
|
+
require 'zlib'
|
270
|
+
|
271
|
+
error = Proc.new {|reason,data|
|
272
|
+
begin
|
273
|
+
$stderr.puts "#{reason}: #{data.to_json}"
|
274
|
+
rescue
|
275
|
+
$stderr.puts "#{reason}"
|
276
|
+
end
|
277
|
+
}
|
278
|
+
|
279
|
+
files.each {|ifname|
|
280
|
+
$stderr.puts "Processing #{ifname}..."
|
281
|
+
record_num = 0
|
282
|
+
|
283
|
+
basename = File.basename(ifname).split('.').first
|
284
|
+
File.open(ifname) {|io|
|
285
|
+
of_index = 0
|
286
|
+
out = nil
|
287
|
+
zout = nil
|
288
|
+
begin
|
289
|
+
reader.parse(io, error) {|record|
|
290
|
+
if zout == nil
|
291
|
+
ofname = "#{basename}_#{of_index}.msgpack.gz"
|
292
|
+
$stderr.puts " Preparing part \"#{basename}_#{of_index}\"..."
|
293
|
+
out = File.open("#{outdir}/#{ofname}", 'wb')
|
294
|
+
zout = Zlib::GzipWriter.new(out)
|
295
|
+
|
296
|
+
t = record['time']
|
297
|
+
$stderr.puts " sample: #{Time.at(t).utc} #{record.to_json}"
|
298
|
+
end
|
299
|
+
|
300
|
+
zout.write(record.to_msgpack)
|
301
|
+
record_num += 1
|
302
|
+
|
303
|
+
if out.size > split_size
|
304
|
+
zout.close
|
305
|
+
of_index += 1
|
306
|
+
out = nil
|
307
|
+
zout = nil
|
308
|
+
end
|
309
|
+
}
|
310
|
+
ensure
|
311
|
+
if zout
|
312
|
+
zout.close
|
313
|
+
zout = nil
|
314
|
+
end
|
315
|
+
end
|
316
|
+
$stderr.puts " #{ifname}: #{record_num} entries."
|
317
|
+
}
|
318
|
+
}
|
319
|
+
end
|
320
|
+
|
321
|
+
private
|
322
|
+
def bulk_import_upload_impl(name, part_name, io, size, retry_limit, retry_wait)
|
323
|
+
begin
|
324
|
+
client = get_client
|
325
|
+
client.bulk_import_upload_part(name, part_name, io, size)
|
326
|
+
rescue
|
327
|
+
if retry_limit > 0
|
328
|
+
retry_limit -= 1
|
329
|
+
$stderr.puts "#{$!}; retrying '#{part_name}'..."
|
330
|
+
sleep retry_wait
|
331
|
+
retry
|
332
|
+
end
|
333
|
+
raise
|
334
|
+
end
|
335
|
+
end
|
206
336
|
end
|
207
337
|
end
|
208
338
|
|
data/lib/td/command/list.rb
CHANGED
@@ -228,7 +228,9 @@ module List
|
|
228
228
|
add_list 'bulk_import:list', %w[], 'List bulk import sessions', 'bulk_import:list'
|
229
229
|
add_list 'bulk_import:show', %w[name], 'Show list of uploaded parts', 'bulk_import:show'
|
230
230
|
add_list 'bulk_import:create', %w[name db table], 'Create a new bulk import session to the the table', 'bulk_import:create logs_201201 example_db event_logs'
|
231
|
+
add_list 'bulk_import:prepare_part', %w[files_], 'Convert files into part file format', 'bulk_import:prepare_part logs/*.csv --format csv --columns time,uid,price,count --time-column "time" -o parts/'
|
231
232
|
add_list 'bulk_import:upload_part', %w[name id path.msgpack.gz], 'Upload or re-upload a file into a bulk import session', 'bulk_import:upload_part logs_201201 01h data-201201-01.msgpack.gz'
|
233
|
+
add_list 'bulk_import:upload_parts', %w[name files_], 'Upload or re-upload files into a bulk import session', 'bulk_import:upload_parts parts/* --prefix logs_'
|
232
234
|
add_list 'bulk_import:delete_part', %w[name id], 'Delete a uploaded file from a bulk import session', 'bulk_import:delete_part logs_201201 01h'
|
233
235
|
add_list 'bulk_import:perform', %w[name], 'Start to validate and convert uploaded files', 'bulk_import:perform logs_201201'
|
234
236
|
add_list 'bulk_import:error_records', %w[name], 'Show records which did not pass validations', 'bulk_import:error_records logs_201201'
|
@@ -0,0 +1,324 @@
|
|
1
|
+
|
2
|
+
module TreasureData
|
3
|
+
class FileReader
|
4
|
+
|
5
|
+
class MessagePackParsingReader
|
6
|
+
def initialize(io, error, opts)
|
7
|
+
require 'msgpack'
|
8
|
+
@io = io
|
9
|
+
@error = error
|
10
|
+
@u = MessagePack::Unpacker.new(@io)
|
11
|
+
end
|
12
|
+
|
13
|
+
def next
|
14
|
+
@u.next
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class LineReader
|
19
|
+
def initialize(io, error, opts)
|
20
|
+
if encoding = opts[:encoding]
|
21
|
+
io.external_encoding = encoding
|
22
|
+
end
|
23
|
+
#@delimiter = opts[:line_delimiter_expr] || /\r?\n/
|
24
|
+
@io = io
|
25
|
+
@error = error
|
26
|
+
end
|
27
|
+
|
28
|
+
def next_row
|
29
|
+
@io.readline($/).chomp
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class DelimiterParser
|
34
|
+
def initialize(reader, error, opts)
|
35
|
+
@reader = reader
|
36
|
+
@delimiter_expr = opts[:delimiter_expr]
|
37
|
+
@null_expr = opts[:null_expr]
|
38
|
+
# TODO
|
39
|
+
#@escape_char = opts[:escape_char]
|
40
|
+
#@quote_char = opts[:quote_char]
|
41
|
+
end
|
42
|
+
|
43
|
+
def next
|
44
|
+
row = @reader.next_row
|
45
|
+
array = row.split(@delimiter_expr)
|
46
|
+
array.map! {|x|
|
47
|
+
@null_expr =~ x ? nil : x
|
48
|
+
}
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class JSONParser
|
53
|
+
def initialize(reader, error, opts)
|
54
|
+
@reader = reader
|
55
|
+
@error = error
|
56
|
+
end
|
57
|
+
|
58
|
+
def next
|
59
|
+
while true
|
60
|
+
line = @reader.next_row
|
61
|
+
begin
|
62
|
+
return JSON.parse(line)
|
63
|
+
rescue
|
64
|
+
@error.call("invalid json format: #{$!}", line)
|
65
|
+
next
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
#class ApacheParser
|
72
|
+
# REGEXP = /^([^ ]*) [^ ]* ([^ ]*) \[([^\]]*)\] "(\S+)(?: +([^ ]*) +\S*)?" ([^ ]*) ([^ ]*)(?: "([^\"]*)" "([^\"]*)")?$/
|
73
|
+
#
|
74
|
+
# def initialize(reader, error, opts)
|
75
|
+
# @reader = reader
|
76
|
+
# end
|
77
|
+
#
|
78
|
+
# def next
|
79
|
+
# while true
|
80
|
+
# m = REGEXP.match(@reader.next_row)
|
81
|
+
# if m
|
82
|
+
# h = {
|
83
|
+
# 'host' => m[1],
|
84
|
+
# 'user' => m[2],
|
85
|
+
# 'time' => m[3],
|
86
|
+
# 'method' => m[4],
|
87
|
+
# 'path' => m[5],
|
88
|
+
# 'code' => m[6],
|
89
|
+
# 'size' => m[7].to_i,
|
90
|
+
# 'referer' => m[8],
|
91
|
+
# 'agent' => m[9],
|
92
|
+
# }
|
93
|
+
# return h
|
94
|
+
# end
|
95
|
+
# end
|
96
|
+
# end
|
97
|
+
#end
|
98
|
+
|
99
|
+
class AutoTypeConvertParserFilter
|
100
|
+
def initialize(parser, error)
|
101
|
+
@parser = parser
|
102
|
+
end
|
103
|
+
|
104
|
+
def next
|
105
|
+
array = @parser.next
|
106
|
+
array.map! {|s|
|
107
|
+
# nil.to_i == 0 != nil.to_s
|
108
|
+
i = s.to_i
|
109
|
+
i.to_s == s ? i : s
|
110
|
+
}
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
class HashBuilder
|
115
|
+
def initialize(parser, error, columns)
|
116
|
+
@parser = parser
|
117
|
+
@columns = columns
|
118
|
+
end
|
119
|
+
|
120
|
+
def next
|
121
|
+
array = @parser.next
|
122
|
+
# FIXME error handling
|
123
|
+
Hash[@columns.zip(array)]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
class TimeParserFilter
|
128
|
+
def initialize(parser, error, opts)
|
129
|
+
require 'time'
|
130
|
+
@parser = parser
|
131
|
+
@error = error
|
132
|
+
@time_column = opts[:time_column]
|
133
|
+
unless @time_column
|
134
|
+
raise '-t, --time-column NAME option is required'
|
135
|
+
end
|
136
|
+
@time_format = opts[:time_format]
|
137
|
+
end
|
138
|
+
|
139
|
+
def next
|
140
|
+
while true
|
141
|
+
row = @parser.next
|
142
|
+
tval = row[@time_column]
|
143
|
+
|
144
|
+
unless tval
|
145
|
+
@error.call("time column '#{@time_column}' is missing", row)
|
146
|
+
next
|
147
|
+
end
|
148
|
+
|
149
|
+
begin
|
150
|
+
if tf = @time_format
|
151
|
+
row['time'] = parse_time(tval, tf).to_i
|
152
|
+
elsif tval.is_a?(Integer)
|
153
|
+
row['time'] = tval
|
154
|
+
else
|
155
|
+
row['time'] = Time.parse(tval).to_i
|
156
|
+
end
|
157
|
+
return row
|
158
|
+
|
159
|
+
rescue
|
160
|
+
@error.call("invalid time format '#{tval}': #{$!}", row)
|
161
|
+
next
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
if Time.respond_to?(:strptime)
|
167
|
+
def parse_time(value, format)
|
168
|
+
Time.strptime(value, format)
|
169
|
+
end
|
170
|
+
else
|
171
|
+
def parse_time(value, format)
|
172
|
+
Time.parse(DateTime.strptime(value, format).to_s)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def initialize
|
178
|
+
@format = "text"
|
179
|
+
@default_opts = {
|
180
|
+
:delimiter_expr => /\t|,/,
|
181
|
+
#:line_delimiter_expr => /\r?\n/,
|
182
|
+
:null_expr => /\A(?:\\N|\-|)\z/,
|
183
|
+
#:quote_char => "\"",
|
184
|
+
}
|
185
|
+
@opts = {}
|
186
|
+
@parser_class = nil
|
187
|
+
end
|
188
|
+
|
189
|
+
attr_reader :default_opts, :opts
|
190
|
+
attr_accessor :parser_class
|
191
|
+
|
192
|
+
def init_optparse(op)
|
193
|
+
op.on('-f', '--format NAME', "source file format") {|s|
|
194
|
+
set_format_template(s)
|
195
|
+
}
|
196
|
+
op.on('-h', '--columns NAME,NAME,...', 'column names') {|s|
|
197
|
+
@opts[:column_names] = s.split(',')
|
198
|
+
}
|
199
|
+
op.on('-H', '--column-header', 'first line includes column names', TrueClass) {|b|
|
200
|
+
@opts[:column_header] = b
|
201
|
+
}
|
202
|
+
op.on('-d', '--delimiter REGEX', "delimiter between columns (default: #{@default_opts[:delimiter_expr].inspect[1..-2]}") {|s|
|
203
|
+
@opts[:delimiter_expr] = Regexp.new(s)
|
204
|
+
}
|
205
|
+
#op.on('-D', '--line-delimiter REGEX', "delimiter between rows (default: #{@default_opts[:line_delimiter_expr].inspect[1..-2]})") {|s|
|
206
|
+
# @opts[:line_delimiter_expr] = Regexp.new(s)
|
207
|
+
#}
|
208
|
+
op.on('-N', '--null REGEX', "null expression (default: #{@default_opts[:null_expr].inspect[1..-2]}") {|s|
|
209
|
+
@opts[:null_expr] = Regexp.new(s)
|
210
|
+
}
|
211
|
+
# TODO
|
212
|
+
#op.on('-E', '--escape CHAR', "escape character (default: no escape character)") {|s|
|
213
|
+
# @opts[:escape_char] = s
|
214
|
+
#}
|
215
|
+
#op.on('-Q', '--quote CHAR', "quote character (default: #{@default_opts[:quote_char]}") {|s|
|
216
|
+
# @opts[:quote_char] = s
|
217
|
+
#}
|
218
|
+
op.on('-S', '--all-string', 'disable automatic type conversion', TrueClass) {|b|
|
219
|
+
@opts[:all_string] = b
|
220
|
+
}
|
221
|
+
op.on('-t', '--time-column NAME', 'name of the time column (default: auto detect)') {|s|
|
222
|
+
@opts[:time_column] = s
|
223
|
+
}
|
224
|
+
op.on('-T', '--time-format FORMAT', 'strftime(3) format of the time column') {|s|
|
225
|
+
@opts[:time_format] = s
|
226
|
+
}
|
227
|
+
op.on('-e', '--encoding NAME', "text encoding") {|s|
|
228
|
+
@opts[:encoding] = s
|
229
|
+
}
|
230
|
+
op.on('-C', '--compress NAME', 'compression format name [plain, gzip] (default: auto detect)') {|s|
|
231
|
+
@opts[:compress] = s
|
232
|
+
}
|
233
|
+
end
|
234
|
+
|
235
|
+
def set_format_template(name)
|
236
|
+
case name
|
237
|
+
when 'csv'
|
238
|
+
@format = 'text'
|
239
|
+
@opts[:delimiter_expr] = /,/
|
240
|
+
when 'tsv'
|
241
|
+
@format = 'text'
|
242
|
+
@opts[:delimiter_expr] = /\t/
|
243
|
+
#when 'apache'
|
244
|
+
# @format = 'apache'
|
245
|
+
# @opts[:column_names] = ['host', 'user', 'time', 'method', 'path', 'code', 'size', 'referer', 'agent']
|
246
|
+
# @opts[:null_expr] = /\A(?:\-|)\z/
|
247
|
+
# @opts[:time_column] = 'time'
|
248
|
+
# @opts[:time_format] = '%d/%b/%Y:%H:%M:%S %z'
|
249
|
+
when 'msgpack'
|
250
|
+
@format = 'msgpack'
|
251
|
+
when 'json'
|
252
|
+
@format = 'json'
|
253
|
+
else
|
254
|
+
raise "Unknown format: #{name}"
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def compose_factory
|
259
|
+
opts = @default_opts.merge(@opts)
|
260
|
+
case @format
|
261
|
+
when 'text'
|
262
|
+
Proc.new {|io,error|
|
263
|
+
reader = LineReader.new(io, error, opts)
|
264
|
+
parser = DelimiterParser.new(reader, error, opts)
|
265
|
+
if opts[:column_header]
|
266
|
+
column_names = parser.next
|
267
|
+
elsif opts[:column_names]
|
268
|
+
column_names = opts[:column_names]
|
269
|
+
else
|
270
|
+
raise "--column-header or --columns option is required"
|
271
|
+
end
|
272
|
+
unless opts[:all_string]
|
273
|
+
parser = AutoTypeConvertParserFilter.new(parser, error)
|
274
|
+
end
|
275
|
+
parser = HashBuilder.new(parser, error, column_names)
|
276
|
+
parser = TimeParserFilter.new(parser, error, opts)
|
277
|
+
}
|
278
|
+
|
279
|
+
#when 'apache'
|
280
|
+
|
281
|
+
when 'json'
|
282
|
+
Proc.new {|io,error|
|
283
|
+
reader = LineReader.new(io, error, opts)
|
284
|
+
parser = JSONParser.new(reader, error, opts)
|
285
|
+
if opts[:column_header]
|
286
|
+
column_names = parser.next
|
287
|
+
elsif opts[:column_names]
|
288
|
+
column_names = opts[:column_names]
|
289
|
+
end
|
290
|
+
if column_names
|
291
|
+
parser = HashBuilder.new(parser, error, column_names)
|
292
|
+
end
|
293
|
+
parser = TimeParserFilter.new(parser, error, opts)
|
294
|
+
}
|
295
|
+
|
296
|
+
when 'msgpack'
|
297
|
+
Proc.new {|io,error|
|
298
|
+
parser = MessagePackParsingReader.new(io, error, opts)
|
299
|
+
if opts[:column_header]
|
300
|
+
column_names = parser.next
|
301
|
+
elsif opts[:column_names]
|
302
|
+
column_names = opts[:column_names]
|
303
|
+
end
|
304
|
+
if column_names
|
305
|
+
parser = HashBuilder.new(parser, error, column_names)
|
306
|
+
end
|
307
|
+
parser = TimeParserFilter.new(parser, error, opts)
|
308
|
+
}
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def parse(io, error, &block)
|
313
|
+
factory = compose_factory
|
314
|
+
parser = factory.call(io, error)
|
315
|
+
begin
|
316
|
+
while record = parser.next
|
317
|
+
block.call(record)
|
318
|
+
end
|
319
|
+
rescue EOFError
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
end
|
324
|
+
end
|
data/lib/td/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: td
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.39
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: msgpack
|
@@ -163,6 +163,7 @@ files:
|
|
163
163
|
- lib/td/compat_gzip_reader.rb
|
164
164
|
- lib/td/config.rb
|
165
165
|
- lib/td/distribution.rb
|
166
|
+
- lib/td/file_reader.rb
|
166
167
|
- lib/td/version.rb
|
167
168
|
- ChangeLog
|
168
169
|
- README.rdoc
|