td 0.10.38 → 0.10.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ChangeLog +5 -0
- data/lib/td/command/acl.rb +1 -1
- data/lib/td/command/bulk_import.rb +134 -4
- data/lib/td/command/list.rb +2 -0
- data/lib/td/file_reader.rb +324 -0
- data/lib/td/version.rb +1 -1
- metadata +3 -2
data/ChangeLog
CHANGED
data/lib/td/command/acl.rb
CHANGED
@@ -35,7 +35,7 @@ module Command
|
|
35
35
|
|
36
36
|
client.grant_access_control(subject, action, scope, grant_option)
|
37
37
|
|
38
|
-
$stderr.puts "Access control [#{subject} #{action} #{scope}] is created #{grant_option
|
38
|
+
$stderr.puts "Access control [#{subject} #{action} #{scope}] is created #{grant_option ? 'with' : 'without'} grant option."
|
39
39
|
end
|
40
40
|
|
41
41
|
def acl_revoke(op)
|
@@ -85,17 +85,48 @@ module Command
|
|
85
85
|
end
|
86
86
|
|
87
87
|
def bulk_import_upload_part(op)
|
88
|
-
|
88
|
+
retry_limit = 10
|
89
|
+
retry_wait = 1
|
89
90
|
|
90
|
-
|
91
|
+
name, part_name, path = op.cmd_parse
|
91
92
|
|
92
|
-
File.open(path, "rb") {|
|
93
|
-
|
93
|
+
File.open(path, "rb") {|io|
|
94
|
+
bulk_import_upload_impl(name, part_name, io, io.size, retry_limit, retry_wait)
|
94
95
|
}
|
95
96
|
|
96
97
|
$stderr.puts "Part '#{part_name}' is uploaded."
|
97
98
|
end
|
98
99
|
|
100
|
+
def bulk_import_upload_parts(op)
|
101
|
+
retry_limit = 10
|
102
|
+
retry_wait = 1
|
103
|
+
suffix_count = 0
|
104
|
+
part_prefix = ""
|
105
|
+
|
106
|
+
op.on('-P', '--prefix NAME', 'add prefix to parts name') {|s|
|
107
|
+
part_prefix = s
|
108
|
+
}
|
109
|
+
op.on('-s', '--use-suffix COUNT', 'use COUNT number of . (dots) in the source file name to the parts name', Integer) {|i|
|
110
|
+
suffix_count = i
|
111
|
+
}
|
112
|
+
|
113
|
+
name, *files = op.cmd_parse
|
114
|
+
|
115
|
+
files.each {|ifname|
|
116
|
+
basename = File.basename(ifname)
|
117
|
+
part_name = part_prefix + basename.split('.')[0..suffix_count].join('.')
|
118
|
+
|
119
|
+
File.open(ifname, "rb") {|io|
|
120
|
+
size = io.size
|
121
|
+
$stderr.puts "Uploading '#{ifname}' -> '#{part_name}'... (#{size} bytes)"
|
122
|
+
|
123
|
+
bulk_import_upload_impl(name, part_name, io, size, retry_limit, retry_wait)
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
$stderr.puts "done."
|
128
|
+
end
|
129
|
+
|
99
130
|
def bulk_import_delete_part(op)
|
100
131
|
name, part_name = op.cmd_parse
|
101
132
|
|
@@ -203,6 +234,105 @@ module Command
|
|
203
234
|
$stderr.puts "Bulk import session '#{name}' is unfrozen."
|
204
235
|
end
|
205
236
|
|
237
|
+
|
238
|
+
PART_SPLIT_SIZE = 16*1024*1024
|
239
|
+
|
240
|
+
def bulk_import_prepare_part(op)
|
241
|
+
outdir = nil
|
242
|
+
split_size_kb = PART_SPLIT_SIZE / 1024 # kb
|
243
|
+
|
244
|
+
require 'td/file_reader'
|
245
|
+
reader = FileReader.new
|
246
|
+
reader.init_optparse(op)
|
247
|
+
|
248
|
+
op.on('-s', '--split-size SIZE_IN_KB', "size of each parts (default: #{split_size_kb})", Integer) {|i|
|
249
|
+
split_size_kb = i
|
250
|
+
}
|
251
|
+
op.on('-o', '--output DIR', 'output directory') {|s|
|
252
|
+
outdir = s
|
253
|
+
}
|
254
|
+
|
255
|
+
*files = op.cmd_parse
|
256
|
+
|
257
|
+
unless outdir
|
258
|
+
$stderr.puts "-o, --output DIR option is required."
|
259
|
+
exit 1
|
260
|
+
end
|
261
|
+
|
262
|
+
split_size = split_size_kb * 1024
|
263
|
+
|
264
|
+
require 'fileutils'
|
265
|
+
FileUtils.mkdir_p(outdir)
|
266
|
+
|
267
|
+
require 'json'
|
268
|
+
require 'msgpack'
|
269
|
+
require 'zlib'
|
270
|
+
|
271
|
+
error = Proc.new {|reason,data|
|
272
|
+
begin
|
273
|
+
$stderr.puts "#{reason}: #{data.to_json}"
|
274
|
+
rescue
|
275
|
+
$stderr.puts "#{reason}"
|
276
|
+
end
|
277
|
+
}
|
278
|
+
|
279
|
+
files.each {|ifname|
|
280
|
+
$stderr.puts "Processing #{ifname}..."
|
281
|
+
record_num = 0
|
282
|
+
|
283
|
+
basename = File.basename(ifname).split('.').first
|
284
|
+
File.open(ifname) {|io|
|
285
|
+
of_index = 0
|
286
|
+
out = nil
|
287
|
+
zout = nil
|
288
|
+
begin
|
289
|
+
reader.parse(io, error) {|record|
|
290
|
+
if zout == nil
|
291
|
+
ofname = "#{basename}_#{of_index}.msgpack.gz"
|
292
|
+
$stderr.puts " Preparing part \"#{basename}_#{of_index}\"..."
|
293
|
+
out = File.open("#{outdir}/#{ofname}", 'wb')
|
294
|
+
zout = Zlib::GzipWriter.new(out)
|
295
|
+
|
296
|
+
t = record['time']
|
297
|
+
$stderr.puts " sample: #{Time.at(t).utc} #{record.to_json}"
|
298
|
+
end
|
299
|
+
|
300
|
+
zout.write(record.to_msgpack)
|
301
|
+
record_num += 1
|
302
|
+
|
303
|
+
if out.size > split_size
|
304
|
+
zout.close
|
305
|
+
of_index += 1
|
306
|
+
out = nil
|
307
|
+
zout = nil
|
308
|
+
end
|
309
|
+
}
|
310
|
+
ensure
|
311
|
+
if zout
|
312
|
+
zout.close
|
313
|
+
zout = nil
|
314
|
+
end
|
315
|
+
end
|
316
|
+
$stderr.puts " #{ifname}: #{record_num} entries."
|
317
|
+
}
|
318
|
+
}
|
319
|
+
end
|
320
|
+
|
321
|
+
private
|
322
|
+
def bulk_import_upload_impl(name, part_name, io, size, retry_limit, retry_wait)
|
323
|
+
begin
|
324
|
+
client = get_client
|
325
|
+
client.bulk_import_upload_part(name, part_name, io, size)
|
326
|
+
rescue
|
327
|
+
if retry_limit > 0
|
328
|
+
retry_limit -= 1
|
329
|
+
$stderr.puts "#{$!}; retrying '#{part_name}'..."
|
330
|
+
sleep retry_wait
|
331
|
+
retry
|
332
|
+
end
|
333
|
+
raise
|
334
|
+
end
|
335
|
+
end
|
206
336
|
end
|
207
337
|
end
|
208
338
|
|
data/lib/td/command/list.rb
CHANGED
@@ -228,7 +228,9 @@ module List
|
|
228
228
|
add_list 'bulk_import:list', %w[], 'List bulk import sessions', 'bulk_import:list'
|
229
229
|
add_list 'bulk_import:show', %w[name], 'Show list of uploaded parts', 'bulk_import:show'
|
230
230
|
add_list 'bulk_import:create', %w[name db table], 'Create a new bulk import session to the the table', 'bulk_import:create logs_201201 example_db event_logs'
|
231
|
+
add_list 'bulk_import:prepare_part', %w[files_], 'Convert files into part file format', 'bulk_import:prepare_part logs/*.csv --format csv --columns time,uid,price,count --time-column "time" -o parts/'
|
231
232
|
add_list 'bulk_import:upload_part', %w[name id path.msgpack.gz], 'Upload or re-upload a file into a bulk import session', 'bulk_import:upload_part logs_201201 01h data-201201-01.msgpack.gz'
|
233
|
+
add_list 'bulk_import:upload_parts', %w[name files_], 'Upload or re-upload files into a bulk import session', 'bulk_import:upload_parts parts/* --prefix logs_'
|
232
234
|
add_list 'bulk_import:delete_part', %w[name id], 'Delete a uploaded file from a bulk import session', 'bulk_import:delete_part logs_201201 01h'
|
233
235
|
add_list 'bulk_import:perform', %w[name], 'Start to validate and convert uploaded files', 'bulk_import:perform logs_201201'
|
234
236
|
add_list 'bulk_import:error_records', %w[name], 'Show records which did not pass validations', 'bulk_import:error_records logs_201201'
|
@@ -0,0 +1,324 @@
|
|
1
|
+
|
2
|
+
module TreasureData
|
3
|
+
class FileReader
|
4
|
+
|
5
|
+
class MessagePackParsingReader
|
6
|
+
def initialize(io, error, opts)
|
7
|
+
require 'msgpack'
|
8
|
+
@io = io
|
9
|
+
@error = error
|
10
|
+
@u = MessagePack::Unpacker.new(@io)
|
11
|
+
end
|
12
|
+
|
13
|
+
def next
|
14
|
+
@u.next
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class LineReader
|
19
|
+
def initialize(io, error, opts)
|
20
|
+
if encoding = opts[:encoding]
|
21
|
+
io.external_encoding = encoding
|
22
|
+
end
|
23
|
+
#@delimiter = opts[:line_delimiter_expr] || /\r?\n/
|
24
|
+
@io = io
|
25
|
+
@error = error
|
26
|
+
end
|
27
|
+
|
28
|
+
def next_row
|
29
|
+
@io.readline($/).chomp
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class DelimiterParser
|
34
|
+
def initialize(reader, error, opts)
|
35
|
+
@reader = reader
|
36
|
+
@delimiter_expr = opts[:delimiter_expr]
|
37
|
+
@null_expr = opts[:null_expr]
|
38
|
+
# TODO
|
39
|
+
#@escape_char = opts[:escape_char]
|
40
|
+
#@quote_char = opts[:quote_char]
|
41
|
+
end
|
42
|
+
|
43
|
+
def next
|
44
|
+
row = @reader.next_row
|
45
|
+
array = row.split(@delimiter_expr)
|
46
|
+
array.map! {|x|
|
47
|
+
@null_expr =~ x ? nil : x
|
48
|
+
}
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
class JSONParser
|
53
|
+
def initialize(reader, error, opts)
|
54
|
+
@reader = reader
|
55
|
+
@error = error
|
56
|
+
end
|
57
|
+
|
58
|
+
def next
|
59
|
+
while true
|
60
|
+
line = @reader.next_row
|
61
|
+
begin
|
62
|
+
return JSON.parse(line)
|
63
|
+
rescue
|
64
|
+
@error.call("invalid json format: #{$!}", line)
|
65
|
+
next
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
#class ApacheParser
|
72
|
+
# REGEXP = /^([^ ]*) [^ ]* ([^ ]*) \[([^\]]*)\] "(\S+)(?: +([^ ]*) +\S*)?" ([^ ]*) ([^ ]*)(?: "([^\"]*)" "([^\"]*)")?$/
|
73
|
+
#
|
74
|
+
# def initialize(reader, error, opts)
|
75
|
+
# @reader = reader
|
76
|
+
# end
|
77
|
+
#
|
78
|
+
# def next
|
79
|
+
# while true
|
80
|
+
# m = REGEXP.match(@reader.next_row)
|
81
|
+
# if m
|
82
|
+
# h = {
|
83
|
+
# 'host' => m[1],
|
84
|
+
# 'user' => m[2],
|
85
|
+
# 'time' => m[3],
|
86
|
+
# 'method' => m[4],
|
87
|
+
# 'path' => m[5],
|
88
|
+
# 'code' => m[6],
|
89
|
+
# 'size' => m[7].to_i,
|
90
|
+
# 'referer' => m[8],
|
91
|
+
# 'agent' => m[9],
|
92
|
+
# }
|
93
|
+
# return h
|
94
|
+
# end
|
95
|
+
# end
|
96
|
+
# end
|
97
|
+
#end
|
98
|
+
|
99
|
+
class AutoTypeConvertParserFilter
|
100
|
+
def initialize(parser, error)
|
101
|
+
@parser = parser
|
102
|
+
end
|
103
|
+
|
104
|
+
def next
|
105
|
+
array = @parser.next
|
106
|
+
array.map! {|s|
|
107
|
+
# nil.to_i == 0 != nil.to_s
|
108
|
+
i = s.to_i
|
109
|
+
i.to_s == s ? i : s
|
110
|
+
}
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
class HashBuilder
|
115
|
+
def initialize(parser, error, columns)
|
116
|
+
@parser = parser
|
117
|
+
@columns = columns
|
118
|
+
end
|
119
|
+
|
120
|
+
def next
|
121
|
+
array = @parser.next
|
122
|
+
# FIXME error handling
|
123
|
+
Hash[@columns.zip(array)]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
class TimeParserFilter
|
128
|
+
def initialize(parser, error, opts)
|
129
|
+
require 'time'
|
130
|
+
@parser = parser
|
131
|
+
@error = error
|
132
|
+
@time_column = opts[:time_column]
|
133
|
+
unless @time_column
|
134
|
+
raise '-t, --time-column NAME option is required'
|
135
|
+
end
|
136
|
+
@time_format = opts[:time_format]
|
137
|
+
end
|
138
|
+
|
139
|
+
def next
|
140
|
+
while true
|
141
|
+
row = @parser.next
|
142
|
+
tval = row[@time_column]
|
143
|
+
|
144
|
+
unless tval
|
145
|
+
@error.call("time column '#{@time_column}' is missing", row)
|
146
|
+
next
|
147
|
+
end
|
148
|
+
|
149
|
+
begin
|
150
|
+
if tf = @time_format
|
151
|
+
row['time'] = parse_time(tval, tf).to_i
|
152
|
+
elsif tval.is_a?(Integer)
|
153
|
+
row['time'] = tval
|
154
|
+
else
|
155
|
+
row['time'] = Time.parse(tval).to_i
|
156
|
+
end
|
157
|
+
return row
|
158
|
+
|
159
|
+
rescue
|
160
|
+
@error.call("invalid time format '#{tval}': #{$!}", row)
|
161
|
+
next
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
if Time.respond_to?(:strptime)
|
167
|
+
def parse_time(value, format)
|
168
|
+
Time.strptime(value, format)
|
169
|
+
end
|
170
|
+
else
|
171
|
+
def parse_time(value, format)
|
172
|
+
Time.parse(DateTime.strptime(value, format).to_s)
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
def initialize
|
178
|
+
@format = "text"
|
179
|
+
@default_opts = {
|
180
|
+
:delimiter_expr => /\t|,/,
|
181
|
+
#:line_delimiter_expr => /\r?\n/,
|
182
|
+
:null_expr => /\A(?:\\N|\-|)\z/,
|
183
|
+
#:quote_char => "\"",
|
184
|
+
}
|
185
|
+
@opts = {}
|
186
|
+
@parser_class = nil
|
187
|
+
end
|
188
|
+
|
189
|
+
attr_reader :default_opts, :opts
|
190
|
+
attr_accessor :parser_class
|
191
|
+
|
192
|
+
def init_optparse(op)
|
193
|
+
op.on('-f', '--format NAME', "source file format") {|s|
|
194
|
+
set_format_template(s)
|
195
|
+
}
|
196
|
+
op.on('-h', '--columns NAME,NAME,...', 'column names') {|s|
|
197
|
+
@opts[:column_names] = s.split(',')
|
198
|
+
}
|
199
|
+
op.on('-H', '--column-header', 'first line includes column names', TrueClass) {|b|
|
200
|
+
@opts[:column_header] = b
|
201
|
+
}
|
202
|
+
op.on('-d', '--delimiter REGEX', "delimiter between columns (default: #{@default_opts[:delimiter_expr].inspect[1..-2]}") {|s|
|
203
|
+
@opts[:delimiter_expr] = Regexp.new(s)
|
204
|
+
}
|
205
|
+
#op.on('-D', '--line-delimiter REGEX', "delimiter between rows (default: #{@default_opts[:line_delimiter_expr].inspect[1..-2]})") {|s|
|
206
|
+
# @opts[:line_delimiter_expr] = Regexp.new(s)
|
207
|
+
#}
|
208
|
+
op.on('-N', '--null REGEX', "null expression (default: #{@default_opts[:null_expr].inspect[1..-2]}") {|s|
|
209
|
+
@opts[:null_expr] = Regexp.new(s)
|
210
|
+
}
|
211
|
+
# TODO
|
212
|
+
#op.on('-E', '--escape CHAR', "escape character (default: no escape character)") {|s|
|
213
|
+
# @opts[:escape_char] = s
|
214
|
+
#}
|
215
|
+
#op.on('-Q', '--quote CHAR', "quote character (default: #{@default_opts[:quote_char]}") {|s|
|
216
|
+
# @opts[:quote_char] = s
|
217
|
+
#}
|
218
|
+
op.on('-S', '--all-string', 'disable automatic type conversion', TrueClass) {|b|
|
219
|
+
@opts[:all_string] = b
|
220
|
+
}
|
221
|
+
op.on('-t', '--time-column NAME', 'name of the time column (default: auto detect)') {|s|
|
222
|
+
@opts[:time_column] = s
|
223
|
+
}
|
224
|
+
op.on('-T', '--time-format FORMAT', 'strftime(3) format of the time column') {|s|
|
225
|
+
@opts[:time_format] = s
|
226
|
+
}
|
227
|
+
op.on('-e', '--encoding NAME', "text encoding") {|s|
|
228
|
+
@opts[:encoding] = s
|
229
|
+
}
|
230
|
+
op.on('-C', '--compress NAME', 'compression format name [plain, gzip] (default: auto detect)') {|s|
|
231
|
+
@opts[:compress] = s
|
232
|
+
}
|
233
|
+
end
|
234
|
+
|
235
|
+
def set_format_template(name)
|
236
|
+
case name
|
237
|
+
when 'csv'
|
238
|
+
@format = 'text'
|
239
|
+
@opts[:delimiter_expr] = /,/
|
240
|
+
when 'tsv'
|
241
|
+
@format = 'text'
|
242
|
+
@opts[:delimiter_expr] = /\t/
|
243
|
+
#when 'apache'
|
244
|
+
# @format = 'apache'
|
245
|
+
# @opts[:column_names] = ['host', 'user', 'time', 'method', 'path', 'code', 'size', 'referer', 'agent']
|
246
|
+
# @opts[:null_expr] = /\A(?:\-|)\z/
|
247
|
+
# @opts[:time_column] = 'time'
|
248
|
+
# @opts[:time_format] = '%d/%b/%Y:%H:%M:%S %z'
|
249
|
+
when 'msgpack'
|
250
|
+
@format = 'msgpack'
|
251
|
+
when 'json'
|
252
|
+
@format = 'json'
|
253
|
+
else
|
254
|
+
raise "Unknown format: #{name}"
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
def compose_factory
|
259
|
+
opts = @default_opts.merge(@opts)
|
260
|
+
case @format
|
261
|
+
when 'text'
|
262
|
+
Proc.new {|io,error|
|
263
|
+
reader = LineReader.new(io, error, opts)
|
264
|
+
parser = DelimiterParser.new(reader, error, opts)
|
265
|
+
if opts[:column_header]
|
266
|
+
column_names = parser.next
|
267
|
+
elsif opts[:column_names]
|
268
|
+
column_names = opts[:column_names]
|
269
|
+
else
|
270
|
+
raise "--column-header or --columns option is required"
|
271
|
+
end
|
272
|
+
unless opts[:all_string]
|
273
|
+
parser = AutoTypeConvertParserFilter.new(parser, error)
|
274
|
+
end
|
275
|
+
parser = HashBuilder.new(parser, error, column_names)
|
276
|
+
parser = TimeParserFilter.new(parser, error, opts)
|
277
|
+
}
|
278
|
+
|
279
|
+
#when 'apache'
|
280
|
+
|
281
|
+
when 'json'
|
282
|
+
Proc.new {|io,error|
|
283
|
+
reader = LineReader.new(io, error, opts)
|
284
|
+
parser = JSONParser.new(reader, error, opts)
|
285
|
+
if opts[:column_header]
|
286
|
+
column_names = parser.next
|
287
|
+
elsif opts[:column_names]
|
288
|
+
column_names = opts[:column_names]
|
289
|
+
end
|
290
|
+
if column_names
|
291
|
+
parser = HashBuilder.new(parser, error, column_names)
|
292
|
+
end
|
293
|
+
parser = TimeParserFilter.new(parser, error, opts)
|
294
|
+
}
|
295
|
+
|
296
|
+
when 'msgpack'
|
297
|
+
Proc.new {|io,error|
|
298
|
+
parser = MessagePackParsingReader.new(io, error, opts)
|
299
|
+
if opts[:column_header]
|
300
|
+
column_names = parser.next
|
301
|
+
elsif opts[:column_names]
|
302
|
+
column_names = opts[:column_names]
|
303
|
+
end
|
304
|
+
if column_names
|
305
|
+
parser = HashBuilder.new(parser, error, column_names)
|
306
|
+
end
|
307
|
+
parser = TimeParserFilter.new(parser, error, opts)
|
308
|
+
}
|
309
|
+
end
|
310
|
+
end
|
311
|
+
|
312
|
+
def parse(io, error, &block)
|
313
|
+
factory = compose_factory
|
314
|
+
parser = factory.call(io, error)
|
315
|
+
begin
|
316
|
+
while record = parser.next
|
317
|
+
block.call(record)
|
318
|
+
end
|
319
|
+
rescue EOFError
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
end
|
324
|
+
end
|
data/lib/td/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: td
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.10.
|
4
|
+
version: 0.10.39
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-27 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: msgpack
|
@@ -163,6 +163,7 @@ files:
|
|
163
163
|
- lib/td/compat_gzip_reader.rb
|
164
164
|
- lib/td/config.rb
|
165
165
|
- lib/td/distribution.rb
|
166
|
+
- lib/td/file_reader.rb
|
166
167
|
- lib/td/version.rb
|
167
168
|
- ChangeLog
|
168
169
|
- README.rdoc
|