td 0.10.38 → 0.10.39

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog CHANGED
@@ -1,4 +1,9 @@
1
1
 
2
+ == 2012-08-07 version 0.10.39
3
+
4
+ * Added bulk_import:prepare_parts and bulk_import:upload_parts subcommands
5
+
6
+
2
7
  == 2012-08-06 version 0.10.38
3
8
 
4
9
  * *:list and *:show: subcommands show organization name if it's set
@@ -35,7 +35,7 @@ module Command
35
35
 
36
36
  client.grant_access_control(subject, action, scope, grant_option)
37
37
 
38
- $stderr.puts "Access control [#{subject} #{action} #{scope}] is created #{grant_option? ? 'with' : 'without'} grant option."
38
+ $stderr.puts "Access control [#{subject} #{action} #{scope}] is created #{grant_option ? 'with' : 'without'} grant option."
39
39
  end
40
40
 
41
41
  def acl_revoke(op)
@@ -85,17 +85,48 @@ module Command
85
85
  end
86
86
 
87
87
  def bulk_import_upload_part(op)
88
- name, part_name, path = op.cmd_parse
88
+ retry_limit = 10
89
+ retry_wait = 1
89
90
 
90
- client = get_client
91
+ name, part_name, path = op.cmd_parse
91
92
 
92
- File.open(path, "rb") {|is|
93
- client.bulk_import_upload_part(name, part_name, is, is.size)
93
+ File.open(path, "rb") {|io|
94
+ bulk_import_upload_impl(name, part_name, io, io.size, retry_limit, retry_wait)
94
95
  }
95
96
 
96
97
  $stderr.puts "Part '#{part_name}' is uploaded."
97
98
  end
98
99
 
100
+ def bulk_import_upload_parts(op)
101
+ retry_limit = 10
102
+ retry_wait = 1
103
+ suffix_count = 0
104
+ part_prefix = ""
105
+
106
+ op.on('-P', '--prefix NAME', 'add prefix to parts name') {|s|
107
+ part_prefix = s
108
+ }
109
+ op.on('-s', '--use-suffix COUNT', 'use COUNT number of . (dots) in the source file name to the parts name', Integer) {|i|
110
+ suffix_count = i
111
+ }
112
+
113
+ name, *files = op.cmd_parse
114
+
115
+ files.each {|ifname|
116
+ basename = File.basename(ifname)
117
+ part_name = part_prefix + basename.split('.')[0..suffix_count].join('.')
118
+
119
+ File.open(ifname, "rb") {|io|
120
+ size = io.size
121
+ $stderr.puts "Uploading '#{ifname}' -> '#{part_name}'... (#{size} bytes)"
122
+
123
+ bulk_import_upload_impl(name, part_name, io, size, retry_limit, retry_wait)
124
+ }
125
+ }
126
+
127
+ $stderr.puts "done."
128
+ end
129
+
99
130
  def bulk_import_delete_part(op)
100
131
  name, part_name = op.cmd_parse
101
132
 
@@ -203,6 +234,105 @@ module Command
203
234
  $stderr.puts "Bulk import session '#{name}' is unfrozen."
204
235
  end
205
236
 
237
+
238
+ PART_SPLIT_SIZE = 16*1024*1024
239
+
240
+ def bulk_import_prepare_part(op)
241
+ outdir = nil
242
+ split_size_kb = PART_SPLIT_SIZE / 1024 # kb
243
+
244
+ require 'td/file_reader'
245
+ reader = FileReader.new
246
+ reader.init_optparse(op)
247
+
248
+ op.on('-s', '--split-size SIZE_IN_KB', "size of each parts (default: #{split_size_kb})", Integer) {|i|
249
+ split_size_kb = i
250
+ }
251
+ op.on('-o', '--output DIR', 'output directory') {|s|
252
+ outdir = s
253
+ }
254
+
255
+ *files = op.cmd_parse
256
+
257
+ unless outdir
258
+ $stderr.puts "-o, --output DIR option is required."
259
+ exit 1
260
+ end
261
+
262
+ split_size = split_size_kb * 1024
263
+
264
+ require 'fileutils'
265
+ FileUtils.mkdir_p(outdir)
266
+
267
+ require 'json'
268
+ require 'msgpack'
269
+ require 'zlib'
270
+
271
+ error = Proc.new {|reason,data|
272
+ begin
273
+ $stderr.puts "#{reason}: #{data.to_json}"
274
+ rescue
275
+ $stderr.puts "#{reason}"
276
+ end
277
+ }
278
+
279
+ files.each {|ifname|
280
+ $stderr.puts "Processing #{ifname}..."
281
+ record_num = 0
282
+
283
+ basename = File.basename(ifname).split('.').first
284
+ File.open(ifname) {|io|
285
+ of_index = 0
286
+ out = nil
287
+ zout = nil
288
+ begin
289
+ reader.parse(io, error) {|record|
290
+ if zout == nil
291
+ ofname = "#{basename}_#{of_index}.msgpack.gz"
292
+ $stderr.puts " Preparing part \"#{basename}_#{of_index}\"..."
293
+ out = File.open("#{outdir}/#{ofname}", 'wb')
294
+ zout = Zlib::GzipWriter.new(out)
295
+
296
+ t = record['time']
297
+ $stderr.puts " sample: #{Time.at(t).utc} #{record.to_json}"
298
+ end
299
+
300
+ zout.write(record.to_msgpack)
301
+ record_num += 1
302
+
303
+ if out.size > split_size
304
+ zout.close
305
+ of_index += 1
306
+ out = nil
307
+ zout = nil
308
+ end
309
+ }
310
+ ensure
311
+ if zout
312
+ zout.close
313
+ zout = nil
314
+ end
315
+ end
316
+ $stderr.puts " #{ifname}: #{record_num} entries."
317
+ }
318
+ }
319
+ end
320
+
321
+ private
322
+ def bulk_import_upload_impl(name, part_name, io, size, retry_limit, retry_wait)
323
+ begin
324
+ client = get_client
325
+ client.bulk_import_upload_part(name, part_name, io, size)
326
+ rescue
327
+ if retry_limit > 0
328
+ retry_limit -= 1
329
+ $stderr.puts "#{$!}; retrying '#{part_name}'..."
330
+ sleep retry_wait
331
+ retry
332
+ end
333
+ raise
334
+ end
335
+ end
206
336
  end
207
337
  end
208
338
 
@@ -228,7 +228,9 @@ module List
228
228
  add_list 'bulk_import:list', %w[], 'List bulk import sessions', 'bulk_import:list'
229
229
  add_list 'bulk_import:show', %w[name], 'Show list of uploaded parts', 'bulk_import:show'
230
230
  add_list 'bulk_import:create', %w[name db table], 'Create a new bulk import session to the the table', 'bulk_import:create logs_201201 example_db event_logs'
231
+ add_list 'bulk_import:prepare_part', %w[files_], 'Convert files into part file format', 'bulk_import:prepare_part logs/*.csv --format csv --columns time,uid,price,count --time-column "time" -o parts/'
231
232
  add_list 'bulk_import:upload_part', %w[name id path.msgpack.gz], 'Upload or re-upload a file into a bulk import session', 'bulk_import:upload_part logs_201201 01h data-201201-01.msgpack.gz'
233
+ add_list 'bulk_import:upload_parts', %w[name files_], 'Upload or re-upload files into a bulk import session', 'bulk_import:upload_parts parts/* --prefix logs_'
232
234
  add_list 'bulk_import:delete_part', %w[name id], 'Delete a uploaded file from a bulk import session', 'bulk_import:delete_part logs_201201 01h'
233
235
  add_list 'bulk_import:perform', %w[name], 'Start to validate and convert uploaded files', 'bulk_import:perform logs_201201'
234
236
  add_list 'bulk_import:error_records', %w[name], 'Show records which did not pass validations', 'bulk_import:error_records logs_201201'
@@ -0,0 +1,324 @@
1
+
2
+ module TreasureData
3
+ class FileReader
4
+
5
+ class MessagePackParsingReader
6
+ def initialize(io, error, opts)
7
+ require 'msgpack'
8
+ @io = io
9
+ @error = error
10
+ @u = MessagePack::Unpacker.new(@io)
11
+ end
12
+
13
+ def next
14
+ @u.next
15
+ end
16
+ end
17
+
18
+ class LineReader
19
+ def initialize(io, error, opts)
20
+ if encoding = opts[:encoding]
21
+ io.external_encoding = encoding
22
+ end
23
+ #@delimiter = opts[:line_delimiter_expr] || /\r?\n/
24
+ @io = io
25
+ @error = error
26
+ end
27
+
28
+ def next_row
29
+ @io.readline($/).chomp
30
+ end
31
+ end
32
+
33
+ class DelimiterParser
34
+ def initialize(reader, error, opts)
35
+ @reader = reader
36
+ @delimiter_expr = opts[:delimiter_expr]
37
+ @null_expr = opts[:null_expr]
38
+ # TODO
39
+ #@escape_char = opts[:escape_char]
40
+ #@quote_char = opts[:quote_char]
41
+ end
42
+
43
+ def next
44
+ row = @reader.next_row
45
+ array = row.split(@delimiter_expr)
46
+ array.map! {|x|
47
+ @null_expr =~ x ? nil : x
48
+ }
49
+ end
50
+ end
51
+
52
+ class JSONParser
53
+ def initialize(reader, error, opts)
54
+ @reader = reader
55
+ @error = error
56
+ end
57
+
58
+ def next
59
+ while true
60
+ line = @reader.next_row
61
+ begin
62
+ return JSON.parse(line)
63
+ rescue
64
+ @error.call("invalid json format: #{$!}", line)
65
+ next
66
+ end
67
+ end
68
+ end
69
+ end
70
+
71
+ #class ApacheParser
72
+ # REGEXP = /^([^ ]*) [^ ]* ([^ ]*) \[([^\]]*)\] "(\S+)(?: +([^ ]*) +\S*)?" ([^ ]*) ([^ ]*)(?: "([^\"]*)" "([^\"]*)")?$/
73
+ #
74
+ # def initialize(reader, error, opts)
75
+ # @reader = reader
76
+ # end
77
+ #
78
+ # def next
79
+ # while true
80
+ # m = REGEXP.match(@reader.next_row)
81
+ # if m
82
+ # h = {
83
+ # 'host' => m[1],
84
+ # 'user' => m[2],
85
+ # 'time' => m[3],
86
+ # 'method' => m[4],
87
+ # 'path' => m[5],
88
+ # 'code' => m[6],
89
+ # 'size' => m[7].to_i,
90
+ # 'referer' => m[8],
91
+ # 'agent' => m[9],
92
+ # }
93
+ # return h
94
+ # end
95
+ # end
96
+ # end
97
+ #end
98
+
99
+ class AutoTypeConvertParserFilter
100
+ def initialize(parser, error)
101
+ @parser = parser
102
+ end
103
+
104
+ def next
105
+ array = @parser.next
106
+ array.map! {|s|
107
+ # nil.to_i == 0 != nil.to_s
108
+ i = s.to_i
109
+ i.to_s == s ? i : s
110
+ }
111
+ end
112
+ end
113
+
114
+ class HashBuilder
115
+ def initialize(parser, error, columns)
116
+ @parser = parser
117
+ @columns = columns
118
+ end
119
+
120
+ def next
121
+ array = @parser.next
122
+ # FIXME error handling
123
+ Hash[@columns.zip(array)]
124
+ end
125
+ end
126
+
127
+ class TimeParserFilter
128
+ def initialize(parser, error, opts)
129
+ require 'time'
130
+ @parser = parser
131
+ @error = error
132
+ @time_column = opts[:time_column]
133
+ unless @time_column
134
+ raise '-t, --time-column NAME option is required'
135
+ end
136
+ @time_format = opts[:time_format]
137
+ end
138
+
139
+ def next
140
+ while true
141
+ row = @parser.next
142
+ tval = row[@time_column]
143
+
144
+ unless tval
145
+ @error.call("time column '#{@time_column}' is missing", row)
146
+ next
147
+ end
148
+
149
+ begin
150
+ if tf = @time_format
151
+ row['time'] = parse_time(tval, tf).to_i
152
+ elsif tval.is_a?(Integer)
153
+ row['time'] = tval
154
+ else
155
+ row['time'] = Time.parse(tval).to_i
156
+ end
157
+ return row
158
+
159
+ rescue
160
+ @error.call("invalid time format '#{tval}': #{$!}", row)
161
+ next
162
+ end
163
+ end
164
+ end
165
+
166
+ if Time.respond_to?(:strptime)
167
+ def parse_time(value, format)
168
+ Time.strptime(value, format)
169
+ end
170
+ else
171
+ def parse_time(value, format)
172
+ Time.parse(DateTime.strptime(value, format).to_s)
173
+ end
174
+ end
175
+ end
176
+
177
+ def initialize
178
+ @format = "text"
179
+ @default_opts = {
180
+ :delimiter_expr => /\t|,/,
181
+ #:line_delimiter_expr => /\r?\n/,
182
+ :null_expr => /\A(?:\\N|\-|)\z/,
183
+ #:quote_char => "\"",
184
+ }
185
+ @opts = {}
186
+ @parser_class = nil
187
+ end
188
+
189
+ attr_reader :default_opts, :opts
190
+ attr_accessor :parser_class
191
+
192
+ def init_optparse(op)
193
+ op.on('-f', '--format NAME', "source file format") {|s|
194
+ set_format_template(s)
195
+ }
196
+ op.on('-h', '--columns NAME,NAME,...', 'column names') {|s|
197
+ @opts[:column_names] = s.split(',')
198
+ }
199
+ op.on('-H', '--column-header', 'first line includes column names', TrueClass) {|b|
200
+ @opts[:column_header] = b
201
+ }
202
+ op.on('-d', '--delimiter REGEX', "delimiter between columns (default: #{@default_opts[:delimiter_expr].inspect[1..-2]}") {|s|
203
+ @opts[:delimiter_expr] = Regexp.new(s)
204
+ }
205
+ #op.on('-D', '--line-delimiter REGEX', "delimiter between rows (default: #{@default_opts[:line_delimiter_expr].inspect[1..-2]})") {|s|
206
+ # @opts[:line_delimiter_expr] = Regexp.new(s)
207
+ #}
208
+ op.on('-N', '--null REGEX', "null expression (default: #{@default_opts[:null_expr].inspect[1..-2]}") {|s|
209
+ @opts[:null_expr] = Regexp.new(s)
210
+ }
211
+ # TODO
212
+ #op.on('-E', '--escape CHAR', "escape character (default: no escape character)") {|s|
213
+ # @opts[:escape_char] = s
214
+ #}
215
+ #op.on('-Q', '--quote CHAR', "quote character (default: #{@default_opts[:quote_char]}") {|s|
216
+ # @opts[:quote_char] = s
217
+ #}
218
+ op.on('-S', '--all-string', 'disable automatic type conversion', TrueClass) {|b|
219
+ @opts[:all_string] = b
220
+ }
221
+ op.on('-t', '--time-column NAME', 'name of the time column (default: auto detect)') {|s|
222
+ @opts[:time_column] = s
223
+ }
224
+ op.on('-T', '--time-format FORMAT', 'strftime(3) format of the time column') {|s|
225
+ @opts[:time_format] = s
226
+ }
227
+ op.on('-e', '--encoding NAME', "text encoding") {|s|
228
+ @opts[:encoding] = s
229
+ }
230
+ op.on('-C', '--compress NAME', 'compression format name [plain, gzip] (default: auto detect)') {|s|
231
+ @opts[:compress] = s
232
+ }
233
+ end
234
+
235
+ def set_format_template(name)
236
+ case name
237
+ when 'csv'
238
+ @format = 'text'
239
+ @opts[:delimiter_expr] = /,/
240
+ when 'tsv'
241
+ @format = 'text'
242
+ @opts[:delimiter_expr] = /\t/
243
+ #when 'apache'
244
+ # @format = 'apache'
245
+ # @opts[:column_names] = ['host', 'user', 'time', 'method', 'path', 'code', 'size', 'referer', 'agent']
246
+ # @opts[:null_expr] = /\A(?:\-|)\z/
247
+ # @opts[:time_column] = 'time'
248
+ # @opts[:time_format] = '%d/%b/%Y:%H:%M:%S %z'
249
+ when 'msgpack'
250
+ @format = 'msgpack'
251
+ when 'json'
252
+ @format = 'json'
253
+ else
254
+ raise "Unknown format: #{name}"
255
+ end
256
+ end
257
+
258
+ def compose_factory
259
+ opts = @default_opts.merge(@opts)
260
+ case @format
261
+ when 'text'
262
+ Proc.new {|io,error|
263
+ reader = LineReader.new(io, error, opts)
264
+ parser = DelimiterParser.new(reader, error, opts)
265
+ if opts[:column_header]
266
+ column_names = parser.next
267
+ elsif opts[:column_names]
268
+ column_names = opts[:column_names]
269
+ else
270
+ raise "--column-header or --columns option is required"
271
+ end
272
+ unless opts[:all_string]
273
+ parser = AutoTypeConvertParserFilter.new(parser, error)
274
+ end
275
+ parser = HashBuilder.new(parser, error, column_names)
276
+ parser = TimeParserFilter.new(parser, error, opts)
277
+ }
278
+
279
+ #when 'apache'
280
+
281
+ when 'json'
282
+ Proc.new {|io,error|
283
+ reader = LineReader.new(io, error, opts)
284
+ parser = JSONParser.new(reader, error, opts)
285
+ if opts[:column_header]
286
+ column_names = parser.next
287
+ elsif opts[:column_names]
288
+ column_names = opts[:column_names]
289
+ end
290
+ if column_names
291
+ parser = HashBuilder.new(parser, error, column_names)
292
+ end
293
+ parser = TimeParserFilter.new(parser, error, opts)
294
+ }
295
+
296
+ when 'msgpack'
297
+ Proc.new {|io,error|
298
+ parser = MessagePackParsingReader.new(io, error, opts)
299
+ if opts[:column_header]
300
+ column_names = parser.next
301
+ elsif opts[:column_names]
302
+ column_names = opts[:column_names]
303
+ end
304
+ if column_names
305
+ parser = HashBuilder.new(parser, error, column_names)
306
+ end
307
+ parser = TimeParserFilter.new(parser, error, opts)
308
+ }
309
+ end
310
+ end
311
+
312
+ def parse(io, error, &block)
313
+ factory = compose_factory
314
+ parser = factory.call(io, error)
315
+ begin
316
+ while record = parser.next
317
+ block.call(record)
318
+ end
319
+ rescue EOFError
320
+ end
321
+ end
322
+
323
+ end
324
+ end
data/lib/td/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module TreasureData
2
2
 
3
- VERSION = '0.10.38'
3
+ VERSION = '0.10.39'
4
4
 
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: td
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.38
4
+ version: 0.10.39
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-20 00:00:00.000000000 Z
12
+ date: 2012-08-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: msgpack
@@ -163,6 +163,7 @@ files:
163
163
  - lib/td/compat_gzip_reader.rb
164
164
  - lib/td/config.rb
165
165
  - lib/td/distribution.rb
166
+ - lib/td/file_reader.rb
166
167
  - lib/td/version.rb
167
168
  - ChangeLog
168
169
  - README.rdoc