td 0.10.38 → 0.10.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog CHANGED
@@ -1,4 +1,9 @@
1
1
 
2
+ == 2012-08-07 version 0.10.39
3
+
4
+ * Added bulk_import:prepare_parts and bulk_import:upload_parts subcommands
5
+
6
+
2
7
  == 2012-08-06 version 0.10.38
3
8
 
4
9
  * *:list and *:show: subcommands show organization name if it's set
@@ -35,7 +35,7 @@ module Command
35
35
 
36
36
  client.grant_access_control(subject, action, scope, grant_option)
37
37
 
38
- $stderr.puts "Access control [#{subject} #{action} #{scope}] is created #{grant_option? ? 'with' : 'without'} grant option."
38
+ $stderr.puts "Access control [#{subject} #{action} #{scope}] is created #{grant_option ? 'with' : 'without'} grant option."
39
39
  end
40
40
 
41
41
  def acl_revoke(op)
@@ -85,17 +85,48 @@ module Command
85
85
  end
86
86
 
87
87
  def bulk_import_upload_part(op)
88
- name, part_name, path = op.cmd_parse
88
+ retry_limit = 10
89
+ retry_wait = 1
89
90
 
90
- client = get_client
91
+ name, part_name, path = op.cmd_parse
91
92
 
92
- File.open(path, "rb") {|is|
93
- client.bulk_import_upload_part(name, part_name, is, is.size)
93
+ File.open(path, "rb") {|io|
94
+ bulk_import_upload_impl(name, part_name, io, io.size, retry_limit, retry_wait)
94
95
  }
95
96
 
96
97
  $stderr.puts "Part '#{part_name}' is uploaded."
97
98
  end
98
99
 
100
+ def bulk_import_upload_parts(op)
101
+ retry_limit = 10
102
+ retry_wait = 1
103
+ suffix_count = 0
104
+ part_prefix = ""
105
+
106
+ op.on('-P', '--prefix NAME', 'add prefix to parts name') {|s|
107
+ part_prefix = s
108
+ }
109
+ op.on('-s', '--use-suffix COUNT', 'use COUNT number of . (dots) in the source file name to the parts name', Integer) {|i|
110
+ suffix_count = i
111
+ }
112
+
113
+ name, *files = op.cmd_parse
114
+
115
+ files.each {|ifname|
116
+ basename = File.basename(ifname)
117
+ part_name = part_prefix + basename.split('.')[0..suffix_count].join('.')
118
+
119
+ File.open(ifname, "rb") {|io|
120
+ size = io.size
121
+ $stderr.puts "Uploading '#{ifname}' -> '#{part_name}'... (#{size} bytes)"
122
+
123
+ bulk_import_upload_impl(name, part_name, io, size, retry_limit, retry_wait)
124
+ }
125
+ }
126
+
127
+ $stderr.puts "done."
128
+ end
129
+
99
130
  def bulk_import_delete_part(op)
100
131
  name, part_name = op.cmd_parse
101
132
 
@@ -203,6 +234,105 @@ module Command
203
234
  $stderr.puts "Bulk import session '#{name}' is unfrozen."
204
235
  end
205
236
 
237
+
238
+ PART_SPLIT_SIZE = 16*1024*1024
239
+
240
+ def bulk_import_prepare_part(op)
241
+ outdir = nil
242
+ split_size_kb = PART_SPLIT_SIZE / 1024 # kb
243
+
244
+ require 'td/file_reader'
245
+ reader = FileReader.new
246
+ reader.init_optparse(op)
247
+
248
+ op.on('-s', '--split-size SIZE_IN_KB', "size of each parts (default: #{split_size_kb})", Integer) {|i|
249
+ split_size_kb = i
250
+ }
251
+ op.on('-o', '--output DIR', 'output directory') {|s|
252
+ outdir = s
253
+ }
254
+
255
+ *files = op.cmd_parse
256
+
257
+ unless outdir
258
+ $stderr.puts "-o, --output DIR option is required."
259
+ exit 1
260
+ end
261
+
262
+ split_size = split_size_kb * 1024
263
+
264
+ require 'fileutils'
265
+ FileUtils.mkdir_p(outdir)
266
+
267
+ require 'json'
268
+ require 'msgpack'
269
+ require 'zlib'
270
+
271
+ error = Proc.new {|reason,data|
272
+ begin
273
+ $stderr.puts "#{reason}: #{data.to_json}"
274
+ rescue
275
+ $stderr.puts "#{reason}"
276
+ end
277
+ }
278
+
279
+ files.each {|ifname|
280
+ $stderr.puts "Processing #{ifname}..."
281
+ record_num = 0
282
+
283
+ basename = File.basename(ifname).split('.').first
284
+ File.open(ifname) {|io|
285
+ of_index = 0
286
+ out = nil
287
+ zout = nil
288
+ begin
289
+ reader.parse(io, error) {|record|
290
+ if zout == nil
291
+ ofname = "#{basename}_#{of_index}.msgpack.gz"
292
+ $stderr.puts " Preparing part \"#{basename}_#{of_index}\"..."
293
+ out = File.open("#{outdir}/#{ofname}", 'wb')
294
+ zout = Zlib::GzipWriter.new(out)
295
+
296
+ t = record['time']
297
+ $stderr.puts " sample: #{Time.at(t).utc} #{record.to_json}"
298
+ end
299
+
300
+ zout.write(record.to_msgpack)
301
+ record_num += 1
302
+
303
+ if out.size > split_size
304
+ zout.close
305
+ of_index += 1
306
+ out = nil
307
+ zout = nil
308
+ end
309
+ }
310
+ ensure
311
+ if zout
312
+ zout.close
313
+ zout = nil
314
+ end
315
+ end
316
+ $stderr.puts " #{ifname}: #{record_num} entries."
317
+ }
318
+ }
319
+ end
320
+
321
+ private
322
+ def bulk_import_upload_impl(name, part_name, io, size, retry_limit, retry_wait)
323
+ begin
324
+ client = get_client
325
+ client.bulk_import_upload_part(name, part_name, io, size)
326
+ rescue
327
+ if retry_limit > 0
328
+ retry_limit -= 1
329
+ $stderr.puts "#{$!}; retrying '#{part_name}'..."
330
+ sleep retry_wait
331
+ retry
332
+ end
333
+ raise
334
+ end
335
+ end
206
336
  end
207
337
  end
208
338
 
@@ -228,7 +228,9 @@ module List
228
228
  add_list 'bulk_import:list', %w[], 'List bulk import sessions', 'bulk_import:list'
229
229
  add_list 'bulk_import:show', %w[name], 'Show list of uploaded parts', 'bulk_import:show'
230
230
  add_list 'bulk_import:create', %w[name db table], 'Create a new bulk import session to the the table', 'bulk_import:create logs_201201 example_db event_logs'
231
+ add_list 'bulk_import:prepare_part', %w[files_], 'Convert files into part file format', 'bulk_import:prepare_part logs/*.csv --format csv --columns time,uid,price,count --time-column "time" -o parts/'
231
232
  add_list 'bulk_import:upload_part', %w[name id path.msgpack.gz], 'Upload or re-upload a file into a bulk import session', 'bulk_import:upload_part logs_201201 01h data-201201-01.msgpack.gz'
233
+ add_list 'bulk_import:upload_parts', %w[name files_], 'Upload or re-upload files into a bulk import session', 'bulk_import:upload_parts parts/* --prefix logs_'
232
234
  add_list 'bulk_import:delete_part', %w[name id], 'Delete a uploaded file from a bulk import session', 'bulk_import:delete_part logs_201201 01h'
233
235
  add_list 'bulk_import:perform', %w[name], 'Start to validate and convert uploaded files', 'bulk_import:perform logs_201201'
234
236
  add_list 'bulk_import:error_records', %w[name], 'Show records which did not pass validations', 'bulk_import:error_records logs_201201'
@@ -0,0 +1,324 @@
1
+
2
+ module TreasureData
3
+ class FileReader
4
+
5
+ class MessagePackParsingReader
6
+ def initialize(io, error, opts)
7
+ require 'msgpack'
8
+ @io = io
9
+ @error = error
10
+ @u = MessagePack::Unpacker.new(@io)
11
+ end
12
+
13
+ def next
14
+ @u.next
15
+ end
16
+ end
17
+
18
+ class LineReader
19
+ def initialize(io, error, opts)
20
+ if encoding = opts[:encoding]
21
+ io.external_encoding = encoding
22
+ end
23
+ #@delimiter = opts[:line_delimiter_expr] || /\r?\n/
24
+ @io = io
25
+ @error = error
26
+ end
27
+
28
+ def next_row
29
+ @io.readline($/).chomp
30
+ end
31
+ end
32
+
33
+ class DelimiterParser
34
+ def initialize(reader, error, opts)
35
+ @reader = reader
36
+ @delimiter_expr = opts[:delimiter_expr]
37
+ @null_expr = opts[:null_expr]
38
+ # TODO
39
+ #@escape_char = opts[:escape_char]
40
+ #@quote_char = opts[:quote_char]
41
+ end
42
+
43
+ def next
44
+ row = @reader.next_row
45
+ array = row.split(@delimiter_expr)
46
+ array.map! {|x|
47
+ @null_expr =~ x ? nil : x
48
+ }
49
+ end
50
+ end
51
+
52
+ class JSONParser
53
+ def initialize(reader, error, opts)
54
+ @reader = reader
55
+ @error = error
56
+ end
57
+
58
+ def next
59
+ while true
60
+ line = @reader.next_row
61
+ begin
62
+ return JSON.parse(line)
63
+ rescue
64
+ @error.call("invalid json format: #{$!}", line)
65
+ next
66
+ end
67
+ end
68
+ end
69
+ end
70
+
71
+ #class ApacheParser
72
+ # REGEXP = /^([^ ]*) [^ ]* ([^ ]*) \[([^\]]*)\] "(\S+)(?: +([^ ]*) +\S*)?" ([^ ]*) ([^ ]*)(?: "([^\"]*)" "([^\"]*)")?$/
73
+ #
74
+ # def initialize(reader, error, opts)
75
+ # @reader = reader
76
+ # end
77
+ #
78
+ # def next
79
+ # while true
80
+ # m = REGEXP.match(@reader.next_row)
81
+ # if m
82
+ # h = {
83
+ # 'host' => m[1],
84
+ # 'user' => m[2],
85
+ # 'time' => m[3],
86
+ # 'method' => m[4],
87
+ # 'path' => m[5],
88
+ # 'code' => m[6],
89
+ # 'size' => m[7].to_i,
90
+ # 'referer' => m[8],
91
+ # 'agent' => m[9],
92
+ # }
93
+ # return h
94
+ # end
95
+ # end
96
+ # end
97
+ #end
98
+
99
+ class AutoTypeConvertParserFilter
100
+ def initialize(parser, error)
101
+ @parser = parser
102
+ end
103
+
104
+ def next
105
+ array = @parser.next
106
+ array.map! {|s|
107
+ # nil.to_i == 0 != nil.to_s
108
+ i = s.to_i
109
+ i.to_s == s ? i : s
110
+ }
111
+ end
112
+ end
113
+
114
+ class HashBuilder
115
+ def initialize(parser, error, columns)
116
+ @parser = parser
117
+ @columns = columns
118
+ end
119
+
120
+ def next
121
+ array = @parser.next
122
+ # FIXME error handling
123
+ Hash[@columns.zip(array)]
124
+ end
125
+ end
126
+
127
+ class TimeParserFilter
128
+ def initialize(parser, error, opts)
129
+ require 'time'
130
+ @parser = parser
131
+ @error = error
132
+ @time_column = opts[:time_column]
133
+ unless @time_column
134
+ raise '-t, --time-column NAME option is required'
135
+ end
136
+ @time_format = opts[:time_format]
137
+ end
138
+
139
+ def next
140
+ while true
141
+ row = @parser.next
142
+ tval = row[@time_column]
143
+
144
+ unless tval
145
+ @error.call("time column '#{@time_column}' is missing", row)
146
+ next
147
+ end
148
+
149
+ begin
150
+ if tf = @time_format
151
+ row['time'] = parse_time(tval, tf).to_i
152
+ elsif tval.is_a?(Integer)
153
+ row['time'] = tval
154
+ else
155
+ row['time'] = Time.parse(tval).to_i
156
+ end
157
+ return row
158
+
159
+ rescue
160
+ @error.call("invalid time format '#{tval}': #{$!}", row)
161
+ next
162
+ end
163
+ end
164
+ end
165
+
166
+ if Time.respond_to?(:strptime)
167
+ def parse_time(value, format)
168
+ Time.strptime(value, format)
169
+ end
170
+ else
171
+ def parse_time(value, format)
172
+ Time.parse(DateTime.strptime(value, format).to_s)
173
+ end
174
+ end
175
+ end
176
+
177
+ def initialize
178
+ @format = "text"
179
+ @default_opts = {
180
+ :delimiter_expr => /\t|,/,
181
+ #:line_delimiter_expr => /\r?\n/,
182
+ :null_expr => /\A(?:\\N|\-|)\z/,
183
+ #:quote_char => "\"",
184
+ }
185
+ @opts = {}
186
+ @parser_class = nil
187
+ end
188
+
189
+ attr_reader :default_opts, :opts
190
+ attr_accessor :parser_class
191
+
192
+ def init_optparse(op)
193
+ op.on('-f', '--format NAME', "source file format") {|s|
194
+ set_format_template(s)
195
+ }
196
+ op.on('-h', '--columns NAME,NAME,...', 'column names') {|s|
197
+ @opts[:column_names] = s.split(',')
198
+ }
199
+ op.on('-H', '--column-header', 'first line includes column names', TrueClass) {|b|
200
+ @opts[:column_header] = b
201
+ }
202
+ op.on('-d', '--delimiter REGEX', "delimiter between columns (default: #{@default_opts[:delimiter_expr].inspect[1..-2]}") {|s|
203
+ @opts[:delimiter_expr] = Regexp.new(s)
204
+ }
205
+ #op.on('-D', '--line-delimiter REGEX', "delimiter between rows (default: #{@default_opts[:line_delimiter_expr].inspect[1..-2]})") {|s|
206
+ # @opts[:line_delimiter_expr] = Regexp.new(s)
207
+ #}
208
+ op.on('-N', '--null REGEX', "null expression (default: #{@default_opts[:null_expr].inspect[1..-2]}") {|s|
209
+ @opts[:null_expr] = Regexp.new(s)
210
+ }
211
+ # TODO
212
+ #op.on('-E', '--escape CHAR', "escape character (default: no escape character)") {|s|
213
+ # @opts[:escape_char] = s
214
+ #}
215
+ #op.on('-Q', '--quote CHAR', "quote character (default: #{@default_opts[:quote_char]}") {|s|
216
+ # @opts[:quote_char] = s
217
+ #}
218
+ op.on('-S', '--all-string', 'disable automatic type conversion', TrueClass) {|b|
219
+ @opts[:all_string] = b
220
+ }
221
+ op.on('-t', '--time-column NAME', 'name of the time column (default: auto detect)') {|s|
222
+ @opts[:time_column] = s
223
+ }
224
+ op.on('-T', '--time-format FORMAT', 'strftime(3) format of the time column') {|s|
225
+ @opts[:time_format] = s
226
+ }
227
+ op.on('-e', '--encoding NAME', "text encoding") {|s|
228
+ @opts[:encoding] = s
229
+ }
230
+ op.on('-C', '--compress NAME', 'compression format name [plain, gzip] (default: auto detect)') {|s|
231
+ @opts[:compress] = s
232
+ }
233
+ end
234
+
235
+ def set_format_template(name)
236
+ case name
237
+ when 'csv'
238
+ @format = 'text'
239
+ @opts[:delimiter_expr] = /,/
240
+ when 'tsv'
241
+ @format = 'text'
242
+ @opts[:delimiter_expr] = /\t/
243
+ #when 'apache'
244
+ # @format = 'apache'
245
+ # @opts[:column_names] = ['host', 'user', 'time', 'method', 'path', 'code', 'size', 'referer', 'agent']
246
+ # @opts[:null_expr] = /\A(?:\-|)\z/
247
+ # @opts[:time_column] = 'time'
248
+ # @opts[:time_format] = '%d/%b/%Y:%H:%M:%S %z'
249
+ when 'msgpack'
250
+ @format = 'msgpack'
251
+ when 'json'
252
+ @format = 'json'
253
+ else
254
+ raise "Unknown format: #{name}"
255
+ end
256
+ end
257
+
258
+ def compose_factory
259
+ opts = @default_opts.merge(@opts)
260
+ case @format
261
+ when 'text'
262
+ Proc.new {|io,error|
263
+ reader = LineReader.new(io, error, opts)
264
+ parser = DelimiterParser.new(reader, error, opts)
265
+ if opts[:column_header]
266
+ column_names = parser.next
267
+ elsif opts[:column_names]
268
+ column_names = opts[:column_names]
269
+ else
270
+ raise "--column-header or --columns option is required"
271
+ end
272
+ unless opts[:all_string]
273
+ parser = AutoTypeConvertParserFilter.new(parser, error)
274
+ end
275
+ parser = HashBuilder.new(parser, error, column_names)
276
+ parser = TimeParserFilter.new(parser, error, opts)
277
+ }
278
+
279
+ #when 'apache'
280
+
281
+ when 'json'
282
+ Proc.new {|io,error|
283
+ reader = LineReader.new(io, error, opts)
284
+ parser = JSONParser.new(reader, error, opts)
285
+ if opts[:column_header]
286
+ column_names = parser.next
287
+ elsif opts[:column_names]
288
+ column_names = opts[:column_names]
289
+ end
290
+ if column_names
291
+ parser = HashBuilder.new(parser, error, column_names)
292
+ end
293
+ parser = TimeParserFilter.new(parser, error, opts)
294
+ }
295
+
296
+ when 'msgpack'
297
+ Proc.new {|io,error|
298
+ parser = MessagePackParsingReader.new(io, error, opts)
299
+ if opts[:column_header]
300
+ column_names = parser.next
301
+ elsif opts[:column_names]
302
+ column_names = opts[:column_names]
303
+ end
304
+ if column_names
305
+ parser = HashBuilder.new(parser, error, column_names)
306
+ end
307
+ parser = TimeParserFilter.new(parser, error, opts)
308
+ }
309
+ end
310
+ end
311
+
312
+ def parse(io, error, &block)
313
+ factory = compose_factory
314
+ parser = factory.call(io, error)
315
+ begin
316
+ while record = parser.next
317
+ block.call(record)
318
+ end
319
+ rescue EOFError
320
+ end
321
+ end
322
+
323
+ end
324
+ end
data/lib/td/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module TreasureData
2
2
 
3
- VERSION = '0.10.38'
3
+ VERSION = '0.10.39'
4
4
 
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: td
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.38
4
+ version: 0.10.39
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-20 00:00:00.000000000 Z
12
+ date: 2012-08-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: msgpack
@@ -163,6 +163,7 @@ files:
163
163
  - lib/td/compat_gzip_reader.rb
164
164
  - lib/td/config.rb
165
165
  - lib/td/distribution.rb
166
+ - lib/td/file_reader.rb
166
167
  - lib/td/version.rb
167
168
  - ChangeLog
168
169
  - README.rdoc