td 0.10.84 → 0.10.85

Sign up to get free protection for your applications and to get access to all the features.
data/ChangeLog CHANGED
@@ -1,3 +1,11 @@
1
+ == 2013-08-23 version 0.10.85
2
+
3
+ * Fix 'undefined method job_priority_id_of' error
4
+ * Add table:expire_days subcommand
5
+ * Update bulk-import-java to 0.2.1
6
+ * Add import command to support new bulk import
7
+
8
+
1
9
  == 2013-07-25 version 0.10.84
2
10
 
3
11
  * bulk_import now checks first argument is correct session name
@@ -19,10 +19,13 @@ if [ -n "$chrev" ];then
19
19
  fi
20
20
 
21
21
  revname="$(git show --pretty=format:'%H %ad' | head -n 1)"
22
- vername="0.2.0-SNAPSHOT"
22
+ vername="0.2.1"
23
23
 
24
24
  mvn package -Dmaven.test.skip=true || exit 1
25
+ echo "copy td-bulk-import-${vername}.jar"
25
26
  cp target/td-bulk-import-${vername}.jar ../../java/td-bulk-import-${vername}.jar
27
+ echo "copy logging.properties"
28
+ cp src/test/resources/java/logging.properties ../../java/logging.properties
26
29
 
27
30
  if [ -n "$chrev" ];then
28
31
  git checkout master
@@ -0,0 +1,53 @@
1
+ ############################################################
2
+ # Treasure Data BulkImport Logging Configuration File
3
+ #
4
+ # You can use a different file by specifying a filename
5
+ # with the java.util.logging.config.file system property.
6
+ # For example java -Djava.util.logging.config.file=myfile
7
+ ############################################################
8
+
9
+ ############################################################
10
+ # Global properties
11
+ ############################################################
12
+
13
+ # "handlers" specifies a comma separated list of log Handler
14
+ # classes. These handlers will be installed during VM startup.
15
+ # Note that these classes must be on the system classpath.
16
+ # By default we only configure a ConsoleHandler, which will only
17
+ # show messages at the INFO and above levels.
18
+ handlers= java.util.logging.FileHandler
19
+
20
+ # To also add the FileHandler, use the following line instead.
21
+ #handlers= java.util.logging.FileHandler, java.util.logging.ConsoleHandler
22
+
23
+ # Default global logging level.
24
+ # This specifies which kinds of events are logged across
25
+ # all loggers. For any given facility this global level
26
+ # can be overriden by a facility specific level
27
+ # Note that the ConsoleHandler also has a separate level
28
+ # setting to limit messages printed to the console.
29
+ .level= INFO
30
+
31
+ ############################################################
32
+ # Handler specific properties.
33
+ # Describes specific configuration info for Handlers.
34
+ ############################################################
35
+
36
+ java.util.logging.FileHandler.level = INFO
37
+ java.util.logging.FileHandler.pattern=td-bulk-import.log
38
+ java.util.logging.FileHandler.limit = 50000
39
+ java.util.logging.FileHandler.count = 1
40
+ java.util.logging.FileHandler.formatter = java.util.logging.SimpleFormatter
41
+
42
+ # Limit the message that are printed on the console to INFO and above.
43
+ java.util.logging.ConsoleHandler.level = INFO
44
+ java.util.logging.ConsoleHandler.formatter = java.util.logging.SimpleFormatter
45
+
46
+ ############################################################
47
+ # Facility specific properties.
48
+ # Provides extra control for each logger.
49
+ ############################################################
50
+
51
+ # For example, set the com.xyz.foo logger to only log SEVERE
52
+ # messages:
53
+ com.xyz.foo.level = SEVERE
@@ -1 +1 @@
1
- 13ef573e6221c827f793ed68fea7f7e4978e7957 Fri Jun 28 16:14:15 2013 +0900
1
+ 9fab1d2cc1d026d0a355945c52ea103bb8b05e1d Wed Aug 21 15:28:09 2013 +0900
@@ -2,321 +2,213 @@
2
2
  module TreasureData
3
3
  module Command
4
4
 
5
- IMPORT_TEMPLATES = {
6
- 'apache' => [
7
- /^([^ ]*) [^ ]* ([^ ]*) \[([^\]]*)\] "(\S+)(?: +([^ ]*) +\S*)?" ([^ ]*) ([^ ]*)(?: "([^\"]*)" "([^\"]*)")?$/,
8
- ['host', 'user', 'time', 'method', 'path', 'code', 'size', 'referer', 'agent'],
9
- "%d/%b/%Y:%H:%M:%S %z"],
10
- 'syslog' => [
11
- /^([^ ]* [^ ]* [^ ]*) ([^ ]*) ([a-zA-Z0-9_\/\.\-]*)(?:\[([0-9]+)\])?[^\:]*\: *(.*)$/,
12
- ['time', 'host', 'ident', 'pid', 'message'],
13
- "%b %d %H:%M:%S"],
14
- }
15
-
16
- # TODO import-item
17
- # TODO tail
18
-
19
- def table_import(op)
20
- op.banner << "\nsupported formats:\n"
21
- op.banner << " apache\n"
22
- op.banner << " syslog\n"
23
- op.banner << " msgpack\n"
24
- op.banner << " json\n"
25
-
26
- format = 'apache'
27
- time_key = 'time'
28
- auto_create = false
29
-
30
- op.on('--format FORMAT', "file format (default: #{format})") {|s|
31
- format = s
32
- }
33
-
34
- op.on('--apache', "same as --format apache; apache common log format") {
35
- format = 'apache'
36
- }
37
-
38
- op.on('--syslog', "same as --format syslog; syslog") {
39
- format = 'syslog'
40
- }
41
-
42
- op.on('--msgpack', "same as --format msgpack; msgpack stream format") {
43
- format = 'msgpack'
44
- }
45
-
46
- op.on('--json', "same as --format json; LF-separated json format") {
47
- format = 'json'
48
- }
49
-
50
- op.on('-t', '--time-key COL_NAME', "time key name for json and msgpack format (e.g. 'created_at')") {|s|
51
- time_key = s
52
- }
53
-
54
- op.on('--auto-create-table', "Create table and database if doesn't exist", TrueClass) { |b|
55
- auto_create = b
56
- }
57
-
58
- db_name, table_name, *paths = op.cmd_parse
59
-
60
- client = get_client
61
-
62
- if auto_create
63
- # Merge with db_create and table_create after refactoring
64
- API.validate_database_name(db_name)
65
- begin
66
- client.create_database(db_name)
67
- $stderr.puts "Database '#{db_name}' is created."
68
- rescue AlreadyExistsError
69
- end
5
+ BASE_PATH = File.expand_path('../../..', File.dirname(__FILE__))
70
6
 
71
- API.validate_table_name(table_name)
72
- begin
73
- client.create_log_table(db_name, table_name)
74
- $stderr.puts "Table '#{db_name}.#{table_name}' is created."
75
- rescue AlreadyExistsError
76
- end
77
- end
7
+ JAVA_COMMAND = "java"
8
+ JAVA_COMMAND_CHECK = "#{JAVA_COMMAND} -version"
9
+ JAVA_MAIN_CLASS = "com.treasure_data.bulk_import.BulkImportMain"
10
+ JAVA_HEAP_MAX_SIZE = "-Xmx1024m" # TODO
78
11
 
79
- case format
80
- when 'json', 'msgpack'
81
- #unless time_key
82
- # $stderr.puts "-t, --time-key COL_NAME (e.g. '-t created_at') parameter is required for #{format} format"
83
- # exit 1
84
- #end
85
- if format == 'json'
86
- require 'json'
87
- require 'time'
88
- parser = JsonParser.new(time_key)
89
- else
90
- parser = MessagePackParser.new(time_key)
91
- end
12
+ APP_OPTION_PREPARE = "prepare"
13
+ APP_OPTION_UPLOAD = "upload"
92
14
 
93
- else
94
- regexp, names, time_format = IMPORT_TEMPLATES[format]
95
- if !regexp || !names || !time_format
96
- $stderr.puts "Unknown format '#{format}'"
97
- exit 1
98
- end
99
- parser = TextParser.new(names, regexp, time_format)
100
- end
15
+ def import_list(op)
16
+ require 'td/command/bulk_import'
17
+ bulk_import_list(op)
18
+ end
101
19
 
102
- get_table(client, db_name, table_name)
20
+ def import_show(op)
21
+ require 'td/command/bulk_import'
22
+ bulk_import_show(op)
23
+ end
103
24
 
104
- require 'zlib'
25
+ def import_create(op)
26
+ require 'td/command/bulk_import'
27
+ bulk_import_create(op)
28
+ end
105
29
 
106
- files = paths.map {|path|
107
- if path == '-'
108
- $stdin
109
- elsif path =~ /\.gz$/
110
- require 'td/compat_gzip_reader'
111
- Zlib::GzipReader.open(path)
112
- else
113
- File.open(path)
114
- end
115
- }
30
+ def import_prepare(op)
31
+ import_generic(APP_OPTION_PREPARE)
32
+ end
116
33
 
117
- require 'msgpack'
118
- require 'tempfile'
119
- #require 'thread'
34
+ def import_upload(op)
35
+ import_generic(APP_OPTION_UPLOAD)
36
+ end
120
37
 
121
- files.zip(paths).each {|file,path|
122
- import_log_file(file, path, client, db_name, table_name, parser)
123
- }
38
+ def import_perform(op)
39
+ require 'td/command/bulk_import'
40
+ bulk_import_perform(op)
41
+ end
42
+
43
+ def import_error_records(op)
44
+ require 'td/command/bulk_import'
45
+ bulk_import_error_records(op)
46
+ end
47
+
48
+ def import_commit(op)
49
+ require 'td/command/bulk_import'
50
+ bulk_import_commit(op)
51
+ end
52
+
53
+ def import_delete(op)
54
+ require 'td/command/bulk_import'
55
+ bulk_import_delete(op)
56
+ end
57
+
58
+ def import_freeze(op)
59
+ require 'td/command/bulk_import'
60
+ bulk_import_freeze(op)
61
+ end
124
62
 
125
- puts "done."
63
+ def import_unfreeze(op)
64
+ require 'td/command/bulk_import'
65
+ bulk_importunfreeze(op)
126
66
  end
127
67
 
128
68
  private
129
- def import_log_file(file, path, client, db_name, table_name, parser)
130
- puts "importing #{path}..."
131
-
132
- out = Tempfile.new('td-import')
133
- out.binmode if out.respond_to?(:binmode)
134
-
135
- writer = Zlib::GzipWriter.new(out)
136
-
137
- n = 0
138
- x = 0
139
- has_bignum = false
140
- parser.call(file, path) {|record|
141
- entry = begin
142
- record.to_msgpack
143
- rescue RangeError
144
- has_bignum = true
145
- TreasureData::API.normalized_msgpack(record)
146
- end
147
- writer.write entry
148
-
149
- n += 1
150
- x += 1
151
- if n % 10000 == 0
152
- puts " imported #{n} entries from #{path}..."
153
-
154
- elsif out.pos > 1024*1024 # TODO size
155
- puts " imported #{n} entries from #{path}..."
156
- begin
157
- writer.finish
158
- size = out.pos
159
- out.pos = 0
160
-
161
- puts " uploading #{size} bytes..."
162
- client.import(db_name, table_name, "msgpack.gz", out, size)
163
-
164
- out.truncate(0)
165
- out.pos = 0
166
- x = 0
167
- writer = Zlib::GzipWriter.new(out)
168
- rescue
169
- $stderr.puts " #{$!}"
170
- return 1 # TODO error
171
- end
172
- end
173
- }
69
+ def import_generic(subcmd)
70
+ # has java runtime
71
+ check_java
72
+
73
+ # show help
74
+ show_help = ARGV.size == 0 || (ARGV.size == 1 || ARGV[0] =~ /^import:/)
174
75
 
175
- if x != 0
176
- writer.finish
177
- size = out.pos
178
- out.pos = 0
76
+ # configure jvm options
77
+ jvm_opts = [ JAVA_HEAP_MAX_SIZE ]
179
78
 
180
- puts " uploading #{size} bytes..."
181
- # TODO upload on background thread
182
- client.import(db_name, table_name, "msgpack.gz", out, size)
79
+ # configure java options
80
+ java_opts = [ "-cp \"#{find_td_bulk_import_jar()}\"" ]
81
+
82
+ # configure system properties
83
+ sysprops = set_sysprops()
84
+
85
+ # configure java command-line arguments
86
+ java_args = []
87
+ java_args << JAVA_MAIN_CLASS
88
+ java_args << subcmd
89
+ if show_help
90
+ java_args << "--help"
91
+ else
92
+ java_args << ARGV
183
93
  end
184
94
 
185
- puts " imported #{n} entries from #{path}."
186
- $stderr.puts normalized_message if has_bignum
187
- ensure
188
- out.close rescue nil
189
- writer.close rescue nil
95
+ # TODO consider parameters including spaces; don't use join(' ')
96
+ cmd = "#{JAVA_COMMAND} #{jvm_opts.join(' ')} #{java_opts.join(' ')} #{sysprops.join(' ')} #{java_args.join(' ')}"
97
+ exec cmd
190
98
  end
191
99
 
192
- require 'date' # DateTime#strptime
193
- require 'time' # Time#strptime, Time#parse
100
+ private
101
+ def check_java
102
+ pid = do_fork(JAVA_COMMAND_CHECK)
103
+ pid, stat = Process.waitpid2(pid)
194
104
 
195
- class TextParser
196
- def initialize(names, regexp, time_format)
197
- @names = names
198
- @regexp = regexp
199
- @time_format = time_format
105
+ if stat.exitstatus != 0
106
+ $stderr.puts "Java is not installed. 'td import' command requires Java (version 1.6 or later). If Java is not installed yet, please use 'bulk_import' commands instead of this command."
107
+ exit 1
200
108
  end
109
+ end
201
110
 
202
- def call(file, path, &block)
203
- i = 0
204
- file.each_line {|line|
205
- i += 1
206
- begin
207
- line.rstrip!
208
- m = @regexp.match(line)
209
- unless m
210
- raise "invalid log format at #{path}:#{i}"
211
- end
212
-
213
- record = {}
214
-
215
- cap = m.captures
216
- @names.each_with_index {|name,cap_i|
217
- if value = cap[cap_i]
218
- if name == "time"
219
- value = parse_time(value).to_i
220
- end
221
- record[name] = value
222
- end
223
- }
224
-
225
- block.call(record)
226
-
227
- rescue
228
- $stderr.puts " skipped: #{$!}: #{line.dump}"
229
- end
230
- }
111
+ def do_fork(cmd)
112
+ Process.fork do
113
+ begin
114
+ Process.exec(*cmd)
115
+ ensure
116
+ exit! 127
117
+ end
231
118
  end
119
+ end
232
120
 
233
- if Time.respond_to?(:strptime)
234
- def parse_time(value)
235
- Time.strptime(value, @time_format)
236
- end
237
- else
238
- def parse_time(value)
239
- Time.parse(DateTime.strptime(value, @time_format).to_s)
240
- end
121
+ private
122
+ def find_td_bulk_import_jar
123
+ libjars = Dir.glob("#{BASE_PATH}/java/**/*.jar")
124
+ found = libjars.find { |path| File.basename(path) =~ /^td-bulk-import/ }
125
+ if found.nil?
126
+ $stderr.puts "td-bulk-import.jar is not found."
127
+ exit
241
128
  end
129
+ td_bulk_import_jar = libjars.delete(found)
130
+ td_bulk_import_jar
242
131
  end
243
132
 
244
- class JsonParser
245
- def initialize(time_key)
246
- require 'json'
247
- @time_key = time_key
133
+ private
134
+ def set_sysprops
135
+ sysprops = []
136
+
137
+ # set apiserver
138
+ set_sysprops_endpoint(sysprops)
139
+
140
+ # set http_proxy
141
+ set_sysprops_http_proxy(sysprops)
142
+
143
+ # set configuration file for logging
144
+ conf_file = find_logging_conf_file
145
+ if conf_file
146
+ sysprops << "-Djava.util.logging.config.file=#{conf_file}"
248
147
  end
249
148
 
250
- def call(file, path, &block)
251
- i = 0
252
- file.each_line {|line|
253
- i += 1
254
- begin
255
- record = JSON.parse(line)
256
-
257
- unless record.is_a?(Hash)
258
- raise "record must be a Hash"
259
- end
260
-
261
- time = record[@time_key]
262
- unless time
263
- raise "record doesn't have '#{@time_key}' column"
264
- end
265
-
266
- case time
267
- when Integer
268
- # do nothing
269
- else
270
- time = Time.parse(time.to_s).to_i
271
- end
272
- record['time'] = time
273
-
274
- block.call(record)
275
-
276
- rescue
277
- $stderr.puts " skipped: #{$!}: #{line.dump}"
149
+ # set API key
150
+ sysprops << "-Dtd.api.key=#{TreasureData::Config.apikey}"
151
+
152
+ sysprops
153
+ end
154
+
155
+ private
156
+ def set_sysprops_endpoint(sysprops)
157
+ endpoint = ENV['TD_API_SERVER']
158
+ if endpoint
159
+ require 'uri'
160
+
161
+ uri = URI.parse(endpoint)
162
+
163
+ case uri.scheme
164
+ when 'http', 'https'
165
+ host = uri.host
166
+ port = uri.port
167
+ ssl = uri.scheme == 'https'
168
+
169
+ port = 80 if port == 443 and ssl
170
+ else
171
+ if uri.port
172
+ # invalid URI
173
+ raise "Invalid endpoint: #{endpoint}"
278
174
  end
279
- }
175
+
176
+ # generic URI
177
+ host, port = endpoint.split(':', 2)
178
+ port = port.to_i
179
+ # TODO support ssl
180
+ port = 80 if port == 0
181
+ ssl = false
182
+ end
183
+
184
+ sysprops << "-Dtd.api.server.host=#{host}"
185
+ sysprops << "-Dtd.api.server.port=#{port}"
280
186
  end
281
187
  end
282
188
 
283
- class MessagePackParser
284
- def initialize(time_key)
285
- require 'msgpack'
286
- @time_key = time_key
287
- end
189
+ private
190
+ def set_sysprops_http_proxy(sysprops)
191
+ http_proxy = ENV['HTTP_PROXY']
192
+ if http_proxy
193
+ if http_proxy =~ /\Ahttp:\/\/(.*)\z/
194
+ http_proxy = $~[1]
195
+ end
196
+ proxy_host, proxy_port = http_proxy.split(':', 2)
197
+ proxy_port = (proxy_port ? proxy_port.to_i : 80)
288
198
 
289
- def call(file, path, &block)
290
- i = 0
291
- MessagePack::Unpacker.new(file).each {|record|
292
- i += 1
293
- begin
294
- unless record.is_a?(Hash)
295
- raise "record must be a Hash"
296
- end
297
-
298
- time = record[@time_key]
299
- unless time
300
- raise "record doesn't have '#{@time_key}' column"
301
- end
302
-
303
- case time
304
- when Integer
305
- # do nothing
306
- else
307
- time = Time.parse(time.to_s).to_i
308
- end
309
- record['time'] = time
310
-
311
- block.call(record)
312
-
313
- rescue
314
- $stderr.puts " skipped: #{$!}: #{record.to_json}"
315
- end
316
- }
317
- rescue EOFError
199
+ sysprops << "-Dhttp.proxyHost=#{proxy_host}"
200
+ sysprops << "-Dhttp.proxyPort=#{proxy_port}"
318
201
  end
319
202
  end
203
+
204
+ private
205
+ def find_logging_conf_file
206
+ libjars = Dir.glob("#{BASE_PATH}/java/**/*.properties")
207
+ found = libjars.find { |path| File.basename(path) =~ /^logging.properties/ }
208
+ return nil if found.nil?
209
+ logging_conf_file = libjars.delete(found)
210
+ logging_conf_file
211
+ end
212
+
320
213
  end
321
214
  end
322
-
@@ -228,6 +228,7 @@ module List
228
228
  add_list 'table:swap', %w[db table1 table2], 'Swap names of two tables', 'table:swap example_db table1 table2'
229
229
  add_list 'table:tail', %w[db table], 'Get recently imported logs', 'table:tail example_db table1', 'table:tail example_db table1 -t "2011-01-02 03:04:05" -n 30'
230
230
  add_list 'table:partial_delete', %w[db table], 'Delete logs from the table within the specified time range', 'table:partial_delete example_db table1 --from 1341000000 --to 1341003600'
231
+ add_list 'table:expire', %w[db table expire_days], 'Expire data in table after specified number of days', 'table:expire example_db table1 30'
231
232
 
232
233
  add_list 'bulk_import:list', %w[], 'List bulk import sessions', 'bulk_import:list'
233
234
  add_list 'bulk_import:show', %w[name], 'Show list of uploaded parts', 'bulk_import:show'
@@ -246,6 +247,18 @@ module List
246
247
  add_list 'bulk_import:freeze', %w[name], 'Reject succeeding uploadings to a bulk import session', 'bulk_import:freeze logs_201201'
247
248
  add_list 'bulk_import:unfreeze', %w[name], 'Unfreeze a frozen bulk import session', 'bulk_import:unfreeze logs_201201'
248
249
 
250
+ add_list 'import:list', %w[], 'List bulk import sessions', 'import:list'
251
+ add_list 'import:show', %w[name], 'Show list of uploaded parts', 'import:show'
252
+ add_list 'import:create', %w[name db table], 'Create a new bulk import session to the the table', 'import:create logs_201201 example_db event_logs'
253
+ add_list 'import:prepare', %w[files_], 'Convert files into part file format', 'import:prepare logs/*.csv --format csv --columns time,uid,price,count --time-column "time" -o parts/'
254
+ add_list 'import:upload', %w[name files_], 'Upload or re-upload files into a bulk import session', 'import:upload parts/* --parallel 4'
255
+ add_list 'import:perform', %w[name], 'Start to validate and convert uploaded files', 'import:perform logs_201201'
256
+ add_list 'import:error_records', %w[name], 'Show records which did not pass validations', 'import:error_records logs_201201'
257
+ add_list 'import:commit', %w[name], 'Start to commit a performed bulk import session', 'import:commit logs_201201'
258
+ add_list 'import:delete', %w[name], 'Delete a bulk import session', 'import:delete logs_201201'
259
+ add_list 'import:freeze', %w[name], 'Reject succeeding uploadings to a bulk import session', 'import:freeze logs_201201'
260
+ add_list 'import:unfreeze', %w[name], 'Unfreeze a frozen bulk import session', 'import:unfreeze logs_201201'
261
+
249
262
  add_list 'result:list', %w[], 'Show list of result URLs', 'result:list', 'results'
250
263
  add_list 'result:show', %w[name], 'Describe information of a result URL', 'result mydb'
251
264
  add_list 'result:create', %w[name URL], 'Create a result URL', 'result:create mydb mysql://my-server/mydb'
@@ -413,7 +426,7 @@ module List
413
426
  add_guess 'show-job', 'job:show'
414
427
  add_guess 'show-jobs', 'job:list'
415
428
  add_guess 'server-status', 'server:status'
416
- add_alias 'import', 'table:import'
429
+ add_alias 'import', 'import:upload'
417
430
 
418
431
  finishup
419
432
  end
@@ -25,6 +25,8 @@ module Command
25
25
  end
26
26
 
27
27
  def sched_create(op)
28
+ require 'td/command/job' # job_priority_id_of
29
+
28
30
  org = nil
29
31
  db_name = nil
30
32
  timezone = nil
@@ -131,6 +133,8 @@ module Command
131
133
  end
132
134
 
133
135
  def sched_update(op)
136
+ require 'td/command/job' # job_priority_id_of
137
+
134
138
  cron = nil
135
139
  sql = nil
136
140
  db_name = nil
@@ -357,7 +357,338 @@ module Command
357
357
  end
358
358
  end
359
359
 
360
- require 'td/command/import' # table:import
360
+ def table_expire(op)
361
+ db_name, table_name, expire_days = op.cmd_parse
362
+
363
+ expire_days = expire_days.to_i
364
+ if expire_days <= 0
365
+ $stderr.puts "Table expiration days must be greater than 0."
366
+ return
367
+ end
368
+
369
+ client = get_client
370
+ client.update_expire(db_name, table_name, expire_days)
371
+
372
+ $stderr.puts "Table set to expire data older than #{expire_days} days."
373
+ end
374
+
375
+
376
+ IMPORT_TEMPLATES = {
377
+ 'apache' => [
378
+ /^([^ ]*) [^ ]* ([^ ]*) \[([^\]]*)\] "(\S+)(?: +([^ ]*) +\S*)?" ([^ ]*) ([^ ]*)(?: "([^\"]*)" "([^\"]*)")?$/,
379
+ ['host', 'user', 'time', 'method', 'path', 'code', 'size', 'referer', 'agent'],
380
+ "%d/%b/%Y:%H:%M:%S %z"],
381
+ 'syslog' => [
382
+ /^([^ ]* [^ ]* [^ ]*) ([^ ]*) ([a-zA-Z0-9_\/\.\-]*)(?:\[([0-9]+)\])?[^\:]*\: *(.*)$/,
383
+ ['time', 'host', 'ident', 'pid', 'message'],
384
+ "%b %d %H:%M:%S"],
385
+ }
386
+
387
+ # TODO import-item
388
+ # TODO tail
389
+
390
+ def table_import(op)
391
+ op.banner << "\nsupported formats:\n"
392
+ op.banner << " apache\n"
393
+ op.banner << " syslog\n"
394
+ op.banner << " msgpack\n"
395
+ op.banner << " json\n"
396
+
397
+ format = 'apache'
398
+ time_key = 'time'
399
+ auto_create = false
400
+
401
+ op.on('--format FORMAT', "file format (default: #{format})") {|s|
402
+ format = s
403
+ }
404
+
405
+ op.on('--apache', "same as --format apache; apache common log format") {
406
+ format = 'apache'
407
+ }
408
+
409
+ op.on('--syslog', "same as --format syslog; syslog") {
410
+ format = 'syslog'
411
+ }
412
+
413
+ op.on('--msgpack', "same as --format msgpack; msgpack stream format") {
414
+ format = 'msgpack'
415
+ }
416
+
417
+ op.on('--json', "same as --format json; LF-separated json format") {
418
+ format = 'json'
419
+ }
420
+
421
+ op.on('-t', '--time-key COL_NAME', "time key name for json and msgpack format (e.g. 'created_at')") {|s|
422
+ time_key = s
423
+ }
424
+
425
+ op.on('--auto-create-table', "Create table and database if doesn't exist", TrueClass) { |b|
426
+ auto_create = b
427
+ }
428
+
429
+ db_name, table_name, *paths = op.cmd_parse
430
+
431
+ client = get_client
432
+
433
+ if auto_create
434
+ # Merge with db_create and table_create after refactoring
435
+ API.validate_database_name(db_name)
436
+ begin
437
+ client.create_database(db_name)
438
+ $stderr.puts "Database '#{db_name}' is created."
439
+ rescue AlreadyExistsError
440
+ end
441
+
442
+ API.validate_table_name(table_name)
443
+ begin
444
+ client.create_log_table(db_name, table_name)
445
+ $stderr.puts "Table '#{db_name}.#{table_name}' is created."
446
+ rescue AlreadyExistsError
447
+ end
448
+ end
449
+
450
+ case format
451
+ when 'json', 'msgpack'
452
+ #unless time_key
453
+ # $stderr.puts "-t, --time-key COL_NAME (e.g. '-t created_at') parameter is required for #{format} format"
454
+ # exit 1
455
+ #end
456
+ if format == 'json'
457
+ require 'json'
458
+ require 'time'
459
+ parser = JsonParser.new(time_key)
460
+ else
461
+ parser = MessagePackParser.new(time_key)
462
+ end
463
+
464
+ else
465
+ regexp, names, time_format = IMPORT_TEMPLATES[format]
466
+ if !regexp || !names || !time_format
467
+ $stderr.puts "Unknown format '#{format}'"
468
+ exit 1
469
+ end
470
+ parser = TextParser.new(names, regexp, time_format)
471
+ end
472
+
473
+ get_table(client, db_name, table_name)
474
+
475
+ require 'zlib'
476
+
477
+ files = paths.map {|path|
478
+ if path == '-'
479
+ $stdin
480
+ elsif path =~ /\.gz$/
481
+ require 'td/compat_gzip_reader'
482
+ Zlib::GzipReader.open(path)
483
+ else
484
+ File.open(path)
485
+ end
486
+ }
487
+
488
+ require 'msgpack'
489
+ require 'tempfile'
490
+ #require 'thread'
491
+
492
+ files.zip(paths).each {|file,path|
493
+ import_log_file(file, path, client, db_name, table_name, parser)
494
+ }
495
+
496
+ puts "done."
497
+ end
498
+
499
+ private
500
+ def import_log_file(file, path, client, db_name, table_name, parser)
501
+ puts "importing #{path}..."
502
+
503
+ out = Tempfile.new('td-import')
504
+ out.binmode if out.respond_to?(:binmode)
505
+
506
+ writer = Zlib::GzipWriter.new(out)
507
+
508
+ n = 0
509
+ x = 0
510
+ has_bignum = false
511
+ parser.call(file, path) {|record|
512
+ entry = begin
513
+ record.to_msgpack
514
+ rescue RangeError
515
+ has_bignum = true
516
+ TreasureData::API.normalized_msgpack(record)
517
+ end
518
+ writer.write entry
519
+
520
+ n += 1
521
+ x += 1
522
+ if n % 10000 == 0
523
+ puts " imported #{n} entries from #{path}..."
524
+
525
+ elsif out.pos > 1024*1024 # TODO size
526
+ puts " imported #{n} entries from #{path}..."
527
+ begin
528
+ writer.finish
529
+ size = out.pos
530
+ out.pos = 0
531
+
532
+ puts " uploading #{size} bytes..."
533
+ client.import(db_name, table_name, "msgpack.gz", out, size)
534
+
535
+ out.truncate(0)
536
+ out.pos = 0
537
+ x = 0
538
+ writer = Zlib::GzipWriter.new(out)
539
+ rescue
540
+ $stderr.puts " #{$!}"
541
+ return 1 # TODO error
542
+ end
543
+ end
544
+ }
545
+
546
+ if x != 0
547
+ writer.finish
548
+ size = out.pos
549
+ out.pos = 0
550
+
551
+ puts " uploading #{size} bytes..."
552
+ # TODO upload on background thread
553
+ client.import(db_name, table_name, "msgpack.gz", out, size)
554
+ end
555
+
556
+ puts " imported #{n} entries from #{path}."
557
+ $stderr.puts normalized_message if has_bignum
558
+ ensure
559
+ out.close rescue nil
560
+ writer.close rescue nil
561
+ end
562
+
563
+ require 'date' # DateTime#strptime
564
+ require 'time' # Time#strptime, Time#parse
565
+
566
+ class TextParser
567
+ def initialize(names, regexp, time_format)
568
+ @names = names
569
+ @regexp = regexp
570
+ @time_format = time_format
571
+ end
572
+
573
+ def call(file, path, &block)
574
+ i = 0
575
+ file.each_line {|line|
576
+ i += 1
577
+ begin
578
+ line.rstrip!
579
+ m = @regexp.match(line)
580
+ unless m
581
+ raise "invalid log format at #{path}:#{i}"
582
+ end
583
+
584
+ record = {}
585
+
586
+ cap = m.captures
587
+ @names.each_with_index {|name,cap_i|
588
+ if value = cap[cap_i]
589
+ if name == "time"
590
+ value = parse_time(value).to_i
591
+ end
592
+ record[name] = value
593
+ end
594
+ }
595
+
596
+ block.call(record)
597
+
598
+ rescue
599
+ $stderr.puts " skipped: #{$!}: #{line.dump}"
600
+ end
601
+ }
602
+ end
603
+
604
+ if Time.respond_to?(:strptime)
605
+ def parse_time(value)
606
+ Time.strptime(value, @time_format)
607
+ end
608
+ else
609
+ def parse_time(value)
610
+ Time.parse(DateTime.strptime(value, @time_format).to_s)
611
+ end
612
+ end
613
+ end
614
+
615
+ class JsonParser
616
+ def initialize(time_key)
617
+ require 'json'
618
+ @time_key = time_key
619
+ end
620
+
621
+ def call(file, path, &block)
622
+ i = 0
623
+ file.each_line {|line|
624
+ i += 1
625
+ begin
626
+ record = JSON.parse(line)
627
+
628
+ unless record.is_a?(Hash)
629
+ raise "record must be a Hash"
630
+ end
631
+
632
+ time = record[@time_key]
633
+ unless time
634
+ raise "record doesn't have '#{@time_key}' column"
635
+ end
636
+
637
+ case time
638
+ when Integer
639
+ # do nothing
640
+ else
641
+ time = Time.parse(time.to_s).to_i
642
+ end
643
+ record['time'] = time
644
+
645
+ block.call(record)
646
+
647
+ rescue
648
+ $stderr.puts " skipped: #{$!}: #{line.dump}"
649
+ end
650
+ }
651
+ end
652
+ end
653
+
654
+ class MessagePackParser
655
+ def initialize(time_key)
656
+ require 'msgpack'
657
+ @time_key = time_key
658
+ end
659
+
660
+ def call(file, path, &block)
661
+ i = 0
662
+ MessagePack::Unpacker.new(file).each {|record|
663
+ i += 1
664
+ begin
665
+ unless record.is_a?(Hash)
666
+ raise "record must be a Hash"
667
+ end
668
+
669
+ time = record[@time_key]
670
+ unless time
671
+ raise "record doesn't have '#{@time_key}' column"
672
+ end
673
+
674
+ case time
675
+ when Integer
676
+ # do nothing
677
+ else
678
+ time = Time.parse(time.to_s).to_i
679
+ end
680
+ record['time'] = time
681
+
682
+ block.call(record)
683
+
684
+ rescue
685
+ $stderr.puts " skipped: #{$!}: #{record.to_json}"
686
+ end
687
+ }
688
+ rescue EOFError
689
+ end
690
+ end
691
+
361
692
  require 'td/command/export' # table:export
362
693
  require 'td/command/job' # wait_job
363
694
  end
data/lib/td/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module TreasureData
2
2
 
3
- VERSION = '0.10.84'
3
+ VERSION = '0.10.85'
4
4
 
5
5
  end
metadata CHANGED
@@ -1,32 +1,34 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: td
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.84
4
+ version: 0.10.85
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Treasure Data, Inc.
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-07-25 00:00:00.000000000 Z
12
+ date: 2013-08-23 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: msgpack
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
- - - '>='
19
+ - - ! '>='
18
20
  - !ruby/object:Gem::Version
19
21
  version: 0.4.4
20
- - - '!='
22
+ - - ! '!='
21
23
  - !ruby/object:Gem::Version
22
24
  version: 0.5.0
23
- - - '!='
25
+ - - ! '!='
24
26
  - !ruby/object:Gem::Version
25
27
  version: 0.5.1
26
- - - '!='
28
+ - - ! '!='
27
29
  - !ruby/object:Gem::Version
28
30
  version: 0.5.2
29
- - - '!='
31
+ - - ! '!='
30
32
  - !ruby/object:Gem::Version
31
33
  version: 0.5.3
32
34
  - - <
@@ -35,20 +37,21 @@ dependencies:
35
37
  type: :runtime
36
38
  prerelease: false
37
39
  version_requirements: !ruby/object:Gem::Requirement
40
+ none: false
38
41
  requirements:
39
- - - '>='
42
+ - - ! '>='
40
43
  - !ruby/object:Gem::Version
41
44
  version: 0.4.4
42
- - - '!='
45
+ - - ! '!='
43
46
  - !ruby/object:Gem::Version
44
47
  version: 0.5.0
45
- - - '!='
48
+ - - ! '!='
46
49
  - !ruby/object:Gem::Version
47
50
  version: 0.5.1
48
- - - '!='
51
+ - - ! '!='
49
52
  - !ruby/object:Gem::Version
50
53
  version: 0.5.2
51
- - - '!='
54
+ - - ! '!='
52
55
  - !ruby/object:Gem::Version
53
56
  version: 0.5.3
54
57
  - - <
@@ -57,6 +60,7 @@ dependencies:
57
60
  - !ruby/object:Gem::Dependency
58
61
  name: yajl-ruby
59
62
  requirement: !ruby/object:Gem::Requirement
63
+ none: false
60
64
  requirements:
61
65
  - - ~>
62
66
  - !ruby/object:Gem::Version
@@ -64,6 +68,7 @@ dependencies:
64
68
  type: :runtime
65
69
  prerelease: false
66
70
  version_requirements: !ruby/object:Gem::Requirement
71
+ none: false
67
72
  requirements:
68
73
  - - ~>
69
74
  - !ruby/object:Gem::Version
@@ -71,20 +76,23 @@ dependencies:
71
76
  - !ruby/object:Gem::Dependency
72
77
  name: hirb
73
78
  requirement: !ruby/object:Gem::Requirement
79
+ none: false
74
80
  requirements:
75
- - - '>='
81
+ - - ! '>='
76
82
  - !ruby/object:Gem::Version
77
83
  version: 0.4.5
78
84
  type: :runtime
79
85
  prerelease: false
80
86
  version_requirements: !ruby/object:Gem::Requirement
87
+ none: false
81
88
  requirements:
82
- - - '>='
89
+ - - ! '>='
83
90
  - !ruby/object:Gem::Version
84
91
  version: 0.4.5
85
92
  - !ruby/object:Gem::Dependency
86
93
  name: parallel
87
94
  requirement: !ruby/object:Gem::Requirement
95
+ none: false
88
96
  requirements:
89
97
  - - ~>
90
98
  - !ruby/object:Gem::Version
@@ -92,6 +100,7 @@ dependencies:
92
100
  type: :runtime
93
101
  prerelease: false
94
102
  version_requirements: !ruby/object:Gem::Requirement
103
+ none: false
95
104
  requirements:
96
105
  - - ~>
97
106
  - !ruby/object:Gem::Version
@@ -99,6 +108,7 @@ dependencies:
99
108
  - !ruby/object:Gem::Dependency
100
109
  name: td-client
101
110
  requirement: !ruby/object:Gem::Requirement
111
+ none: false
102
112
  requirements:
103
113
  - - ~>
104
114
  - !ruby/object:Gem::Version
@@ -106,6 +116,7 @@ dependencies:
106
116
  type: :runtime
107
117
  prerelease: false
108
118
  version_requirements: !ruby/object:Gem::Requirement
119
+ none: false
109
120
  requirements:
110
121
  - - ~>
111
122
  - !ruby/object:Gem::Version
@@ -113,6 +124,7 @@ dependencies:
113
124
  - !ruby/object:Gem::Dependency
114
125
  name: td-logger
115
126
  requirement: !ruby/object:Gem::Requirement
127
+ none: false
116
128
  requirements:
117
129
  - - ~>
118
130
  - !ruby/object:Gem::Version
@@ -120,6 +132,7 @@ dependencies:
120
132
  type: :runtime
121
133
  prerelease: false
122
134
  version_requirements: !ruby/object:Gem::Requirement
135
+ none: false
123
136
  requirements:
124
137
  - - ~>
125
138
  - !ruby/object:Gem::Version
@@ -127,6 +140,7 @@ dependencies:
127
140
  - !ruby/object:Gem::Dependency
128
141
  name: rubyzip
129
142
  requirement: !ruby/object:Gem::Requirement
143
+ none: false
130
144
  requirements:
131
145
  - - ~>
132
146
  - !ruby/object:Gem::Version
@@ -134,6 +148,7 @@ dependencies:
134
148
  type: :runtime
135
149
  prerelease: false
136
150
  version_requirements: !ruby/object:Gem::Requirement
151
+ none: false
137
152
  requirements:
138
153
  - - ~>
139
154
  - !ruby/object:Gem::Version
@@ -141,6 +156,7 @@ dependencies:
141
156
  - !ruby/object:Gem::Dependency
142
157
  name: rake
143
158
  requirement: !ruby/object:Gem::Requirement
159
+ none: false
144
160
  requirements:
145
161
  - - ~>
146
162
  - !ruby/object:Gem::Version
@@ -148,6 +164,7 @@ dependencies:
148
164
  type: :development
149
165
  prerelease: false
150
166
  version_requirements: !ruby/object:Gem::Requirement
167
+ none: false
151
168
  requirements:
152
169
  - - ~>
153
170
  - !ruby/object:Gem::Version
@@ -155,6 +172,7 @@ dependencies:
155
172
  - !ruby/object:Gem::Dependency
156
173
  name: rspec
157
174
  requirement: !ruby/object:Gem::Requirement
175
+ none: false
158
176
  requirements:
159
177
  - - ~>
160
178
  - !ruby/object:Gem::Version
@@ -162,6 +180,7 @@ dependencies:
162
180
  type: :development
163
181
  prerelease: false
164
182
  version_requirements: !ruby/object:Gem::Requirement
183
+ none: false
165
184
  requirements:
166
185
  - - ~>
167
186
  - !ruby/object:Gem::Version
@@ -169,6 +188,7 @@ dependencies:
169
188
  - !ruby/object:Gem::Dependency
170
189
  name: simplecov
171
190
  requirement: !ruby/object:Gem::Requirement
191
+ none: false
172
192
  requirements:
173
193
  - - ~>
174
194
  - !ruby/object:Gem::Version
@@ -176,6 +196,7 @@ dependencies:
176
196
  type: :development
177
197
  prerelease: false
178
198
  version_requirements: !ruby/object:Gem::Requirement
199
+ none: false
179
200
  requirements:
180
201
  - - ~>
181
202
  - !ruby/object:Gem::Version
@@ -209,7 +230,8 @@ files:
209
230
  - dist/resources/pkg/postinstall
210
231
  - dist/resources/pkg/ruby-2.0.0-p0.pkg
211
232
  - dist/resources/pkg/td
212
- - java/td-bulk-import-0.2.0-SNAPSHOT.jar
233
+ - java/logging.properties
234
+ - java/td-bulk-import-0.2.1.jar
213
235
  - java/td-bulk-import-java.version
214
236
  - lib/td.rb
215
237
  - lib/td/command/account.rb
@@ -259,26 +281,33 @@ files:
259
281
  - td.gemspec
260
282
  homepage: http://treasure-data.com/
261
283
  licenses: []
262
- metadata: {}
263
284
  post_install_message:
264
285
  rdoc_options: []
265
286
  require_paths:
266
287
  - lib
267
288
  required_ruby_version: !ruby/object:Gem::Requirement
289
+ none: false
268
290
  requirements:
269
- - - '>='
291
+ - - ! '>='
270
292
  - !ruby/object:Gem::Version
271
293
  version: '0'
294
+ segments:
295
+ - 0
296
+ hash: -2194512597141251482
272
297
  required_rubygems_version: !ruby/object:Gem::Requirement
298
+ none: false
273
299
  requirements:
274
- - - '>='
300
+ - - ! '>='
275
301
  - !ruby/object:Gem::Version
276
302
  version: '0'
303
+ segments:
304
+ - 0
305
+ hash: -2194512597141251482
277
306
  requirements: []
278
307
  rubyforge_project:
279
- rubygems_version: 2.0.2
308
+ rubygems_version: 1.8.23
280
309
  signing_key:
281
- specification_version: 4
310
+ specification_version: 3
282
311
  summary: CLI to manage data on Treasure Data, the Hadoop-based cloud data warehousing
283
312
  test_files:
284
313
  - spec/file_reader/filter_spec.rb
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: c7c60a874b7228266a8436132aae4d0e2438b3e5
4
- data.tar.gz: 9caacd3a0d3065d8fd4dca7bc7bae1cd29b9bbd5
5
- SHA512:
6
- metadata.gz: baa729bcbfb73760f2c93cc70e6fefccc69b1cae9a41c62e7c869372bf4124975649e0abffb5db15b79156bfb59e59ab52875634bf179da86fab129979c7cdab
7
- data.tar.gz: 10328c283f1b8a9a3e4d2b9ad23187da08b0b0391ae237fdd5fbe8ad4c610784f0d8a2f2ca8d00c758f5afac757426965545e5811b406df943769a826b9a27d7