td 0.10.84 → 0.10.85

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ChangeLog CHANGED
@@ -1,3 +1,11 @@
1
+ == 2013-08-23 version 0.10.85
2
+
3
+ * Fix 'undefined method job_priority_id_of' error
4
+ * Add table:expire_days subcommand
5
+ * Update bulk-import-java to 0.2.1
6
+ * Add import command to support new bulk import
7
+
8
+
1
9
  == 2013-07-25 version 0.10.84
2
10
 
3
11
  * bulk_import now checks first argument is correct session name
@@ -19,10 +19,13 @@ if [ -n "$chrev" ];then
19
19
  fi
20
20
 
21
21
  revname="$(git show --pretty=format:'%H %ad' | head -n 1)"
22
- vername="0.2.0-SNAPSHOT"
22
+ vername="0.2.1"
23
23
 
24
24
  mvn package -Dmaven.test.skip=true || exit 1
25
+ echo "copy td-bulk-import-${vername}.jar"
25
26
  cp target/td-bulk-import-${vername}.jar ../../java/td-bulk-import-${vername}.jar
27
+ echo "copy logging.properties"
28
+ cp src/test/resources/java/logging.properties ../../java/logging.properties
26
29
 
27
30
  if [ -n "$chrev" ];then
28
31
  git checkout master
@@ -0,0 +1,53 @@
1
+ ############################################################
2
+ # Treasure Data BulkImport Logging Configuration File
3
+ #
4
+ # You can use a different file by specifying a filename
5
+ # with the java.util.logging.config.file system property.
6
+ # For example java -Djava.util.logging.config.file=myfile
7
+ ############################################################
8
+
9
+ ############################################################
10
+ # Global properties
11
+ ############################################################
12
+
13
+ # "handlers" specifies a comma separated list of log Handler
14
+ # classes. These handlers will be installed during VM startup.
15
+ # Note that these classes must be on the system classpath.
16
+ # By default we only configure a ConsoleHandler, which will only
17
+ # show messages at the INFO and above levels.
18
+ handlers= java.util.logging.FileHandler
19
+
20
+ # To also add the FileHandler, use the following line instead.
21
+ #handlers= java.util.logging.FileHandler, java.util.logging.ConsoleHandler
22
+
23
+ # Default global logging level.
24
+ # This specifies which kinds of events are logged across
25
+ # all loggers. For any given facility this global level
26
+ # can be overriden by a facility specific level
27
+ # Note that the ConsoleHandler also has a separate level
28
+ # setting to limit messages printed to the console.
29
+ .level= INFO
30
+
31
+ ############################################################
32
+ # Handler specific properties.
33
+ # Describes specific configuration info for Handlers.
34
+ ############################################################
35
+
36
+ java.util.logging.FileHandler.level = INFO
37
+ java.util.logging.FileHandler.pattern=td-bulk-import.log
38
+ java.util.logging.FileHandler.limit = 50000
39
+ java.util.logging.FileHandler.count = 1
40
+ java.util.logging.FileHandler.formatter = java.util.logging.SimpleFormatter
41
+
42
+ # Limit the message that are printed on the console to INFO and above.
43
+ java.util.logging.ConsoleHandler.level = INFO
44
+ java.util.logging.ConsoleHandler.formatter = java.util.logging.SimpleFormatter
45
+
46
+ ############################################################
47
+ # Facility specific properties.
48
+ # Provides extra control for each logger.
49
+ ############################################################
50
+
51
+ # For example, set the com.xyz.foo logger to only log SEVERE
52
+ # messages:
53
+ com.xyz.foo.level = SEVERE
@@ -1 +1 @@
1
- 13ef573e6221c827f793ed68fea7f7e4978e7957 Fri Jun 28 16:14:15 2013 +0900
1
+ 9fab1d2cc1d026d0a355945c52ea103bb8b05e1d Wed Aug 21 15:28:09 2013 +0900
@@ -2,321 +2,213 @@
2
2
  module TreasureData
3
3
  module Command
4
4
 
5
- IMPORT_TEMPLATES = {
6
- 'apache' => [
7
- /^([^ ]*) [^ ]* ([^ ]*) \[([^\]]*)\] "(\S+)(?: +([^ ]*) +\S*)?" ([^ ]*) ([^ ]*)(?: "([^\"]*)" "([^\"]*)")?$/,
8
- ['host', 'user', 'time', 'method', 'path', 'code', 'size', 'referer', 'agent'],
9
- "%d/%b/%Y:%H:%M:%S %z"],
10
- 'syslog' => [
11
- /^([^ ]* [^ ]* [^ ]*) ([^ ]*) ([a-zA-Z0-9_\/\.\-]*)(?:\[([0-9]+)\])?[^\:]*\: *(.*)$/,
12
- ['time', 'host', 'ident', 'pid', 'message'],
13
- "%b %d %H:%M:%S"],
14
- }
15
-
16
- # TODO import-item
17
- # TODO tail
18
-
19
- def table_import(op)
20
- op.banner << "\nsupported formats:\n"
21
- op.banner << " apache\n"
22
- op.banner << " syslog\n"
23
- op.banner << " msgpack\n"
24
- op.banner << " json\n"
25
-
26
- format = 'apache'
27
- time_key = 'time'
28
- auto_create = false
29
-
30
- op.on('--format FORMAT', "file format (default: #{format})") {|s|
31
- format = s
32
- }
33
-
34
- op.on('--apache', "same as --format apache; apache common log format") {
35
- format = 'apache'
36
- }
37
-
38
- op.on('--syslog', "same as --format syslog; syslog") {
39
- format = 'syslog'
40
- }
41
-
42
- op.on('--msgpack', "same as --format msgpack; msgpack stream format") {
43
- format = 'msgpack'
44
- }
45
-
46
- op.on('--json', "same as --format json; LF-separated json format") {
47
- format = 'json'
48
- }
49
-
50
- op.on('-t', '--time-key COL_NAME', "time key name for json and msgpack format (e.g. 'created_at')") {|s|
51
- time_key = s
52
- }
53
-
54
- op.on('--auto-create-table', "Create table and database if doesn't exist", TrueClass) { |b|
55
- auto_create = b
56
- }
57
-
58
- db_name, table_name, *paths = op.cmd_parse
59
-
60
- client = get_client
61
-
62
- if auto_create
63
- # Merge with db_create and table_create after refactoring
64
- API.validate_database_name(db_name)
65
- begin
66
- client.create_database(db_name)
67
- $stderr.puts "Database '#{db_name}' is created."
68
- rescue AlreadyExistsError
69
- end
5
+ BASE_PATH = File.expand_path('../../..', File.dirname(__FILE__))
70
6
 
71
- API.validate_table_name(table_name)
72
- begin
73
- client.create_log_table(db_name, table_name)
74
- $stderr.puts "Table '#{db_name}.#{table_name}' is created."
75
- rescue AlreadyExistsError
76
- end
77
- end
7
+ JAVA_COMMAND = "java"
8
+ JAVA_COMMAND_CHECK = "#{JAVA_COMMAND} -version"
9
+ JAVA_MAIN_CLASS = "com.treasure_data.bulk_import.BulkImportMain"
10
+ JAVA_HEAP_MAX_SIZE = "-Xmx1024m" # TODO
78
11
 
79
- case format
80
- when 'json', 'msgpack'
81
- #unless time_key
82
- # $stderr.puts "-t, --time-key COL_NAME (e.g. '-t created_at') parameter is required for #{format} format"
83
- # exit 1
84
- #end
85
- if format == 'json'
86
- require 'json'
87
- require 'time'
88
- parser = JsonParser.new(time_key)
89
- else
90
- parser = MessagePackParser.new(time_key)
91
- end
12
+ APP_OPTION_PREPARE = "prepare"
13
+ APP_OPTION_UPLOAD = "upload"
92
14
 
93
- else
94
- regexp, names, time_format = IMPORT_TEMPLATES[format]
95
- if !regexp || !names || !time_format
96
- $stderr.puts "Unknown format '#{format}'"
97
- exit 1
98
- end
99
- parser = TextParser.new(names, regexp, time_format)
100
- end
15
+ def import_list(op)
16
+ require 'td/command/bulk_import'
17
+ bulk_import_list(op)
18
+ end
101
19
 
102
- get_table(client, db_name, table_name)
20
+ def import_show(op)
21
+ require 'td/command/bulk_import'
22
+ bulk_import_show(op)
23
+ end
103
24
 
104
- require 'zlib'
25
+ def import_create(op)
26
+ require 'td/command/bulk_import'
27
+ bulk_import_create(op)
28
+ end
105
29
 
106
- files = paths.map {|path|
107
- if path == '-'
108
- $stdin
109
- elsif path =~ /\.gz$/
110
- require 'td/compat_gzip_reader'
111
- Zlib::GzipReader.open(path)
112
- else
113
- File.open(path)
114
- end
115
- }
30
+ def import_prepare(op)
31
+ import_generic(APP_OPTION_PREPARE)
32
+ end
116
33
 
117
- require 'msgpack'
118
- require 'tempfile'
119
- #require 'thread'
34
+ def import_upload(op)
35
+ import_generic(APP_OPTION_UPLOAD)
36
+ end
120
37
 
121
- files.zip(paths).each {|file,path|
122
- import_log_file(file, path, client, db_name, table_name, parser)
123
- }
38
+ def import_perform(op)
39
+ require 'td/command/bulk_import'
40
+ bulk_import_perform(op)
41
+ end
42
+
43
+ def import_error_records(op)
44
+ require 'td/command/bulk_import'
45
+ bulk_import_error_records(op)
46
+ end
47
+
48
+ def import_commit(op)
49
+ require 'td/command/bulk_import'
50
+ bulk_import_commit(op)
51
+ end
52
+
53
+ def import_delete(op)
54
+ require 'td/command/bulk_import'
55
+ bulk_import_delete(op)
56
+ end
57
+
58
+ def import_freeze(op)
59
+ require 'td/command/bulk_import'
60
+ bulk_import_freeze(op)
61
+ end
124
62
 
125
- puts "done."
63
+ def import_unfreeze(op)
64
+ require 'td/command/bulk_import'
65
+ bulk_importunfreeze(op)
126
66
  end
127
67
 
128
68
  private
129
- def import_log_file(file, path, client, db_name, table_name, parser)
130
- puts "importing #{path}..."
131
-
132
- out = Tempfile.new('td-import')
133
- out.binmode if out.respond_to?(:binmode)
134
-
135
- writer = Zlib::GzipWriter.new(out)
136
-
137
- n = 0
138
- x = 0
139
- has_bignum = false
140
- parser.call(file, path) {|record|
141
- entry = begin
142
- record.to_msgpack
143
- rescue RangeError
144
- has_bignum = true
145
- TreasureData::API.normalized_msgpack(record)
146
- end
147
- writer.write entry
148
-
149
- n += 1
150
- x += 1
151
- if n % 10000 == 0
152
- puts " imported #{n} entries from #{path}..."
153
-
154
- elsif out.pos > 1024*1024 # TODO size
155
- puts " imported #{n} entries from #{path}..."
156
- begin
157
- writer.finish
158
- size = out.pos
159
- out.pos = 0
160
-
161
- puts " uploading #{size} bytes..."
162
- client.import(db_name, table_name, "msgpack.gz", out, size)
163
-
164
- out.truncate(0)
165
- out.pos = 0
166
- x = 0
167
- writer = Zlib::GzipWriter.new(out)
168
- rescue
169
- $stderr.puts " #{$!}"
170
- return 1 # TODO error
171
- end
172
- end
173
- }
69
+ def import_generic(subcmd)
70
+ # has java runtime
71
+ check_java
72
+
73
+ # show help
74
+ show_help = ARGV.size == 0 || (ARGV.size == 1 || ARGV[0] =~ /^import:/)
174
75
 
175
- if x != 0
176
- writer.finish
177
- size = out.pos
178
- out.pos = 0
76
+ # configure jvm options
77
+ jvm_opts = [ JAVA_HEAP_MAX_SIZE ]
179
78
 
180
- puts " uploading #{size} bytes..."
181
- # TODO upload on background thread
182
- client.import(db_name, table_name, "msgpack.gz", out, size)
79
+ # configure java options
80
+ java_opts = [ "-cp \"#{find_td_bulk_import_jar()}\"" ]
81
+
82
+ # configure system properties
83
+ sysprops = set_sysprops()
84
+
85
+ # configure java command-line arguments
86
+ java_args = []
87
+ java_args << JAVA_MAIN_CLASS
88
+ java_args << subcmd
89
+ if show_help
90
+ java_args << "--help"
91
+ else
92
+ java_args << ARGV
183
93
  end
184
94
 
185
- puts " imported #{n} entries from #{path}."
186
- $stderr.puts normalized_message if has_bignum
187
- ensure
188
- out.close rescue nil
189
- writer.close rescue nil
95
+ # TODO consider parameters including spaces; don't use join(' ')
96
+ cmd = "#{JAVA_COMMAND} #{jvm_opts.join(' ')} #{java_opts.join(' ')} #{sysprops.join(' ')} #{java_args.join(' ')}"
97
+ exec cmd
190
98
  end
191
99
 
192
- require 'date' # DateTime#strptime
193
- require 'time' # Time#strptime, Time#parse
100
+ private
101
+ def check_java
102
+ pid = do_fork(JAVA_COMMAND_CHECK)
103
+ pid, stat = Process.waitpid2(pid)
194
104
 
195
- class TextParser
196
- def initialize(names, regexp, time_format)
197
- @names = names
198
- @regexp = regexp
199
- @time_format = time_format
105
+ if stat.exitstatus != 0
106
+ $stderr.puts "Java is not installed. 'td import' command requires Java (version 1.6 or later). If Java is not installed yet, please use 'bulk_import' commands instead of this command."
107
+ exit 1
200
108
  end
109
+ end
201
110
 
202
- def call(file, path, &block)
203
- i = 0
204
- file.each_line {|line|
205
- i += 1
206
- begin
207
- line.rstrip!
208
- m = @regexp.match(line)
209
- unless m
210
- raise "invalid log format at #{path}:#{i}"
211
- end
212
-
213
- record = {}
214
-
215
- cap = m.captures
216
- @names.each_with_index {|name,cap_i|
217
- if value = cap[cap_i]
218
- if name == "time"
219
- value = parse_time(value).to_i
220
- end
221
- record[name] = value
222
- end
223
- }
224
-
225
- block.call(record)
226
-
227
- rescue
228
- $stderr.puts " skipped: #{$!}: #{line.dump}"
229
- end
230
- }
111
+ def do_fork(cmd)
112
+ Process.fork do
113
+ begin
114
+ Process.exec(*cmd)
115
+ ensure
116
+ exit! 127
117
+ end
231
118
  end
119
+ end
232
120
 
233
- if Time.respond_to?(:strptime)
234
- def parse_time(value)
235
- Time.strptime(value, @time_format)
236
- end
237
- else
238
- def parse_time(value)
239
- Time.parse(DateTime.strptime(value, @time_format).to_s)
240
- end
121
+ private
122
+ def find_td_bulk_import_jar
123
+ libjars = Dir.glob("#{BASE_PATH}/java/**/*.jar")
124
+ found = libjars.find { |path| File.basename(path) =~ /^td-bulk-import/ }
125
+ if found.nil?
126
+ $stderr.puts "td-bulk-import.jar is not found."
127
+ exit
241
128
  end
129
+ td_bulk_import_jar = libjars.delete(found)
130
+ td_bulk_import_jar
242
131
  end
243
132
 
244
- class JsonParser
245
- def initialize(time_key)
246
- require 'json'
247
- @time_key = time_key
133
+ private
134
+ def set_sysprops
135
+ sysprops = []
136
+
137
+ # set apiserver
138
+ set_sysprops_endpoint(sysprops)
139
+
140
+ # set http_proxy
141
+ set_sysprops_http_proxy(sysprops)
142
+
143
+ # set configuration file for logging
144
+ conf_file = find_logging_conf_file
145
+ if conf_file
146
+ sysprops << "-Djava.util.logging.config.file=#{conf_file}"
248
147
  end
249
148
 
250
- def call(file, path, &block)
251
- i = 0
252
- file.each_line {|line|
253
- i += 1
254
- begin
255
- record = JSON.parse(line)
256
-
257
- unless record.is_a?(Hash)
258
- raise "record must be a Hash"
259
- end
260
-
261
- time = record[@time_key]
262
- unless time
263
- raise "record doesn't have '#{@time_key}' column"
264
- end
265
-
266
- case time
267
- when Integer
268
- # do nothing
269
- else
270
- time = Time.parse(time.to_s).to_i
271
- end
272
- record['time'] = time
273
-
274
- block.call(record)
275
-
276
- rescue
277
- $stderr.puts " skipped: #{$!}: #{line.dump}"
149
+ # set API key
150
+ sysprops << "-Dtd.api.key=#{TreasureData::Config.apikey}"
151
+
152
+ sysprops
153
+ end
154
+
155
+ private
156
+ def set_sysprops_endpoint(sysprops)
157
+ endpoint = ENV['TD_API_SERVER']
158
+ if endpoint
159
+ require 'uri'
160
+
161
+ uri = URI.parse(endpoint)
162
+
163
+ case uri.scheme
164
+ when 'http', 'https'
165
+ host = uri.host
166
+ port = uri.port
167
+ ssl = uri.scheme == 'https'
168
+
169
+ port = 80 if port == 443 and ssl
170
+ else
171
+ if uri.port
172
+ # invalid URI
173
+ raise "Invalid endpoint: #{endpoint}"
278
174
  end
279
- }
175
+
176
+ # generic URI
177
+ host, port = endpoint.split(':', 2)
178
+ port = port.to_i
179
+ # TODO support ssl
180
+ port = 80 if port == 0
181
+ ssl = false
182
+ end
183
+
184
+ sysprops << "-Dtd.api.server.host=#{host}"
185
+ sysprops << "-Dtd.api.server.port=#{port}"
280
186
  end
281
187
  end
282
188
 
283
- class MessagePackParser
284
- def initialize(time_key)
285
- require 'msgpack'
286
- @time_key = time_key
287
- end
189
+ private
190
+ def set_sysprops_http_proxy(sysprops)
191
+ http_proxy = ENV['HTTP_PROXY']
192
+ if http_proxy
193
+ if http_proxy =~ /\Ahttp:\/\/(.*)\z/
194
+ http_proxy = $~[1]
195
+ end
196
+ proxy_host, proxy_port = http_proxy.split(':', 2)
197
+ proxy_port = (proxy_port ? proxy_port.to_i : 80)
288
198
 
289
- def call(file, path, &block)
290
- i = 0
291
- MessagePack::Unpacker.new(file).each {|record|
292
- i += 1
293
- begin
294
- unless record.is_a?(Hash)
295
- raise "record must be a Hash"
296
- end
297
-
298
- time = record[@time_key]
299
- unless time
300
- raise "record doesn't have '#{@time_key}' column"
301
- end
302
-
303
- case time
304
- when Integer
305
- # do nothing
306
- else
307
- time = Time.parse(time.to_s).to_i
308
- end
309
- record['time'] = time
310
-
311
- block.call(record)
312
-
313
- rescue
314
- $stderr.puts " skipped: #{$!}: #{record.to_json}"
315
- end
316
- }
317
- rescue EOFError
199
+ sysprops << "-Dhttp.proxyHost=#{proxy_host}"
200
+ sysprops << "-Dhttp.proxyPort=#{proxy_port}"
318
201
  end
319
202
  end
203
+
204
+ private
205
+ def find_logging_conf_file
206
+ libjars = Dir.glob("#{BASE_PATH}/java/**/*.properties")
207
+ found = libjars.find { |path| File.basename(path) =~ /^logging.properties/ }
208
+ return nil if found.nil?
209
+ logging_conf_file = libjars.delete(found)
210
+ logging_conf_file
211
+ end
212
+
320
213
  end
321
214
  end
322
-
@@ -228,6 +228,7 @@ module List
228
228
  add_list 'table:swap', %w[db table1 table2], 'Swap names of two tables', 'table:swap example_db table1 table2'
229
229
  add_list 'table:tail', %w[db table], 'Get recently imported logs', 'table:tail example_db table1', 'table:tail example_db table1 -t "2011-01-02 03:04:05" -n 30'
230
230
  add_list 'table:partial_delete', %w[db table], 'Delete logs from the table within the specified time range', 'table:partial_delete example_db table1 --from 1341000000 --to 1341003600'
231
+ add_list 'table:expire', %w[db table expire_days], 'Expire data in table after specified number of days', 'table:expire example_db table1 30'
231
232
 
232
233
  add_list 'bulk_import:list', %w[], 'List bulk import sessions', 'bulk_import:list'
233
234
  add_list 'bulk_import:show', %w[name], 'Show list of uploaded parts', 'bulk_import:show'
@@ -246,6 +247,18 @@ module List
246
247
  add_list 'bulk_import:freeze', %w[name], 'Reject succeeding uploadings to a bulk import session', 'bulk_import:freeze logs_201201'
247
248
  add_list 'bulk_import:unfreeze', %w[name], 'Unfreeze a frozen bulk import session', 'bulk_import:unfreeze logs_201201'
248
249
 
250
+ add_list 'import:list', %w[], 'List bulk import sessions', 'import:list'
251
+ add_list 'import:show', %w[name], 'Show list of uploaded parts', 'import:show'
252
+ add_list 'import:create', %w[name db table], 'Create a new bulk import session to the the table', 'import:create logs_201201 example_db event_logs'
253
+ add_list 'import:prepare', %w[files_], 'Convert files into part file format', 'import:prepare logs/*.csv --format csv --columns time,uid,price,count --time-column "time" -o parts/'
254
+ add_list 'import:upload', %w[name files_], 'Upload or re-upload files into a bulk import session', 'import:upload parts/* --parallel 4'
255
+ add_list 'import:perform', %w[name], 'Start to validate and convert uploaded files', 'import:perform logs_201201'
256
+ add_list 'import:error_records', %w[name], 'Show records which did not pass validations', 'import:error_records logs_201201'
257
+ add_list 'import:commit', %w[name], 'Start to commit a performed bulk import session', 'import:commit logs_201201'
258
+ add_list 'import:delete', %w[name], 'Delete a bulk import session', 'import:delete logs_201201'
259
+ add_list 'import:freeze', %w[name], 'Reject succeeding uploadings to a bulk import session', 'import:freeze logs_201201'
260
+ add_list 'import:unfreeze', %w[name], 'Unfreeze a frozen bulk import session', 'import:unfreeze logs_201201'
261
+
249
262
  add_list 'result:list', %w[], 'Show list of result URLs', 'result:list', 'results'
250
263
  add_list 'result:show', %w[name], 'Describe information of a result URL', 'result mydb'
251
264
  add_list 'result:create', %w[name URL], 'Create a result URL', 'result:create mydb mysql://my-server/mydb'
@@ -413,7 +426,7 @@ module List
413
426
  add_guess 'show-job', 'job:show'
414
427
  add_guess 'show-jobs', 'job:list'
415
428
  add_guess 'server-status', 'server:status'
416
- add_alias 'import', 'table:import'
429
+ add_alias 'import', 'import:upload'
417
430
 
418
431
  finishup
419
432
  end
@@ -25,6 +25,8 @@ module Command
25
25
  end
26
26
 
27
27
  def sched_create(op)
28
+ require 'td/command/job' # job_priority_id_of
29
+
28
30
  org = nil
29
31
  db_name = nil
30
32
  timezone = nil
@@ -131,6 +133,8 @@ module Command
131
133
  end
132
134
 
133
135
  def sched_update(op)
136
+ require 'td/command/job' # job_priority_id_of
137
+
134
138
  cron = nil
135
139
  sql = nil
136
140
  db_name = nil
@@ -357,7 +357,338 @@ module Command
357
357
  end
358
358
  end
359
359
 
360
- require 'td/command/import' # table:import
360
+ def table_expire(op)
361
+ db_name, table_name, expire_days = op.cmd_parse
362
+
363
+ expire_days = expire_days.to_i
364
+ if expire_days <= 0
365
+ $stderr.puts "Table expiration days must be greater than 0."
366
+ return
367
+ end
368
+
369
+ client = get_client
370
+ client.update_expire(db_name, table_name, expire_days)
371
+
372
+ $stderr.puts "Table set to expire data older than #{expire_days} days."
373
+ end
374
+
375
+
376
+ IMPORT_TEMPLATES = {
377
+ 'apache' => [
378
+ /^([^ ]*) [^ ]* ([^ ]*) \[([^\]]*)\] "(\S+)(?: +([^ ]*) +\S*)?" ([^ ]*) ([^ ]*)(?: "([^\"]*)" "([^\"]*)")?$/,
379
+ ['host', 'user', 'time', 'method', 'path', 'code', 'size', 'referer', 'agent'],
380
+ "%d/%b/%Y:%H:%M:%S %z"],
381
+ 'syslog' => [
382
+ /^([^ ]* [^ ]* [^ ]*) ([^ ]*) ([a-zA-Z0-9_\/\.\-]*)(?:\[([0-9]+)\])?[^\:]*\: *(.*)$/,
383
+ ['time', 'host', 'ident', 'pid', 'message'],
384
+ "%b %d %H:%M:%S"],
385
+ }
386
+
387
+ # TODO import-item
388
+ # TODO tail
389
+
390
+ def table_import(op)
391
+ op.banner << "\nsupported formats:\n"
392
+ op.banner << " apache\n"
393
+ op.banner << " syslog\n"
394
+ op.banner << " msgpack\n"
395
+ op.banner << " json\n"
396
+
397
+ format = 'apache'
398
+ time_key = 'time'
399
+ auto_create = false
400
+
401
+ op.on('--format FORMAT', "file format (default: #{format})") {|s|
402
+ format = s
403
+ }
404
+
405
+ op.on('--apache', "same as --format apache; apache common log format") {
406
+ format = 'apache'
407
+ }
408
+
409
+ op.on('--syslog', "same as --format syslog; syslog") {
410
+ format = 'syslog'
411
+ }
412
+
413
+ op.on('--msgpack', "same as --format msgpack; msgpack stream format") {
414
+ format = 'msgpack'
415
+ }
416
+
417
+ op.on('--json', "same as --format json; LF-separated json format") {
418
+ format = 'json'
419
+ }
420
+
421
+ op.on('-t', '--time-key COL_NAME', "time key name for json and msgpack format (e.g. 'created_at')") {|s|
422
+ time_key = s
423
+ }
424
+
425
+ op.on('--auto-create-table', "Create table and database if doesn't exist", TrueClass) { |b|
426
+ auto_create = b
427
+ }
428
+
429
+ db_name, table_name, *paths = op.cmd_parse
430
+
431
+ client = get_client
432
+
433
+ if auto_create
434
+ # Merge with db_create and table_create after refactoring
435
+ API.validate_database_name(db_name)
436
+ begin
437
+ client.create_database(db_name)
438
+ $stderr.puts "Database '#{db_name}' is created."
439
+ rescue AlreadyExistsError
440
+ end
441
+
442
+ API.validate_table_name(table_name)
443
+ begin
444
+ client.create_log_table(db_name, table_name)
445
+ $stderr.puts "Table '#{db_name}.#{table_name}' is created."
446
+ rescue AlreadyExistsError
447
+ end
448
+ end
449
+
450
+ case format
451
+ when 'json', 'msgpack'
452
+ #unless time_key
453
+ # $stderr.puts "-t, --time-key COL_NAME (e.g. '-t created_at') parameter is required for #{format} format"
454
+ # exit 1
455
+ #end
456
+ if format == 'json'
457
+ require 'json'
458
+ require 'time'
459
+ parser = JsonParser.new(time_key)
460
+ else
461
+ parser = MessagePackParser.new(time_key)
462
+ end
463
+
464
+ else
465
+ regexp, names, time_format = IMPORT_TEMPLATES[format]
466
+ if !regexp || !names || !time_format
467
+ $stderr.puts "Unknown format '#{format}'"
468
+ exit 1
469
+ end
470
+ parser = TextParser.new(names, regexp, time_format)
471
+ end
472
+
473
+ get_table(client, db_name, table_name)
474
+
475
+ require 'zlib'
476
+
477
+ files = paths.map {|path|
478
+ if path == '-'
479
+ $stdin
480
+ elsif path =~ /\.gz$/
481
+ require 'td/compat_gzip_reader'
482
+ Zlib::GzipReader.open(path)
483
+ else
484
+ File.open(path)
485
+ end
486
+ }
487
+
488
+ require 'msgpack'
489
+ require 'tempfile'
490
+ #require 'thread'
491
+
492
+ files.zip(paths).each {|file,path|
493
+ import_log_file(file, path, client, db_name, table_name, parser)
494
+ }
495
+
496
+ puts "done."
497
+ end
498
+
499
+ private
500
+ def import_log_file(file, path, client, db_name, table_name, parser)
501
+ puts "importing #{path}..."
502
+
503
+ out = Tempfile.new('td-import')
504
+ out.binmode if out.respond_to?(:binmode)
505
+
506
+ writer = Zlib::GzipWriter.new(out)
507
+
508
+ n = 0
509
+ x = 0
510
+ has_bignum = false
511
+ parser.call(file, path) {|record|
512
+ entry = begin
513
+ record.to_msgpack
514
+ rescue RangeError
515
+ has_bignum = true
516
+ TreasureData::API.normalized_msgpack(record)
517
+ end
518
+ writer.write entry
519
+
520
+ n += 1
521
+ x += 1
522
+ if n % 10000 == 0
523
+ puts " imported #{n} entries from #{path}..."
524
+
525
+ elsif out.pos > 1024*1024 # TODO size
526
+ puts " imported #{n} entries from #{path}..."
527
+ begin
528
+ writer.finish
529
+ size = out.pos
530
+ out.pos = 0
531
+
532
+ puts " uploading #{size} bytes..."
533
+ client.import(db_name, table_name, "msgpack.gz", out, size)
534
+
535
+ out.truncate(0)
536
+ out.pos = 0
537
+ x = 0
538
+ writer = Zlib::GzipWriter.new(out)
539
+ rescue
540
+ $stderr.puts " #{$!}"
541
+ return 1 # TODO error
542
+ end
543
+ end
544
+ }
545
+
546
+ if x != 0
547
+ writer.finish
548
+ size = out.pos
549
+ out.pos = 0
550
+
551
+ puts " uploading #{size} bytes..."
552
+ # TODO upload on background thread
553
+ client.import(db_name, table_name, "msgpack.gz", out, size)
554
+ end
555
+
556
+ puts " imported #{n} entries from #{path}."
557
+ $stderr.puts normalized_message if has_bignum
558
+ ensure
559
+ out.close rescue nil
560
+ writer.close rescue nil
561
+ end
562
+
563
+ require 'date' # DateTime#strptime
564
+ require 'time' # Time#strptime, Time#parse
565
+
566
+ class TextParser
567
+ def initialize(names, regexp, time_format)
568
+ @names = names
569
+ @regexp = regexp
570
+ @time_format = time_format
571
+ end
572
+
573
+ def call(file, path, &block)
574
+ i = 0
575
+ file.each_line {|line|
576
+ i += 1
577
+ begin
578
+ line.rstrip!
579
+ m = @regexp.match(line)
580
+ unless m
581
+ raise "invalid log format at #{path}:#{i}"
582
+ end
583
+
584
+ record = {}
585
+
586
+ cap = m.captures
587
+ @names.each_with_index {|name,cap_i|
588
+ if value = cap[cap_i]
589
+ if name == "time"
590
+ value = parse_time(value).to_i
591
+ end
592
+ record[name] = value
593
+ end
594
+ }
595
+
596
+ block.call(record)
597
+
598
+ rescue
599
+ $stderr.puts " skipped: #{$!}: #{line.dump}"
600
+ end
601
+ }
602
+ end
603
+
604
+ if Time.respond_to?(:strptime)
605
+ def parse_time(value)
606
+ Time.strptime(value, @time_format)
607
+ end
608
+ else
609
+ def parse_time(value)
610
+ Time.parse(DateTime.strptime(value, @time_format).to_s)
611
+ end
612
+ end
613
+ end
614
+
615
+ class JsonParser
616
+ def initialize(time_key)
617
+ require 'json'
618
+ @time_key = time_key
619
+ end
620
+
621
+ def call(file, path, &block)
622
+ i = 0
623
+ file.each_line {|line|
624
+ i += 1
625
+ begin
626
+ record = JSON.parse(line)
627
+
628
+ unless record.is_a?(Hash)
629
+ raise "record must be a Hash"
630
+ end
631
+
632
+ time = record[@time_key]
633
+ unless time
634
+ raise "record doesn't have '#{@time_key}' column"
635
+ end
636
+
637
+ case time
638
+ when Integer
639
+ # do nothing
640
+ else
641
+ time = Time.parse(time.to_s).to_i
642
+ end
643
+ record['time'] = time
644
+
645
+ block.call(record)
646
+
647
+ rescue
648
+ $stderr.puts " skipped: #{$!}: #{line.dump}"
649
+ end
650
+ }
651
+ end
652
+ end
653
+
654
+ class MessagePackParser
655
+ def initialize(time_key)
656
+ require 'msgpack'
657
+ @time_key = time_key
658
+ end
659
+
660
+ def call(file, path, &block)
661
+ i = 0
662
+ MessagePack::Unpacker.new(file).each {|record|
663
+ i += 1
664
+ begin
665
+ unless record.is_a?(Hash)
666
+ raise "record must be a Hash"
667
+ end
668
+
669
+ time = record[@time_key]
670
+ unless time
671
+ raise "record doesn't have '#{@time_key}' column"
672
+ end
673
+
674
+ case time
675
+ when Integer
676
+ # do nothing
677
+ else
678
+ time = Time.parse(time.to_s).to_i
679
+ end
680
+ record['time'] = time
681
+
682
+ block.call(record)
683
+
684
+ rescue
685
+ $stderr.puts " skipped: #{$!}: #{record.to_json}"
686
+ end
687
+ }
688
+ rescue EOFError
689
+ end
690
+ end
691
+
361
692
  require 'td/command/export' # table:export
362
693
  require 'td/command/job' # wait_job
363
694
  end
data/lib/td/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  module TreasureData
2
2
 
3
- VERSION = '0.10.84'
3
+ VERSION = '0.10.85'
4
4
 
5
5
  end
metadata CHANGED
@@ -1,32 +1,34 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: td
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.84
4
+ version: 0.10.85
5
+ prerelease:
5
6
  platform: ruby
6
7
  authors:
7
8
  - Treasure Data, Inc.
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2013-07-25 00:00:00.000000000 Z
12
+ date: 2013-08-23 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  name: msgpack
15
16
  requirement: !ruby/object:Gem::Requirement
17
+ none: false
16
18
  requirements:
17
- - - '>='
19
+ - - ! '>='
18
20
  - !ruby/object:Gem::Version
19
21
  version: 0.4.4
20
- - - '!='
22
+ - - ! '!='
21
23
  - !ruby/object:Gem::Version
22
24
  version: 0.5.0
23
- - - '!='
25
+ - - ! '!='
24
26
  - !ruby/object:Gem::Version
25
27
  version: 0.5.1
26
- - - '!='
28
+ - - ! '!='
27
29
  - !ruby/object:Gem::Version
28
30
  version: 0.5.2
29
- - - '!='
31
+ - - ! '!='
30
32
  - !ruby/object:Gem::Version
31
33
  version: 0.5.3
32
34
  - - <
@@ -35,20 +37,21 @@ dependencies:
35
37
  type: :runtime
36
38
  prerelease: false
37
39
  version_requirements: !ruby/object:Gem::Requirement
40
+ none: false
38
41
  requirements:
39
- - - '>='
42
+ - - ! '>='
40
43
  - !ruby/object:Gem::Version
41
44
  version: 0.4.4
42
- - - '!='
45
+ - - ! '!='
43
46
  - !ruby/object:Gem::Version
44
47
  version: 0.5.0
45
- - - '!='
48
+ - - ! '!='
46
49
  - !ruby/object:Gem::Version
47
50
  version: 0.5.1
48
- - - '!='
51
+ - - ! '!='
49
52
  - !ruby/object:Gem::Version
50
53
  version: 0.5.2
51
- - - '!='
54
+ - - ! '!='
52
55
  - !ruby/object:Gem::Version
53
56
  version: 0.5.3
54
57
  - - <
@@ -57,6 +60,7 @@ dependencies:
57
60
  - !ruby/object:Gem::Dependency
58
61
  name: yajl-ruby
59
62
  requirement: !ruby/object:Gem::Requirement
63
+ none: false
60
64
  requirements:
61
65
  - - ~>
62
66
  - !ruby/object:Gem::Version
@@ -64,6 +68,7 @@ dependencies:
64
68
  type: :runtime
65
69
  prerelease: false
66
70
  version_requirements: !ruby/object:Gem::Requirement
71
+ none: false
67
72
  requirements:
68
73
  - - ~>
69
74
  - !ruby/object:Gem::Version
@@ -71,20 +76,23 @@ dependencies:
71
76
  - !ruby/object:Gem::Dependency
72
77
  name: hirb
73
78
  requirement: !ruby/object:Gem::Requirement
79
+ none: false
74
80
  requirements:
75
- - - '>='
81
+ - - ! '>='
76
82
  - !ruby/object:Gem::Version
77
83
  version: 0.4.5
78
84
  type: :runtime
79
85
  prerelease: false
80
86
  version_requirements: !ruby/object:Gem::Requirement
87
+ none: false
81
88
  requirements:
82
- - - '>='
89
+ - - ! '>='
83
90
  - !ruby/object:Gem::Version
84
91
  version: 0.4.5
85
92
  - !ruby/object:Gem::Dependency
86
93
  name: parallel
87
94
  requirement: !ruby/object:Gem::Requirement
95
+ none: false
88
96
  requirements:
89
97
  - - ~>
90
98
  - !ruby/object:Gem::Version
@@ -92,6 +100,7 @@ dependencies:
92
100
  type: :runtime
93
101
  prerelease: false
94
102
  version_requirements: !ruby/object:Gem::Requirement
103
+ none: false
95
104
  requirements:
96
105
  - - ~>
97
106
  - !ruby/object:Gem::Version
@@ -99,6 +108,7 @@ dependencies:
99
108
  - !ruby/object:Gem::Dependency
100
109
  name: td-client
101
110
  requirement: !ruby/object:Gem::Requirement
111
+ none: false
102
112
  requirements:
103
113
  - - ~>
104
114
  - !ruby/object:Gem::Version
@@ -106,6 +116,7 @@ dependencies:
106
116
  type: :runtime
107
117
  prerelease: false
108
118
  version_requirements: !ruby/object:Gem::Requirement
119
+ none: false
109
120
  requirements:
110
121
  - - ~>
111
122
  - !ruby/object:Gem::Version
@@ -113,6 +124,7 @@ dependencies:
113
124
  - !ruby/object:Gem::Dependency
114
125
  name: td-logger
115
126
  requirement: !ruby/object:Gem::Requirement
127
+ none: false
116
128
  requirements:
117
129
  - - ~>
118
130
  - !ruby/object:Gem::Version
@@ -120,6 +132,7 @@ dependencies:
120
132
  type: :runtime
121
133
  prerelease: false
122
134
  version_requirements: !ruby/object:Gem::Requirement
135
+ none: false
123
136
  requirements:
124
137
  - - ~>
125
138
  - !ruby/object:Gem::Version
@@ -127,6 +140,7 @@ dependencies:
127
140
  - !ruby/object:Gem::Dependency
128
141
  name: rubyzip
129
142
  requirement: !ruby/object:Gem::Requirement
143
+ none: false
130
144
  requirements:
131
145
  - - ~>
132
146
  - !ruby/object:Gem::Version
@@ -134,6 +148,7 @@ dependencies:
134
148
  type: :runtime
135
149
  prerelease: false
136
150
  version_requirements: !ruby/object:Gem::Requirement
151
+ none: false
137
152
  requirements:
138
153
  - - ~>
139
154
  - !ruby/object:Gem::Version
@@ -141,6 +156,7 @@ dependencies:
141
156
  - !ruby/object:Gem::Dependency
142
157
  name: rake
143
158
  requirement: !ruby/object:Gem::Requirement
159
+ none: false
144
160
  requirements:
145
161
  - - ~>
146
162
  - !ruby/object:Gem::Version
@@ -148,6 +164,7 @@ dependencies:
148
164
  type: :development
149
165
  prerelease: false
150
166
  version_requirements: !ruby/object:Gem::Requirement
167
+ none: false
151
168
  requirements:
152
169
  - - ~>
153
170
  - !ruby/object:Gem::Version
@@ -155,6 +172,7 @@ dependencies:
155
172
  - !ruby/object:Gem::Dependency
156
173
  name: rspec
157
174
  requirement: !ruby/object:Gem::Requirement
175
+ none: false
158
176
  requirements:
159
177
  - - ~>
160
178
  - !ruby/object:Gem::Version
@@ -162,6 +180,7 @@ dependencies:
162
180
  type: :development
163
181
  prerelease: false
164
182
  version_requirements: !ruby/object:Gem::Requirement
183
+ none: false
165
184
  requirements:
166
185
  - - ~>
167
186
  - !ruby/object:Gem::Version
@@ -169,6 +188,7 @@ dependencies:
169
188
  - !ruby/object:Gem::Dependency
170
189
  name: simplecov
171
190
  requirement: !ruby/object:Gem::Requirement
191
+ none: false
172
192
  requirements:
173
193
  - - ~>
174
194
  - !ruby/object:Gem::Version
@@ -176,6 +196,7 @@ dependencies:
176
196
  type: :development
177
197
  prerelease: false
178
198
  version_requirements: !ruby/object:Gem::Requirement
199
+ none: false
179
200
  requirements:
180
201
  - - ~>
181
202
  - !ruby/object:Gem::Version
@@ -209,7 +230,8 @@ files:
209
230
  - dist/resources/pkg/postinstall
210
231
  - dist/resources/pkg/ruby-2.0.0-p0.pkg
211
232
  - dist/resources/pkg/td
212
- - java/td-bulk-import-0.2.0-SNAPSHOT.jar
233
+ - java/logging.properties
234
+ - java/td-bulk-import-0.2.1.jar
213
235
  - java/td-bulk-import-java.version
214
236
  - lib/td.rb
215
237
  - lib/td/command/account.rb
@@ -259,26 +281,33 @@ files:
259
281
  - td.gemspec
260
282
  homepage: http://treasure-data.com/
261
283
  licenses: []
262
- metadata: {}
263
284
  post_install_message:
264
285
  rdoc_options: []
265
286
  require_paths:
266
287
  - lib
267
288
  required_ruby_version: !ruby/object:Gem::Requirement
289
+ none: false
268
290
  requirements:
269
- - - '>='
291
+ - - ! '>='
270
292
  - !ruby/object:Gem::Version
271
293
  version: '0'
294
+ segments:
295
+ - 0
296
+ hash: -2194512597141251482
272
297
  required_rubygems_version: !ruby/object:Gem::Requirement
298
+ none: false
273
299
  requirements:
274
- - - '>='
300
+ - - ! '>='
275
301
  - !ruby/object:Gem::Version
276
302
  version: '0'
303
+ segments:
304
+ - 0
305
+ hash: -2194512597141251482
277
306
  requirements: []
278
307
  rubyforge_project:
279
- rubygems_version: 2.0.2
308
+ rubygems_version: 1.8.23
280
309
  signing_key:
281
- specification_version: 4
310
+ specification_version: 3
282
311
  summary: CLI to manage data on Treasure Data, the Hadoop-based cloud data warehousing
283
312
  test_files:
284
313
  - spec/file_reader/filter_spec.rb
checksums.yaml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- SHA1:
3
- metadata.gz: c7c60a874b7228266a8436132aae4d0e2438b3e5
4
- data.tar.gz: 9caacd3a0d3065d8fd4dca7bc7bae1cd29b9bbd5
5
- SHA512:
6
- metadata.gz: baa729bcbfb73760f2c93cc70e6fefccc69b1cae9a41c62e7c869372bf4124975649e0abffb5db15b79156bfb59e59ab52875634bf179da86fab129979c7cdab
7
- data.tar.gz: 10328c283f1b8a9a3e4d2b9ad23187da08b0b0391ae237fdd5fbe8ad4c610784f0d8a2f2ca8d00c758f5afac757426965545e5811b406df943769a826b9a27d7