embulk-input-td 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/gradlew.bat ADDED
@@ -0,0 +1,90 @@
1
+ @if "%DEBUG%" == "" @echo off
2
+ @rem ##########################################################################
3
+ @rem
4
+ @rem Gradle startup script for Windows
5
+ @rem
6
+ @rem ##########################################################################
7
+
8
+ @rem Set local scope for the variables with windows NT shell
9
+ if "%OS%"=="Windows_NT" setlocal
10
+
11
+ @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12
+ set DEFAULT_JVM_OPTS=
13
+
14
+ set DIRNAME=%~dp0
15
+ if "%DIRNAME%" == "" set DIRNAME=.
16
+ set APP_BASE_NAME=%~n0
17
+ set APP_HOME=%DIRNAME%
18
+
19
+ @rem Find java.exe
20
+ if defined JAVA_HOME goto findJavaFromJavaHome
21
+
22
+ set JAVA_EXE=java.exe
23
+ %JAVA_EXE% -version >NUL 2>&1
24
+ if "%ERRORLEVEL%" == "0" goto init
25
+
26
+ echo.
27
+ echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28
+ echo.
29
+ echo Please set the JAVA_HOME variable in your environment to match the
30
+ echo location of your Java installation.
31
+
32
+ goto fail
33
+
34
+ :findJavaFromJavaHome
35
+ set JAVA_HOME=%JAVA_HOME:"=%
36
+ set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37
+
38
+ if exist "%JAVA_EXE%" goto init
39
+
40
+ echo.
41
+ echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42
+ echo.
43
+ echo Please set the JAVA_HOME variable in your environment to match the
44
+ echo location of your Java installation.
45
+
46
+ goto fail
47
+
48
+ :init
49
+ @rem Get command-line arguments, handling Windowz variants
50
+
51
+ if not "%OS%" == "Windows_NT" goto win9xME_args
52
+ if "%@eval[2+2]" == "4" goto 4NT_args
53
+
54
+ :win9xME_args
55
+ @rem Slurp the command line arguments.
56
+ set CMD_LINE_ARGS=
57
+ set _SKIP=2
58
+
59
+ :win9xME_args_slurp
60
+ if "x%~1" == "x" goto execute
61
+
62
+ set CMD_LINE_ARGS=%*
63
+ goto execute
64
+
65
+ :4NT_args
66
+ @rem Get arguments from the 4NT Shell from JP Software
67
+ set CMD_LINE_ARGS=%$
68
+
69
+ :execute
70
+ @rem Setup the command line
71
+
72
+ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73
+
74
+ @rem Execute Gradle
75
+ "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76
+
77
+ :end
78
+ @rem End local scope for the variables with windows NT shell
79
+ if "%ERRORLEVEL%"=="0" goto mainEnd
80
+
81
+ :fail
82
+ rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83
+ rem the _cmd.exe /c_ return code!
84
+ if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85
+ exit /b 1
86
+
87
+ :mainEnd
88
+ if "%OS%"=="Windows_NT" endlocal
89
+
90
+ :omega
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_input(
2
+ "td", "org.embulk.input.td.TdInputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,446 @@
1
+ package org.embulk.input.td;
2
+
3
+ import java.io.IOException;
4
+ import java.io.InputStream;
5
+ import java.util.List;
6
+ import java.util.Properties;
7
+ import java.util.zip.GZIPInputStream;
8
+
9
+ import com.fasterxml.jackson.databind.JsonNode;
10
+ import com.fasterxml.jackson.databind.ObjectMapper;
11
+ import com.fasterxml.jackson.databind.node.ArrayNode;
12
+ import com.google.common.base.Function;
13
+ import com.google.common.base.Optional;
14
+ import com.google.common.base.Throwables;
15
+ import com.google.inject.Inject;
16
+ import com.treasuredata.client.ProxyConfig;
17
+ import com.treasuredata.client.TDClient;
18
+ import com.treasuredata.client.TDClientBuilder;
19
+ import com.treasuredata.client.model.TDJob;
20
+ import com.treasuredata.client.model.TDJobRequest;
21
+ import com.treasuredata.client.model.TDJobSummary;
22
+ import org.embulk.config.ConfigException;
23
+ import org.embulk.config.ConfigInject;
24
+ import org.embulk.config.TaskReport;
25
+ import org.embulk.config.Config;
26
+ import org.embulk.config.ConfigDefault;
27
+ import org.embulk.config.ConfigDiff;
28
+ import org.embulk.config.ConfigSource;
29
+ import org.embulk.config.Task;
30
+ import org.embulk.config.TaskSource;
31
+ import org.embulk.input.td.writer.BooleanValueWriter;
32
+ import org.embulk.input.td.writer.DoubleValueWriter;
33
+ import org.embulk.input.td.writer.LongValueWriter;
34
+ import org.embulk.input.td.writer.StringValueWriter;
35
+ import org.embulk.input.td.writer.ValueWriter;
36
+ import org.embulk.spi.BufferAllocator;
37
+ import org.embulk.spi.Column;
38
+ import org.embulk.spi.DataException;
39
+ import org.embulk.spi.Exec;
40
+ import org.embulk.spi.InputPlugin;
41
+ import org.embulk.spi.PageBuilder;
42
+ import org.embulk.spi.PageOutput;
43
+ import org.embulk.spi.Schema;
44
+ import org.embulk.spi.type.Type;
45
+ import org.msgpack.core.MessagePack;
46
+ import org.msgpack.core.MessageUnpacker;
47
+ import org.msgpack.value.ArrayValue;
48
+ import org.msgpack.value.Value;
49
+ import org.slf4j.Logger;
50
+
51
+ import static com.google.common.base.Optional.fromNullable;
52
+ import static com.treasuredata.client.model.TDResultFormat.MESSAGE_PACK_GZ;
53
+ import static java.lang.Integer.parseInt;
54
+ import static java.util.Locale.ENGLISH;
55
+ import static org.embulk.spi.Exec.getLogger;
56
+ import static org.embulk.spi.Exec.newConfigDiff;
57
+ import static org.embulk.spi.Exec.newTaskReport;
58
+ import static org.embulk.spi.type.Types.BOOLEAN;
59
+ import static org.embulk.spi.type.Types.DOUBLE;
60
+ import static org.embulk.spi.type.Types.JSON;
61
+ import static org.embulk.spi.type.Types.LONG;
62
+ import static org.embulk.spi.type.Types.STRING;
63
+ import static org.embulk.spi.type.Types.TIMESTAMP;
64
+
65
+ public class TdInputPlugin
66
+ implements InputPlugin
67
+ {
68
+ public interface PluginTask
69
+ extends Task
70
+ {
71
+ @Config("apikey")
72
+ public String getApiKey();
73
+
74
+ @Config("endpoint")
75
+ @ConfigDefault("\"api.treasuredata.com\"")
76
+ public String getEndpoint();
77
+
78
+ @Config("use_ssl")
79
+ @ConfigDefault("true")
80
+ public boolean getUseSsl();
81
+
82
+ @Config("http_proxy")
83
+ @ConfigDefault("null")
84
+ public Optional<HttpProxyTask> getHttpProxy();
85
+
86
+ // TODO timeout
87
+ // TODO query, database
88
+
89
+ @Config("query")
90
+ @ConfigDefault("null")
91
+ public Optional<String> getQuery();
92
+
93
+ @Config("database")
94
+ @ConfigDefault("null")
95
+ public Optional<String> getDatabase();
96
+
97
+ @Config("job_id")
98
+ @ConfigDefault("null")
99
+ public Optional<String> getJobId();
100
+
101
+ @Config("stop_on_invalid_record")
102
+ @ConfigDefault("false")
103
+ public boolean getStopOnInvalidRecord();
104
+
105
+ // TODO column_options
106
+
107
+ @ConfigInject
108
+ BufferAllocator getBufferAllocator();
109
+ }
110
+
111
+ public interface HttpProxyTask
112
+ extends Task
113
+ {
114
+ @Config("host")
115
+ public String getHost();
116
+
117
+ @Config("port")
118
+ public int getPort();
119
+
120
+ @Config("use_ssl")
121
+ @ConfigDefault("false")
122
+ public boolean getUseSsl();
123
+
124
+ @Config("user")
125
+ @ConfigDefault("null")
126
+ public Optional<String> getUser();
127
+
128
+ @Config("password")
129
+ @ConfigDefault("null")
130
+ public Optional<String> getPassword();
131
+ }
132
+
133
+ private final Logger log;
134
+
135
+ @Inject
136
+ public TdInputPlugin()
137
+ {
138
+ this.log = getLogger(this.getClass());
139
+ }
140
+
141
+ @Override
142
+ public ConfigDiff transaction(ConfigSource config, InputPlugin.Control control)
143
+ {
144
+ PluginTask task = config.loadConfig(PluginTask.class);
145
+ try (TDClient client = newTDClient(task)) {
146
+ TDJob job = getTDJob(task, client);
147
+
148
+ Optional<String> jobResultSchema = job.getResultSchema();
149
+ if (!jobResultSchema.isPresent()) {
150
+ throw new ConfigException(String.format("Not found result schema of job %s", job.getJobId()));
151
+ }
152
+
153
+ Schema inputSchema = convertSchema(job.getType(), toJsonNode(jobResultSchema.get()));
154
+ newValueWriters(inputSchema); // validate if value writers can be created according to the input schema
155
+
156
+ TaskSource taskSource = task.dump().set("job_id", job.getJobId()); // overwrite job_id
157
+ return resume(taskSource, inputSchema, 1, control);
158
+ }
159
+ }
160
+
161
+ private TDClient newTDClient(PluginTask task)
162
+ {
163
+ TDClientBuilder builder = TDClient.newBuilder();
164
+ builder.setApiKey(task.getApiKey());
165
+ builder.setEndpoint(task.getEndpoint());
166
+ builder.setUseSSL(task.getUseSsl());
167
+
168
+ Optional<ProxyConfig>proxyConfig = newProxyConfig(task.getHttpProxy());
169
+ if (proxyConfig.isPresent()) {
170
+ builder.setProxy(proxyConfig.get());
171
+ }
172
+
173
+ return builder.build();
174
+ }
175
+
176
+ private Optional<ProxyConfig> newProxyConfig(Optional<HttpProxyTask> task)
177
+ {
178
+ // This plugin searches http proxy settings and configures them to TDClient. The order of proxy setting searching is:
179
+ // 1. System properties
180
+ // 2. http_proxy config option provided by this plugin
181
+
182
+ Properties props = System.getProperties();
183
+ if (props.containsKey("http.proxyHost") || props.containsKey("https.proxyHost")) {
184
+ boolean useSsl = props.containsKey("https.proxyHost");
185
+ String proto = !useSsl ? "http" : "https";
186
+ String host = props.getProperty(proto + ".proxyHost");
187
+ int port = parseInt(props.getProperty(proto + ".proxyPort", !useSsl ? "80" : "443"));
188
+ Optional<String> user = fromNullable(props.getProperty(proto + ".proxyUser"));
189
+ Optional<String> password = fromNullable(props.getProperty(proto + ".proxyPassword"));
190
+ return Optional.of(new ProxyConfig(host, port, useSsl, user, password));
191
+ }
192
+ else if (task.isPresent()) {
193
+ HttpProxyTask proxyTask = task.get();
194
+ return Optional.of(new ProxyConfig(proxyTask.getHost(), proxyTask.getPort(), proxyTask.getUseSsl(),
195
+ proxyTask.getUser(), proxyTask.getPassword()));
196
+ }
197
+ else {
198
+ return Optional.absent();
199
+ }
200
+ }
201
+
202
+ private TDJob getTDJob(PluginTask task, TDClient client)
203
+ {
204
+ String jobId;
205
+ if (!task.getJobId().isPresent()) {
206
+ if (!task.getQuery().isPresent() || !task.getDatabase().isPresent()) {
207
+ throw new ConfigException("Must specify both of 'query' and 'database' options if 'job_id' option is not used.");
208
+ }
209
+ jobId = submitJob(task, client);
210
+ }
211
+ else {
212
+ jobId = task.getJobId().get();
213
+ }
214
+
215
+ waitJobCompletion(task, client, jobId);
216
+ return client.jobInfo(jobId);
217
+ }
218
+
219
+ private String submitJob(PluginTask task, TDClient client)
220
+ {
221
+ String query = task.getQuery().get();
222
+ String database = task.getDatabase().get();
223
+
224
+ log.info(String.format(ENGLISH, "Submit a query for database '%s': %s", database, query));
225
+ String jobId = client.submit(TDJobRequest.newPrestoQuery(database, query));
226
+ log.info(String.format(ENGLISH, "Job %s is queued.", jobId));
227
+ return jobId;
228
+ }
229
+
230
+ private void waitJobCompletion(PluginTask task, TDClient client, String jobId)
231
+ {
232
+ TDJobSummary js;
233
+ long waitTime = 5 * 1000; // 5 secs
234
+
235
+ // wait for job finish
236
+ log.info(String.format(ENGLISH, "Confirm that job %s finished", jobId));
237
+ while (true) {
238
+ js = client.jobStatus(jobId);
239
+ if (js.getStatus().isFinished()) {
240
+ break;
241
+ }
242
+
243
+ log.debug("Wait for job finished");
244
+ try {
245
+ Thread.sleep(waitTime);
246
+ }
247
+ catch (InterruptedException ignored) {
248
+ }
249
+ }
250
+
251
+ // confirm if the job status is 'success'
252
+ if (js.getStatus() != TDJob.Status.SUCCESS) {
253
+ throw new ConfigException(String.format(ENGLISH, "Cannot download job result because the job was '%s'.", js.getStatus()));
254
+ }
255
+ }
256
+
257
+ private static JsonNode toJsonNode(String schema)
258
+ {
259
+ try {
260
+ return new ObjectMapper().readTree(schema);
261
+ }
262
+ catch (IOException e) {
263
+ throw new ConfigException(String.format(ENGLISH, "Failed to parse job result schema as JSON: %s", schema));
264
+ }
265
+ }
266
+
267
+ private Schema convertSchema(TDJob.Type jobType, JsonNode from)
268
+ {
269
+ Schema.Builder schema = new Schema.Builder();
270
+ ArrayNode a = (ArrayNode) from;
271
+ for (int i = 0; i < a.size(); i++) {
272
+ ArrayNode column = (ArrayNode)a.get(i);
273
+ String name = column.get(0).asText();
274
+ Type type = convertColumnType(jobType, column.get(1).asText());
275
+ schema.add(name, type);
276
+ }
277
+ return schema.build();
278
+ }
279
+
280
+ private Type convertColumnType(TDJob.Type jobType, String from)
281
+ {
282
+ switch (jobType) {
283
+ case PRESTO:
284
+ return convertPrestoColumnType(from);
285
+ case HIVE:
286
+ default:
287
+ throw new ConfigException(String.format(ENGLISH, "Unsupported job type '%s'. Supported types are [presto].", jobType)); // TODO hive
288
+ }
289
+ }
290
+
291
+ private Type convertPrestoColumnType(String from)
292
+ {
293
+ String t = from.toUpperCase(ENGLISH);
294
+ if (t.equals("BOOLEAN")) {
295
+ return BOOLEAN;
296
+ }
297
+ else if (t.equals("BIGINT")) {
298
+ return LONG;
299
+ }
300
+ else if (t.equals("DOUBLE") || t.equals("DECIMAL") || t.startsWith("DECIMAL")) {
301
+ return DOUBLE;
302
+ }
303
+ else if (t.equals("VARCHAR") || t.startsWith("VARCHAR")) {
304
+ return STRING;
305
+ }
306
+ else {
307
+ throw new ConfigException(String.format(ENGLISH, "Unsupported presto type '%s'", from)); // TODO other types
308
+ }
309
+ }
310
+
311
+ @Override
312
+ public ConfigDiff resume(TaskSource taskSource,
313
+ Schema schema, int taskCount,
314
+ InputPlugin.Control control)
315
+ {
316
+ control.run(taskSource, schema, taskCount);
317
+ return newConfigDiff();
318
+ }
319
+
320
+ @Override
321
+ public void cleanup(TaskSource taskSource,
322
+ Schema schema, int taskCount,
323
+ List<TaskReport> successTaskReports)
324
+ {
325
+ // do nothing
326
+ }
327
+
328
+ @Override
329
+ public TaskReport run(TaskSource taskSource,
330
+ final Schema schema, int taskIndex,
331
+ PageOutput output)
332
+ {
333
+ final PluginTask task = taskSource.loadTask(PluginTask.class);
334
+ final BufferAllocator allocator = task.getBufferAllocator();
335
+ final ValueWriter[] writers = newValueWriters(schema);
336
+ final String jobId = taskSource.get(String.class, "job_id");
337
+ final boolean stopOnInvalidRecord = task.getStopOnInvalidRecord();
338
+
339
+ try (final PageBuilder pageBuilder = new PageBuilder(allocator, schema, output);
340
+ final TDClient client = newTDClient(task)) {
341
+ client.jobResult(jobId, MESSAGE_PACK_GZ, new Function<InputStream, Void>() {
342
+ @Override
343
+ public Void apply(InputStream input)
344
+ {
345
+ try (MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(new GZIPInputStream(input))) {
346
+ while (unpacker.hasNext()) {
347
+ try {
348
+ Value v;
349
+ try {
350
+ v = unpacker.unpackValue();
351
+ }
352
+ catch (IOException e) {
353
+ throw new InvalidRecordException("Cannot unpack value", e);
354
+ }
355
+
356
+ if (!v.isArrayValue()) {
357
+ throw new InvalidRecordException(String.format(ENGLISH, "Must be array value: ", v.toString()));
358
+ }
359
+
360
+ ArrayValue record = v.asArrayValue();
361
+ if (record.size() != schema.size()) {
362
+ throw new InvalidRecordException(String.format(ENGLISH, "The size (%d) of the record is invalid", record.size()));
363
+ }
364
+
365
+ // write records to the page
366
+ for (int i = 0; i < writers.length; i++) {
367
+ writers[i].write(record.get(i), pageBuilder);
368
+ }
369
+
370
+ pageBuilder.addRecord();
371
+ }
372
+ catch (InvalidRecordException e) {
373
+ if (stopOnInvalidRecord) {
374
+ throw new DataException(String.format(ENGLISH, "Invalid record (%s)", e.getMessage()), e);
375
+ }
376
+ log.warn(String.format(ENGLISH, "Skipped record (%s)", e.getMessage()));
377
+ }
378
+ }
379
+ }
380
+ catch (IOException e) {
381
+ throw Throwables.propagate(e);
382
+ }
383
+
384
+ return null;
385
+ }
386
+ });
387
+
388
+ pageBuilder.finish();
389
+ }
390
+
391
+ return newTaskReport();
392
+ }
393
+
394
+ private ValueWriter[] newValueWriters(Schema schema)
395
+ {
396
+ ValueWriter[] writers = new ValueWriter[schema.size()];
397
+ for (int i = 0; i < schema.size(); i++) {
398
+ writers[i] = newValueWriter(schema.getColumn(i));
399
+ }
400
+ return writers;
401
+ }
402
+
403
+ private ValueWriter newValueWriter(Column column)
404
+ {
405
+ Type type = column.getType();
406
+ if (type.equals(BOOLEAN)) {
407
+ return new BooleanValueWriter(column);
408
+ }
409
+ else if (type.equals(DOUBLE)) {
410
+ return new DoubleValueWriter(column);
411
+ }
412
+ else if (type.equals(JSON)) {
413
+ throw new ConfigException(String.format(ENGLISH, "Unsupported column type (%s:%s)", column.getName(), type)); // TODO
414
+ }
415
+ else if (type.equals(LONG)) {
416
+ return new LongValueWriter(column);
417
+ }
418
+ else if (type.equals(STRING)) {
419
+ return new StringValueWriter(column);
420
+ }
421
+ else if (type.equals(TIMESTAMP)) {
422
+ throw new ConfigException(String.format(ENGLISH, "Unsupported column type (%s:%s)", column.getName(), type)); // TODO
423
+ }
424
+ else {
425
+ throw new ConfigException(String.format(ENGLISH, "Unsupported column type (%s:%s)", column.getName(), type)); // TODO
426
+ }
427
+ }
428
+
429
+ @Override
430
+ public ConfigDiff guess(ConfigSource config)
431
+ {
432
+ return newConfigDiff(); // do nothing
433
+ }
434
+
435
+ static class InvalidRecordException
436
+ extends RuntimeException
437
+ {
438
+ InvalidRecordException(String cause) {
439
+ super(cause);
440
+ }
441
+
442
+ InvalidRecordException(String cause, Throwable t) {
443
+ super(cause, t);
444
+ }
445
+ }
446
+ }