embulk-output-td 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4fee7360589ff074152102355e21cd14a4fd6f15
4
- data.tar.gz: f839023cacdc9b64382aa5d7b3fcdce51c997a86
3
+ metadata.gz: 5610036db652164098fcc3f828b8fb3f95542329
4
+ data.tar.gz: a297a28131574b748d20c49097774b6bdb607552
5
5
  SHA512:
6
- metadata.gz: e841eb70257ada5e8c012595968536236f606aebe110fe2b32edf203607d569c5436d001504d0c5d98da490b58580893ea8b0fe58314fc8ae51c984cc0c7f0b8
7
- data.tar.gz: 52af81b3e7c4d27ec844b5f74abe02a5dc2df4339945bd2da37580f5a184c5417ac1d2306fb22b62d164273df90b12e637e368eed2564d30d676443e2eea7b69
6
+ metadata.gz: 18bd27aedf3f1585c2c1fbbdc881f2e50157bfe61eb5d820363dd7896dc9c63f2c7fc2abae1ff1ef57c3b338b4672ee86bb1b57c097f073f0a1489a67800b3c4
7
+ data.tar.gz: 2d25527c5fb487970c90386c343c3dab2e4a0ad630698c9c772af247949f5affc5cae09d7edcd6ecdd43e21dc92d5c7a99055c4e9d5fc42d86b5643808867c0b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.1.4 - 2015-XX-XX
2
+
3
+ ## 0.1.3 - 2015-08-05
4
+
5
+ * [maintenance] Upgrade Embulk v0.6.19
6
+ * [new feature] Add column_options [#11](https://github.com/treasure-data/embulk-output-td/pull/11)
7
+
1
8
  ## 0.1.2 - 2015-07-14
2
9
 
3
10
  ## 0.1.1 - 2015-07-14
data/README.md CHANGED
@@ -23,6 +23,11 @@ TODO: Write short description here
23
23
  - **tmpdir**: temporal directory
24
24
  - **upload_concurrency**: upload concurrency (int, default=2). max concurrency is 8.
25
25
  - **file_split_size**: split size (long, default=16384 (16MB)).
26
+ - **default_timezone**: default timezone (string, default='UTC')
27
+ - **default_timestamp_format**: default timestamp format (string, default=`%Y-%m-%d %H:%M:%S.%6N`)
28
+ - **column_options**: advanced: a key-value pairs where key is a column name and value is options for the column.
29
+ - **timezone**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp value into a SQL string. In this cases, this timezone option is used to control the timezone. (string, value of default_timezone option is used by default)
30
+ - **format**: If input column type (embulk type) is timestamp, this plugin needs to format the timestamp value into a string. This timestamp_format option is used to control the format of the timestamp. (string, value of default_timestamp_format option is used by default)
26
31
 
27
32
  ## Example
28
33
  Here is sample configuration for TD output plugin.
data/build.gradle CHANGED
@@ -13,19 +13,22 @@ configurations {
13
13
  provided
14
14
  }
15
15
 
16
- version = "0.1.2"
16
+ version = "0.1.3"
17
17
 
18
18
  compileJava.options.encoding = 'UTF-8' // source encoding
19
19
  sourceCompatibility = 1.7
20
20
  targetCompatibility = 1.7
21
21
 
22
22
  dependencies {
23
- compile "org.embulk:embulk-core:0.6.10"
24
- provided "org.embulk:embulk-core:0.6.10"
25
- compile "org.eclipse.jetty:jetty-client:9.2.2.v20140723"
26
- compile "org.msgpack:msgpack:0.6.11"
23
+ compile "org.embulk:embulk-core:0.6.19"
24
+ provided "org.embulk:embulk-core:0.6.19"
25
+ compile "org.embulk:embulk-standards:0.6.19"
26
+ provided "org.embulk:embulk-standards:0.6.19"
27
+ compile "org.eclipse.jetty:jetty-client:9.2.2.v20140723"
28
+ compile "org.msgpack:msgpack:0.6.11"
27
29
 
28
30
  testCompile "junit:junit:4.+"
31
+ testCompile "org.bigtesting:fixd:1.0.0"
29
32
  }
30
33
 
31
34
  task classpath(type: Copy, dependsOn: ["jar"]) {
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-output-td"
4
- spec.version = "0.1.2"
4
+ spec.version = "0.1.3"
5
5
  spec.authors = ["Muga Nishizawa"]
6
6
  spec.summary = %[TreasureData output plugin for Embulk]
7
7
  spec.description = %[TreasureData output plugin is an Embulk plugin that loads records to TreasureData read by any input plugins. Search the input plugins by 'embulk-output' keyword.]
@@ -1,3 +1,3 @@
1
1
  Embulk::JavaPlugin.register_output(
2
- "td", "org.embulk.output.TdOutputPlugin",
2
+ "td", "org.embulk.output.td.TdOutputPlugin",
3
3
  File.expand_path('../../../../classpath', __FILE__))
@@ -1,4 +1,4 @@
1
- package org.embulk.output;
1
+ package org.embulk.output.td;
2
2
 
3
3
  import java.io.Closeable;
4
4
  import java.io.IOException;
@@ -1,4 +1,4 @@
1
- package org.embulk.output;
1
+ package org.embulk.output.td;
2
2
 
3
3
  import org.embulk.spi.Exec;
4
4
  import org.msgpack.MessagePack;
@@ -1,4 +1,4 @@
1
- package org.embulk.output;
1
+ package org.embulk.output.td;
2
2
 
3
3
  import com.google.common.base.Optional;
4
4
  import com.google.common.base.Stopwatch;
@@ -6,7 +6,6 @@ import com.google.common.base.Throwables;
6
6
  import com.treasuredata.api.TdApiClient;
7
7
  import org.embulk.config.CommitReport;
8
8
  import org.embulk.config.ConfigException;
9
- import org.embulk.output.TdOutputPlugin.PluginTask;
10
9
  import org.embulk.spi.Column;
11
10
  import org.embulk.spi.ColumnVisitor;
12
11
  import org.embulk.spi.Exec;
@@ -21,6 +20,7 @@ import org.embulk.spi.type.LongType;
21
20
  import org.embulk.spi.type.StringType;
22
21
  import org.embulk.spi.type.TimestampType;
23
22
  import org.embulk.spi.type.Type;
23
+ import org.embulk.spi.util.Timestamps;
24
24
  import org.joda.time.DateTimeZone;
25
25
  import org.jruby.embed.ScriptingContainer;
26
26
  import org.msgpack.MessagePack;
@@ -35,7 +35,6 @@ import java.util.concurrent.Callable;
35
35
  import java.util.concurrent.TimeUnit;
36
36
 
37
37
  import static com.google.common.base.Preconditions.checkNotNull;
38
- import org.embulk.output.TdOutputPlugin.UnixTimestampUnit;
39
38
 
40
39
  public class RecordWriter
41
40
  implements TransactionalPageOutput
@@ -57,7 +56,7 @@ public class RecordWriter
57
56
  private final int uploadConcurrency;
58
57
  private final long fileSplitSize; // unit: kb
59
58
 
60
- public RecordWriter(PluginTask task, int taskIndex, TdApiClient client, FieldWriterSet fieldWriters)
59
+ public RecordWriter(TdOutputPlugin.PluginTask task, int taskIndex, TdApiClient client, FieldWriterSet fieldWriters)
61
60
  {
62
61
  this.log = Exec.getLogger(getClass());
63
62
  this.client = checkNotNull(client);
@@ -72,7 +71,7 @@ public class RecordWriter
72
71
  this.fileSplitSize = task.getFileSplitSize() * 1024;
73
72
  }
74
73
 
75
- public static void validateSchema(Logger log, PluginTask task, Schema schema)
74
+ public static void validateSchema(Logger log, TdOutputPlugin.PluginTask task, Schema schema)
76
75
  {
77
76
  new FieldWriterSet(log, task, schema);
78
77
  }
@@ -263,7 +262,7 @@ public class RecordWriter
263
262
  private final int fieldCount;
264
263
  private final FieldWriter[] fieldWriters;
265
264
 
266
- public FieldWriterSet(Logger log, PluginTask task, Schema schema)
265
+ public FieldWriterSet(Logger log, TdOutputPlugin.PluginTask task, Schema schema)
267
266
  {
268
267
  Optional<String> userDefinedPrimaryKeySourceColumnName = task.getTimeColumn();
269
268
  boolean hasPkWriter = false;
@@ -272,6 +271,7 @@ public class RecordWriter
272
271
 
273
272
  int fc = 0;
274
273
  fieldWriters = new FieldWriter[schema.size()];
274
+ TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
275
275
 
276
276
  for (int i = 0; i < schema.size(); i++) {
277
277
  String columnName = schema.getColumnName(i);
@@ -311,13 +311,13 @@ public class RecordWriter
311
311
  case PRIMARY_KEY:
312
312
  log.info("Using {}:{} column as the data partitioning key", columnName, columnType);
313
313
  if (columnType instanceof LongType) {
314
- if (task.getUnixTimestampUnit() != UnixTimestampUnit.SEC) {
314
+ if (task.getUnixTimestampUnit() != TdOutputPlugin.UnixTimestampUnit.SEC) {
315
315
  log.warn("time column is converted from {} to seconds", task.getUnixTimestampUnit());
316
316
  }
317
317
  writer = new UnixTimestampLongFieldWriter(columnName, task.getUnixTimestampUnit().getFractionUnit());
318
318
  hasPkWriter = true;
319
319
  } else if (columnType instanceof TimestampType) {
320
- writer = new TimestampStringFieldWriter(task.getJRuby(), columnName);
320
+ writer = new TimestampStringFieldWriter(timestampFormatters[i], columnName);
321
321
  hasPkWriter = true;
322
322
  } else {
323
323
  throw new ConfigException(String.format("Type of '%s' column must be long or timestamp but got %s",
@@ -335,7 +335,7 @@ public class RecordWriter
335
335
  } else if (columnType instanceof StringType) {
336
336
  writer = new StringFieldWriter(columnName);
337
337
  } else if (columnType instanceof TimestampType) {
338
- writer = new TimestampStringFieldWriter(task.getJRuby(), columnName);
338
+ writer = new TimestampStringFieldWriter(timestampFormatters[i], columnName);
339
339
  if (firstTimestampColumnIndex < 0) {
340
340
  firstTimestampColumnIndex = i;
341
341
  }
@@ -381,7 +381,7 @@ public class RecordWriter
381
381
  } else if (columnType instanceof TimestampType) {
382
382
  log.info("Duplicating {}:{} column to 'time' column as seconds for the data partitioning",
383
383
  columnName, columnType);
384
- writer = new TimestampFieldLongDuplicator(task.getJRuby(), columnName, "time");
384
+ writer = new TimestampFieldLongDuplicator(timestampFormatters[duplicatePrimaryKeySourceIndex], columnName, "time");
385
385
  } else {
386
386
  throw new ConfigException(String.format("Type of '%s' column must be long or timestamp but got %s",
387
387
  columnName, columnType));
@@ -541,20 +541,19 @@ public class RecordWriter
541
541
  static class TimestampStringFieldWriter
542
542
  extends FieldWriter
543
543
  {
544
- // to format timestamp values to string by "%Y-%m-%d %H:%M:%S.%3N"
545
- private final TimestampFormatter defaultFormatter;
544
+ private final TimestampFormatter formatter;
546
545
 
547
- public TimestampStringFieldWriter(ScriptingContainer jruby, String keyName)
546
+ public TimestampStringFieldWriter(TimestampFormatter formatter, String keyName)
548
547
  {
549
548
  super(keyName);
550
- this.defaultFormatter = new TimestampFormatter(jruby, "%Y-%m-%d %H:%M:%S.%3N", DateTimeZone.UTC);
549
+ this.formatter = formatter;
551
550
  }
552
551
 
553
552
  @Override
554
553
  public void writeValue(MsgpackGZFileBuilder builder, PageReader reader, Column column)
555
554
  throws IOException
556
555
  {
557
- builder.writeString(defaultFormatter.format(reader.getTimestamp(column)));
556
+ builder.writeString(formatter.format(reader.getTimestamp(column)));
558
557
  }
559
558
  }
560
559
 
@@ -599,9 +598,9 @@ public class RecordWriter
599
598
  {
600
599
  private final TimestampLongFieldWriter timeFieldWriter;
601
600
 
602
- public TimestampFieldLongDuplicator(ScriptingContainer jruby, String keyName, String longDuplicateKeyName)
601
+ public TimestampFieldLongDuplicator(TimestampFormatter formatter, String keyName, String longDuplicateKeyName)
603
602
  {
604
- super(jruby, keyName);
603
+ super(formatter, keyName);
605
604
  timeFieldWriter = new TimestampLongFieldWriter(longDuplicateKeyName);
606
605
  }
607
606
 
@@ -1,7 +1,8 @@
1
- package org.embulk.output;
1
+ package org.embulk.output.td;
2
2
 
3
3
  import java.io.IOException;
4
4
  import java.util.List;
5
+ import java.util.Map;
5
6
  import javax.validation.constraints.Min;
6
7
  import javax.validation.constraints.Max;
7
8
 
@@ -14,36 +15,34 @@ import com.treasuredata.api.TdApiClientConfig;
14
15
  import com.treasuredata.api.TdApiClientConfig.HttpProxyConfig;
15
16
  import com.treasuredata.api.TdApiConflictException;
16
17
  import com.treasuredata.api.TdApiNotFoundException;
17
- import com.treasuredata.api.TdApiException;
18
18
  import com.treasuredata.api.model.TDBulkImportSession;
19
19
  import com.treasuredata.api.model.TDBulkImportSession.ImportStatus;
20
- import com.treasuredata.api.model.TDDatabase;
21
20
  import com.treasuredata.api.model.TDTable;
22
21
  import org.embulk.config.CommitReport;
23
22
  import org.embulk.config.Config;
24
23
  import org.embulk.config.ConfigDefault;
25
24
  import org.embulk.config.ConfigDiff;
26
- import org.embulk.config.ConfigInject;
27
25
  import org.embulk.config.ConfigSource;
28
26
  import org.embulk.config.ConfigException;
29
27
  import org.embulk.config.Task;
30
28
  import org.embulk.config.TaskSource;
31
- import org.embulk.output.RecordWriter.FieldWriterSet;
29
+ import org.embulk.output.td.RecordWriter.FieldWriterSet;
32
30
  import org.embulk.spi.Exec;
33
31
  import org.embulk.spi.ExecSession;
34
32
  import org.embulk.spi.OutputPlugin;
35
33
  import org.embulk.spi.Schema;
36
34
  import org.embulk.spi.TransactionalPageOutput;
37
35
  import org.embulk.spi.time.Timestamp;
36
+ import org.embulk.spi.time.TimestampFormatter;
37
+ import org.joda.time.DateTimeZone;
38
38
  import org.joda.time.format.DateTimeFormat;
39
- import org.jruby.embed.ScriptingContainer;
40
39
  import org.slf4j.Logger;
41
40
 
42
41
  public class TdOutputPlugin
43
42
  implements OutputPlugin
44
43
  {
45
44
  public interface PluginTask
46
- extends Task
45
+ extends Task, TimestampFormatter.Task
47
46
  {
48
47
  @Config("apikey")
49
48
  public String getApiKey();
@@ -62,6 +61,8 @@ public class TdOutputPlugin
62
61
 
63
62
  // TODO connect_timeout, read_timeout, send_timeout
64
63
 
64
+ // TODO mode[append, replace]
65
+
65
66
  @Config("auto_create_table")
66
67
  @ConfigDefault("true")
67
68
  public boolean getAutoCreateTable();
@@ -98,8 +99,25 @@ public class TdOutputPlugin
98
99
  @ConfigDefault("16384") // default 16MB (unit: kb)
99
100
  public long getFileSplitSize();
100
101
 
101
- @ConfigInject
102
- public ScriptingContainer getJRuby();
102
+ @Override
103
+ @Config("default_timestamp_format")
104
+ // SQL timestamp with milliseconds is, by defualt, used because Hive and Presto use
105
+ // those format. As timestamp type, Presto
106
+ // * cannot parse SQL timestamp with timezone like '2015-02-03 04:05:06.789 UTC'
107
+ // * cannot parse SQL timestamp with nanoseconds like '2015-02-03 04:05:06.789012345'
108
+ // * cannot parse SQL timestamp with microseconds like '2015-02-03 04:05:06.789012'
109
+ // * can parse SQL timestamp with milliseconds like '2015-02-03 04:05:06.789'
110
+ // On the other hand, Hive
111
+ // * cannot parse SQL timestamp with timezone like '2015-02-03 04:05:06.789 UTC'
112
+ // * can parse SQL timestamp with nanoseconds like '2015-02-03 04:05:06.789012345'
113
+ // * can parse SQL timestamp with microseconds like '2015-02-03 04:05:06.789012'
114
+ // * can parse SQL timestamp with milliseconds like '2015-02-03 04:05:06.789'
115
+ @ConfigDefault("\"%Y-%m-%d %H:%M:%S.%3N\"")
116
+ public String getDefaultTimestampFormat();
117
+
118
+ @Config("column_options")
119
+ @ConfigDefault("{}")
120
+ public Map<String, TimestampColumnOption> getColumnOptions();
103
121
 
104
122
  public boolean getDoUpload();
105
123
  public void setDoUpload(boolean doUpload);
@@ -108,6 +126,10 @@ public class TdOutputPlugin
108
126
  public void setSessionName(String session);
109
127
  }
110
128
 
129
+ public interface TimestampColumnOption
130
+ extends Task, TimestampFormatter.TimestampColumnOption
131
+ {}
132
+
111
133
  public interface HttpProxyTask
112
134
  extends Task
113
135
  {
@@ -175,6 +197,13 @@ public class TdOutputPlugin
175
197
  {
176
198
  final PluginTask task = config.loadConfig(PluginTask.class);
177
199
 
200
+ // TODO mode check
201
+
202
+ // check column_options is valid or not
203
+ for (String columnName : task.getColumnOptions().keySet()) {
204
+ schema.lookupColumn(columnName); // throws SchemaConfigException
205
+ }
206
+
178
207
  // generate session name
179
208
  task.setSessionName(buildBulkImportSessionName(task, Exec.session()));
180
209
 
@@ -0,0 +1,79 @@
1
+ package com.treasuredata.api;
2
+
3
+ import com.treasuredata.api.model.TDDatabase;
4
+ import org.bigtesting.fixd.ServerFixture;
5
+ import org.bigtesting.fixd.core.Method;
6
+ import org.junit.After;
7
+ import org.junit.Before;
8
+ import org.junit.Test;
9
+
10
+ import java.util.List;
11
+
12
+ import static org.junit.Assert.assertEquals;
13
+ import static org.junit.Assert.fail;
14
+
15
+ public class TestTdApiClient
16
+ {
17
+ private ServerFixture server;
18
+ private TdApiClient client;
19
+ private TdApiClientConfig clientConfig;
20
+ private String apikey = "apikey";
21
+
22
+ @Before
23
+ public void startServer()
24
+ throws Exception
25
+ {
26
+ server = new ServerFixture(9490);
27
+ server.start();
28
+ }
29
+
30
+ @After
31
+ public void stopServer()
32
+ throws Exception
33
+ {
34
+ server.stop();
35
+ }
36
+
37
+ @Before
38
+ public void startTdApiClient()
39
+ throws Exception
40
+ {
41
+ clientConfig = new TdApiClientConfig("localhost:9490", false);
42
+ client = new TdApiClient(apikey, clientConfig);
43
+ client.start();
44
+ }
45
+
46
+ @After
47
+ public void stopTdApiClient()
48
+ throws Exception
49
+ {
50
+ client.close();;
51
+ }
52
+
53
+ private static final String DATABASE_LIST_JSON =
54
+ "{" +
55
+ "\"databases\":[" +
56
+ "{\"name\":\"test1\"}," +
57
+ "{\"name\":\"test2\"}" +
58
+ "]" +
59
+ "}";
60
+
61
+ @Test
62
+ public void getDatabases() throws Exception
63
+ {
64
+ server.handle(Method.GET, "/v3/database/list").with(200, "text/json", DATABASE_LIST_JSON);
65
+ List<TDDatabase> dbs = client.getDatabases();
66
+ assertEquals(2, dbs.size());
67
+ assertEquals("test1", dbs.get(0).getName());
68
+ assertEquals("test2", dbs.get(1).getName());
69
+ }
70
+
71
+ @Test(expected = TdApiNotFoundException.class)
72
+ public void notFoundDatabases()
73
+ throws Exception
74
+ {
75
+ server.handle(Method.GET, "/v3/database/list").with(404, "text/json", "{\"message\":\"not found\"}");
76
+ client.getDatabases();
77
+ fail();
78
+ }
79
+ }
@@ -1,4 +1,4 @@
1
- package org.embulk.output;
1
+ package org.embulk.output.td;
2
2
 
3
3
  public class TestTdOutputPlugin
4
4
  {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-td
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Muga Nishizawa
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-07-14 00:00:00.000000000 Z
11
+ date: 2015-08-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,12 +80,13 @@ files:
80
80
  - src/main/java/com/treasuredata/api/model/TDTablePermission.java
81
81
  - src/main/java/com/treasuredata/api/model/TDTableSchema.java
82
82
  - src/main/java/com/treasuredata/api/model/TDTableType.java
83
- - src/main/java/org/embulk/output/FinalizableExecutorService.java
84
- - src/main/java/org/embulk/output/MsgpackGZFileBuilder.java
85
- - src/main/java/org/embulk/output/RecordWriter.java
86
- - src/main/java/org/embulk/output/TdOutputPlugin.java
87
- - src/test/java/org/embulk/output/TestTdOutputPlugin.java
88
- - classpath/embulk-output-td-0.1.2.jar
83
+ - src/main/java/org/embulk/output/td/FinalizableExecutorService.java
84
+ - src/main/java/org/embulk/output/td/MsgpackGZFileBuilder.java
85
+ - src/main/java/org/embulk/output/td/RecordWriter.java
86
+ - src/main/java/org/embulk/output/td/TdOutputPlugin.java
87
+ - src/test/java/com/treasuredata/api/TestTdApiClient.java
88
+ - src/test/java/org/embulk/output/td/TestTdOutputPlugin.java
89
+ - classpath/embulk-output-td-0.1.3.jar
89
90
  - classpath/javassist-3.18.1-GA.jar
90
91
  - classpath/jetty-client-9.2.2.v20140723.jar
91
92
  - classpath/jetty-http-9.2.2.v20140723.jar