embulk-filter-protobuf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_filter(
2
+ "protobuf", "org.embulk.filter.protobuf.ProtobufFilterPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,264 @@
1
+ package org.embulk.filter.protobuf;
2
+
3
+ import com.google.common.io.BaseEncoding;
4
+ import com.google.protobuf.InvalidProtocolBufferException;
5
+ import com.google.protobuf.Message;
6
+ import com.google.protobuf.MessageOrBuilder;
7
+ import com.google.protobuf.util.JsonFormat;
8
+
9
+ import org.embulk.filter.protobuf.ProtobufFilterPlugin.ColumnTask;
10
+ import org.embulk.filter.protobuf.ProtobufFilterPlugin.PluginTask;
11
+
12
+ import org.embulk.plugin.PluginClassLoader;
13
+ import org.embulk.spi.Column;
14
+ import org.embulk.spi.ColumnVisitor;
15
+ import org.embulk.spi.DataException;
16
+ import org.embulk.spi.PageBuilder;
17
+ import org.embulk.spi.PageReader;
18
+
19
+ import java.lang.reflect.InvocationTargetException;
20
+ import java.lang.reflect.Method;
21
+ import java.net.URLClassLoader;
22
+ import java.nio.file.Path;
23
+ import java.nio.file.Paths;
24
+ import java.util.HashMap;
25
+ import java.util.List;
26
+ import java.util.Map;
27
+
28
+ public class ColumnVisitorImpl implements ColumnVisitor
29
+ {
30
+ private final PluginTask pluginTask;
31
+ private final PageReader pageReader;
32
+ private final PageBuilder pageBuilder;
33
+ private final Map<String, ColumnTask> columnTaskMap;
34
+
35
+ ColumnVisitorImpl(PluginTask task, PageReader reader, PageBuilder builder)
36
+ {
37
+ this.pluginTask = task;
38
+ this.pageReader = reader;
39
+ this.pageBuilder = builder;
40
+ this.columnTaskMap = getColumnMap(task.getColumns());
41
+ addProtobufJarToClasspath();
42
+ }
43
+
44
+ private Map<String, ColumnTask> getColumnMap(
45
+ List<ColumnTask> columnTasks)
46
+ {
47
+ Map<String, ColumnTask> m = new HashMap<>();
48
+ for (ColumnTask columnTask : columnTasks) {
49
+ m.put(columnTask.getName(), columnTask);
50
+ }
51
+ return m;
52
+ }
53
+
54
+ private void addProtobufJarToClasspath()
55
+ {
56
+ Path protobufJarPath = Paths.get(pluginTask.getProtobufJarPath());
57
+ // FIXME:
58
+ // getClass().getClassLoader() returns sun.misc.Launcher$AppClassLoader
59
+ // and it cannot be cast to PluginClassLoader in gradle test.
60
+ try {
61
+ PluginClassLoader loader = (PluginClassLoader) getClass().getClassLoader();
62
+ loader.addPath(protobufJarPath);
63
+ }
64
+ catch (ClassCastException e) {
65
+ }
66
+ }
67
+
68
+ private ColumnTask getColumnTask(Column column)
69
+ {
70
+ String colName = column.getName();
71
+ return columnTaskMap.get(colName);
72
+ }
73
+
74
+ private byte[] decodeMessage(String messageAsString)
75
+ {
76
+ byte[] decoded = null;
77
+ String encoding = pluginTask.getEncoding();
78
+ if (encoding.equals("Base64")) {
79
+ decoded = BaseEncoding.base64().decode(messageAsString);
80
+ }
81
+ return decoded;
82
+ }
83
+
84
+ private String encodeMessage(byte[] messageAsBytes)
85
+ {
86
+ String encoded = null;
87
+ String encoding = pluginTask.getEncoding();
88
+ if (encoding.equals("Base64")) {
89
+ encoded = BaseEncoding.base64().encode(messageAsBytes);
90
+ }
91
+ return encoded;
92
+ }
93
+
94
+ private String convertMessageBytesToJson(
95
+ byte[] messageAsBytes, String messageName)
96
+ {
97
+ URLClassLoader loader = (URLClassLoader) getClass().getClassLoader();
98
+ // Get a message object
99
+ Object message = null;
100
+ try {
101
+ Class<?> messageClass = loader.loadClass(messageName);
102
+ Method parseFrom = messageClass.getMethod(
103
+ "parseFrom", byte[].class);
104
+ message = parseFrom.invoke(
105
+ (Object) messageClass, (Object) messageAsBytes);
106
+ }
107
+ catch (ClassNotFoundException e) {
108
+ throw new DataException(e);
109
+ }
110
+ catch (NoSuchMethodException e) {
111
+ throw new DataException(e);
112
+ }
113
+ catch (IllegalAccessException e) {
114
+ throw new DataException(e);
115
+ }
116
+ catch (InvocationTargetException e) {
117
+ throw new DataException(e);
118
+ }
119
+ // Convert message object to json string
120
+ String messageAsString = null;
121
+ try {
122
+ messageAsString = JsonFormat.printer()
123
+ .omittingInsignificantWhitespace()
124
+ .print((MessageOrBuilder) message);
125
+ }
126
+ catch (InvalidProtocolBufferException e) {
127
+ throw new DataException(e);
128
+ }
129
+ return messageAsString;
130
+ }
131
+
132
+ private byte[] convertJsonToMessageBytes(
133
+ String messageAsJson, String messageName)
134
+ {
135
+ URLClassLoader loader = (URLClassLoader) getClass().getClassLoader();
136
+ // Get a message builder object
137
+ Message.Builder builder = null;
138
+ try {
139
+ Class<?> messageClass = loader.loadClass(messageName);
140
+ Method newBuilder = messageClass.getMethod("newBuilder");
141
+ builder = (Message.Builder) newBuilder.invoke(
142
+ (Object) messageClass);
143
+ }
144
+ catch (ClassNotFoundException e) {
145
+ throw new DataException(e);
146
+ }
147
+ catch (NoSuchMethodException e) {
148
+ throw new DataException(e);
149
+ }
150
+ catch (IllegalAccessException e) {
151
+ throw new DataException(e);
152
+ }
153
+ catch (InvocationTargetException e) {
154
+ throw new DataException(e);
155
+ }
156
+ // Convert message json to binary
157
+ byte[] messageAsBytes = null;
158
+ try {
159
+ JsonFormat.parser().merge(messageAsJson, builder);
160
+ messageAsBytes = builder.build().toByteArray();
161
+ }
162
+ catch (InvalidProtocolBufferException e) {
163
+ throw new DataException(e);
164
+ }
165
+ return messageAsBytes;
166
+ }
167
+
168
+ private String executeTask(ColumnTask colTask, Column column)
169
+ {
170
+ String messageName = colTask.getMessage();
171
+ // serialize
172
+ if (pluginTask.getDoSerialize().get()) {
173
+ String messageAsJson = pageReader.getString(column);
174
+ byte[] messageAsBytes = convertJsonToMessageBytes(
175
+ messageAsJson, messageName);
176
+ return encodeMessage(messageAsBytes);
177
+ }
178
+ // deserialize
179
+ else {
180
+ String messageAsString = pageReader.getString(column);
181
+ byte[] messageAsBytes = decodeMessage(messageAsString);
182
+ return convertMessageBytesToJson(messageAsBytes, messageName);
183
+ }
184
+ }
185
+
186
+ @Override
187
+ public void booleanColumn(Column outputColumn)
188
+ {
189
+ if (pageReader.isNull(outputColumn)) {
190
+ pageBuilder.setNull(outputColumn);
191
+ }
192
+ else {
193
+ pageBuilder.setBoolean(
194
+ outputColumn, pageReader.getBoolean(outputColumn));
195
+ }
196
+ }
197
+
198
+ @Override
199
+ public void longColumn(Column outputColumn)
200
+ {
201
+ if (pageReader.isNull(outputColumn)) {
202
+ pageBuilder.setNull(outputColumn);
203
+ }
204
+ else {
205
+ pageBuilder.setLong(
206
+ outputColumn, pageReader.getLong(outputColumn));
207
+ }
208
+ }
209
+
210
+ @Override
211
+ public void doubleColumn(Column outputColumn)
212
+ {
213
+ if (pageReader.isNull(outputColumn)) {
214
+ pageBuilder.setNull(outputColumn);
215
+ }
216
+ else {
217
+ pageBuilder.setDouble(
218
+ outputColumn, pageReader.getDouble(outputColumn));
219
+ }
220
+ }
221
+
222
+ @Override
223
+ public void stringColumn(Column outputColumn)
224
+ {
225
+ if (pageReader.isNull(outputColumn)) {
226
+ pageBuilder.setNull(outputColumn);
227
+ }
228
+ else {
229
+ ColumnTask task = getColumnTask(outputColumn);
230
+ if (task == null) {
231
+ pageBuilder.setString(
232
+ outputColumn, pageReader.getString(outputColumn));
233
+ }
234
+ else {
235
+ pageBuilder.setString(
236
+ outputColumn, executeTask(task, outputColumn));
237
+ }
238
+ }
239
+ }
240
+
241
+ @Override
242
+ public void timestampColumn(Column outputColumn)
243
+ {
244
+ if (pageReader.isNull(outputColumn)) {
245
+ pageBuilder.setNull(outputColumn);
246
+ }
247
+ else {
248
+ pageBuilder.setTimestamp(
249
+ outputColumn, pageReader.getTimestamp(outputColumn));
250
+ }
251
+ }
252
+
253
+ @Override
254
+ public void jsonColumn(Column outputColumn)
255
+ {
256
+ if (pageReader.isNull(outputColumn)) {
257
+ pageBuilder.setNull(outputColumn);
258
+ }
259
+ else {
260
+ pageBuilder.setJson(
261
+ outputColumn, pageReader.getJson(outputColumn));
262
+ }
263
+ }
264
+ }
@@ -0,0 +1,155 @@
1
+ package org.embulk.filter.protobuf;
2
+
3
+ import com.google.common.base.Optional;
4
+
5
+ import org.embulk.config.Config;
6
+ import org.embulk.config.ConfigDefault;
7
+ import org.embulk.config.ConfigException;
8
+ import org.embulk.config.ConfigSource;
9
+ import org.embulk.config.Task;
10
+ import org.embulk.config.TaskSource;
11
+ import org.embulk.spi.Column;
12
+ import org.embulk.spi.Exec;
13
+ import org.embulk.spi.FilterPlugin;
14
+ import org.embulk.spi.Page;
15
+ import org.embulk.spi.PageBuilder;
16
+ import org.embulk.spi.PageOutput;
17
+ import org.embulk.spi.PageReader;
18
+ import org.embulk.spi.Schema;
19
+ import org.embulk.spi.type.Types;
20
+
21
+ import java.nio.file.Path;
22
+ import java.nio.file.Paths;
23
+ import java.util.Arrays;
24
+ import java.util.List;
25
+
26
+ public class ProtobufFilterPlugin implements FilterPlugin
27
+ {
28
+ public interface PluginTask extends Task
29
+ {
30
+ @Config("serialize")
31
+ @ConfigDefault("false")
32
+ public Optional<Boolean> getDoSerialize();
33
+
34
+ @Config("deserialize")
35
+ @ConfigDefault("false")
36
+ public Optional<Boolean> getDoDeserialize();
37
+
38
+ @Config("encoding")
39
+ public String getEncoding();
40
+
41
+ @Config("protobuf_jar_path")
42
+ public String getProtobufJarPath();
43
+
44
+ @Config("columns")
45
+ public List<ColumnTask> getColumns();
46
+ }
47
+
48
+ public interface ColumnTask extends Task
49
+ {
50
+ @Config("name")
51
+ public String getName();
52
+
53
+ @Config("message")
54
+ public String getMessage();
55
+ }
56
+
57
+ public void validate(PluginTask pluginTask, Schema inputSchema)
58
+ {
59
+ // validate 'serialize' and 'deserialize' in PluginTask
60
+ boolean doSerialize = pluginTask.getDoSerialize().get();
61
+ boolean doDeserialize = pluginTask.getDoDeserialize().get();
62
+ boolean bothTrue = doSerialize && doDeserialize;
63
+ boolean bothFalse = !doSerialize && !doDeserialize;
64
+ if (bothTrue || bothFalse) {
65
+ String errMsg = "Specify either 'serialize: true' or 'deserialize: true'.";
66
+ throw new ConfigException(errMsg);
67
+ }
68
+ // validate 'encoding' in PluginTask
69
+ String[] allowedEncordings = {"Base64"};
70
+ String encoding = pluginTask.getEncoding();
71
+ if (!Arrays.asList(allowedEncordings).contains(encoding)) {
72
+ String errMsg = "Specify 'encoding: Base64'.";
73
+ throw new ConfigException(errMsg);
74
+ }
75
+ // validate 'protobuf_jar_path' in PluginTask
76
+ Path protobufJarPath = Paths.get(pluginTask.getProtobufJarPath());
77
+ if (!protobufJarPath.toFile().exists()) {
78
+ String errMsg = "The jar file does not exist.";
79
+ throw new ConfigException(errMsg);
80
+ }
81
+ // validate 'name' in ColumnTask
82
+ for (ColumnTask colTask : pluginTask.getColumns()) {
83
+ // throws exception when the column does not exist
84
+ Column column = inputSchema.lookupColumn(colTask.getName());
85
+ // TODO: accept both STRING and JSON type when 'serilialize': true
86
+ if (!Types.STRING.equals(column.getType())) {
87
+ String errMsg = "Type of input columns must be string.";
88
+ throw new ConfigException(errMsg);
89
+ }
90
+ }
91
+ }
92
+
93
+ @Override
94
+ public void transaction(ConfigSource config, Schema inputSchema,
95
+ FilterPlugin.Control control)
96
+ {
97
+ PluginTask task = config.loadConfig(PluginTask.class);
98
+ validate(task, inputSchema);
99
+ Schema outputSchema = inputSchema;
100
+ control.run(task.dump(), outputSchema);
101
+ }
102
+
103
+ @Override
104
+ public PageOutput open(TaskSource taskSource, Schema inputSchema,
105
+ Schema outputSchema, PageOutput output)
106
+ {
107
+ PluginTask task = taskSource.loadTask(PluginTask.class);
108
+ PageBuilder pageBuilder = new PageBuilder(
109
+ Exec.getBufferAllocator(), outputSchema, output);
110
+ PageReader pageReader = new PageReader(inputSchema);
111
+ ColumnVisitorImpl visitor = new ColumnVisitorImpl(
112
+ task, pageReader, pageBuilder);
113
+
114
+ return new PageOutputImpl(
115
+ pageReader, pageBuilder, outputSchema, visitor);
116
+ }
117
+
118
+ public static class PageOutputImpl implements PageOutput
119
+ {
120
+ private PageReader pageReader;
121
+ private PageBuilder pageBuilder;
122
+ private Schema outputSchema;
123
+ private ColumnVisitorImpl visitor;
124
+
125
+ PageOutputImpl(PageReader pageReader, PageBuilder pageBuilder, Schema outputSchema, ColumnVisitorImpl visitor)
126
+ {
127
+ this.pageReader = pageReader;
128
+ this.pageBuilder = pageBuilder;
129
+ this.outputSchema = outputSchema;
130
+ this.visitor = visitor;
131
+ }
132
+
133
+ @Override
134
+ public void add(Page page)
135
+ {
136
+ pageReader.setPage(page);
137
+ while (pageReader.nextRecord()) {
138
+ outputSchema.visitColumns(visitor);
139
+ pageBuilder.addRecord();
140
+ }
141
+ }
142
+
143
+ @Override
144
+ public void finish()
145
+ {
146
+ pageBuilder.finish();
147
+ }
148
+
149
+ @Override
150
+ public void close()
151
+ {
152
+ pageBuilder.close();
153
+ }
154
+ };
155
+ }
@@ -0,0 +1,117 @@
1
+ package org.embulk.filter.protobuf;
2
+
3
+ import org.embulk.EmbulkTestRuntime;
4
+ import org.embulk.filter.protobuf.ProtobufFilterPlugin.PageOutputImpl;
5
+ import org.embulk.filter.protobuf.ProtobufFilterPlugin.PluginTask;
6
+ import org.embulk.spi.Page;
7
+ import org.embulk.spi.PageBuilder;
8
+ import org.embulk.spi.PageOutput;
9
+ import org.embulk.spi.PageReader;
10
+ import org.embulk.spi.PageTestUtils;
11
+ import org.embulk.spi.Schema;
12
+ import org.embulk.spi.TestPageBuilderReader.MockPageOutput;
13
+ import org.embulk.spi.type.Types;
14
+ import org.embulk.spi.util.Pages;
15
+
16
+ import org.junit.Before;
17
+ import org.junit.Rule;
18
+ import org.junit.Test;
19
+ import static org.embulk.filter.protobuf.TestProtobufFilterPlugin.taskFromYamlString;
20
+ import static org.junit.Assert.assertEquals;
21
+
22
+ import java.io.File;
23
+ import java.util.List;
24
+
25
+ public class TestColumnVisitorImpl
26
+ {
27
+ @Rule
28
+ public EmbulkTestRuntime runtime;
29
+
30
+ private String protobufJarPath;
31
+
32
+ public TestColumnVisitorImpl()
33
+ {
34
+ this.runtime = new EmbulkTestRuntime();
35
+
36
+ String pluginBasePath = new File(".").getAbsoluteFile().getParent();
37
+ this.protobufJarPath = String.format(
38
+ "%s/example/AddressBookProtosProto3Syntax.jar",
39
+ pluginBasePath);
40
+ }
41
+
42
+ private List<Object[]> filter(
43
+ PluginTask task, Schema inputSchema, Object... objects)
44
+ {
45
+ MockPageOutput output = new MockPageOutput();
46
+ Schema outputSchema = inputSchema;
47
+ PageBuilder pageBuilder = new PageBuilder(
48
+ runtime.getBufferAllocator(), outputSchema, output);
49
+ PageReader pageReader = new PageReader(inputSchema);
50
+ ColumnVisitorImpl visitor = new ColumnVisitorImpl(
51
+ task, pageReader, pageBuilder);
52
+
53
+ List<Page> pages = PageTestUtils.buildPage(
54
+ runtime.getBufferAllocator(), inputSchema, objects);
55
+ PageOutput mockPageOutput = new PageOutputImpl(
56
+ pageReader, pageBuilder, outputSchema, visitor);
57
+ for (Page page : pages) {
58
+ mockPageOutput.add(page);
59
+ }
60
+ mockPageOutput.finish();
61
+ mockPageOutput.close();
62
+ return Pages.toObjects(outputSchema, output.pages);
63
+ }
64
+
65
+ @Test
66
+ public void testExecuteTask_serialize()
67
+ {
68
+ PluginTask task = taskFromYamlString(
69
+ "type: protobuf",
70
+ "serialize: true",
71
+ "encoding: Base64",
72
+ "protobuf_jar_path: " + protobufJarPath,
73
+ "columns:",
74
+ " - {name: to serialize, message: com.example.tutorial.AddressBookProtos$Person}"
75
+ );
76
+ Schema inputSchema = Schema.builder()
77
+ .add("to serialize", Types.STRING)
78
+ .build();
79
+ List<Object[]> records = filter(task, inputSchema,
80
+ // generated from proto2-syntax .proto
81
+ "{\"name\":\"John Doe\",\"id\":1234,\"email\":\"jdoe@example.com\",\"phone\":[{\"number\":\"111-0000\",\"type\":\"MOBILE\"},{\"number\":\"555-4321\",\"type\":\"HOME\"}]}",
82
+ // generated from proto3-syntax .proto
83
+ "{\"name\":\"John Doe\",\"id\":1234,\"email\":\"jdoe@example.com\",\"phone\":[{\"number\":\"111-0000\"},{\"number\":\"555-4321\",\"type\":\"HOME\"}]}"
84
+ );
85
+ assertEquals(2, records.size());
86
+ String expected = "CghKb2huIERvZRDSCRoQamRvZUBleGFtcGxlLmNvbSIKCggxMTEtMDAwMCIMCgg1NTUtNDMyMRAB";
87
+ assertEquals(expected, records.get(0)[0]);
88
+ assertEquals(expected, records.get(1)[0]);
89
+ }
90
+
91
+ @Test
92
+ public void testExecuteTask_deserialize()
93
+ {
94
+ PluginTask task = taskFromYamlString(
95
+ "type: protobuf",
96
+ "deserialize: true",
97
+ "encoding: Base64",
98
+ "protobuf_jar_path: " + protobufJarPath,
99
+ "columns:",
100
+ " - {name: to deserialize, message: com.example.tutorial.AddressBookProtos$Person}"
101
+ );
102
+ Schema inputSchema = Schema.builder()
103
+ .add("to deserialize", Types.STRING)
104
+ .build();
105
+ List<Object[]> records = filter(
106
+ task, inputSchema,
107
+ // generated from proto2-syntax .proto
108
+ "CghKb2huIERvZRDSCRoQamRvZUBleGFtcGxlLmNvbSIMCggxMTEtMDAwMBAAIgwKCDU1NS00MzIxEAE=",
109
+ // generated from proto3-syntax .proto
110
+ "CghKb2huIERvZRDSCRoQamRvZUBleGFtcGxlLmNvbSIKCggxMTEtMDAwMCIMCgg1NTUtNDMyMRAB"
111
+ );
112
+ assertEquals(2, records.size());
113
+ String expected = "{\"name\":\"John Doe\",\"id\":1234,\"email\":\"jdoe@example.com\",\"phone\":[{\"number\":\"111-0000\"},{\"number\":\"555-4321\",\"type\":\"HOME\"}]}";
114
+ assertEquals(expected, records.get(0)[0]);
115
+ assertEquals(expected, records.get(1)[0]);
116
+ }
117
+ }