embulk-executor-mapreduce 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/classpath/{embulk-executor-mapreduce-0.2.4.jar → embulk-executor-mapreduce-0.2.5.jar} +0 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +9 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +4 -0
- data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +42 -1
- data/src/test/java/org/embulk/executor/mapreduce/TestMapReduceExecutor.java +30 -14
- data/src/test/resources/config/embulk_mapred_config.yml +6 -3
- data/src/test/resources/config/embulk_mapred_partitioning_config.yml +6 -3
- data/src/test/resources/fixtures/csv/sample1.csv +3 -3
- data/src/test/resources/fixtures/csv/sample2.csv +3 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4461eebeecc53f99b9b9683d7553a585a87e1a1f
|
4
|
+
data.tar.gz: a019cd9224918ae2721482a9cf92c9c8148a05a6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91e107ce10160fc097930b139f07b59dcb80b1201dde0723cc302fd1e142a283ad8a817d7518ac7684f0b066ed55537c20ae0af5446230eb0f63026d9bf7e21d
|
7
|
+
data.tar.gz: bc045316fedf83de62e34bbf9304152680d90e46ec6fa885fc054aca50d9a967b2d8f8eb3fa86e7c685badba7e3f15834f985fee923c5bf54c9f5197f43a68fb
|
Binary file
|
@@ -71,6 +71,15 @@ public class BufferedPagePartitioner
|
|
71
71
|
}
|
72
72
|
}
|
73
73
|
|
74
|
+
public void jsonColumn(Column column)
|
75
|
+
{
|
76
|
+
if (source.isNull(column)) {
|
77
|
+
destination.setNull(column);
|
78
|
+
} else {
|
79
|
+
destination.setJson(column, source.getJson(column));
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
74
83
|
public void timestampColumn(Column column)
|
75
84
|
{
|
76
85
|
if (source.isNull(column)) {
|
@@ -254,6 +254,7 @@ public class EmbulkPartitioningMapReduce
|
|
254
254
|
this.output = output;
|
255
255
|
}
|
256
256
|
|
257
|
+
@Override
|
257
258
|
public ConfigDiff transaction(ConfigSource config,
|
258
259
|
Schema schema, int taskCount,
|
259
260
|
OutputPlugin.Control control)
|
@@ -262,6 +263,7 @@ public class EmbulkPartitioningMapReduce
|
|
262
263
|
throw new RuntimeException("");
|
263
264
|
}
|
264
265
|
|
266
|
+
@Override
|
265
267
|
public ConfigDiff resume(TaskSource taskSource,
|
266
268
|
Schema schema, int taskCount,
|
267
269
|
OutputPlugin.Control control)
|
@@ -270,6 +272,7 @@ public class EmbulkPartitioningMapReduce
|
|
270
272
|
throw new RuntimeException("");
|
271
273
|
}
|
272
274
|
|
275
|
+
@Override
|
273
276
|
public void cleanup(TaskSource taskSource,
|
274
277
|
Schema schema, int taskCount,
|
275
278
|
List<TaskReport> successTaskReports)
|
@@ -278,6 +281,7 @@ public class EmbulkPartitioningMapReduce
|
|
278
281
|
throw new RuntimeException("");
|
279
282
|
}
|
280
283
|
|
284
|
+
@Override
|
281
285
|
public TransactionalPageOutput open(TaskSource taskSource, final Schema schema, int taskIndex)
|
282
286
|
{
|
283
287
|
return new TransactionalPageOutput() {
|
@@ -7,6 +7,13 @@ import java.util.List;
|
|
7
7
|
import java.util.ArrayList;
|
8
8
|
import org.apache.hadoop.io.Writable;
|
9
9
|
import org.apache.hadoop.io.WritableUtils;
|
10
|
+
import org.apache.hadoop.io.DataOutputOutputStream;
|
11
|
+
import org.msgpack.value.Value;
|
12
|
+
import org.msgpack.value.ImmutableValue;
|
13
|
+
import org.msgpack.core.MessagePack;
|
14
|
+
import org.msgpack.core.MessageBufferPacker;
|
15
|
+
import org.msgpack.core.MessageUnpacker;
|
16
|
+
import org.msgpack.core.buffer.MessageBuffer;
|
10
17
|
import org.embulk.spi.Buffer;
|
11
18
|
import org.embulk.spi.Page;
|
12
19
|
import static java.nio.charset.StandardCharsets.UTF_8;
|
@@ -40,6 +47,22 @@ public class PageWritable
|
|
40
47
|
for (String s : stringReferences) {
|
41
48
|
out.writeUTF(s);
|
42
49
|
}
|
50
|
+
|
51
|
+
List<ImmutableValue> valueReferences = page.getValueReferences();
|
52
|
+
WritableUtils.writeVInt(out, valueReferences.size());
|
53
|
+
for (Value value : valueReferences) {
|
54
|
+
MessageBufferPacker packer = MessagePack.newDefaultBufferPacker(); // TODO reuse allocated buffer
|
55
|
+
value.writeTo(packer);
|
56
|
+
List<MessageBuffer> buffers = packer.toBufferList();
|
57
|
+
int size = 0;
|
58
|
+
for (MessageBuffer b : buffers) {
|
59
|
+
size += b.size();
|
60
|
+
}
|
61
|
+
WritableUtils.writeVInt(out, size);
|
62
|
+
for (MessageBuffer b : buffers) {
|
63
|
+
out.write(b.array(), b.arrayOffset(), b.size());
|
64
|
+
}
|
65
|
+
}
|
43
66
|
}
|
44
67
|
|
45
68
|
@Override
|
@@ -51,13 +74,31 @@ public class PageWritable
|
|
51
74
|
Buffer buffer = Buffer.wrap(bytes);
|
52
75
|
|
53
76
|
int stringCount = WritableUtils.readVInt(in);
|
54
|
-
List<String> strings = new ArrayList
|
77
|
+
List<String> strings = new ArrayList<>(stringCount);
|
55
78
|
for (int i=0; i < stringCount; i++) {
|
56
79
|
strings.add(in.readUTF());
|
57
80
|
}
|
58
81
|
|
82
|
+
int valueCount = WritableUtils.readVInt(in);
|
83
|
+
List<ImmutableValue> values = new ArrayList<>(valueCount);
|
84
|
+
byte[] b = new byte[32 * 1024];
|
85
|
+
for (int i=0; i < valueCount; i++) {
|
86
|
+
int size = WritableUtils.readVInt(in);
|
87
|
+
if (b.length < size) {
|
88
|
+
int ns = b.length;
|
89
|
+
while (ns < size) {
|
90
|
+
ns *= 2;
|
91
|
+
}
|
92
|
+
b = new byte[ns];
|
93
|
+
}
|
94
|
+
in.readFully(b, 0, size);
|
95
|
+
MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(b, 0, size);
|
96
|
+
values.add(unpacker.unpackValue());
|
97
|
+
}
|
98
|
+
|
59
99
|
Page newPage = Page.wrap(buffer);
|
60
100
|
newPage.setStringReferences(strings);
|
101
|
+
newPage.setValueReferences(values);
|
61
102
|
if (page != null) {
|
62
103
|
page.release();
|
63
104
|
}
|
@@ -25,16 +25,19 @@ import org.slf4j.impl.Log4jLoggerFactory;
|
|
25
25
|
|
26
26
|
import java.io.BufferedInputStream;
|
27
27
|
import java.io.BufferedReader;
|
28
|
+
import java.io.File;
|
28
29
|
import java.io.FileNotFoundException;
|
29
30
|
import java.io.IOException;
|
30
31
|
import java.io.InputStream;
|
31
32
|
import java.io.InputStreamReader;
|
33
|
+
import java.nio.file.Files;
|
32
34
|
import java.util.ArrayList;
|
33
35
|
import java.util.Collections;
|
34
36
|
import java.util.Comparator;
|
35
37
|
import java.util.List;
|
36
38
|
import java.util.Random;
|
37
39
|
|
40
|
+
import static java.nio.charset.StandardCharsets.UTF_8;
|
38
41
|
import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
|
39
42
|
import static org.junit.Assert.assertEquals;
|
40
43
|
import static org.junit.Assert.assertTrue;
|
@@ -62,36 +65,42 @@ public class TestMapReduceExecutor
|
|
62
65
|
bootstrap.setSystemConfig(systemConfig);
|
63
66
|
bootstrap.overrideModules(getModuleOverrides(systemConfig));
|
64
67
|
embulk = bootstrap.initialize();
|
68
|
+
|
69
|
+
new File("tmp").mkdirs();
|
65
70
|
}
|
66
71
|
|
67
72
|
@Test
|
68
73
|
public void testEmbulkMapper()
|
69
74
|
throws Exception
|
70
75
|
{
|
76
|
+
new File("tmp/embulk_mapred_output.000.00.csv").delete();
|
77
|
+
new File("tmp/embulk_mapred_output.001.00.csv").delete();
|
71
78
|
ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_config.yml");
|
72
79
|
embulk.run(config);
|
73
80
|
assertFileContent(
|
74
81
|
Lists.newArrayList(
|
75
|
-
"fixtures/csv/sample1.csv",
|
76
|
-
"fixtures/csv/sample1.csv"),
|
82
|
+
"src/test/resources/fixtures/csv/sample1.csv",
|
83
|
+
"src/test/resources/fixtures/csv/sample1.csv"),
|
77
84
|
Lists.newArrayList(
|
78
|
-
"
|
79
|
-
"
|
85
|
+
"tmp/embulk_mapred_output.000.00.csv",
|
86
|
+
"tmp/embulk_mapred_output.001.00.csv"));
|
80
87
|
}
|
81
88
|
|
82
89
|
@Test
|
83
90
|
public void testEmbulkPartitioningMapperReducer()
|
84
91
|
throws Exception
|
85
92
|
{
|
93
|
+
new File("tmp/embulk_mapred_partitioning_output.000.00.csv").delete();
|
94
|
+
new File("tmp/embulk_mapred_partitioning_output.001.00.csv").delete();
|
86
95
|
ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_partitioning_config.yml");
|
87
96
|
embulk.run(config);
|
88
97
|
assertFileContent(
|
89
98
|
Lists.newArrayList(
|
90
|
-
"fixtures/csv/sample1.csv",
|
91
|
-
"fixtures/csv/sample1.csv"),
|
99
|
+
"src/test/resources/fixtures/csv/sample1.csv",
|
100
|
+
"src/test/resources/fixtures/csv/sample1.csv"),
|
92
101
|
Lists.newArrayList(
|
93
|
-
"
|
94
|
-
"
|
102
|
+
"tmp/embulk_mapred_partitioning_output.000.00.csv",
|
103
|
+
"tmp/embulk_mapred_partitioning_output.001.00.csv"));
|
95
104
|
}
|
96
105
|
|
97
106
|
@Test
|
@@ -104,7 +113,8 @@ public class TestMapReduceExecutor
|
|
104
113
|
fail();
|
105
114
|
}
|
106
115
|
catch (Throwable t) {
|
107
|
-
assertTrue(t instanceof
|
116
|
+
assertTrue(t instanceof PartialExecutionException);
|
117
|
+
assertTrue(t.getCause() instanceof ConfigException);
|
108
118
|
}
|
109
119
|
}
|
110
120
|
|
@@ -118,7 +128,8 @@ public class TestMapReduceExecutor
|
|
118
128
|
fail();
|
119
129
|
}
|
120
130
|
catch (Throwable t) {
|
121
|
-
assertTrue(t instanceof
|
131
|
+
assertTrue(t instanceof PartialExecutionException);
|
132
|
+
assertTrue(t.getCause() instanceof ConfigException);
|
122
133
|
}
|
123
134
|
}
|
124
135
|
|
@@ -132,7 +143,8 @@ public class TestMapReduceExecutor
|
|
132
143
|
fail();
|
133
144
|
}
|
134
145
|
catch (Throwable t) {
|
135
|
-
assertTrue(t instanceof
|
146
|
+
assertTrue(t instanceof PartialExecutionException);
|
147
|
+
assertTrue(t.getCause() instanceof ConfigException);
|
136
148
|
}
|
137
149
|
}
|
138
150
|
|
@@ -146,7 +158,9 @@ public class TestMapReduceExecutor
|
|
146
158
|
fail();
|
147
159
|
}
|
148
160
|
catch (Throwable t) {
|
149
|
-
assertTrue(t
|
161
|
+
assertTrue(t instanceof PartialExecutionException);
|
162
|
+
assertTrue(t.getCause() instanceof RuntimeException);
|
163
|
+
assertTrue(t.getCause().getCause() instanceof FileNotFoundException);
|
150
164
|
}
|
151
165
|
}
|
152
166
|
|
@@ -273,6 +287,7 @@ public class TestMapReduceExecutor
|
|
273
287
|
}
|
274
288
|
|
275
289
|
private static void assertFileContent(List<String> inputFiles, List<String> outputFiles)
|
290
|
+
throws IOException
|
276
291
|
{
|
277
292
|
List<List<String>> inputRecords = getRecords(inputFiles);
|
278
293
|
Collections.sort(inputRecords, new RecordComparator());
|
@@ -294,6 +309,7 @@ public class TestMapReduceExecutor
|
|
294
309
|
}
|
295
310
|
|
296
311
|
private static List<List<String>> getRecords(List<String> files)
|
312
|
+
throws IOException
|
297
313
|
{
|
298
314
|
List<List<String>> records = new ArrayList<>();
|
299
315
|
|
@@ -327,8 +343,8 @@ public class TestMapReduceExecutor
|
|
327
343
|
}
|
328
344
|
|
329
345
|
private static BufferedReader newReader(String filePath)
|
346
|
+
throws IOException
|
330
347
|
{
|
331
|
-
|
332
|
-
return new BufferedReader(new InputStreamReader(in));
|
348
|
+
return Files.newBufferedReader(new File(filePath).toPath(), UTF_8);
|
333
349
|
}
|
334
350
|
}
|
@@ -19,8 +19,8 @@ in:
|
|
19
19
|
newline: CRLF
|
20
20
|
type: csv
|
21
21
|
delimiter: ','
|
22
|
-
quote: ''
|
23
|
-
escape: ''
|
22
|
+
quote: '"'
|
23
|
+
escape: '"'
|
24
24
|
skip_header_lines: 1
|
25
25
|
columns:
|
26
26
|
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S"}
|
@@ -34,13 +34,16 @@ in:
|
|
34
34
|
- {name: size, type: long}
|
35
35
|
- {name: d, type: double}
|
36
36
|
- {name: flag, type: boolean}
|
37
|
+
- {name: v_json, type: json}
|
37
38
|
out:
|
38
39
|
type: file
|
39
|
-
path_prefix: '
|
40
|
+
path_prefix: 'tmp/embulk_mapred_output.'
|
40
41
|
file_ext: 'csv'
|
41
42
|
formatter:
|
42
43
|
charset: UTF-8
|
43
44
|
newline: CRLF
|
45
|
+
quote: '"'
|
46
|
+
escape: '"'
|
44
47
|
type: csv
|
45
48
|
column_options:
|
46
49
|
timestamp: {format: '%Y-%m-%d %H:%M:%S'}
|
@@ -25,8 +25,8 @@ in:
|
|
25
25
|
newline: CRLF
|
26
26
|
type: csv
|
27
27
|
delimiter: ','
|
28
|
-
quote: ''
|
29
|
-
escape: ''
|
28
|
+
quote: '"'
|
29
|
+
escape: '"'
|
30
30
|
skip_header_lines: 1
|
31
31
|
columns:
|
32
32
|
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S"}
|
@@ -40,13 +40,16 @@ in:
|
|
40
40
|
- {name: size, type: long}
|
41
41
|
- {name: d, type: double}
|
42
42
|
- {name: flag, type: boolean}
|
43
|
+
- {name: v_json, type: json}
|
43
44
|
out:
|
44
45
|
type: file
|
45
|
-
path_prefix: '
|
46
|
+
path_prefix: 'tmp/embulk_mapred_partitioning_output.'
|
46
47
|
file_ext: 'csv'
|
47
48
|
formatter:
|
48
49
|
charset: UTF-8
|
49
50
|
newline: CRLF
|
51
|
+
quote: '"'
|
52
|
+
escape: '"'
|
50
53
|
type: csv
|
51
54
|
column_options:
|
52
55
|
timestamp: {format: '%Y-%m-%d %H:%M:%S'}
|
@@ -1,3 +1,3 @@
|
|
1
|
-
timestamp,host,path,method,referer,code,agent,user,size,d,flag
|
2
|
-
2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true
|
3
|
-
2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false
|
1
|
+
timestamp,host,path,method,referer,code,agent,user,size,d,flag,v_json
|
2
|
+
2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true,"{""k0"":""v0"",""k1"":""v1""}"
|
3
|
+
2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false,"[1,2,""3""]"
|
@@ -1,4 +1,3 @@
|
|
1
|
-
timestamp,host,path,method,referer,code,agent,user,size,d,flag
|
2
|
-
2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true
|
3
|
-
2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false
|
4
|
-
|
1
|
+
timestamp,host,path,method,referer,code,agent,user,size,d,flag,v_json
|
2
|
+
2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true,"{""k0"":""v0"",""k1"":""v1""}"
|
3
|
+
2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false,"[1,2,""3""]"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-executor-mapreduce
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Executes tasks on Hadoop.
|
14
14
|
email:
|
@@ -84,7 +84,7 @@ files:
|
|
84
84
|
- classpath/curator-client-2.6.0.jar
|
85
85
|
- classpath/curator-framework-2.6.0.jar
|
86
86
|
- classpath/curator-recipes-2.6.0.jar
|
87
|
-
- classpath/embulk-executor-mapreduce-0.2.
|
87
|
+
- classpath/embulk-executor-mapreduce-0.2.5.jar
|
88
88
|
- classpath/gson-2.2.4.jar
|
89
89
|
- classpath/hadoop-annotations-2.6.0.jar
|
90
90
|
- classpath/hadoop-auth-2.6.0.jar
|