embulk-executor-mapreduce 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/classpath/{embulk-executor-mapreduce-0.2.4.jar → embulk-executor-mapreduce-0.2.5.jar} +0 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +9 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +4 -0
- data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +42 -1
- data/src/test/java/org/embulk/executor/mapreduce/TestMapReduceExecutor.java +30 -14
- data/src/test/resources/config/embulk_mapred_config.yml +6 -3
- data/src/test/resources/config/embulk_mapred_partitioning_config.yml +6 -3
- data/src/test/resources/fixtures/csv/sample1.csv +3 -3
- data/src/test/resources/fixtures/csv/sample2.csv +3 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4461eebeecc53f99b9b9683d7553a585a87e1a1f
|
4
|
+
data.tar.gz: a019cd9224918ae2721482a9cf92c9c8148a05a6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91e107ce10160fc097930b139f07b59dcb80b1201dde0723cc302fd1e142a283ad8a817d7518ac7684f0b066ed55537c20ae0af5446230eb0f63026d9bf7e21d
|
7
|
+
data.tar.gz: bc045316fedf83de62e34bbf9304152680d90e46ec6fa885fc054aca50d9a967b2d8f8eb3fa86e7c685badba7e3f15834f985fee923c5bf54c9f5197f43a68fb
|
Binary file
|
@@ -71,6 +71,15 @@ public class BufferedPagePartitioner
|
|
71
71
|
}
|
72
72
|
}
|
73
73
|
|
74
|
+
public void jsonColumn(Column column)
|
75
|
+
{
|
76
|
+
if (source.isNull(column)) {
|
77
|
+
destination.setNull(column);
|
78
|
+
} else {
|
79
|
+
destination.setJson(column, source.getJson(column));
|
80
|
+
}
|
81
|
+
}
|
82
|
+
|
74
83
|
public void timestampColumn(Column column)
|
75
84
|
{
|
76
85
|
if (source.isNull(column)) {
|
@@ -254,6 +254,7 @@ public class EmbulkPartitioningMapReduce
|
|
254
254
|
this.output = output;
|
255
255
|
}
|
256
256
|
|
257
|
+
@Override
|
257
258
|
public ConfigDiff transaction(ConfigSource config,
|
258
259
|
Schema schema, int taskCount,
|
259
260
|
OutputPlugin.Control control)
|
@@ -262,6 +263,7 @@ public class EmbulkPartitioningMapReduce
|
|
262
263
|
throw new RuntimeException("");
|
263
264
|
}
|
264
265
|
|
266
|
+
@Override
|
265
267
|
public ConfigDiff resume(TaskSource taskSource,
|
266
268
|
Schema schema, int taskCount,
|
267
269
|
OutputPlugin.Control control)
|
@@ -270,6 +272,7 @@ public class EmbulkPartitioningMapReduce
|
|
270
272
|
throw new RuntimeException("");
|
271
273
|
}
|
272
274
|
|
275
|
+
@Override
|
273
276
|
public void cleanup(TaskSource taskSource,
|
274
277
|
Schema schema, int taskCount,
|
275
278
|
List<TaskReport> successTaskReports)
|
@@ -278,6 +281,7 @@ public class EmbulkPartitioningMapReduce
|
|
278
281
|
throw new RuntimeException("");
|
279
282
|
}
|
280
283
|
|
284
|
+
@Override
|
281
285
|
public TransactionalPageOutput open(TaskSource taskSource, final Schema schema, int taskIndex)
|
282
286
|
{
|
283
287
|
return new TransactionalPageOutput() {
|
@@ -7,6 +7,13 @@ import java.util.List;
|
|
7
7
|
import java.util.ArrayList;
|
8
8
|
import org.apache.hadoop.io.Writable;
|
9
9
|
import org.apache.hadoop.io.WritableUtils;
|
10
|
+
import org.apache.hadoop.io.DataOutputOutputStream;
|
11
|
+
import org.msgpack.value.Value;
|
12
|
+
import org.msgpack.value.ImmutableValue;
|
13
|
+
import org.msgpack.core.MessagePack;
|
14
|
+
import org.msgpack.core.MessageBufferPacker;
|
15
|
+
import org.msgpack.core.MessageUnpacker;
|
16
|
+
import org.msgpack.core.buffer.MessageBuffer;
|
10
17
|
import org.embulk.spi.Buffer;
|
11
18
|
import org.embulk.spi.Page;
|
12
19
|
import static java.nio.charset.StandardCharsets.UTF_8;
|
@@ -40,6 +47,22 @@ public class PageWritable
|
|
40
47
|
for (String s : stringReferences) {
|
41
48
|
out.writeUTF(s);
|
42
49
|
}
|
50
|
+
|
51
|
+
List<ImmutableValue> valueReferences = page.getValueReferences();
|
52
|
+
WritableUtils.writeVInt(out, valueReferences.size());
|
53
|
+
for (Value value : valueReferences) {
|
54
|
+
MessageBufferPacker packer = MessagePack.newDefaultBufferPacker(); // TODO reuse allocated buffer
|
55
|
+
value.writeTo(packer);
|
56
|
+
List<MessageBuffer> buffers = packer.toBufferList();
|
57
|
+
int size = 0;
|
58
|
+
for (MessageBuffer b : buffers) {
|
59
|
+
size += b.size();
|
60
|
+
}
|
61
|
+
WritableUtils.writeVInt(out, size);
|
62
|
+
for (MessageBuffer b : buffers) {
|
63
|
+
out.write(b.array(), b.arrayOffset(), b.size());
|
64
|
+
}
|
65
|
+
}
|
43
66
|
}
|
44
67
|
|
45
68
|
@Override
|
@@ -51,13 +74,31 @@ public class PageWritable
|
|
51
74
|
Buffer buffer = Buffer.wrap(bytes);
|
52
75
|
|
53
76
|
int stringCount = WritableUtils.readVInt(in);
|
54
|
-
List<String> strings = new ArrayList
|
77
|
+
List<String> strings = new ArrayList<>(stringCount);
|
55
78
|
for (int i=0; i < stringCount; i++) {
|
56
79
|
strings.add(in.readUTF());
|
57
80
|
}
|
58
81
|
|
82
|
+
int valueCount = WritableUtils.readVInt(in);
|
83
|
+
List<ImmutableValue> values = new ArrayList<>(valueCount);
|
84
|
+
byte[] b = new byte[32 * 1024];
|
85
|
+
for (int i=0; i < valueCount; i++) {
|
86
|
+
int size = WritableUtils.readVInt(in);
|
87
|
+
if (b.length < size) {
|
88
|
+
int ns = b.length;
|
89
|
+
while (ns < size) {
|
90
|
+
ns *= 2;
|
91
|
+
}
|
92
|
+
b = new byte[ns];
|
93
|
+
}
|
94
|
+
in.readFully(b, 0, size);
|
95
|
+
MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(b, 0, size);
|
96
|
+
values.add(unpacker.unpackValue());
|
97
|
+
}
|
98
|
+
|
59
99
|
Page newPage = Page.wrap(buffer);
|
60
100
|
newPage.setStringReferences(strings);
|
101
|
+
newPage.setValueReferences(values);
|
61
102
|
if (page != null) {
|
62
103
|
page.release();
|
63
104
|
}
|
@@ -25,16 +25,19 @@ import org.slf4j.impl.Log4jLoggerFactory;
|
|
25
25
|
|
26
26
|
import java.io.BufferedInputStream;
|
27
27
|
import java.io.BufferedReader;
|
28
|
+
import java.io.File;
|
28
29
|
import java.io.FileNotFoundException;
|
29
30
|
import java.io.IOException;
|
30
31
|
import java.io.InputStream;
|
31
32
|
import java.io.InputStreamReader;
|
33
|
+
import java.nio.file.Files;
|
32
34
|
import java.util.ArrayList;
|
33
35
|
import java.util.Collections;
|
34
36
|
import java.util.Comparator;
|
35
37
|
import java.util.List;
|
36
38
|
import java.util.Random;
|
37
39
|
|
40
|
+
import static java.nio.charset.StandardCharsets.UTF_8;
|
38
41
|
import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
|
39
42
|
import static org.junit.Assert.assertEquals;
|
40
43
|
import static org.junit.Assert.assertTrue;
|
@@ -62,36 +65,42 @@ public class TestMapReduceExecutor
|
|
62
65
|
bootstrap.setSystemConfig(systemConfig);
|
63
66
|
bootstrap.overrideModules(getModuleOverrides(systemConfig));
|
64
67
|
embulk = bootstrap.initialize();
|
68
|
+
|
69
|
+
new File("tmp").mkdirs();
|
65
70
|
}
|
66
71
|
|
67
72
|
@Test
|
68
73
|
public void testEmbulkMapper()
|
69
74
|
throws Exception
|
70
75
|
{
|
76
|
+
new File("tmp/embulk_mapred_output.000.00.csv").delete();
|
77
|
+
new File("tmp/embulk_mapred_output.001.00.csv").delete();
|
71
78
|
ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_config.yml");
|
72
79
|
embulk.run(config);
|
73
80
|
assertFileContent(
|
74
81
|
Lists.newArrayList(
|
75
|
-
"fixtures/csv/sample1.csv",
|
76
|
-
"fixtures/csv/sample1.csv"),
|
82
|
+
"src/test/resources/fixtures/csv/sample1.csv",
|
83
|
+
"src/test/resources/fixtures/csv/sample1.csv"),
|
77
84
|
Lists.newArrayList(
|
78
|
-
"
|
79
|
-
"
|
85
|
+
"tmp/embulk_mapred_output.000.00.csv",
|
86
|
+
"tmp/embulk_mapred_output.001.00.csv"));
|
80
87
|
}
|
81
88
|
|
82
89
|
@Test
|
83
90
|
public void testEmbulkPartitioningMapperReducer()
|
84
91
|
throws Exception
|
85
92
|
{
|
93
|
+
new File("tmp/embulk_mapred_partitioning_output.000.00.csv").delete();
|
94
|
+
new File("tmp/embulk_mapred_partitioning_output.001.00.csv").delete();
|
86
95
|
ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_partitioning_config.yml");
|
87
96
|
embulk.run(config);
|
88
97
|
assertFileContent(
|
89
98
|
Lists.newArrayList(
|
90
|
-
"fixtures/csv/sample1.csv",
|
91
|
-
"fixtures/csv/sample1.csv"),
|
99
|
+
"src/test/resources/fixtures/csv/sample1.csv",
|
100
|
+
"src/test/resources/fixtures/csv/sample1.csv"),
|
92
101
|
Lists.newArrayList(
|
93
|
-
"
|
94
|
-
"
|
102
|
+
"tmp/embulk_mapred_partitioning_output.000.00.csv",
|
103
|
+
"tmp/embulk_mapred_partitioning_output.001.00.csv"));
|
95
104
|
}
|
96
105
|
|
97
106
|
@Test
|
@@ -104,7 +113,8 @@ public class TestMapReduceExecutor
|
|
104
113
|
fail();
|
105
114
|
}
|
106
115
|
catch (Throwable t) {
|
107
|
-
assertTrue(t instanceof
|
116
|
+
assertTrue(t instanceof PartialExecutionException);
|
117
|
+
assertTrue(t.getCause() instanceof ConfigException);
|
108
118
|
}
|
109
119
|
}
|
110
120
|
|
@@ -118,7 +128,8 @@ public class TestMapReduceExecutor
|
|
118
128
|
fail();
|
119
129
|
}
|
120
130
|
catch (Throwable t) {
|
121
|
-
assertTrue(t instanceof
|
131
|
+
assertTrue(t instanceof PartialExecutionException);
|
132
|
+
assertTrue(t.getCause() instanceof ConfigException);
|
122
133
|
}
|
123
134
|
}
|
124
135
|
|
@@ -132,7 +143,8 @@ public class TestMapReduceExecutor
|
|
132
143
|
fail();
|
133
144
|
}
|
134
145
|
catch (Throwable t) {
|
135
|
-
assertTrue(t instanceof
|
146
|
+
assertTrue(t instanceof PartialExecutionException);
|
147
|
+
assertTrue(t.getCause() instanceof ConfigException);
|
136
148
|
}
|
137
149
|
}
|
138
150
|
|
@@ -146,7 +158,9 @@ public class TestMapReduceExecutor
|
|
146
158
|
fail();
|
147
159
|
}
|
148
160
|
catch (Throwable t) {
|
149
|
-
assertTrue(t
|
161
|
+
assertTrue(t instanceof PartialExecutionException);
|
162
|
+
assertTrue(t.getCause() instanceof RuntimeException);
|
163
|
+
assertTrue(t.getCause().getCause() instanceof FileNotFoundException);
|
150
164
|
}
|
151
165
|
}
|
152
166
|
|
@@ -273,6 +287,7 @@ public class TestMapReduceExecutor
|
|
273
287
|
}
|
274
288
|
|
275
289
|
private static void assertFileContent(List<String> inputFiles, List<String> outputFiles)
|
290
|
+
throws IOException
|
276
291
|
{
|
277
292
|
List<List<String>> inputRecords = getRecords(inputFiles);
|
278
293
|
Collections.sort(inputRecords, new RecordComparator());
|
@@ -294,6 +309,7 @@ public class TestMapReduceExecutor
|
|
294
309
|
}
|
295
310
|
|
296
311
|
private static List<List<String>> getRecords(List<String> files)
|
312
|
+
throws IOException
|
297
313
|
{
|
298
314
|
List<List<String>> records = new ArrayList<>();
|
299
315
|
|
@@ -327,8 +343,8 @@ public class TestMapReduceExecutor
|
|
327
343
|
}
|
328
344
|
|
329
345
|
private static BufferedReader newReader(String filePath)
|
346
|
+
throws IOException
|
330
347
|
{
|
331
|
-
|
332
|
-
return new BufferedReader(new InputStreamReader(in));
|
348
|
+
return Files.newBufferedReader(new File(filePath).toPath(), UTF_8);
|
333
349
|
}
|
334
350
|
}
|
@@ -19,8 +19,8 @@ in:
|
|
19
19
|
newline: CRLF
|
20
20
|
type: csv
|
21
21
|
delimiter: ','
|
22
|
-
quote: ''
|
23
|
-
escape: ''
|
22
|
+
quote: '"'
|
23
|
+
escape: '"'
|
24
24
|
skip_header_lines: 1
|
25
25
|
columns:
|
26
26
|
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S"}
|
@@ -34,13 +34,16 @@ in:
|
|
34
34
|
- {name: size, type: long}
|
35
35
|
- {name: d, type: double}
|
36
36
|
- {name: flag, type: boolean}
|
37
|
+
- {name: v_json, type: json}
|
37
38
|
out:
|
38
39
|
type: file
|
39
|
-
path_prefix: '
|
40
|
+
path_prefix: 'tmp/embulk_mapred_output.'
|
40
41
|
file_ext: 'csv'
|
41
42
|
formatter:
|
42
43
|
charset: UTF-8
|
43
44
|
newline: CRLF
|
45
|
+
quote: '"'
|
46
|
+
escape: '"'
|
44
47
|
type: csv
|
45
48
|
column_options:
|
46
49
|
timestamp: {format: '%Y-%m-%d %H:%M:%S'}
|
@@ -25,8 +25,8 @@ in:
|
|
25
25
|
newline: CRLF
|
26
26
|
type: csv
|
27
27
|
delimiter: ','
|
28
|
-
quote: ''
|
29
|
-
escape: ''
|
28
|
+
quote: '"'
|
29
|
+
escape: '"'
|
30
30
|
skip_header_lines: 1
|
31
31
|
columns:
|
32
32
|
- {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S"}
|
@@ -40,13 +40,16 @@ in:
|
|
40
40
|
- {name: size, type: long}
|
41
41
|
- {name: d, type: double}
|
42
42
|
- {name: flag, type: boolean}
|
43
|
+
- {name: v_json, type: json}
|
43
44
|
out:
|
44
45
|
type: file
|
45
|
-
path_prefix: '
|
46
|
+
path_prefix: 'tmp/embulk_mapred_partitioning_output.'
|
46
47
|
file_ext: 'csv'
|
47
48
|
formatter:
|
48
49
|
charset: UTF-8
|
49
50
|
newline: CRLF
|
51
|
+
quote: '"'
|
52
|
+
escape: '"'
|
50
53
|
type: csv
|
51
54
|
column_options:
|
52
55
|
timestamp: {format: '%Y-%m-%d %H:%M:%S'}
|
@@ -1,3 +1,3 @@
|
|
1
|
-
timestamp,host,path,method,referer,code,agent,user,size,d,flag
|
2
|
-
2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true
|
3
|
-
2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false
|
1
|
+
timestamp,host,path,method,referer,code,agent,user,size,d,flag,v_json
|
2
|
+
2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true,"{""k0"":""v0"",""k1"":""v1""}"
|
3
|
+
2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false,"[1,2,""3""]"
|
@@ -1,4 +1,3 @@
|
|
1
|
-
timestamp,host,path,method,referer,code,agent,user,size,d,flag
|
2
|
-
2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true
|
3
|
-
2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false
|
4
|
-
|
1
|
+
timestamp,host,path,method,referer,code,agent,user,size,d,flag,v_json
|
2
|
+
2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true,"{""k0"":""v0"",""k1"":""v1""}"
|
3
|
+
2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false,"[1,2,""3""]"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-executor-mapreduce
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sadayuki Furuhashi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-02-09 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: Executes tasks on Hadoop.
|
14
14
|
email:
|
@@ -84,7 +84,7 @@ files:
|
|
84
84
|
- classpath/curator-client-2.6.0.jar
|
85
85
|
- classpath/curator-framework-2.6.0.jar
|
86
86
|
- classpath/curator-recipes-2.6.0.jar
|
87
|
-
- classpath/embulk-executor-mapreduce-0.2.
|
87
|
+
- classpath/embulk-executor-mapreduce-0.2.5.jar
|
88
88
|
- classpath/gson-2.2.4.jar
|
89
89
|
- classpath/hadoop-annotations-2.6.0.jar
|
90
90
|
- classpath/hadoop-auth-2.6.0.jar
|