embulk-executor-mapreduce 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b2abda7db750f6c161ab8867474fdccfa67eb265
4
- data.tar.gz: 8cfc89242d0a57368b5803db9b55e6494b6916e6
3
+ metadata.gz: 4461eebeecc53f99b9b9683d7553a585a87e1a1f
4
+ data.tar.gz: a019cd9224918ae2721482a9cf92c9c8148a05a6
5
5
  SHA512:
6
- metadata.gz: 0e8b2f14207ec85d1ba60b531cc5876c90689a2b259f76a5edb57d8b50d025bd5e7151307be0381d82e813b5f263d5326e39cbccaa56238d8aa3f1171e8a21fd
7
- data.tar.gz: 645417db32cc29813fee20f2175a2858485596159fb80c31e77c1f45d1c715f40b239f7417ef0caae927f75428b8f93175eb477281d620fe38edae694e76f66a
6
+ metadata.gz: 91e107ce10160fc097930b139f07b59dcb80b1201dde0723cc302fd1e142a283ad8a817d7518ac7684f0b066ed55537c20ae0af5446230eb0f63026d9bf7e21d
7
+ data.tar.gz: bc045316fedf83de62e34bbf9304152680d90e46ec6fa885fc054aca50d9a967b2d8f8eb3fa86e7c685badba7e3f15834f985fee923c5bf54c9f5197f43a68fb
@@ -71,6 +71,15 @@ public class BufferedPagePartitioner
71
71
  }
72
72
  }
73
73
 
74
+ public void jsonColumn(Column column)
75
+ {
76
+ if (source.isNull(column)) {
77
+ destination.setNull(column);
78
+ } else {
79
+ destination.setJson(column, source.getJson(column));
80
+ }
81
+ }
82
+
74
83
  public void timestampColumn(Column column)
75
84
  {
76
85
  if (source.isNull(column)) {
@@ -254,6 +254,7 @@ public class EmbulkPartitioningMapReduce
254
254
  this.output = output;
255
255
  }
256
256
 
257
+ @Override
257
258
  public ConfigDiff transaction(ConfigSource config,
258
259
  Schema schema, int taskCount,
259
260
  OutputPlugin.Control control)
@@ -262,6 +263,7 @@ public class EmbulkPartitioningMapReduce
262
263
  throw new RuntimeException("");
263
264
  }
264
265
 
266
+ @Override
265
267
  public ConfigDiff resume(TaskSource taskSource,
266
268
  Schema schema, int taskCount,
267
269
  OutputPlugin.Control control)
@@ -270,6 +272,7 @@ public class EmbulkPartitioningMapReduce
270
272
  throw new RuntimeException("");
271
273
  }
272
274
 
275
+ @Override
273
276
  public void cleanup(TaskSource taskSource,
274
277
  Schema schema, int taskCount,
275
278
  List<TaskReport> successTaskReports)
@@ -278,6 +281,7 @@ public class EmbulkPartitioningMapReduce
278
281
  throw new RuntimeException("");
279
282
  }
280
283
 
284
+ @Override
281
285
  public TransactionalPageOutput open(TaskSource taskSource, final Schema schema, int taskIndex)
282
286
  {
283
287
  return new TransactionalPageOutput() {
@@ -7,6 +7,13 @@ import java.util.List;
7
7
  import java.util.ArrayList;
8
8
  import org.apache.hadoop.io.Writable;
9
9
  import org.apache.hadoop.io.WritableUtils;
10
+ import org.apache.hadoop.io.DataOutputOutputStream;
11
+ import org.msgpack.value.Value;
12
+ import org.msgpack.value.ImmutableValue;
13
+ import org.msgpack.core.MessagePack;
14
+ import org.msgpack.core.MessageBufferPacker;
15
+ import org.msgpack.core.MessageUnpacker;
16
+ import org.msgpack.core.buffer.MessageBuffer;
10
17
  import org.embulk.spi.Buffer;
11
18
  import org.embulk.spi.Page;
12
19
  import static java.nio.charset.StandardCharsets.UTF_8;
@@ -40,6 +47,22 @@ public class PageWritable
40
47
  for (String s : stringReferences) {
41
48
  out.writeUTF(s);
42
49
  }
50
+
51
+ List<ImmutableValue> valueReferences = page.getValueReferences();
52
+ WritableUtils.writeVInt(out, valueReferences.size());
53
+ for (Value value : valueReferences) {
54
+ MessageBufferPacker packer = MessagePack.newDefaultBufferPacker(); // TODO reuse allocated buffer
55
+ value.writeTo(packer);
56
+ List<MessageBuffer> buffers = packer.toBufferList();
57
+ int size = 0;
58
+ for (MessageBuffer b : buffers) {
59
+ size += b.size();
60
+ }
61
+ WritableUtils.writeVInt(out, size);
62
+ for (MessageBuffer b : buffers) {
63
+ out.write(b.array(), b.arrayOffset(), b.size());
64
+ }
65
+ }
43
66
  }
44
67
 
45
68
  @Override
@@ -51,13 +74,31 @@ public class PageWritable
51
74
  Buffer buffer = Buffer.wrap(bytes);
52
75
 
53
76
  int stringCount = WritableUtils.readVInt(in);
54
- List<String> strings = new ArrayList<String>(stringCount);
77
+ List<String> strings = new ArrayList<>(stringCount);
55
78
  for (int i=0; i < stringCount; i++) {
56
79
  strings.add(in.readUTF());
57
80
  }
58
81
 
82
+ int valueCount = WritableUtils.readVInt(in);
83
+ List<ImmutableValue> values = new ArrayList<>(valueCount);
84
+ byte[] b = new byte[32 * 1024];
85
+ for (int i=0; i < valueCount; i++) {
86
+ int size = WritableUtils.readVInt(in);
87
+ if (b.length < size) {
88
+ int ns = b.length;
89
+ while (ns < size) {
90
+ ns *= 2;
91
+ }
92
+ b = new byte[ns];
93
+ }
94
+ in.readFully(b, 0, size);
95
+ MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(b, 0, size);
96
+ values.add(unpacker.unpackValue());
97
+ }
98
+
59
99
  Page newPage = Page.wrap(buffer);
60
100
  newPage.setStringReferences(strings);
101
+ newPage.setValueReferences(values);
61
102
  if (page != null) {
62
103
  page.release();
63
104
  }
@@ -25,16 +25,19 @@ import org.slf4j.impl.Log4jLoggerFactory;
25
25
 
26
26
  import java.io.BufferedInputStream;
27
27
  import java.io.BufferedReader;
28
+ import java.io.File;
28
29
  import java.io.FileNotFoundException;
29
30
  import java.io.IOException;
30
31
  import java.io.InputStream;
31
32
  import java.io.InputStreamReader;
33
+ import java.nio.file.Files;
32
34
  import java.util.ArrayList;
33
35
  import java.util.Collections;
34
36
  import java.util.Comparator;
35
37
  import java.util.List;
36
38
  import java.util.Random;
37
39
 
40
+ import static java.nio.charset.StandardCharsets.UTF_8;
38
41
  import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
39
42
  import static org.junit.Assert.assertEquals;
40
43
  import static org.junit.Assert.assertTrue;
@@ -62,36 +65,42 @@ public class TestMapReduceExecutor
62
65
  bootstrap.setSystemConfig(systemConfig);
63
66
  bootstrap.overrideModules(getModuleOverrides(systemConfig));
64
67
  embulk = bootstrap.initialize();
68
+
69
+ new File("tmp").mkdirs();
65
70
  }
66
71
 
67
72
  @Test
68
73
  public void testEmbulkMapper()
69
74
  throws Exception
70
75
  {
76
+ new File("tmp/embulk_mapred_output.000.00.csv").delete();
77
+ new File("tmp/embulk_mapred_output.001.00.csv").delete();
71
78
  ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_config.yml");
72
79
  embulk.run(config);
73
80
  assertFileContent(
74
81
  Lists.newArrayList(
75
- "fixtures/csv/sample1.csv",
76
- "fixtures/csv/sample1.csv"),
82
+ "src/test/resources/fixtures/csv/sample1.csv",
83
+ "src/test/resources/fixtures/csv/sample1.csv"),
77
84
  Lists.newArrayList(
78
- "fixtures/csv/embulk_mapred_output.000.00.csv",
79
- "fixtures/csv/embulk_mapred_output.001.00.csv"));
85
+ "tmp/embulk_mapred_output.000.00.csv",
86
+ "tmp/embulk_mapred_output.001.00.csv"));
80
87
  }
81
88
 
82
89
  @Test
83
90
  public void testEmbulkPartitioningMapperReducer()
84
91
  throws Exception
85
92
  {
93
+ new File("tmp/embulk_mapred_partitioning_output.000.00.csv").delete();
94
+ new File("tmp/embulk_mapred_partitioning_output.001.00.csv").delete();
86
95
  ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_partitioning_config.yml");
87
96
  embulk.run(config);
88
97
  assertFileContent(
89
98
  Lists.newArrayList(
90
- "fixtures/csv/sample1.csv",
91
- "fixtures/csv/sample1.csv"),
99
+ "src/test/resources/fixtures/csv/sample1.csv",
100
+ "src/test/resources/fixtures/csv/sample1.csv"),
92
101
  Lists.newArrayList(
93
- "fixtures/csv/embulk_mapred_partitioning_output.000.00.csv",
94
- "fixtures/csv/embulk_mapred_partitioning_output.001.00.csv"));
102
+ "tmp/embulk_mapred_partitioning_output.000.00.csv",
103
+ "tmp/embulk_mapred_partitioning_output.001.00.csv"));
95
104
  }
96
105
 
97
106
  @Test
@@ -104,7 +113,8 @@ public class TestMapReduceExecutor
104
113
  fail();
105
114
  }
106
115
  catch (Throwable t) {
107
- assertTrue(t instanceof ConfigException);
116
+ assertTrue(t instanceof PartialExecutionException);
117
+ assertTrue(t.getCause() instanceof ConfigException);
108
118
  }
109
119
  }
110
120
 
@@ -118,7 +128,8 @@ public class TestMapReduceExecutor
118
128
  fail();
119
129
  }
120
130
  catch (Throwable t) {
121
- assertTrue(t instanceof ConfigException);
131
+ assertTrue(t instanceof PartialExecutionException);
132
+ assertTrue(t.getCause() instanceof ConfigException);
122
133
  }
123
134
  }
124
135
 
@@ -132,7 +143,8 @@ public class TestMapReduceExecutor
132
143
  fail();
133
144
  }
134
145
  catch (Throwable t) {
135
- assertTrue(t instanceof ConfigException);
146
+ assertTrue(t instanceof PartialExecutionException);
147
+ assertTrue(t.getCause() instanceof ConfigException);
136
148
  }
137
149
  }
138
150
 
@@ -146,7 +158,9 @@ public class TestMapReduceExecutor
146
158
  fail();
147
159
  }
148
160
  catch (Throwable t) {
149
- assertTrue(t.getCause() instanceof FileNotFoundException);
161
+ assertTrue(t instanceof PartialExecutionException);
162
+ assertTrue(t.getCause() instanceof RuntimeException);
163
+ assertTrue(t.getCause().getCause() instanceof FileNotFoundException);
150
164
  }
151
165
  }
152
166
 
@@ -273,6 +287,7 @@ public class TestMapReduceExecutor
273
287
  }
274
288
 
275
289
  private static void assertFileContent(List<String> inputFiles, List<String> outputFiles)
290
+ throws IOException
276
291
  {
277
292
  List<List<String>> inputRecords = getRecords(inputFiles);
278
293
  Collections.sort(inputRecords, new RecordComparator());
@@ -294,6 +309,7 @@ public class TestMapReduceExecutor
294
309
  }
295
310
 
296
311
  private static List<List<String>> getRecords(List<String> files)
312
+ throws IOException
297
313
  {
298
314
  List<List<String>> records = new ArrayList<>();
299
315
 
@@ -327,8 +343,8 @@ public class TestMapReduceExecutor
327
343
  }
328
344
 
329
345
  private static BufferedReader newReader(String filePath)
346
+ throws IOException
330
347
  {
331
- InputStream in = new BufferedInputStream(TestMapReduceExecutor.class.getClassLoader().getResourceAsStream(filePath));
332
- return new BufferedReader(new InputStreamReader(in));
348
+ return Files.newBufferedReader(new File(filePath).toPath(), UTF_8);
333
349
  }
334
350
  }
@@ -19,8 +19,8 @@ in:
19
19
  newline: CRLF
20
20
  type: csv
21
21
  delimiter: ','
22
- quote: ''
23
- escape: ''
22
+ quote: '"'
23
+ escape: '"'
24
24
  skip_header_lines: 1
25
25
  columns:
26
26
  - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S"}
@@ -34,13 +34,16 @@ in:
34
34
  - {name: size, type: long}
35
35
  - {name: d, type: double}
36
36
  - {name: flag, type: boolean}
37
+ - {name: v_json, type: json}
37
38
  out:
38
39
  type: file
39
- path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_output.'
40
+ path_prefix: 'tmp/embulk_mapred_output.'
40
41
  file_ext: 'csv'
41
42
  formatter:
42
43
  charset: UTF-8
43
44
  newline: CRLF
45
+ quote: '"'
46
+ escape: '"'
44
47
  type: csv
45
48
  column_options:
46
49
  timestamp: {format: '%Y-%m-%d %H:%M:%S'}
@@ -25,8 +25,8 @@ in:
25
25
  newline: CRLF
26
26
  type: csv
27
27
  delimiter: ','
28
- quote: ''
29
- escape: ''
28
+ quote: '"'
29
+ escape: '"'
30
30
  skip_header_lines: 1
31
31
  columns:
32
32
  - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S"}
@@ -40,13 +40,16 @@ in:
40
40
  - {name: size, type: long}
41
41
  - {name: d, type: double}
42
42
  - {name: flag, type: boolean}
43
+ - {name: v_json, type: json}
43
44
  out:
44
45
  type: file
45
- path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_partitioning_output.'
46
+ path_prefix: 'tmp/embulk_mapred_partitioning_output.'
46
47
  file_ext: 'csv'
47
48
  formatter:
48
49
  charset: UTF-8
49
50
  newline: CRLF
51
+ quote: '"'
52
+ escape: '"'
50
53
  type: csv
51
54
  column_options:
52
55
  timestamp: {format: '%Y-%m-%d %H:%M:%S'}
@@ -1,3 +1,3 @@
1
- timestamp,host,path,method,referer,code,agent,user,size,d,flag
2
- 2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true
3
- 2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false
1
+ timestamp,host,path,method,referer,code,agent,user,size,d,flag,v_json
2
+ 2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true,"{""k0"":""v0"",""k1"":""v1""}"
3
+ 2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false,"[1,2,""3""]"
@@ -1,4 +1,3 @@
1
- timestamp,host,path,method,referer,code,agent,user,size,d,flag
2
- 2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true
3
- 2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false
4
-
1
+ timestamp,host,path,method,referer,code,agent,user,size,d,flag,v_json
2
+ 2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true,"{""k0"":""v0"",""k1"":""v1""}"
3
+ 2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false,"[1,2,""3""]"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-executor-mapreduce
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-21 00:00:00.000000000 Z
11
+ date: 2016-02-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Executes tasks on Hadoop.
14
14
  email:
@@ -84,7 +84,7 @@ files:
84
84
  - classpath/curator-client-2.6.0.jar
85
85
  - classpath/curator-framework-2.6.0.jar
86
86
  - classpath/curator-recipes-2.6.0.jar
87
- - classpath/embulk-executor-mapreduce-0.2.4.jar
87
+ - classpath/embulk-executor-mapreduce-0.2.5.jar
88
88
  - classpath/gson-2.2.4.jar
89
89
  - classpath/hadoop-annotations-2.6.0.jar
90
90
  - classpath/hadoop-auth-2.6.0.jar