embulk-executor-mapreduce 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b2abda7db750f6c161ab8867474fdccfa67eb265
4
- data.tar.gz: 8cfc89242d0a57368b5803db9b55e6494b6916e6
3
+ metadata.gz: 4461eebeecc53f99b9b9683d7553a585a87e1a1f
4
+ data.tar.gz: a019cd9224918ae2721482a9cf92c9c8148a05a6
5
5
  SHA512:
6
- metadata.gz: 0e8b2f14207ec85d1ba60b531cc5876c90689a2b259f76a5edb57d8b50d025bd5e7151307be0381d82e813b5f263d5326e39cbccaa56238d8aa3f1171e8a21fd
7
- data.tar.gz: 645417db32cc29813fee20f2175a2858485596159fb80c31e77c1f45d1c715f40b239f7417ef0caae927f75428b8f93175eb477281d620fe38edae694e76f66a
6
+ metadata.gz: 91e107ce10160fc097930b139f07b59dcb80b1201dde0723cc302fd1e142a283ad8a817d7518ac7684f0b066ed55537c20ae0af5446230eb0f63026d9bf7e21d
7
+ data.tar.gz: bc045316fedf83de62e34bbf9304152680d90e46ec6fa885fc054aca50d9a967b2d8f8eb3fa86e7c685badba7e3f15834f985fee923c5bf54c9f5197f43a68fb
@@ -71,6 +71,15 @@ public class BufferedPagePartitioner
71
71
  }
72
72
  }
73
73
 
74
+ public void jsonColumn(Column column)
75
+ {
76
+ if (source.isNull(column)) {
77
+ destination.setNull(column);
78
+ } else {
79
+ destination.setJson(column, source.getJson(column));
80
+ }
81
+ }
82
+
74
83
  public void timestampColumn(Column column)
75
84
  {
76
85
  if (source.isNull(column)) {
@@ -254,6 +254,7 @@ public class EmbulkPartitioningMapReduce
254
254
  this.output = output;
255
255
  }
256
256
 
257
+ @Override
257
258
  public ConfigDiff transaction(ConfigSource config,
258
259
  Schema schema, int taskCount,
259
260
  OutputPlugin.Control control)
@@ -262,6 +263,7 @@ public class EmbulkPartitioningMapReduce
262
263
  throw new RuntimeException("");
263
264
  }
264
265
 
266
+ @Override
265
267
  public ConfigDiff resume(TaskSource taskSource,
266
268
  Schema schema, int taskCount,
267
269
  OutputPlugin.Control control)
@@ -270,6 +272,7 @@ public class EmbulkPartitioningMapReduce
270
272
  throw new RuntimeException("");
271
273
  }
272
274
 
275
+ @Override
273
276
  public void cleanup(TaskSource taskSource,
274
277
  Schema schema, int taskCount,
275
278
  List<TaskReport> successTaskReports)
@@ -278,6 +281,7 @@ public class EmbulkPartitioningMapReduce
278
281
  throw new RuntimeException("");
279
282
  }
280
283
 
284
+ @Override
281
285
  public TransactionalPageOutput open(TaskSource taskSource, final Schema schema, int taskIndex)
282
286
  {
283
287
  return new TransactionalPageOutput() {
@@ -7,6 +7,13 @@ import java.util.List;
7
7
  import java.util.ArrayList;
8
8
  import org.apache.hadoop.io.Writable;
9
9
  import org.apache.hadoop.io.WritableUtils;
10
+ import org.apache.hadoop.io.DataOutputOutputStream;
11
+ import org.msgpack.value.Value;
12
+ import org.msgpack.value.ImmutableValue;
13
+ import org.msgpack.core.MessagePack;
14
+ import org.msgpack.core.MessageBufferPacker;
15
+ import org.msgpack.core.MessageUnpacker;
16
+ import org.msgpack.core.buffer.MessageBuffer;
10
17
  import org.embulk.spi.Buffer;
11
18
  import org.embulk.spi.Page;
12
19
  import static java.nio.charset.StandardCharsets.UTF_8;
@@ -40,6 +47,22 @@ public class PageWritable
40
47
  for (String s : stringReferences) {
41
48
  out.writeUTF(s);
42
49
  }
50
+
51
+ List<ImmutableValue> valueReferences = page.getValueReferences();
52
+ WritableUtils.writeVInt(out, valueReferences.size());
53
+ for (Value value : valueReferences) {
54
+ MessageBufferPacker packer = MessagePack.newDefaultBufferPacker(); // TODO reuse allocated buffer
55
+ value.writeTo(packer);
56
+ List<MessageBuffer> buffers = packer.toBufferList();
57
+ int size = 0;
58
+ for (MessageBuffer b : buffers) {
59
+ size += b.size();
60
+ }
61
+ WritableUtils.writeVInt(out, size);
62
+ for (MessageBuffer b : buffers) {
63
+ out.write(b.array(), b.arrayOffset(), b.size());
64
+ }
65
+ }
43
66
  }
44
67
 
45
68
  @Override
@@ -51,13 +74,31 @@ public class PageWritable
51
74
  Buffer buffer = Buffer.wrap(bytes);
52
75
 
53
76
  int stringCount = WritableUtils.readVInt(in);
54
- List<String> strings = new ArrayList<String>(stringCount);
77
+ List<String> strings = new ArrayList<>(stringCount);
55
78
  for (int i=0; i < stringCount; i++) {
56
79
  strings.add(in.readUTF());
57
80
  }
58
81
 
82
+ int valueCount = WritableUtils.readVInt(in);
83
+ List<ImmutableValue> values = new ArrayList<>(valueCount);
84
+ byte[] b = new byte[32 * 1024];
85
+ for (int i=0; i < valueCount; i++) {
86
+ int size = WritableUtils.readVInt(in);
87
+ if (b.length < size) {
88
+ int ns = b.length;
89
+ while (ns < size) {
90
+ ns *= 2;
91
+ }
92
+ b = new byte[ns];
93
+ }
94
+ in.readFully(b, 0, size);
95
+ MessageUnpacker unpacker = MessagePack.newDefaultUnpacker(b, 0, size);
96
+ values.add(unpacker.unpackValue());
97
+ }
98
+
59
99
  Page newPage = Page.wrap(buffer);
60
100
  newPage.setStringReferences(strings);
101
+ newPage.setValueReferences(values);
61
102
  if (page != null) {
62
103
  page.release();
63
104
  }
@@ -25,16 +25,19 @@ import org.slf4j.impl.Log4jLoggerFactory;
25
25
 
26
26
  import java.io.BufferedInputStream;
27
27
  import java.io.BufferedReader;
28
+ import java.io.File;
28
29
  import java.io.FileNotFoundException;
29
30
  import java.io.IOException;
30
31
  import java.io.InputStream;
31
32
  import java.io.InputStreamReader;
33
+ import java.nio.file.Files;
32
34
  import java.util.ArrayList;
33
35
  import java.util.Collections;
34
36
  import java.util.Comparator;
35
37
  import java.util.List;
36
38
  import java.util.Random;
37
39
 
40
+ import static java.nio.charset.StandardCharsets.UTF_8;
38
41
  import static org.embulk.plugin.InjectedPluginSource.registerPluginTo;
39
42
  import static org.junit.Assert.assertEquals;
40
43
  import static org.junit.Assert.assertTrue;
@@ -62,36 +65,42 @@ public class TestMapReduceExecutor
62
65
  bootstrap.setSystemConfig(systemConfig);
63
66
  bootstrap.overrideModules(getModuleOverrides(systemConfig));
64
67
  embulk = bootstrap.initialize();
68
+
69
+ new File("tmp").mkdirs();
65
70
  }
66
71
 
67
72
  @Test
68
73
  public void testEmbulkMapper()
69
74
  throws Exception
70
75
  {
76
+ new File("tmp/embulk_mapred_output.000.00.csv").delete();
77
+ new File("tmp/embulk_mapred_output.001.00.csv").delete();
71
78
  ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_config.yml");
72
79
  embulk.run(config);
73
80
  assertFileContent(
74
81
  Lists.newArrayList(
75
- "fixtures/csv/sample1.csv",
76
- "fixtures/csv/sample1.csv"),
82
+ "src/test/resources/fixtures/csv/sample1.csv",
83
+ "src/test/resources/fixtures/csv/sample1.csv"),
77
84
  Lists.newArrayList(
78
- "fixtures/csv/embulk_mapred_output.000.00.csv",
79
- "fixtures/csv/embulk_mapred_output.001.00.csv"));
85
+ "tmp/embulk_mapred_output.000.00.csv",
86
+ "tmp/embulk_mapred_output.001.00.csv"));
80
87
  }
81
88
 
82
89
  @Test
83
90
  public void testEmbulkPartitioningMapperReducer()
84
91
  throws Exception
85
92
  {
93
+ new File("tmp/embulk_mapred_partitioning_output.000.00.csv").delete();
94
+ new File("tmp/embulk_mapred_partitioning_output.001.00.csv").delete();
86
95
  ConfigSource config = loadConfigSource(embulk.newConfigLoader(), "config/embulk_mapred_partitioning_config.yml");
87
96
  embulk.run(config);
88
97
  assertFileContent(
89
98
  Lists.newArrayList(
90
- "fixtures/csv/sample1.csv",
91
- "fixtures/csv/sample1.csv"),
99
+ "src/test/resources/fixtures/csv/sample1.csv",
100
+ "src/test/resources/fixtures/csv/sample1.csv"),
92
101
  Lists.newArrayList(
93
- "fixtures/csv/embulk_mapred_partitioning_output.000.00.csv",
94
- "fixtures/csv/embulk_mapred_partitioning_output.001.00.csv"));
102
+ "tmp/embulk_mapred_partitioning_output.000.00.csv",
103
+ "tmp/embulk_mapred_partitioning_output.001.00.csv"));
95
104
  }
96
105
 
97
106
  @Test
@@ -104,7 +113,8 @@ public class TestMapReduceExecutor
104
113
  fail();
105
114
  }
106
115
  catch (Throwable t) {
107
- assertTrue(t instanceof ConfigException);
116
+ assertTrue(t instanceof PartialExecutionException);
117
+ assertTrue(t.getCause() instanceof ConfigException);
108
118
  }
109
119
  }
110
120
 
@@ -118,7 +128,8 @@ public class TestMapReduceExecutor
118
128
  fail();
119
129
  }
120
130
  catch (Throwable t) {
121
- assertTrue(t instanceof ConfigException);
131
+ assertTrue(t instanceof PartialExecutionException);
132
+ assertTrue(t.getCause() instanceof ConfigException);
122
133
  }
123
134
  }
124
135
 
@@ -132,7 +143,8 @@ public class TestMapReduceExecutor
132
143
  fail();
133
144
  }
134
145
  catch (Throwable t) {
135
- assertTrue(t instanceof ConfigException);
146
+ assertTrue(t instanceof PartialExecutionException);
147
+ assertTrue(t.getCause() instanceof ConfigException);
136
148
  }
137
149
  }
138
150
 
@@ -146,7 +158,9 @@ public class TestMapReduceExecutor
146
158
  fail();
147
159
  }
148
160
  catch (Throwable t) {
149
- assertTrue(t.getCause() instanceof FileNotFoundException);
161
+ assertTrue(t instanceof PartialExecutionException);
162
+ assertTrue(t.getCause() instanceof RuntimeException);
163
+ assertTrue(t.getCause().getCause() instanceof FileNotFoundException);
150
164
  }
151
165
  }
152
166
 
@@ -273,6 +287,7 @@ public class TestMapReduceExecutor
273
287
  }
274
288
 
275
289
  private static void assertFileContent(List<String> inputFiles, List<String> outputFiles)
290
+ throws IOException
276
291
  {
277
292
  List<List<String>> inputRecords = getRecords(inputFiles);
278
293
  Collections.sort(inputRecords, new RecordComparator());
@@ -294,6 +309,7 @@ public class TestMapReduceExecutor
294
309
  }
295
310
 
296
311
  private static List<List<String>> getRecords(List<String> files)
312
+ throws IOException
297
313
  {
298
314
  List<List<String>> records = new ArrayList<>();
299
315
 
@@ -327,8 +343,8 @@ public class TestMapReduceExecutor
327
343
  }
328
344
 
329
345
  private static BufferedReader newReader(String filePath)
346
+ throws IOException
330
347
  {
331
- InputStream in = new BufferedInputStream(TestMapReduceExecutor.class.getClassLoader().getResourceAsStream(filePath));
332
- return new BufferedReader(new InputStreamReader(in));
348
+ return Files.newBufferedReader(new File(filePath).toPath(), UTF_8);
333
349
  }
334
350
  }
@@ -19,8 +19,8 @@ in:
19
19
  newline: CRLF
20
20
  type: csv
21
21
  delimiter: ','
22
- quote: ''
23
- escape: ''
22
+ quote: '"'
23
+ escape: '"'
24
24
  skip_header_lines: 1
25
25
  columns:
26
26
  - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S"}
@@ -34,13 +34,16 @@ in:
34
34
  - {name: size, type: long}
35
35
  - {name: d, type: double}
36
36
  - {name: flag, type: boolean}
37
+ - {name: v_json, type: json}
37
38
  out:
38
39
  type: file
39
- path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_output.'
40
+ path_prefix: 'tmp/embulk_mapred_output.'
40
41
  file_ext: 'csv'
41
42
  formatter:
42
43
  charset: UTF-8
43
44
  newline: CRLF
45
+ quote: '"'
46
+ escape: '"'
44
47
  type: csv
45
48
  column_options:
46
49
  timestamp: {format: '%Y-%m-%d %H:%M:%S'}
@@ -25,8 +25,8 @@ in:
25
25
  newline: CRLF
26
26
  type: csv
27
27
  delimiter: ','
28
- quote: ''
29
- escape: ''
28
+ quote: '"'
29
+ escape: '"'
30
30
  skip_header_lines: 1
31
31
  columns:
32
32
  - {name: timestamp, type: timestamp, format: "%Y-%m-%d %H:%M:%S"}
@@ -40,13 +40,16 @@ in:
40
40
  - {name: size, type: long}
41
41
  - {name: d, type: double}
42
42
  - {name: flag, type: boolean}
43
+ - {name: v_json, type: json}
43
44
  out:
44
45
  type: file
45
- path_prefix: 'src/test/resources/fixtures/csv/embulk_mapred_partitioning_output.'
46
+ path_prefix: 'tmp/embulk_mapred_partitioning_output.'
46
47
  file_ext: 'csv'
47
48
  formatter:
48
49
  charset: UTF-8
49
50
  newline: CRLF
51
+ quote: '"'
52
+ escape: '"'
50
53
  type: csv
51
54
  column_options:
52
55
  timestamp: {format: '%Y-%m-%d %H:%M:%S'}
@@ -1,3 +1,3 @@
1
- timestamp,host,path,method,referer,code,agent,user,size,d,flag
2
- 2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true
3
- 2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false
1
+ timestamp,host,path,method,referer,code,agent,user,size,d,flag,v_json
2
+ 2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true,"{""k0"":""v0"",""k1"":""v1""}"
3
+ 2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false,"[1,2,""3""]"
@@ -1,4 +1,3 @@
1
- timestamp,host,path,method,referer,code,agent,user,size,d,flag
2
- 2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true
3
- 2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false
4
-
1
+ timestamp,host,path,method,referer,code,agent,user,size,d,flag,v_json
2
+ 2014-10-02 22:15:39,84.186.29.187,/category/electronics,GET,/category/music,200,Mozilla/5.0,-,136,1.1,true,"{""k0"":""v0"",""k1"":""v1""}"
3
+ 2014-10-02 22:15:01,140.36.216.47,/category/music?from=10,GET,-,200,Mozilla/5.0,-,70,1.2,false,"[1,2,""3""]"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-executor-mapreduce
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sadayuki Furuhashi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-12-21 00:00:00.000000000 Z
11
+ date: 2016-02-09 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: Executes tasks on Hadoop.
14
14
  email:
@@ -84,7 +84,7 @@ files:
84
84
  - classpath/curator-client-2.6.0.jar
85
85
  - classpath/curator-framework-2.6.0.jar
86
86
  - classpath/curator-recipes-2.6.0.jar
87
- - classpath/embulk-executor-mapreduce-0.2.4.jar
87
+ - classpath/embulk-executor-mapreduce-0.2.5.jar
88
88
  - classpath/gson-2.2.4.jar
89
89
  - classpath/hadoop-annotations-2.6.0.jar
90
90
  - classpath/hadoop-auth-2.6.0.jar