embulk-executor-mapreduce 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (89) hide show
  1. checksums.yaml +7 -0
  2. data/build.gradle +2 -0
  3. data/classpath/activation-1.1.jar +0 -0
  4. data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
  5. data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
  6. data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
  7. data/classpath/api-util-1.0.0-M20.jar +0 -0
  8. data/classpath/avro-1.7.4.jar +0 -0
  9. data/classpath/commons-beanutils-1.7.0.jar +0 -0
  10. data/classpath/commons-cli-1.2.jar +0 -0
  11. data/classpath/commons-codec-1.6.jar +0 -0
  12. data/classpath/commons-collections-3.2.1.jar +0 -0
  13. data/classpath/commons-compress-1.4.1.jar +0 -0
  14. data/classpath/commons-configuration-1.6.jar +0 -0
  15. data/classpath/commons-digester-1.8.jar +0 -0
  16. data/classpath/commons-httpclient-3.1.jar +0 -0
  17. data/classpath/commons-io-2.4.jar +0 -0
  18. data/classpath/commons-lang-2.6.jar +0 -0
  19. data/classpath/commons-logging-1.1.3.jar +0 -0
  20. data/classpath/commons-math3-3.1.1.jar +0 -0
  21. data/classpath/commons-net-3.1.jar +0 -0
  22. data/classpath/curator-client-2.6.0.jar +0 -0
  23. data/classpath/curator-framework-2.6.0.jar +0 -0
  24. data/classpath/curator-recipes-2.6.0.jar +0 -0
  25. data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
  26. data/classpath/gson-2.2.4.jar +0 -0
  27. data/classpath/hadoop-annotations-2.6.0.jar +0 -0
  28. data/classpath/hadoop-auth-2.6.0.jar +0 -0
  29. data/classpath/hadoop-client-2.6.0.jar +0 -0
  30. data/classpath/hadoop-common-2.6.0.jar +0 -0
  31. data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
  32. data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
  33. data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
  34. data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
  35. data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
  36. data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
  37. data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
  38. data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
  39. data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
  40. data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
  41. data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
  42. data/classpath/htrace-core-3.0.4.jar +0 -0
  43. data/classpath/httpclient-4.2.5.jar +0 -0
  44. data/classpath/httpcore-4.2.4.jar +0 -0
  45. data/classpath/jackson-core-asl-1.9.13.jar +0 -0
  46. data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
  47. data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
  48. data/classpath/jackson-xc-1.9.13.jar +0 -0
  49. data/classpath/jaxb-api-2.2.2.jar +0 -0
  50. data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
  51. data/classpath/jersey-client-1.9.jar +0 -0
  52. data/classpath/jersey-core-1.9.jar +0 -0
  53. data/classpath/jersey-guice-1.9.jar +0 -0
  54. data/classpath/jersey-json-1.9.jar +0 -0
  55. data/classpath/jersey-server-1.9.jar +0 -0
  56. data/classpath/jettison-1.1.jar +0 -0
  57. data/classpath/jetty-util-6.1.26.jar +0 -0
  58. data/classpath/jline-0.9.94.jar +0 -0
  59. data/classpath/jsr305-1.3.9.jar +0 -0
  60. data/classpath/leveldbjni-all-1.8.jar +0 -0
  61. data/classpath/netty-3.7.0.Final.jar +0 -0
  62. data/classpath/paranamer-2.3.jar +0 -0
  63. data/classpath/protobuf-java-2.5.0.jar +0 -0
  64. data/classpath/servlet-api-2.5.jar +0 -0
  65. data/classpath/snappy-java-1.0.4.1.jar +0 -0
  66. data/classpath/stax-api-1.0-2.jar +0 -0
  67. data/classpath/xmlenc-0.52.jar +0 -0
  68. data/classpath/xz-1.0.jar +0 -0
  69. data/classpath/zookeeper-3.4.6.jar +0 -0
  70. data/lib/embulk/executor/mapreduce.rb +3 -0
  71. data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
  72. data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
  73. data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
  74. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
  75. data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
  76. data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
  77. data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
  78. data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
  79. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
  80. data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
  81. data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
  82. data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
  83. data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
  84. data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
  85. data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
  86. data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
  87. data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
  88. data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
  89. metadata +131 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 44794b114e21e1c5ced89169aa74b7d86a882d4d
4
+ data.tar.gz: 44ce570ea8b23c6d0c430598057022a1c36e73b2
5
+ SHA512:
6
+ metadata.gz: cc9a2d3da2e8b89c0fb630652209897efcbf1f4ef7aebc71f82c6f3913bb6cb812bd2b583a84d7cac98b6dfd50d8380c92beac0a8406b0d69497400c3b40ee6d
7
+ data.tar.gz: 1f81b7fabaae57386f33c924bfd797e9fbe5f684e2a14cf87ccf68c2bc231a22b7aa5844f1ffeac9bee287fa082b34aaa45dbe6711445273ddf5635f0013a766
data/build.gradle ADDED
@@ -0,0 +1,2 @@
1
+ dependencies {
2
+ }
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,3 @@
1
+ Embulk::JavaPlugin.register_executor(
2
+ :mapreduce, "org.embulk.executor.mapreduce.MapReduceExecutor",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,154 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.Scanner;
4
+ import java.io.InputStream;
5
+ import java.io.OutputStream;
6
+ import java.io.ByteArrayOutputStream;
7
+ import java.io.PrintStream;
8
+ import java.io.IOException;
9
+ import java.io.EOFException;
10
+ import java.io.UnsupportedEncodingException;
11
+ import java.nio.charset.StandardCharsets;
12
+ import com.google.common.base.Optional;
13
+ import com.fasterxml.jackson.annotation.JsonCreator;
14
+ import com.fasterxml.jackson.annotation.JsonProperty;
15
+ import com.fasterxml.jackson.annotation.JsonIgnore;
16
+ import com.fasterxml.jackson.annotation.JsonValue;
17
+ import org.apache.hadoop.mapreduce.TaskAttemptID;
18
+ import org.embulk.config.ModelManager;
19
+ import org.embulk.config.CommitReport;
20
+
21
+ public class AttemptState
22
+ {
23
+ private final TaskAttemptID attemptId;
24
+ private final Optional<Integer> inputTaskIndex;
25
+ private final Optional<Integer> outputTaskIndex;
26
+ private Optional<String> exception;
27
+ private Optional<CommitReport> inputCommitReport;
28
+ private Optional<CommitReport> outputCommitReport;
29
+
30
+ public AttemptState(TaskAttemptID attemptId, Optional<Integer> inputTaskIndex, Optional<Integer> outputTaskIndex)
31
+ {
32
+ this.attemptId = attemptId;
33
+ this.inputTaskIndex = inputTaskIndex;
34
+ this.outputTaskIndex = outputTaskIndex;
35
+ }
36
+
37
+ @JsonCreator
38
+ AttemptState(
39
+ @JsonProperty("attempt") String attemptId,
40
+ @JsonProperty("inputTaskIndex") Optional<Integer> inputTaskIndex,
41
+ @JsonProperty("outputTaskIndex") Optional<Integer> outputTaskIndex,
42
+ @JsonProperty("exception") Optional<String> exception,
43
+ @JsonProperty("inputCommitReport") Optional<CommitReport> inputCommitReport,
44
+ @JsonProperty("outputCommitReport") Optional<CommitReport> outputCommitReport)
45
+ {
46
+ this(TaskAttemptID.forName(attemptId),
47
+ inputTaskIndex, outputTaskIndex, exception,
48
+ inputCommitReport, outputCommitReport);
49
+ }
50
+
51
+ public AttemptState(
52
+ TaskAttemptID attemptId,
53
+ Optional<Integer> inputTaskIndex,
54
+ Optional<Integer> outputTaskIndex,
55
+ Optional<String> exception,
56
+ Optional<CommitReport> inputCommitReport,
57
+ Optional<CommitReport> outputCommitReport)
58
+ {
59
+ this.attemptId = attemptId;
60
+ this.inputTaskIndex = inputTaskIndex;
61
+ this.outputTaskIndex = outputTaskIndex;
62
+ this.exception = exception;
63
+ this.inputCommitReport = inputCommitReport;
64
+ this.outputCommitReport = outputCommitReport;
65
+ }
66
+
67
+ @JsonIgnore
68
+ public TaskAttemptID getAttemptId()
69
+ {
70
+ return attemptId;
71
+ }
72
+
73
+ @JsonProperty("attempt")
74
+ public String getAttemptIdString()
75
+ {
76
+ return attemptId.toString();
77
+ }
78
+
79
+ @JsonProperty("inputTaskIndex")
80
+ public Optional<Integer> getInputTaskIndex()
81
+ {
82
+ return inputTaskIndex;
83
+ }
84
+
85
+ @JsonProperty("outputTaskIndex")
86
+ public Optional<Integer> getOutputTaskIndex()
87
+ {
88
+ return outputTaskIndex;
89
+ }
90
+
91
+ @JsonIgnore
92
+ public void setException(Throwable exception)
93
+ {
94
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
95
+ try (PrintStream ps = new PrintStream(os, false, "UTF-8")) {
96
+ exception.printStackTrace(ps);
97
+ } catch (UnsupportedEncodingException ex) {
98
+ throw new RuntimeException(ex);
99
+ }
100
+ setException(new String(os.toByteArray(), StandardCharsets.UTF_8));
101
+ }
102
+
103
+ @JsonIgnore
104
+ public void setException(String exception)
105
+ {
106
+ this.exception = Optional.of(exception);
107
+ }
108
+
109
+ @JsonProperty("exception")
110
+ public Optional<String> getException()
111
+ {
112
+ return exception;
113
+ }
114
+
115
+ @JsonProperty("inputCommitReport")
116
+ public Optional<CommitReport> getInputCommitReport()
117
+ {
118
+ return inputCommitReport;
119
+ }
120
+
121
+ @JsonProperty("outputCommitReport")
122
+ public Optional<CommitReport> getOutputCommitReport()
123
+ {
124
+ return outputCommitReport;
125
+ }
126
+
127
+ @JsonIgnore
128
+ public void setInputCommitReport(CommitReport inputCommitReport)
129
+ {
130
+ this.inputCommitReport = Optional.of(inputCommitReport);
131
+ }
132
+
133
+ @JsonIgnore
134
+ public void setOutputCommitReport(CommitReport outputCommitReport)
135
+ {
136
+ this.outputCommitReport = Optional.of(outputCommitReport);
137
+ }
138
+
139
+ public void writeTo(OutputStream out, ModelManager modelManager) throws IOException
140
+ {
141
+ String s = modelManager.writeObject(this);
142
+ out.write(s.getBytes(StandardCharsets.UTF_8));
143
+ }
144
+
145
+ public static AttemptState readFrom(InputStream in, ModelManager modelManager) throws IOException
146
+ {
147
+ Scanner s = new Scanner(in, "UTF-8").useDelimiter("\\A"); // TODO
148
+ if (s.hasNext()) {
149
+ return modelManager.readObject(AttemptState.class, s.next());
150
+ } else {
151
+ throw new EOFException("JSON is not included in the attempt state file.");
152
+ }
153
+ }
154
+ }
@@ -0,0 +1,74 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.io.IOException;
4
+ import java.io.DataOutput;
5
+ import java.io.DataInput;
6
+ import org.apache.hadoop.io.WritableComparable;
7
+ import org.apache.hadoop.io.WritableUtils;
8
+ import org.apache.hadoop.io.WritableComparator;
9
+ import org.embulk.spi.Buffer;
10
+
11
+ public class BufferWritable
12
+ implements WritableComparable<BufferWritable>
13
+ {
14
+ private Buffer buffer;
15
+
16
+ public BufferWritable() { }
17
+
18
+ public void set(Buffer buffer)
19
+ {
20
+ this.buffer = buffer;
21
+ }
22
+
23
+ public Buffer get()
24
+ {
25
+ return buffer;
26
+ }
27
+
28
+ @Override
29
+ public void write(DataOutput out) throws IOException
30
+ {
31
+ WritableUtils.writeVInt(out, buffer.limit());
32
+ out.write(buffer.array(), buffer.offset(), buffer.limit());
33
+ }
34
+
35
+ @Override
36
+ public void readFields(DataInput in) throws IOException
37
+ {
38
+ int size = WritableUtils.readVInt(in);
39
+ byte[] bytes = new byte[size]; // TODO usa buffer allocator?
40
+ in.readFully(bytes, 0, size);
41
+ Buffer newBuffer = Buffer.wrap(bytes);
42
+ if (buffer != null) {
43
+ buffer.release();
44
+ }
45
+ buffer = newBuffer;
46
+ }
47
+
48
+ @Override
49
+ public int compareTo(BufferWritable o)
50
+ {
51
+ return WritableComparator.compareBytes(
52
+ buffer.array(), buffer.offset(), buffer.limit(),
53
+ o.buffer.array(), o.buffer.offset(), o.buffer.limit());
54
+ }
55
+
56
+ @Override
57
+ public boolean equals(Object other)
58
+ {
59
+ if (!(other instanceof BufferWritable)) {
60
+ return false;
61
+ }
62
+ BufferWritable o = (BufferWritable) other;
63
+ if (buffer == null) {
64
+ return o.buffer == null;
65
+ }
66
+ return buffer.equals(o.buffer);
67
+ }
68
+
69
+ @Override
70
+ public int hashCode()
71
+ {
72
+ return buffer.hashCode();
73
+ }
74
+ }
@@ -0,0 +1,158 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.Map;
4
+ import java.util.HashMap;
5
+ import java.util.Iterator;
6
+ import org.embulk.spi.Page;
7
+ import org.embulk.spi.PageOutput;
8
+ import org.embulk.spi.PageBuilder;
9
+ import org.embulk.spi.PageReader;
10
+ import org.embulk.spi.Schema;
11
+ import org.embulk.spi.Column;
12
+ import org.embulk.spi.ColumnVisitor;
13
+ import org.embulk.spi.BufferAllocator;
14
+
15
+ public class BufferedPagePartitioner
16
+ {
17
+ public static interface PartitionedPageOutput
18
+ {
19
+ public void add(PartitionKey key, Page value);
20
+
21
+ public void finish();
22
+
23
+ public void close();
24
+ }
25
+
26
+ private static class ForwardRecordColumnVisitor
27
+ implements ColumnVisitor
28
+ {
29
+ private final PageReader source;
30
+ private final PageBuilder destination;
31
+
32
+ public ForwardRecordColumnVisitor(PageReader source, PageBuilder destination)
33
+ {
34
+ this.source = source;
35
+ this.destination = destination;
36
+ }
37
+
38
+ public void booleanColumn(Column column)
39
+ {
40
+ if (source.isNull(column)) {
41
+ destination.setNull(column);
42
+ } else {
43
+ destination.setBoolean(column, source.getBoolean(column));
44
+ }
45
+ }
46
+
47
+ public void longColumn(Column column)
48
+ {
49
+ if (source.isNull(column)) {
50
+ destination.setNull(column);
51
+ } else {
52
+ destination.setBoolean(column, source.getBoolean(column));
53
+ }
54
+ }
55
+
56
+ public void doubleColumn(Column column)
57
+ {
58
+ if (source.isNull(column)) {
59
+ destination.setNull(column);
60
+ } else {
61
+ destination.setDouble(column, source.getDouble(column));
62
+ }
63
+ }
64
+
65
+ public void stringColumn(Column column)
66
+ {
67
+ if (source.isNull(column)) {
68
+ destination.setNull(column);
69
+ } else {
70
+ destination.setString(column, source.getString(column));
71
+ }
72
+ }
73
+
74
+ public void timestampColumn(Column column)
75
+ {
76
+ if (source.isNull(column)) {
77
+ destination.setNull(column);
78
+ } else {
79
+ destination.setTimestamp(column, source.getTimestamp(column));
80
+ }
81
+ }
82
+ }
83
+
84
+ private final BufferAllocator bufferAllocator;
85
+ private final Schema schema;
86
+ private final Partitioner partitioner;
87
+ private final int maxPageBufferCount;
88
+ private final PartitionedPageOutput output;
89
+
90
+ private final Map<PartitionKey, PageBuilder> hash = new HashMap<PartitionKey, PageBuilder>();
91
+
92
+ public BufferedPagePartitioner(BufferAllocator bufferAllocator, Schema schema,
93
+ Partitioner partitioner, int maxPageBufferCount, PartitionedPageOutput output)
94
+ {
95
+ this.bufferAllocator = bufferAllocator;
96
+ this.schema = schema;
97
+ this.partitioner = partitioner;
98
+ this.maxPageBufferCount = maxPageBufferCount;
99
+ this.output = output;
100
+ }
101
+
102
+ public void add(PageReader record)
103
+ {
104
+ PartitionKey searchKey = partitioner.updateKey(record);
105
+ PageBuilder builder = hash.get(searchKey);
106
+ if (builder == null) {
107
+ if (hash.size() >= maxPageBufferCount) {
108
+ try (PageBuilder b = removeMostUnsed(hash)) {
109
+ b.finish();
110
+ }
111
+ }
112
+ final PartitionKey key = searchKey.clone();
113
+ builder = new PageBuilder(bufferAllocator, schema, new PageOutput() {
114
+ public void add(Page page)
115
+ {
116
+ output.add(key, page);
117
+ }
118
+
119
+ public void finish()
120
+ { }
121
+
122
+ public void close()
123
+ { }
124
+ });
125
+ hash.put(key, builder);
126
+ }
127
+ builder.getSchema().visitColumns(new ForwardRecordColumnVisitor(record, builder));
128
+ builder.addRecord();
129
+ }
130
+
131
+ private PageBuilder removeMostUnsed(Map<PartitionKey, PageBuilder> hash)
132
+ {
133
+ // TODO remove the largest buffer
134
+ Iterator<Map.Entry<PartitionKey, PageBuilder>> ite = hash.entrySet().iterator();
135
+ PageBuilder builder = ite.next().getValue();
136
+ ite.remove();
137
+ return builder;
138
+ }
139
+
140
+ public void finish()
141
+ {
142
+ for (PageBuilder builder : hash.values()) {
143
+ builder.finish();
144
+ }
145
+ output.finish();
146
+ }
147
+
148
+ public void close()
149
+ {
150
+ Iterator<Map.Entry<PartitionKey, PageBuilder>> ite = hash.entrySet().iterator();
151
+ while (ite.hasNext()) {
152
+ PageBuilder builder = ite.next().getValue();
153
+ builder.close();
154
+ ite.remove();
155
+ }
156
+ output.close();
157
+ }
158
+ }
@@ -0,0 +1,37 @@
1
+ package org.embulk.executor.mapreduce;
2
+
3
+ import java.util.List;
4
+ import java.io.IOException;
5
+ import com.google.common.collect.ImmutableList;
6
+ import org.apache.hadoop.io.IntWritable;
7
+ import org.apache.hadoop.io.NullWritable;
8
+ import org.apache.hadoop.mapreduce.InputFormat;
9
+ import org.apache.hadoop.mapreduce.InputSplit;
10
+ import org.apache.hadoop.mapreduce.TaskAttemptContext;
11
+ import org.apache.hadoop.mapreduce.RecordReader;
12
+ import org.apache.hadoop.mapreduce.JobContext;
13
+
14
+ public class EmbulkInputFormat
15
+ extends InputFormat<IntWritable, NullWritable>
16
+ {
17
+ @Override
18
+ public List<InputSplit> getSplits(JobContext context)
19
+ throws IOException, InterruptedException
20
+ {
21
+ // TODO combining multiple tasks to one mapper is not implemented yet.
22
+ int taskCount = EmbulkMapReduce.getMapTaskCount(context.getConfiguration());
23
+ ImmutableList.Builder<InputSplit> builder = ImmutableList.builder();
24
+ for (int i=0; i < taskCount; i++) {
25
+ builder.add(new EmbulkInputSplit(new int[] { i }));
26
+ }
27
+ return builder.build();
28
+ }
29
+
30
+ @Override
31
+ public RecordReader<IntWritable, NullWritable> createRecordReader(
32
+ InputSplit split, TaskAttemptContext context)
33
+ throws IOException, InterruptedException
34
+ {
35
+ return new EmbulkRecordReader((EmbulkInputSplit) split);
36
+ }
37
+ }