embulk-executor-mapreduce 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build.gradle +2 -0
- data/classpath/activation-1.1.jar +0 -0
- data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
- data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
- data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
- data/classpath/api-util-1.0.0-M20.jar +0 -0
- data/classpath/avro-1.7.4.jar +0 -0
- data/classpath/commons-beanutils-1.7.0.jar +0 -0
- data/classpath/commons-cli-1.2.jar +0 -0
- data/classpath/commons-codec-1.6.jar +0 -0
- data/classpath/commons-collections-3.2.1.jar +0 -0
- data/classpath/commons-compress-1.4.1.jar +0 -0
- data/classpath/commons-configuration-1.6.jar +0 -0
- data/classpath/commons-digester-1.8.jar +0 -0
- data/classpath/commons-httpclient-3.1.jar +0 -0
- data/classpath/commons-io-2.4.jar +0 -0
- data/classpath/commons-lang-2.6.jar +0 -0
- data/classpath/commons-logging-1.1.3.jar +0 -0
- data/classpath/commons-math3-3.1.1.jar +0 -0
- data/classpath/commons-net-3.1.jar +0 -0
- data/classpath/curator-client-2.6.0.jar +0 -0
- data/classpath/curator-framework-2.6.0.jar +0 -0
- data/classpath/curator-recipes-2.6.0.jar +0 -0
- data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
- data/classpath/gson-2.2.4.jar +0 -0
- data/classpath/hadoop-annotations-2.6.0.jar +0 -0
- data/classpath/hadoop-auth-2.6.0.jar +0 -0
- data/classpath/hadoop-client-2.6.0.jar +0 -0
- data/classpath/hadoop-common-2.6.0.jar +0 -0
- data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
- data/classpath/htrace-core-3.0.4.jar +0 -0
- data/classpath/httpclient-4.2.5.jar +0 -0
- data/classpath/httpcore-4.2.4.jar +0 -0
- data/classpath/jackson-core-asl-1.9.13.jar +0 -0
- data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
- data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
- data/classpath/jackson-xc-1.9.13.jar +0 -0
- data/classpath/jaxb-api-2.2.2.jar +0 -0
- data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
- data/classpath/jersey-client-1.9.jar +0 -0
- data/classpath/jersey-core-1.9.jar +0 -0
- data/classpath/jersey-guice-1.9.jar +0 -0
- data/classpath/jersey-json-1.9.jar +0 -0
- data/classpath/jersey-server-1.9.jar +0 -0
- data/classpath/jettison-1.1.jar +0 -0
- data/classpath/jetty-util-6.1.26.jar +0 -0
- data/classpath/jline-0.9.94.jar +0 -0
- data/classpath/jsr305-1.3.9.jar +0 -0
- data/classpath/leveldbjni-all-1.8.jar +0 -0
- data/classpath/netty-3.7.0.Final.jar +0 -0
- data/classpath/paranamer-2.3.jar +0 -0
- data/classpath/protobuf-java-2.5.0.jar +0 -0
- data/classpath/servlet-api-2.5.jar +0 -0
- data/classpath/snappy-java-1.0.4.1.jar +0 -0
- data/classpath/stax-api-1.0-2.jar +0 -0
- data/classpath/xmlenc-0.52.jar +0 -0
- data/classpath/xz-1.0.jar +0 -0
- data/classpath/zookeeper-3.4.6.jar +0 -0
- data/lib/embulk/executor/mapreduce.rb +3 -0
- data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
- data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
- data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
- data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
- data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
- data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
- data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
- metadata +131 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 44794b114e21e1c5ced89169aa74b7d86a882d4d
|
|
4
|
+
data.tar.gz: 44ce570ea8b23c6d0c430598057022a1c36e73b2
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: cc9a2d3da2e8b89c0fb630652209897efcbf1f4ef7aebc71f82c6f3913bb6cb812bd2b583a84d7cac98b6dfd50d8380c92beac0a8406b0d69497400c3b40ee6d
|
|
7
|
+
data.tar.gz: 1f81b7fabaae57386f33c924bfd797e9fbe5f684e2a14cf87ccf68c2bc231a22b7aa5844f1ffeac9bee287fa082b34aaa45dbe6711445273ddf5635f0013a766
|
data/build.gradle
ADDED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import java.util.Scanner;
|
|
4
|
+
import java.io.InputStream;
|
|
5
|
+
import java.io.OutputStream;
|
|
6
|
+
import java.io.ByteArrayOutputStream;
|
|
7
|
+
import java.io.PrintStream;
|
|
8
|
+
import java.io.IOException;
|
|
9
|
+
import java.io.EOFException;
|
|
10
|
+
import java.io.UnsupportedEncodingException;
|
|
11
|
+
import java.nio.charset.StandardCharsets;
|
|
12
|
+
import com.google.common.base.Optional;
|
|
13
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
|
14
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
|
15
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
|
16
|
+
import com.fasterxml.jackson.annotation.JsonValue;
|
|
17
|
+
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
|
18
|
+
import org.embulk.config.ModelManager;
|
|
19
|
+
import org.embulk.config.CommitReport;
|
|
20
|
+
|
|
21
|
+
public class AttemptState
|
|
22
|
+
{
|
|
23
|
+
private final TaskAttemptID attemptId;
|
|
24
|
+
private final Optional<Integer> inputTaskIndex;
|
|
25
|
+
private final Optional<Integer> outputTaskIndex;
|
|
26
|
+
private Optional<String> exception;
|
|
27
|
+
private Optional<CommitReport> inputCommitReport;
|
|
28
|
+
private Optional<CommitReport> outputCommitReport;
|
|
29
|
+
|
|
30
|
+
public AttemptState(TaskAttemptID attemptId, Optional<Integer> inputTaskIndex, Optional<Integer> outputTaskIndex)
|
|
31
|
+
{
|
|
32
|
+
this.attemptId = attemptId;
|
|
33
|
+
this.inputTaskIndex = inputTaskIndex;
|
|
34
|
+
this.outputTaskIndex = outputTaskIndex;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
@JsonCreator
|
|
38
|
+
AttemptState(
|
|
39
|
+
@JsonProperty("attempt") String attemptId,
|
|
40
|
+
@JsonProperty("inputTaskIndex") Optional<Integer> inputTaskIndex,
|
|
41
|
+
@JsonProperty("outputTaskIndex") Optional<Integer> outputTaskIndex,
|
|
42
|
+
@JsonProperty("exception") Optional<String> exception,
|
|
43
|
+
@JsonProperty("inputCommitReport") Optional<CommitReport> inputCommitReport,
|
|
44
|
+
@JsonProperty("outputCommitReport") Optional<CommitReport> outputCommitReport)
|
|
45
|
+
{
|
|
46
|
+
this(TaskAttemptID.forName(attemptId),
|
|
47
|
+
inputTaskIndex, outputTaskIndex, exception,
|
|
48
|
+
inputCommitReport, outputCommitReport);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
public AttemptState(
|
|
52
|
+
TaskAttemptID attemptId,
|
|
53
|
+
Optional<Integer> inputTaskIndex,
|
|
54
|
+
Optional<Integer> outputTaskIndex,
|
|
55
|
+
Optional<String> exception,
|
|
56
|
+
Optional<CommitReport> inputCommitReport,
|
|
57
|
+
Optional<CommitReport> outputCommitReport)
|
|
58
|
+
{
|
|
59
|
+
this.attemptId = attemptId;
|
|
60
|
+
this.inputTaskIndex = inputTaskIndex;
|
|
61
|
+
this.outputTaskIndex = outputTaskIndex;
|
|
62
|
+
this.exception = exception;
|
|
63
|
+
this.inputCommitReport = inputCommitReport;
|
|
64
|
+
this.outputCommitReport = outputCommitReport;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
@JsonIgnore
|
|
68
|
+
public TaskAttemptID getAttemptId()
|
|
69
|
+
{
|
|
70
|
+
return attemptId;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
@JsonProperty("attempt")
|
|
74
|
+
public String getAttemptIdString()
|
|
75
|
+
{
|
|
76
|
+
return attemptId.toString();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
@JsonProperty("inputTaskIndex")
|
|
80
|
+
public Optional<Integer> getInputTaskIndex()
|
|
81
|
+
{
|
|
82
|
+
return inputTaskIndex;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
@JsonProperty("outputTaskIndex")
|
|
86
|
+
public Optional<Integer> getOutputTaskIndex()
|
|
87
|
+
{
|
|
88
|
+
return outputTaskIndex;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
@JsonIgnore
|
|
92
|
+
public void setException(Throwable exception)
|
|
93
|
+
{
|
|
94
|
+
ByteArrayOutputStream os = new ByteArrayOutputStream();
|
|
95
|
+
try (PrintStream ps = new PrintStream(os, false, "UTF-8")) {
|
|
96
|
+
exception.printStackTrace(ps);
|
|
97
|
+
} catch (UnsupportedEncodingException ex) {
|
|
98
|
+
throw new RuntimeException(ex);
|
|
99
|
+
}
|
|
100
|
+
setException(new String(os.toByteArray(), StandardCharsets.UTF_8));
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
@JsonIgnore
|
|
104
|
+
public void setException(String exception)
|
|
105
|
+
{
|
|
106
|
+
this.exception = Optional.of(exception);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
@JsonProperty("exception")
|
|
110
|
+
public Optional<String> getException()
|
|
111
|
+
{
|
|
112
|
+
return exception;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
@JsonProperty("inputCommitReport")
|
|
116
|
+
public Optional<CommitReport> getInputCommitReport()
|
|
117
|
+
{
|
|
118
|
+
return inputCommitReport;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
@JsonProperty("outputCommitReport")
|
|
122
|
+
public Optional<CommitReport> getOutputCommitReport()
|
|
123
|
+
{
|
|
124
|
+
return outputCommitReport;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
@JsonIgnore
|
|
128
|
+
public void setInputCommitReport(CommitReport inputCommitReport)
|
|
129
|
+
{
|
|
130
|
+
this.inputCommitReport = Optional.of(inputCommitReport);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
@JsonIgnore
|
|
134
|
+
public void setOutputCommitReport(CommitReport outputCommitReport)
|
|
135
|
+
{
|
|
136
|
+
this.outputCommitReport = Optional.of(outputCommitReport);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
public void writeTo(OutputStream out, ModelManager modelManager) throws IOException
|
|
140
|
+
{
|
|
141
|
+
String s = modelManager.writeObject(this);
|
|
142
|
+
out.write(s.getBytes(StandardCharsets.UTF_8));
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
public static AttemptState readFrom(InputStream in, ModelManager modelManager) throws IOException
|
|
146
|
+
{
|
|
147
|
+
Scanner s = new Scanner(in, "UTF-8").useDelimiter("\\A"); // TODO
|
|
148
|
+
if (s.hasNext()) {
|
|
149
|
+
return modelManager.readObject(AttemptState.class, s.next());
|
|
150
|
+
} else {
|
|
151
|
+
throw new EOFException("JSON is not included in the attempt state file.");
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import java.io.IOException;
|
|
4
|
+
import java.io.DataOutput;
|
|
5
|
+
import java.io.DataInput;
|
|
6
|
+
import org.apache.hadoop.io.WritableComparable;
|
|
7
|
+
import org.apache.hadoop.io.WritableUtils;
|
|
8
|
+
import org.apache.hadoop.io.WritableComparator;
|
|
9
|
+
import org.embulk.spi.Buffer;
|
|
10
|
+
|
|
11
|
+
public class BufferWritable
|
|
12
|
+
implements WritableComparable<BufferWritable>
|
|
13
|
+
{
|
|
14
|
+
private Buffer buffer;
|
|
15
|
+
|
|
16
|
+
public BufferWritable() { }
|
|
17
|
+
|
|
18
|
+
public void set(Buffer buffer)
|
|
19
|
+
{
|
|
20
|
+
this.buffer = buffer;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
public Buffer get()
|
|
24
|
+
{
|
|
25
|
+
return buffer;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
@Override
|
|
29
|
+
public void write(DataOutput out) throws IOException
|
|
30
|
+
{
|
|
31
|
+
WritableUtils.writeVInt(out, buffer.limit());
|
|
32
|
+
out.write(buffer.array(), buffer.offset(), buffer.limit());
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
@Override
|
|
36
|
+
public void readFields(DataInput in) throws IOException
|
|
37
|
+
{
|
|
38
|
+
int size = WritableUtils.readVInt(in);
|
|
39
|
+
byte[] bytes = new byte[size]; // TODO usa buffer allocator?
|
|
40
|
+
in.readFully(bytes, 0, size);
|
|
41
|
+
Buffer newBuffer = Buffer.wrap(bytes);
|
|
42
|
+
if (buffer != null) {
|
|
43
|
+
buffer.release();
|
|
44
|
+
}
|
|
45
|
+
buffer = newBuffer;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
@Override
|
|
49
|
+
public int compareTo(BufferWritable o)
|
|
50
|
+
{
|
|
51
|
+
return WritableComparator.compareBytes(
|
|
52
|
+
buffer.array(), buffer.offset(), buffer.limit(),
|
|
53
|
+
o.buffer.array(), o.buffer.offset(), o.buffer.limit());
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
@Override
|
|
57
|
+
public boolean equals(Object other)
|
|
58
|
+
{
|
|
59
|
+
if (!(other instanceof BufferWritable)) {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
BufferWritable o = (BufferWritable) other;
|
|
63
|
+
if (buffer == null) {
|
|
64
|
+
return o.buffer == null;
|
|
65
|
+
}
|
|
66
|
+
return buffer.equals(o.buffer);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
@Override
|
|
70
|
+
public int hashCode()
|
|
71
|
+
{
|
|
72
|
+
return buffer.hashCode();
|
|
73
|
+
}
|
|
74
|
+
}
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import java.util.Map;
|
|
4
|
+
import java.util.HashMap;
|
|
5
|
+
import java.util.Iterator;
|
|
6
|
+
import org.embulk.spi.Page;
|
|
7
|
+
import org.embulk.spi.PageOutput;
|
|
8
|
+
import org.embulk.spi.PageBuilder;
|
|
9
|
+
import org.embulk.spi.PageReader;
|
|
10
|
+
import org.embulk.spi.Schema;
|
|
11
|
+
import org.embulk.spi.Column;
|
|
12
|
+
import org.embulk.spi.ColumnVisitor;
|
|
13
|
+
import org.embulk.spi.BufferAllocator;
|
|
14
|
+
|
|
15
|
+
public class BufferedPagePartitioner
|
|
16
|
+
{
|
|
17
|
+
public static interface PartitionedPageOutput
|
|
18
|
+
{
|
|
19
|
+
public void add(PartitionKey key, Page value);
|
|
20
|
+
|
|
21
|
+
public void finish();
|
|
22
|
+
|
|
23
|
+
public void close();
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
private static class ForwardRecordColumnVisitor
|
|
27
|
+
implements ColumnVisitor
|
|
28
|
+
{
|
|
29
|
+
private final PageReader source;
|
|
30
|
+
private final PageBuilder destination;
|
|
31
|
+
|
|
32
|
+
public ForwardRecordColumnVisitor(PageReader source, PageBuilder destination)
|
|
33
|
+
{
|
|
34
|
+
this.source = source;
|
|
35
|
+
this.destination = destination;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
public void booleanColumn(Column column)
|
|
39
|
+
{
|
|
40
|
+
if (source.isNull(column)) {
|
|
41
|
+
destination.setNull(column);
|
|
42
|
+
} else {
|
|
43
|
+
destination.setBoolean(column, source.getBoolean(column));
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
public void longColumn(Column column)
|
|
48
|
+
{
|
|
49
|
+
if (source.isNull(column)) {
|
|
50
|
+
destination.setNull(column);
|
|
51
|
+
} else {
|
|
52
|
+
destination.setBoolean(column, source.getBoolean(column));
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
public void doubleColumn(Column column)
|
|
57
|
+
{
|
|
58
|
+
if (source.isNull(column)) {
|
|
59
|
+
destination.setNull(column);
|
|
60
|
+
} else {
|
|
61
|
+
destination.setDouble(column, source.getDouble(column));
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
public void stringColumn(Column column)
|
|
66
|
+
{
|
|
67
|
+
if (source.isNull(column)) {
|
|
68
|
+
destination.setNull(column);
|
|
69
|
+
} else {
|
|
70
|
+
destination.setString(column, source.getString(column));
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
public void timestampColumn(Column column)
|
|
75
|
+
{
|
|
76
|
+
if (source.isNull(column)) {
|
|
77
|
+
destination.setNull(column);
|
|
78
|
+
} else {
|
|
79
|
+
destination.setTimestamp(column, source.getTimestamp(column));
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
private final BufferAllocator bufferAllocator;
|
|
85
|
+
private final Schema schema;
|
|
86
|
+
private final Partitioner partitioner;
|
|
87
|
+
private final int maxPageBufferCount;
|
|
88
|
+
private final PartitionedPageOutput output;
|
|
89
|
+
|
|
90
|
+
private final Map<PartitionKey, PageBuilder> hash = new HashMap<PartitionKey, PageBuilder>();
|
|
91
|
+
|
|
92
|
+
public BufferedPagePartitioner(BufferAllocator bufferAllocator, Schema schema,
|
|
93
|
+
Partitioner partitioner, int maxPageBufferCount, PartitionedPageOutput output)
|
|
94
|
+
{
|
|
95
|
+
this.bufferAllocator = bufferAllocator;
|
|
96
|
+
this.schema = schema;
|
|
97
|
+
this.partitioner = partitioner;
|
|
98
|
+
this.maxPageBufferCount = maxPageBufferCount;
|
|
99
|
+
this.output = output;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
public void add(PageReader record)
|
|
103
|
+
{
|
|
104
|
+
PartitionKey searchKey = partitioner.updateKey(record);
|
|
105
|
+
PageBuilder builder = hash.get(searchKey);
|
|
106
|
+
if (builder == null) {
|
|
107
|
+
if (hash.size() >= maxPageBufferCount) {
|
|
108
|
+
try (PageBuilder b = removeMostUnsed(hash)) {
|
|
109
|
+
b.finish();
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
final PartitionKey key = searchKey.clone();
|
|
113
|
+
builder = new PageBuilder(bufferAllocator, schema, new PageOutput() {
|
|
114
|
+
public void add(Page page)
|
|
115
|
+
{
|
|
116
|
+
output.add(key, page);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
public void finish()
|
|
120
|
+
{ }
|
|
121
|
+
|
|
122
|
+
public void close()
|
|
123
|
+
{ }
|
|
124
|
+
});
|
|
125
|
+
hash.put(key, builder);
|
|
126
|
+
}
|
|
127
|
+
builder.getSchema().visitColumns(new ForwardRecordColumnVisitor(record, builder));
|
|
128
|
+
builder.addRecord();
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
private PageBuilder removeMostUnsed(Map<PartitionKey, PageBuilder> hash)
|
|
132
|
+
{
|
|
133
|
+
// TODO remove the largest buffer
|
|
134
|
+
Iterator<Map.Entry<PartitionKey, PageBuilder>> ite = hash.entrySet().iterator();
|
|
135
|
+
PageBuilder builder = ite.next().getValue();
|
|
136
|
+
ite.remove();
|
|
137
|
+
return builder;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
public void finish()
|
|
141
|
+
{
|
|
142
|
+
for (PageBuilder builder : hash.values()) {
|
|
143
|
+
builder.finish();
|
|
144
|
+
}
|
|
145
|
+
output.finish();
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
public void close()
|
|
149
|
+
{
|
|
150
|
+
Iterator<Map.Entry<PartitionKey, PageBuilder>> ite = hash.entrySet().iterator();
|
|
151
|
+
while (ite.hasNext()) {
|
|
152
|
+
PageBuilder builder = ite.next().getValue();
|
|
153
|
+
builder.close();
|
|
154
|
+
ite.remove();
|
|
155
|
+
}
|
|
156
|
+
output.close();
|
|
157
|
+
}
|
|
158
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import java.util.List;
|
|
4
|
+
import java.io.IOException;
|
|
5
|
+
import com.google.common.collect.ImmutableList;
|
|
6
|
+
import org.apache.hadoop.io.IntWritable;
|
|
7
|
+
import org.apache.hadoop.io.NullWritable;
|
|
8
|
+
import org.apache.hadoop.mapreduce.InputFormat;
|
|
9
|
+
import org.apache.hadoop.mapreduce.InputSplit;
|
|
10
|
+
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
|
11
|
+
import org.apache.hadoop.mapreduce.RecordReader;
|
|
12
|
+
import org.apache.hadoop.mapreduce.JobContext;
|
|
13
|
+
|
|
14
|
+
public class EmbulkInputFormat
|
|
15
|
+
extends InputFormat<IntWritable, NullWritable>
|
|
16
|
+
{
|
|
17
|
+
@Override
|
|
18
|
+
public List<InputSplit> getSplits(JobContext context)
|
|
19
|
+
throws IOException, InterruptedException
|
|
20
|
+
{
|
|
21
|
+
// TODO combining multiple tasks to one mapper is not implemented yet.
|
|
22
|
+
int taskCount = EmbulkMapReduce.getMapTaskCount(context.getConfiguration());
|
|
23
|
+
ImmutableList.Builder<InputSplit> builder = ImmutableList.builder();
|
|
24
|
+
for (int i=0; i < taskCount; i++) {
|
|
25
|
+
builder.add(new EmbulkInputSplit(new int[] { i }));
|
|
26
|
+
}
|
|
27
|
+
return builder.build();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
@Override
|
|
31
|
+
public RecordReader<IntWritable, NullWritable> createRecordReader(
|
|
32
|
+
InputSplit split, TaskAttemptContext context)
|
|
33
|
+
throws IOException, InterruptedException
|
|
34
|
+
{
|
|
35
|
+
return new EmbulkRecordReader((EmbulkInputSplit) split);
|
|
36
|
+
}
|
|
37
|
+
}
|