embulk-executor-mapreduce 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/build.gradle +2 -0
- data/classpath/activation-1.1.jar +0 -0
- data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
- data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
- data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
- data/classpath/api-util-1.0.0-M20.jar +0 -0
- data/classpath/avro-1.7.4.jar +0 -0
- data/classpath/commons-beanutils-1.7.0.jar +0 -0
- data/classpath/commons-cli-1.2.jar +0 -0
- data/classpath/commons-codec-1.6.jar +0 -0
- data/classpath/commons-collections-3.2.1.jar +0 -0
- data/classpath/commons-compress-1.4.1.jar +0 -0
- data/classpath/commons-configuration-1.6.jar +0 -0
- data/classpath/commons-digester-1.8.jar +0 -0
- data/classpath/commons-httpclient-3.1.jar +0 -0
- data/classpath/commons-io-2.4.jar +0 -0
- data/classpath/commons-lang-2.6.jar +0 -0
- data/classpath/commons-logging-1.1.3.jar +0 -0
- data/classpath/commons-math3-3.1.1.jar +0 -0
- data/classpath/commons-net-3.1.jar +0 -0
- data/classpath/curator-client-2.6.0.jar +0 -0
- data/classpath/curator-framework-2.6.0.jar +0 -0
- data/classpath/curator-recipes-2.6.0.jar +0 -0
- data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
- data/classpath/gson-2.2.4.jar +0 -0
- data/classpath/hadoop-annotations-2.6.0.jar +0 -0
- data/classpath/hadoop-auth-2.6.0.jar +0 -0
- data/classpath/hadoop-client-2.6.0.jar +0 -0
- data/classpath/hadoop-common-2.6.0.jar +0 -0
- data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
- data/classpath/htrace-core-3.0.4.jar +0 -0
- data/classpath/httpclient-4.2.5.jar +0 -0
- data/classpath/httpcore-4.2.4.jar +0 -0
- data/classpath/jackson-core-asl-1.9.13.jar +0 -0
- data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
- data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
- data/classpath/jackson-xc-1.9.13.jar +0 -0
- data/classpath/jaxb-api-2.2.2.jar +0 -0
- data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
- data/classpath/jersey-client-1.9.jar +0 -0
- data/classpath/jersey-core-1.9.jar +0 -0
- data/classpath/jersey-guice-1.9.jar +0 -0
- data/classpath/jersey-json-1.9.jar +0 -0
- data/classpath/jersey-server-1.9.jar +0 -0
- data/classpath/jettison-1.1.jar +0 -0
- data/classpath/jetty-util-6.1.26.jar +0 -0
- data/classpath/jline-0.9.94.jar +0 -0
- data/classpath/jsr305-1.3.9.jar +0 -0
- data/classpath/leveldbjni-all-1.8.jar +0 -0
- data/classpath/netty-3.7.0.Final.jar +0 -0
- data/classpath/paranamer-2.3.jar +0 -0
- data/classpath/protobuf-java-2.5.0.jar +0 -0
- data/classpath/servlet-api-2.5.jar +0 -0
- data/classpath/snappy-java-1.0.4.1.jar +0 -0
- data/classpath/stax-api-1.0-2.jar +0 -0
- data/classpath/xmlenc-0.52.jar +0 -0
- data/classpath/xz-1.0.jar +0 -0
- data/classpath/zookeeper-3.4.6.jar +0 -0
- data/lib/embulk/executor/mapreduce.rb +3 -0
- data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
- data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
- data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
- data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
- data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
- data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
- data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
- metadata +131 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 44794b114e21e1c5ced89169aa74b7d86a882d4d
|
4
|
+
data.tar.gz: 44ce570ea8b23c6d0c430598057022a1c36e73b2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cc9a2d3da2e8b89c0fb630652209897efcbf1f4ef7aebc71f82c6f3913bb6cb812bd2b583a84d7cac98b6dfd50d8380c92beac0a8406b0d69497400c3b40ee6d
|
7
|
+
data.tar.gz: 1f81b7fabaae57386f33c924bfd797e9fbe5f684e2a14cf87ccf68c2bc231a22b7aa5844f1ffeac9bee287fa082b34aaa45dbe6711445273ddf5635f0013a766
|
data/build.gradle
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,154 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import java.util.Scanner;
|
4
|
+
import java.io.InputStream;
|
5
|
+
import java.io.OutputStream;
|
6
|
+
import java.io.ByteArrayOutputStream;
|
7
|
+
import java.io.PrintStream;
|
8
|
+
import java.io.IOException;
|
9
|
+
import java.io.EOFException;
|
10
|
+
import java.io.UnsupportedEncodingException;
|
11
|
+
import java.nio.charset.StandardCharsets;
|
12
|
+
import com.google.common.base.Optional;
|
13
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
14
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
15
|
+
import com.fasterxml.jackson.annotation.JsonIgnore;
|
16
|
+
import com.fasterxml.jackson.annotation.JsonValue;
|
17
|
+
import org.apache.hadoop.mapreduce.TaskAttemptID;
|
18
|
+
import org.embulk.config.ModelManager;
|
19
|
+
import org.embulk.config.CommitReport;
|
20
|
+
|
21
|
+
public class AttemptState
|
22
|
+
{
|
23
|
+
private final TaskAttemptID attemptId;
|
24
|
+
private final Optional<Integer> inputTaskIndex;
|
25
|
+
private final Optional<Integer> outputTaskIndex;
|
26
|
+
private Optional<String> exception;
|
27
|
+
private Optional<CommitReport> inputCommitReport;
|
28
|
+
private Optional<CommitReport> outputCommitReport;
|
29
|
+
|
30
|
+
public AttemptState(TaskAttemptID attemptId, Optional<Integer> inputTaskIndex, Optional<Integer> outputTaskIndex)
|
31
|
+
{
|
32
|
+
this.attemptId = attemptId;
|
33
|
+
this.inputTaskIndex = inputTaskIndex;
|
34
|
+
this.outputTaskIndex = outputTaskIndex;
|
35
|
+
}
|
36
|
+
|
37
|
+
@JsonCreator
|
38
|
+
AttemptState(
|
39
|
+
@JsonProperty("attempt") String attemptId,
|
40
|
+
@JsonProperty("inputTaskIndex") Optional<Integer> inputTaskIndex,
|
41
|
+
@JsonProperty("outputTaskIndex") Optional<Integer> outputTaskIndex,
|
42
|
+
@JsonProperty("exception") Optional<String> exception,
|
43
|
+
@JsonProperty("inputCommitReport") Optional<CommitReport> inputCommitReport,
|
44
|
+
@JsonProperty("outputCommitReport") Optional<CommitReport> outputCommitReport)
|
45
|
+
{
|
46
|
+
this(TaskAttemptID.forName(attemptId),
|
47
|
+
inputTaskIndex, outputTaskIndex, exception,
|
48
|
+
inputCommitReport, outputCommitReport);
|
49
|
+
}
|
50
|
+
|
51
|
+
public AttemptState(
|
52
|
+
TaskAttemptID attemptId,
|
53
|
+
Optional<Integer> inputTaskIndex,
|
54
|
+
Optional<Integer> outputTaskIndex,
|
55
|
+
Optional<String> exception,
|
56
|
+
Optional<CommitReport> inputCommitReport,
|
57
|
+
Optional<CommitReport> outputCommitReport)
|
58
|
+
{
|
59
|
+
this.attemptId = attemptId;
|
60
|
+
this.inputTaskIndex = inputTaskIndex;
|
61
|
+
this.outputTaskIndex = outputTaskIndex;
|
62
|
+
this.exception = exception;
|
63
|
+
this.inputCommitReport = inputCommitReport;
|
64
|
+
this.outputCommitReport = outputCommitReport;
|
65
|
+
}
|
66
|
+
|
67
|
+
@JsonIgnore
|
68
|
+
public TaskAttemptID getAttemptId()
|
69
|
+
{
|
70
|
+
return attemptId;
|
71
|
+
}
|
72
|
+
|
73
|
+
@JsonProperty("attempt")
|
74
|
+
public String getAttemptIdString()
|
75
|
+
{
|
76
|
+
return attemptId.toString();
|
77
|
+
}
|
78
|
+
|
79
|
+
@JsonProperty("inputTaskIndex")
|
80
|
+
public Optional<Integer> getInputTaskIndex()
|
81
|
+
{
|
82
|
+
return inputTaskIndex;
|
83
|
+
}
|
84
|
+
|
85
|
+
@JsonProperty("outputTaskIndex")
|
86
|
+
public Optional<Integer> getOutputTaskIndex()
|
87
|
+
{
|
88
|
+
return outputTaskIndex;
|
89
|
+
}
|
90
|
+
|
91
|
+
@JsonIgnore
|
92
|
+
public void setException(Throwable exception)
|
93
|
+
{
|
94
|
+
ByteArrayOutputStream os = new ByteArrayOutputStream();
|
95
|
+
try (PrintStream ps = new PrintStream(os, false, "UTF-8")) {
|
96
|
+
exception.printStackTrace(ps);
|
97
|
+
} catch (UnsupportedEncodingException ex) {
|
98
|
+
throw new RuntimeException(ex);
|
99
|
+
}
|
100
|
+
setException(new String(os.toByteArray(), StandardCharsets.UTF_8));
|
101
|
+
}
|
102
|
+
|
103
|
+
@JsonIgnore
|
104
|
+
public void setException(String exception)
|
105
|
+
{
|
106
|
+
this.exception = Optional.of(exception);
|
107
|
+
}
|
108
|
+
|
109
|
+
@JsonProperty("exception")
|
110
|
+
public Optional<String> getException()
|
111
|
+
{
|
112
|
+
return exception;
|
113
|
+
}
|
114
|
+
|
115
|
+
@JsonProperty("inputCommitReport")
|
116
|
+
public Optional<CommitReport> getInputCommitReport()
|
117
|
+
{
|
118
|
+
return inputCommitReport;
|
119
|
+
}
|
120
|
+
|
121
|
+
@JsonProperty("outputCommitReport")
|
122
|
+
public Optional<CommitReport> getOutputCommitReport()
|
123
|
+
{
|
124
|
+
return outputCommitReport;
|
125
|
+
}
|
126
|
+
|
127
|
+
@JsonIgnore
|
128
|
+
public void setInputCommitReport(CommitReport inputCommitReport)
|
129
|
+
{
|
130
|
+
this.inputCommitReport = Optional.of(inputCommitReport);
|
131
|
+
}
|
132
|
+
|
133
|
+
@JsonIgnore
|
134
|
+
public void setOutputCommitReport(CommitReport outputCommitReport)
|
135
|
+
{
|
136
|
+
this.outputCommitReport = Optional.of(outputCommitReport);
|
137
|
+
}
|
138
|
+
|
139
|
+
public void writeTo(OutputStream out, ModelManager modelManager) throws IOException
|
140
|
+
{
|
141
|
+
String s = modelManager.writeObject(this);
|
142
|
+
out.write(s.getBytes(StandardCharsets.UTF_8));
|
143
|
+
}
|
144
|
+
|
145
|
+
public static AttemptState readFrom(InputStream in, ModelManager modelManager) throws IOException
|
146
|
+
{
|
147
|
+
Scanner s = new Scanner(in, "UTF-8").useDelimiter("\\A"); // TODO
|
148
|
+
if (s.hasNext()) {
|
149
|
+
return modelManager.readObject(AttemptState.class, s.next());
|
150
|
+
} else {
|
151
|
+
throw new EOFException("JSON is not included in the attempt state file.");
|
152
|
+
}
|
153
|
+
}
|
154
|
+
}
|
@@ -0,0 +1,74 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.io.DataOutput;
|
5
|
+
import java.io.DataInput;
|
6
|
+
import org.apache.hadoop.io.WritableComparable;
|
7
|
+
import org.apache.hadoop.io.WritableUtils;
|
8
|
+
import org.apache.hadoop.io.WritableComparator;
|
9
|
+
import org.embulk.spi.Buffer;
|
10
|
+
|
11
|
+
public class BufferWritable
|
12
|
+
implements WritableComparable<BufferWritable>
|
13
|
+
{
|
14
|
+
private Buffer buffer;
|
15
|
+
|
16
|
+
public BufferWritable() { }
|
17
|
+
|
18
|
+
public void set(Buffer buffer)
|
19
|
+
{
|
20
|
+
this.buffer = buffer;
|
21
|
+
}
|
22
|
+
|
23
|
+
public Buffer get()
|
24
|
+
{
|
25
|
+
return buffer;
|
26
|
+
}
|
27
|
+
|
28
|
+
@Override
|
29
|
+
public void write(DataOutput out) throws IOException
|
30
|
+
{
|
31
|
+
WritableUtils.writeVInt(out, buffer.limit());
|
32
|
+
out.write(buffer.array(), buffer.offset(), buffer.limit());
|
33
|
+
}
|
34
|
+
|
35
|
+
@Override
|
36
|
+
public void readFields(DataInput in) throws IOException
|
37
|
+
{
|
38
|
+
int size = WritableUtils.readVInt(in);
|
39
|
+
byte[] bytes = new byte[size]; // TODO usa buffer allocator?
|
40
|
+
in.readFully(bytes, 0, size);
|
41
|
+
Buffer newBuffer = Buffer.wrap(bytes);
|
42
|
+
if (buffer != null) {
|
43
|
+
buffer.release();
|
44
|
+
}
|
45
|
+
buffer = newBuffer;
|
46
|
+
}
|
47
|
+
|
48
|
+
@Override
|
49
|
+
public int compareTo(BufferWritable o)
|
50
|
+
{
|
51
|
+
return WritableComparator.compareBytes(
|
52
|
+
buffer.array(), buffer.offset(), buffer.limit(),
|
53
|
+
o.buffer.array(), o.buffer.offset(), o.buffer.limit());
|
54
|
+
}
|
55
|
+
|
56
|
+
@Override
|
57
|
+
public boolean equals(Object other)
|
58
|
+
{
|
59
|
+
if (!(other instanceof BufferWritable)) {
|
60
|
+
return false;
|
61
|
+
}
|
62
|
+
BufferWritable o = (BufferWritable) other;
|
63
|
+
if (buffer == null) {
|
64
|
+
return o.buffer == null;
|
65
|
+
}
|
66
|
+
return buffer.equals(o.buffer);
|
67
|
+
}
|
68
|
+
|
69
|
+
@Override
|
70
|
+
public int hashCode()
|
71
|
+
{
|
72
|
+
return buffer.hashCode();
|
73
|
+
}
|
74
|
+
}
|
@@ -0,0 +1,158 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import java.util.Map;
|
4
|
+
import java.util.HashMap;
|
5
|
+
import java.util.Iterator;
|
6
|
+
import org.embulk.spi.Page;
|
7
|
+
import org.embulk.spi.PageOutput;
|
8
|
+
import org.embulk.spi.PageBuilder;
|
9
|
+
import org.embulk.spi.PageReader;
|
10
|
+
import org.embulk.spi.Schema;
|
11
|
+
import org.embulk.spi.Column;
|
12
|
+
import org.embulk.spi.ColumnVisitor;
|
13
|
+
import org.embulk.spi.BufferAllocator;
|
14
|
+
|
15
|
+
public class BufferedPagePartitioner
|
16
|
+
{
|
17
|
+
public static interface PartitionedPageOutput
|
18
|
+
{
|
19
|
+
public void add(PartitionKey key, Page value);
|
20
|
+
|
21
|
+
public void finish();
|
22
|
+
|
23
|
+
public void close();
|
24
|
+
}
|
25
|
+
|
26
|
+
private static class ForwardRecordColumnVisitor
|
27
|
+
implements ColumnVisitor
|
28
|
+
{
|
29
|
+
private final PageReader source;
|
30
|
+
private final PageBuilder destination;
|
31
|
+
|
32
|
+
public ForwardRecordColumnVisitor(PageReader source, PageBuilder destination)
|
33
|
+
{
|
34
|
+
this.source = source;
|
35
|
+
this.destination = destination;
|
36
|
+
}
|
37
|
+
|
38
|
+
public void booleanColumn(Column column)
|
39
|
+
{
|
40
|
+
if (source.isNull(column)) {
|
41
|
+
destination.setNull(column);
|
42
|
+
} else {
|
43
|
+
destination.setBoolean(column, source.getBoolean(column));
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
public void longColumn(Column column)
|
48
|
+
{
|
49
|
+
if (source.isNull(column)) {
|
50
|
+
destination.setNull(column);
|
51
|
+
} else {
|
52
|
+
destination.setBoolean(column, source.getBoolean(column));
|
53
|
+
}
|
54
|
+
}
|
55
|
+
|
56
|
+
public void doubleColumn(Column column)
|
57
|
+
{
|
58
|
+
if (source.isNull(column)) {
|
59
|
+
destination.setNull(column);
|
60
|
+
} else {
|
61
|
+
destination.setDouble(column, source.getDouble(column));
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
65
|
+
public void stringColumn(Column column)
|
66
|
+
{
|
67
|
+
if (source.isNull(column)) {
|
68
|
+
destination.setNull(column);
|
69
|
+
} else {
|
70
|
+
destination.setString(column, source.getString(column));
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
public void timestampColumn(Column column)
|
75
|
+
{
|
76
|
+
if (source.isNull(column)) {
|
77
|
+
destination.setNull(column);
|
78
|
+
} else {
|
79
|
+
destination.setTimestamp(column, source.getTimestamp(column));
|
80
|
+
}
|
81
|
+
}
|
82
|
+
}
|
83
|
+
|
84
|
+
private final BufferAllocator bufferAllocator;
|
85
|
+
private final Schema schema;
|
86
|
+
private final Partitioner partitioner;
|
87
|
+
private final int maxPageBufferCount;
|
88
|
+
private final PartitionedPageOutput output;
|
89
|
+
|
90
|
+
private final Map<PartitionKey, PageBuilder> hash = new HashMap<PartitionKey, PageBuilder>();
|
91
|
+
|
92
|
+
public BufferedPagePartitioner(BufferAllocator bufferAllocator, Schema schema,
|
93
|
+
Partitioner partitioner, int maxPageBufferCount, PartitionedPageOutput output)
|
94
|
+
{
|
95
|
+
this.bufferAllocator = bufferAllocator;
|
96
|
+
this.schema = schema;
|
97
|
+
this.partitioner = partitioner;
|
98
|
+
this.maxPageBufferCount = maxPageBufferCount;
|
99
|
+
this.output = output;
|
100
|
+
}
|
101
|
+
|
102
|
+
public void add(PageReader record)
|
103
|
+
{
|
104
|
+
PartitionKey searchKey = partitioner.updateKey(record);
|
105
|
+
PageBuilder builder = hash.get(searchKey);
|
106
|
+
if (builder == null) {
|
107
|
+
if (hash.size() >= maxPageBufferCount) {
|
108
|
+
try (PageBuilder b = removeMostUnsed(hash)) {
|
109
|
+
b.finish();
|
110
|
+
}
|
111
|
+
}
|
112
|
+
final PartitionKey key = searchKey.clone();
|
113
|
+
builder = new PageBuilder(bufferAllocator, schema, new PageOutput() {
|
114
|
+
public void add(Page page)
|
115
|
+
{
|
116
|
+
output.add(key, page);
|
117
|
+
}
|
118
|
+
|
119
|
+
public void finish()
|
120
|
+
{ }
|
121
|
+
|
122
|
+
public void close()
|
123
|
+
{ }
|
124
|
+
});
|
125
|
+
hash.put(key, builder);
|
126
|
+
}
|
127
|
+
builder.getSchema().visitColumns(new ForwardRecordColumnVisitor(record, builder));
|
128
|
+
builder.addRecord();
|
129
|
+
}
|
130
|
+
|
131
|
+
private PageBuilder removeMostUnsed(Map<PartitionKey, PageBuilder> hash)
|
132
|
+
{
|
133
|
+
// TODO remove the largest buffer
|
134
|
+
Iterator<Map.Entry<PartitionKey, PageBuilder>> ite = hash.entrySet().iterator();
|
135
|
+
PageBuilder builder = ite.next().getValue();
|
136
|
+
ite.remove();
|
137
|
+
return builder;
|
138
|
+
}
|
139
|
+
|
140
|
+
public void finish()
|
141
|
+
{
|
142
|
+
for (PageBuilder builder : hash.values()) {
|
143
|
+
builder.finish();
|
144
|
+
}
|
145
|
+
output.finish();
|
146
|
+
}
|
147
|
+
|
148
|
+
public void close()
|
149
|
+
{
|
150
|
+
Iterator<Map.Entry<PartitionKey, PageBuilder>> ite = hash.entrySet().iterator();
|
151
|
+
while (ite.hasNext()) {
|
152
|
+
PageBuilder builder = ite.next().getValue();
|
153
|
+
builder.close();
|
154
|
+
ite.remove();
|
155
|
+
}
|
156
|
+
output.close();
|
157
|
+
}
|
158
|
+
}
|
@@ -0,0 +1,37 @@
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.io.IOException;
|
5
|
+
import com.google.common.collect.ImmutableList;
|
6
|
+
import org.apache.hadoop.io.IntWritable;
|
7
|
+
import org.apache.hadoop.io.NullWritable;
|
8
|
+
import org.apache.hadoop.mapreduce.InputFormat;
|
9
|
+
import org.apache.hadoop.mapreduce.InputSplit;
|
10
|
+
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
11
|
+
import org.apache.hadoop.mapreduce.RecordReader;
|
12
|
+
import org.apache.hadoop.mapreduce.JobContext;
|
13
|
+
|
14
|
+
public class EmbulkInputFormat
|
15
|
+
extends InputFormat<IntWritable, NullWritable>
|
16
|
+
{
|
17
|
+
@Override
|
18
|
+
public List<InputSplit> getSplits(JobContext context)
|
19
|
+
throws IOException, InterruptedException
|
20
|
+
{
|
21
|
+
// TODO combining multiple tasks to one mapper is not implemented yet.
|
22
|
+
int taskCount = EmbulkMapReduce.getMapTaskCount(context.getConfiguration());
|
23
|
+
ImmutableList.Builder<InputSplit> builder = ImmutableList.builder();
|
24
|
+
for (int i=0; i < taskCount; i++) {
|
25
|
+
builder.add(new EmbulkInputSplit(new int[] { i }));
|
26
|
+
}
|
27
|
+
return builder.build();
|
28
|
+
}
|
29
|
+
|
30
|
+
@Override
|
31
|
+
public RecordReader<IntWritable, NullWritable> createRecordReader(
|
32
|
+
InputSplit split, TaskAttemptContext context)
|
33
|
+
throws IOException, InterruptedException
|
34
|
+
{
|
35
|
+
return new EmbulkRecordReader((EmbulkInputSplit) split);
|
36
|
+
}
|
37
|
+
}
|