embulk-executor-mapreduce 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/build.gradle +2 -0
- data/classpath/activation-1.1.jar +0 -0
- data/classpath/apacheds-i18n-2.0.0-M15.jar +0 -0
- data/classpath/apacheds-kerberos-codec-2.0.0-M15.jar +0 -0
- data/classpath/api-asn1-api-1.0.0-M20.jar +0 -0
- data/classpath/api-util-1.0.0-M20.jar +0 -0
- data/classpath/avro-1.7.4.jar +0 -0
- data/classpath/commons-beanutils-1.7.0.jar +0 -0
- data/classpath/commons-cli-1.2.jar +0 -0
- data/classpath/commons-codec-1.6.jar +0 -0
- data/classpath/commons-collections-3.2.1.jar +0 -0
- data/classpath/commons-compress-1.4.1.jar +0 -0
- data/classpath/commons-configuration-1.6.jar +0 -0
- data/classpath/commons-digester-1.8.jar +0 -0
- data/classpath/commons-httpclient-3.1.jar +0 -0
- data/classpath/commons-io-2.4.jar +0 -0
- data/classpath/commons-lang-2.6.jar +0 -0
- data/classpath/commons-logging-1.1.3.jar +0 -0
- data/classpath/commons-math3-3.1.1.jar +0 -0
- data/classpath/commons-net-3.1.jar +0 -0
- data/classpath/curator-client-2.6.0.jar +0 -0
- data/classpath/curator-framework-2.6.0.jar +0 -0
- data/classpath/curator-recipes-2.6.0.jar +0 -0
- data/classpath/embulk-executor-mapreduce-0.1.0.jar +0 -0
- data/classpath/gson-2.2.4.jar +0 -0
- data/classpath/hadoop-annotations-2.6.0.jar +0 -0
- data/classpath/hadoop-auth-2.6.0.jar +0 -0
- data/classpath/hadoop-client-2.6.0.jar +0 -0
- data/classpath/hadoop-common-2.6.0.jar +0 -0
- data/classpath/hadoop-hdfs-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-app-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-common-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-core-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar +0 -0
- data/classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-api-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-client-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-common-2.6.0.jar +0 -0
- data/classpath/hadoop-yarn-server-nodemanager-2.6.0.jar +0 -0
- data/classpath/htrace-core-3.0.4.jar +0 -0
- data/classpath/httpclient-4.2.5.jar +0 -0
- data/classpath/httpcore-4.2.4.jar +0 -0
- data/classpath/jackson-core-asl-1.9.13.jar +0 -0
- data/classpath/jackson-jaxrs-1.9.13.jar +0 -0
- data/classpath/jackson-mapper-asl-1.9.13.jar +0 -0
- data/classpath/jackson-xc-1.9.13.jar +0 -0
- data/classpath/jaxb-api-2.2.2.jar +0 -0
- data/classpath/jaxb-impl-2.2.3-1.jar +0 -0
- data/classpath/jersey-client-1.9.jar +0 -0
- data/classpath/jersey-core-1.9.jar +0 -0
- data/classpath/jersey-guice-1.9.jar +0 -0
- data/classpath/jersey-json-1.9.jar +0 -0
- data/classpath/jersey-server-1.9.jar +0 -0
- data/classpath/jettison-1.1.jar +0 -0
- data/classpath/jetty-util-6.1.26.jar +0 -0
- data/classpath/jline-0.9.94.jar +0 -0
- data/classpath/jsr305-1.3.9.jar +0 -0
- data/classpath/leveldbjni-all-1.8.jar +0 -0
- data/classpath/netty-3.7.0.Final.jar +0 -0
- data/classpath/paranamer-2.3.jar +0 -0
- data/classpath/protobuf-java-2.5.0.jar +0 -0
- data/classpath/servlet-api-2.5.jar +0 -0
- data/classpath/snappy-java-1.0.4.1.jar +0 -0
- data/classpath/stax-api-1.0-2.jar +0 -0
- data/classpath/xmlenc-0.52.jar +0 -0
- data/classpath/xz-1.0.jar +0 -0
- data/classpath/zookeeper-3.4.6.jar +0 -0
- data/lib/embulk/executor/mapreduce.rb +3 -0
- data/src/main/java/org/embulk/executor/mapreduce/AttemptState.java +154 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferWritable.java +74 -0
- data/src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java +158 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java +37 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java +61 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java +359 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java +303 -0
- data/src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java +63 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java +391 -0
- data/src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java +60 -0
- data/src/main/java/org/embulk/executor/mapreduce/PageWritable.java +66 -0
- data/src/main/java/org/embulk/executor/mapreduce/PartitionKey.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioner.java +11 -0
- data/src/main/java/org/embulk/executor/mapreduce/Partitioning.java +12 -0
- data/src/main/java/org/embulk/executor/mapreduce/PluginArchive.java +189 -0
- data/src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java +10 -0
- data/src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java +19 -0
- data/src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java +291 -0
- metadata +131 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import java.util.List;
|
|
4
|
+
import java.util.ArrayList;
|
|
5
|
+
import java.io.File;
|
|
6
|
+
import java.io.InputStream;
|
|
7
|
+
import java.io.OutputStream;
|
|
8
|
+
import java.io.FileOutputStream;
|
|
9
|
+
import java.io.IOException;
|
|
10
|
+
import java.nio.file.Path;
|
|
11
|
+
import java.nio.file.Files;
|
|
12
|
+
import java.nio.file.DirectoryStream;
|
|
13
|
+
import java.nio.file.NoSuchFileException;
|
|
14
|
+
import java.nio.file.NotDirectoryException;
|
|
15
|
+
import java.util.zip.ZipEntry;
|
|
16
|
+
import java.util.zip.ZipOutputStream;
|
|
17
|
+
import java.util.zip.ZipInputStream;
|
|
18
|
+
import com.fasterxml.jackson.annotation.JsonCreator;
|
|
19
|
+
import com.fasterxml.jackson.annotation.JsonProperty;
|
|
20
|
+
import com.google.common.collect.ImmutableList;
|
|
21
|
+
import com.google.common.io.ByteStreams;
|
|
22
|
+
import org.jruby.embed.ScriptingContainer;
|
|
23
|
+
import org.jruby.embed.InvokeFailedException;
|
|
24
|
+
|
|
25
|
+
public class PluginArchive
|
|
26
|
+
{
|
|
27
|
+
public static class GemSpec
|
|
28
|
+
{
|
|
29
|
+
private final String name;
|
|
30
|
+
private final List<String> requirePaths;
|
|
31
|
+
|
|
32
|
+
@JsonCreator
|
|
33
|
+
public GemSpec(
|
|
34
|
+
@JsonProperty("name") String name,
|
|
35
|
+
@JsonProperty("requirePaths") List<String> requirePaths)
|
|
36
|
+
{
|
|
37
|
+
this.name = name;
|
|
38
|
+
this.requirePaths = requirePaths;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
@JsonProperty("name")
|
|
42
|
+
public String getName()
|
|
43
|
+
{
|
|
44
|
+
return name;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
@JsonProperty("requirePaths")
|
|
48
|
+
public List<String> getRequirePaths()
|
|
49
|
+
{
|
|
50
|
+
return requirePaths;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
private static class LocalGem
|
|
55
|
+
extends GemSpec
|
|
56
|
+
{
|
|
57
|
+
private final File localPath;
|
|
58
|
+
|
|
59
|
+
public LocalGem(File localPath, String name, List<String> requirePaths)
|
|
60
|
+
{
|
|
61
|
+
super(name, requirePaths);
|
|
62
|
+
this.localPath = localPath;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
public File getLocalPath()
|
|
66
|
+
{
|
|
67
|
+
return localPath;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
public static class Builder
|
|
72
|
+
{
|
|
73
|
+
private final ImmutableList.Builder<LocalGem> localGems = ImmutableList.builder();
|
|
74
|
+
|
|
75
|
+
@SuppressWarnings("unchecked")
|
|
76
|
+
public Builder addLoadedRubyGems(ScriptingContainer jruby)
|
|
77
|
+
{
|
|
78
|
+
List<List<String>> tuples = (List<List<String>>) jruby.runScriptlet("Gem.loaded_specs.map {|k,v| [k, v.full_gem_path, v.require_paths].flatten }");
|
|
79
|
+
for (List<String> tuple : tuples) {
|
|
80
|
+
String name = tuple.remove(0);
|
|
81
|
+
String fullGemPath = tuple.remove(0);
|
|
82
|
+
List<String> requirePaths = ImmutableList.copyOf(tuple);
|
|
83
|
+
addSpec(new File(fullGemPath), name, requirePaths);
|
|
84
|
+
}
|
|
85
|
+
return this;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
public Builder addSpec(File localPath, String name, List<String> requirePaths)
|
|
89
|
+
{
|
|
90
|
+
localGems.add(new LocalGem(localPath, name, requirePaths));
|
|
91
|
+
return this;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
public PluginArchive build()
|
|
95
|
+
{
|
|
96
|
+
return new PluginArchive(localGems.build());
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
private final List<LocalGem> localGems;
|
|
101
|
+
|
|
102
|
+
private PluginArchive(List<LocalGem> localGems)
|
|
103
|
+
{
|
|
104
|
+
this.localGems = localGems;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
@SuppressWarnings("unchecked")
|
|
108
|
+
public void restoreLoadPathsTo(ScriptingContainer jruby)
|
|
109
|
+
{
|
|
110
|
+
List<String> loadPaths = (List<String>) jruby.runScriptlet("$LOAD_PATH");
|
|
111
|
+
for (LocalGem localGem : localGems) {
|
|
112
|
+
Path localGemPath = localGem.getLocalPath().toPath();
|
|
113
|
+
for (String requirePath : localGem.getRequirePaths()) {
|
|
114
|
+
loadPaths.add(localGemPath.resolve(requirePath).toString());
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
jruby.setLoadPaths(loadPaths);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
public List<GemSpec> dump(OutputStream out)
|
|
121
|
+
throws IOException
|
|
122
|
+
{
|
|
123
|
+
ImmutableList.Builder<GemSpec> builder = ImmutableList.builder();
|
|
124
|
+
try (ZipOutputStream zip = new ZipOutputStream(out)) {
|
|
125
|
+
for (LocalGem localGem : localGems) {
|
|
126
|
+
zipDirectory(zip, localGem.getLocalPath().toPath(), localGem.getName() + "/");
|
|
127
|
+
builder.add(new GemSpec(localGem.getName(), localGem.getRequirePaths()));
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
return builder.build();
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
private static void zipDirectory(ZipOutputStream zip, Path directory, String name)
|
|
134
|
+
throws IOException
|
|
135
|
+
{
|
|
136
|
+
try (DirectoryStream<Path> dirStream = Files.newDirectoryStream(directory)) {
|
|
137
|
+
for (Path path : dirStream) {
|
|
138
|
+
if (Files.isDirectory(path)) {
|
|
139
|
+
zipDirectory(zip, path, name + path.getFileName() + "/");
|
|
140
|
+
} else {
|
|
141
|
+
zip.putNextEntry(new ZipEntry(name + path.getFileName()));
|
|
142
|
+
try (InputStream in = Files.newInputStream(path)) {
|
|
143
|
+
ByteStreams.copy(in, zip);
|
|
144
|
+
}
|
|
145
|
+
zip.closeEntry();
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
} catch (NoSuchFileException | NotDirectoryException ex) {
|
|
149
|
+
// ignore
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
public static PluginArchive load(File localDirectory, List<GemSpec> gemSpecs,
|
|
154
|
+
InputStream in) throws IOException
|
|
155
|
+
{
|
|
156
|
+
try (ZipInputStream zip = new ZipInputStream(in)) {
|
|
157
|
+
unzipDirectory(zip, localDirectory.toPath());
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
ImmutableList.Builder<LocalGem> builder = ImmutableList.builder();
|
|
161
|
+
for (GemSpec gemSpec : gemSpecs) {
|
|
162
|
+
builder.add(new LocalGem(
|
|
163
|
+
new File(localDirectory, gemSpec.getName()),
|
|
164
|
+
gemSpec.getName(),
|
|
165
|
+
gemSpec.getRequirePaths()));
|
|
166
|
+
}
|
|
167
|
+
return new PluginArchive(builder.build());
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
private static void unzipDirectory(ZipInputStream zip, Path directory)
|
|
171
|
+
throws IOException
|
|
172
|
+
{
|
|
173
|
+
while (true) {
|
|
174
|
+
ZipEntry entry = zip.getNextEntry();
|
|
175
|
+
if (entry == null) {
|
|
176
|
+
break;
|
|
177
|
+
}
|
|
178
|
+
Path path = directory.resolve(entry.getName());
|
|
179
|
+
if (entry.getName().endsWith("/")) {
|
|
180
|
+
Files.createDirectories(path);
|
|
181
|
+
} else {
|
|
182
|
+
Files.createDirectories(path.getParent());
|
|
183
|
+
try (OutputStream out = Files.newOutputStream(path)) {
|
|
184
|
+
ByteStreams.copy(zip, out);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
public class SetContextClassLoader
|
|
4
|
+
implements AutoCloseable
|
|
5
|
+
{
|
|
6
|
+
private final ClassLoader original;
|
|
7
|
+
|
|
8
|
+
public SetContextClassLoader(ClassLoader classLoader)
|
|
9
|
+
{
|
|
10
|
+
this.original = Thread.currentThread().getContextClassLoader();
|
|
11
|
+
Thread.currentThread().setContextClassLoader(classLoader);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
@Override
|
|
15
|
+
public void close()
|
|
16
|
+
{
|
|
17
|
+
Thread.currentThread().setContextClassLoader(original);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
package org.embulk.executor.mapreduce;
|
|
2
|
+
|
|
3
|
+
import org.joda.time.DateTimeZone;
|
|
4
|
+
import com.google.common.base.Optional;
|
|
5
|
+
import org.embulk.config.Config;
|
|
6
|
+
import org.embulk.config.ConfigDefault;
|
|
7
|
+
import org.embulk.config.ConfigSource;
|
|
8
|
+
import org.embulk.config.ConfigException;
|
|
9
|
+
import org.embulk.config.Task;
|
|
10
|
+
import org.embulk.config.TaskSource;
|
|
11
|
+
import org.embulk.spi.time.Timestamp;
|
|
12
|
+
import org.embulk.spi.type.TimestampType;
|
|
13
|
+
import org.embulk.spi.type.LongType;
|
|
14
|
+
import org.embulk.spi.Column;
|
|
15
|
+
import org.embulk.spi.PageReader;
|
|
16
|
+
import org.embulk.spi.Schema;
|
|
17
|
+
import org.embulk.spi.Buffer;
|
|
18
|
+
|
|
19
|
+
public class TimestampPartitioning
|
|
20
|
+
implements Partitioning
|
|
21
|
+
{
|
|
22
|
+
public interface PartitioningTask
|
|
23
|
+
extends Task
|
|
24
|
+
{
|
|
25
|
+
@Config("column")
|
|
26
|
+
public String getColumn();
|
|
27
|
+
|
|
28
|
+
@Config("unit")
|
|
29
|
+
public String getUnit();
|
|
30
|
+
|
|
31
|
+
@Config("timezone")
|
|
32
|
+
@ConfigDefault("\"UTC\"")
|
|
33
|
+
public DateTimeZone getTimeZone();
|
|
34
|
+
|
|
35
|
+
@Config("unix_timestamp_unit")
|
|
36
|
+
@ConfigDefault("\"sec\"")
|
|
37
|
+
public String getUnixTimestamp();
|
|
38
|
+
|
|
39
|
+
public Column getTargetColumn();
|
|
40
|
+
public void setTargetColumn(Column column);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
private static enum Unit
|
|
44
|
+
{
|
|
45
|
+
HOUR(60*60),
|
|
46
|
+
DAY(24*60*60);
|
|
47
|
+
//WEEK
|
|
48
|
+
//MONTH,
|
|
49
|
+
//YEAR;
|
|
50
|
+
|
|
51
|
+
private final int unit;
|
|
52
|
+
|
|
53
|
+
private Unit(int unit)
|
|
54
|
+
{
|
|
55
|
+
this.unit = unit;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
public long utcPartition(long seconds)
|
|
59
|
+
{
|
|
60
|
+
return seconds / unit;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
public static Unit of(String s)
|
|
64
|
+
{
|
|
65
|
+
switch (s) {
|
|
66
|
+
case "hour": return HOUR;
|
|
67
|
+
case "day": return DAY;
|
|
68
|
+
//case "week": return WEEK;
|
|
69
|
+
//case "month": return MONTH;
|
|
70
|
+
//case "year": return YEAR;
|
|
71
|
+
default:
|
|
72
|
+
throw new ConfigException(
|
|
73
|
+
String.format("Unknown unit '%s'. Supported units are hour and day"));
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
private static enum UnixTimestampUnit
|
|
79
|
+
{
|
|
80
|
+
SEC(1),
|
|
81
|
+
MILLI(1000),
|
|
82
|
+
MICRO(1000000),
|
|
83
|
+
NANO(1000000000);
|
|
84
|
+
|
|
85
|
+
private final int unit;
|
|
86
|
+
|
|
87
|
+
private UnixTimestampUnit(int unit)
|
|
88
|
+
{
|
|
89
|
+
this.unit = unit;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
public long toSeconds(long v)
|
|
93
|
+
{
|
|
94
|
+
return v / unit;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
public static UnixTimestampUnit of(String s)
|
|
98
|
+
{
|
|
99
|
+
switch (s) {
|
|
100
|
+
case "sec": return SEC;
|
|
101
|
+
case "milli": return MILLI;
|
|
102
|
+
case "micro": return MICRO;
|
|
103
|
+
case "nano": return NANO;
|
|
104
|
+
default:
|
|
105
|
+
throw new ConfigException(
|
|
106
|
+
String.format("Unknown unix_timestamp_unit '%s'. Supported units are sec, milli, micro, and nano"));
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
@Override
|
|
112
|
+
public TaskSource configure(ConfigSource config, Schema schema, int outputTaskCount)
|
|
113
|
+
{
|
|
114
|
+
PartitioningTask task = config.loadConfig(PartitioningTask.class);
|
|
115
|
+
Column column = findColumnByName(schema, task.getColumn());
|
|
116
|
+
|
|
117
|
+
if (!task.getTimeZone().equals(DateTimeZone.UTC)) {
|
|
118
|
+
// TODO
|
|
119
|
+
throw new ConfigException("Timestamp partitioner supports only UTC time zone for now");
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// validate unit
|
|
123
|
+
Unit.of(task.getUnit());
|
|
124
|
+
|
|
125
|
+
// validate type
|
|
126
|
+
if (column.getType() instanceof TimestampType) {
|
|
127
|
+
// ok
|
|
128
|
+
} else if (column.getType() instanceof LongType) {
|
|
129
|
+
// validate unix_timestamp_unit
|
|
130
|
+
UnixTimestampUnit.of(task.getUnixTimestamp());
|
|
131
|
+
} else {
|
|
132
|
+
throw new ConfigException(
|
|
133
|
+
String.format("Partitioning column '%s' must be timestamp or long but got '%s'", column.getName(), column.getType()));
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
task.setTargetColumn(column);
|
|
137
|
+
|
|
138
|
+
return task.dump();
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
private static Column findColumnByName(Schema schema, String columnName)
|
|
142
|
+
{
|
|
143
|
+
for (Column column : schema.getColumns()) {
|
|
144
|
+
if (column.getName().equals(columnName)) {
|
|
145
|
+
return column;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
throw new ConfigException(
|
|
149
|
+
String.format("Column '%s' is not found in schema", columnName));
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
@Override
|
|
153
|
+
public Partitioner newPartitioner(TaskSource taskSource)
|
|
154
|
+
{
|
|
155
|
+
PartitioningTask task = taskSource.loadTask(PartitioningTask.class);
|
|
156
|
+
|
|
157
|
+
Column column = task.getTargetColumn();
|
|
158
|
+
if (column.getType() instanceof TimestampType) {
|
|
159
|
+
return new TimestampPartitioner(column, Unit.of(task.getUnit()));
|
|
160
|
+
} else if (column.getType() instanceof LongType) {
|
|
161
|
+
return new LongUnixTimestampPartitioner(column, Unit.of(task.getUnit()), UnixTimestampUnit.of(task.getUnixTimestamp()));
|
|
162
|
+
} else {
|
|
163
|
+
throw new AssertionError();
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
private static class LongPartitionKey
|
|
168
|
+
implements PartitionKey
|
|
169
|
+
{
|
|
170
|
+
public static Buffer newKeyBuffer()
|
|
171
|
+
{
|
|
172
|
+
Buffer buffer = Buffer.allocate(8);
|
|
173
|
+
buffer.limit(8);
|
|
174
|
+
return buffer;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
private long value;
|
|
178
|
+
|
|
179
|
+
public LongPartitionKey()
|
|
180
|
+
{ }
|
|
181
|
+
|
|
182
|
+
private LongPartitionKey(long value)
|
|
183
|
+
{
|
|
184
|
+
this.value = value;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
public void set(long value)
|
|
188
|
+
{
|
|
189
|
+
this.value = value;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
@Override
|
|
193
|
+
public void dump(Buffer buffer)
|
|
194
|
+
{
|
|
195
|
+
// TODO optimize
|
|
196
|
+
buffer.array()[0] = (byte) (((int) (value >>> 0)) & 0xff);
|
|
197
|
+
buffer.array()[1] = (byte) (((int) (value >>> 4)) & 0xff);
|
|
198
|
+
buffer.array()[2] = (byte) (((int) (value >>> 8)) & 0xff);
|
|
199
|
+
buffer.array()[3] = (byte) (((int) (value >>> 12)) & 0xff);
|
|
200
|
+
buffer.array()[4] = (byte) (((int) (value >>> 16)) & 0xff);
|
|
201
|
+
buffer.array()[5] = (byte) (((int) (value >>> 20)) & 0xff);
|
|
202
|
+
buffer.array()[6] = (byte) (((int) (value >>> 24)) & 0xff);
|
|
203
|
+
buffer.array()[7] = (byte) (((int) (value >>> 28)) & 0xff);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
@Override
|
|
207
|
+
public LongPartitionKey clone()
|
|
208
|
+
{
|
|
209
|
+
return new LongPartitionKey(value);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
@Override
|
|
213
|
+
public boolean equals(Object other)
|
|
214
|
+
{
|
|
215
|
+
if (!(other instanceof LongPartitionKey)) {
|
|
216
|
+
return false;
|
|
217
|
+
}
|
|
218
|
+
LongPartitionKey o = (LongPartitionKey) other;
|
|
219
|
+
return value == o.value;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
@Override
|
|
223
|
+
public int hashCode()
|
|
224
|
+
{
|
|
225
|
+
return (int) (value ^ (value >>> 32));
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
private static abstract class AbstractTimestampPartitioner
|
|
230
|
+
implements Partitioner
|
|
231
|
+
{
|
|
232
|
+
protected final Column column;
|
|
233
|
+
protected final Unit unit;
|
|
234
|
+
private final LongPartitionKey key;
|
|
235
|
+
|
|
236
|
+
public AbstractTimestampPartitioner(Column column, Unit unit)
|
|
237
|
+
{
|
|
238
|
+
this.column = column;
|
|
239
|
+
this.unit = unit;
|
|
240
|
+
this.key = new LongPartitionKey();
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
@Override
|
|
244
|
+
public Buffer newKeyBuffer()
|
|
245
|
+
{
|
|
246
|
+
return LongPartitionKey.newKeyBuffer();
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
protected LongPartitionKey updateKey(long v)
|
|
250
|
+
{
|
|
251
|
+
key.set(v);
|
|
252
|
+
return key;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
private static class TimestampPartitioner
|
|
257
|
+
extends AbstractTimestampPartitioner
|
|
258
|
+
{
|
|
259
|
+
public TimestampPartitioner(Column column, Unit unit)
|
|
260
|
+
{
|
|
261
|
+
super(column, unit);
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
@Override
|
|
265
|
+
public PartitionKey updateKey(PageReader record)
|
|
266
|
+
{
|
|
267
|
+
Timestamp v = record.getTimestamp(column);
|
|
268
|
+
return super.updateKey(unit.utcPartition(v.getEpochSecond()));
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
private static class LongUnixTimestampPartitioner
|
|
273
|
+
extends AbstractTimestampPartitioner
|
|
274
|
+
{
|
|
275
|
+
private final UnixTimestampUnit unixTimestampUnit;
|
|
276
|
+
|
|
277
|
+
public LongUnixTimestampPartitioner(Column column, Unit unit,
|
|
278
|
+
UnixTimestampUnit unixTimestampUnit)
|
|
279
|
+
{
|
|
280
|
+
super(column, unit);
|
|
281
|
+
this.unixTimestampUnit = unixTimestampUnit;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
@Override
|
|
285
|
+
public PartitionKey updateKey(PageReader record)
|
|
286
|
+
{
|
|
287
|
+
long v = record.getLong(column);
|
|
288
|
+
return super.updateKey(unit.utcPartition(unixTimestampUnit.toSeconds(v)));
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
}
|
metadata
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: embulk-executor-mapreduce
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Sadayuki Furuhashi
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2015-04-08 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: Executes tasks on Hadoop.
|
|
14
|
+
email:
|
|
15
|
+
- frsyuki@gmail.com
|
|
16
|
+
executables: []
|
|
17
|
+
extensions: []
|
|
18
|
+
extra_rdoc_files: []
|
|
19
|
+
files:
|
|
20
|
+
- build.gradle
|
|
21
|
+
- lib/embulk/executor/mapreduce.rb
|
|
22
|
+
- src/main/java/org/embulk/executor/mapreduce/AttemptState.java
|
|
23
|
+
- src/main/java/org/embulk/executor/mapreduce/BufferWritable.java
|
|
24
|
+
- src/main/java/org/embulk/executor/mapreduce/BufferedPagePartitioner.java
|
|
25
|
+
- src/main/java/org/embulk/executor/mapreduce/EmbulkInputFormat.java
|
|
26
|
+
- src/main/java/org/embulk/executor/mapreduce/EmbulkInputSplit.java
|
|
27
|
+
- src/main/java/org/embulk/executor/mapreduce/EmbulkMapReduce.java
|
|
28
|
+
- src/main/java/org/embulk/executor/mapreduce/EmbulkPartitioningMapReduce.java
|
|
29
|
+
- src/main/java/org/embulk/executor/mapreduce/EmbulkRecordReader.java
|
|
30
|
+
- src/main/java/org/embulk/executor/mapreduce/MapReduceExecutor.java
|
|
31
|
+
- src/main/java/org/embulk/executor/mapreduce/MapReduceExecutorTask.java
|
|
32
|
+
- src/main/java/org/embulk/executor/mapreduce/PageWritable.java
|
|
33
|
+
- src/main/java/org/embulk/executor/mapreduce/PartitionKey.java
|
|
34
|
+
- src/main/java/org/embulk/executor/mapreduce/Partitioner.java
|
|
35
|
+
- src/main/java/org/embulk/executor/mapreduce/Partitioning.java
|
|
36
|
+
- src/main/java/org/embulk/executor/mapreduce/PluginArchive.java
|
|
37
|
+
- src/main/java/org/embulk/executor/mapreduce/RemoteTaskFailedException.java
|
|
38
|
+
- src/main/java/org/embulk/executor/mapreduce/SetContextClassLoader.java
|
|
39
|
+
- src/main/java/org/embulk/executor/mapreduce/TimestampPartitioning.java
|
|
40
|
+
- classpath/activation-1.1.jar
|
|
41
|
+
- classpath/apacheds-i18n-2.0.0-M15.jar
|
|
42
|
+
- classpath/apacheds-kerberos-codec-2.0.0-M15.jar
|
|
43
|
+
- classpath/api-asn1-api-1.0.0-M20.jar
|
|
44
|
+
- classpath/api-util-1.0.0-M20.jar
|
|
45
|
+
- classpath/avro-1.7.4.jar
|
|
46
|
+
- classpath/commons-beanutils-1.7.0.jar
|
|
47
|
+
- classpath/commons-cli-1.2.jar
|
|
48
|
+
- classpath/commons-codec-1.6.jar
|
|
49
|
+
- classpath/commons-collections-3.2.1.jar
|
|
50
|
+
- classpath/commons-compress-1.4.1.jar
|
|
51
|
+
- classpath/commons-configuration-1.6.jar
|
|
52
|
+
- classpath/commons-digester-1.8.jar
|
|
53
|
+
- classpath/commons-httpclient-3.1.jar
|
|
54
|
+
- classpath/commons-io-2.4.jar
|
|
55
|
+
- classpath/commons-lang-2.6.jar
|
|
56
|
+
- classpath/commons-logging-1.1.3.jar
|
|
57
|
+
- classpath/commons-math3-3.1.1.jar
|
|
58
|
+
- classpath/commons-net-3.1.jar
|
|
59
|
+
- classpath/curator-client-2.6.0.jar
|
|
60
|
+
- classpath/curator-framework-2.6.0.jar
|
|
61
|
+
- classpath/curator-recipes-2.6.0.jar
|
|
62
|
+
- classpath/embulk-executor-mapreduce-0.1.0.jar
|
|
63
|
+
- classpath/gson-2.2.4.jar
|
|
64
|
+
- classpath/hadoop-annotations-2.6.0.jar
|
|
65
|
+
- classpath/hadoop-auth-2.6.0.jar
|
|
66
|
+
- classpath/hadoop-client-2.6.0.jar
|
|
67
|
+
- classpath/hadoop-common-2.6.0.jar
|
|
68
|
+
- classpath/hadoop-hdfs-2.6.0.jar
|
|
69
|
+
- classpath/hadoop-mapreduce-client-app-2.6.0.jar
|
|
70
|
+
- classpath/hadoop-mapreduce-client-common-2.6.0.jar
|
|
71
|
+
- classpath/hadoop-mapreduce-client-core-2.6.0.jar
|
|
72
|
+
- classpath/hadoop-mapreduce-client-jobclient-2.6.0.jar
|
|
73
|
+
- classpath/hadoop-mapreduce-client-shuffle-2.6.0.jar
|
|
74
|
+
- classpath/hadoop-yarn-api-2.6.0.jar
|
|
75
|
+
- classpath/hadoop-yarn-client-2.6.0.jar
|
|
76
|
+
- classpath/hadoop-yarn-common-2.6.0.jar
|
|
77
|
+
- classpath/hadoop-yarn-server-common-2.6.0.jar
|
|
78
|
+
- classpath/hadoop-yarn-server-nodemanager-2.6.0.jar
|
|
79
|
+
- classpath/htrace-core-3.0.4.jar
|
|
80
|
+
- classpath/httpclient-4.2.5.jar
|
|
81
|
+
- classpath/httpcore-4.2.4.jar
|
|
82
|
+
- classpath/jackson-core-asl-1.9.13.jar
|
|
83
|
+
- classpath/jackson-jaxrs-1.9.13.jar
|
|
84
|
+
- classpath/jackson-mapper-asl-1.9.13.jar
|
|
85
|
+
- classpath/jackson-xc-1.9.13.jar
|
|
86
|
+
- classpath/jaxb-api-2.2.2.jar
|
|
87
|
+
- classpath/jaxb-impl-2.2.3-1.jar
|
|
88
|
+
- classpath/jersey-client-1.9.jar
|
|
89
|
+
- classpath/jersey-core-1.9.jar
|
|
90
|
+
- classpath/jersey-guice-1.9.jar
|
|
91
|
+
- classpath/jersey-json-1.9.jar
|
|
92
|
+
- classpath/jersey-server-1.9.jar
|
|
93
|
+
- classpath/jettison-1.1.jar
|
|
94
|
+
- classpath/jetty-util-6.1.26.jar
|
|
95
|
+
- classpath/jline-0.9.94.jar
|
|
96
|
+
- classpath/jsr305-1.3.9.jar
|
|
97
|
+
- classpath/leveldbjni-all-1.8.jar
|
|
98
|
+
- classpath/netty-3.7.0.Final.jar
|
|
99
|
+
- classpath/paranamer-2.3.jar
|
|
100
|
+
- classpath/protobuf-java-2.5.0.jar
|
|
101
|
+
- classpath/servlet-api-2.5.jar
|
|
102
|
+
- classpath/snappy-java-1.0.4.1.jar
|
|
103
|
+
- classpath/stax-api-1.0-2.jar
|
|
104
|
+
- classpath/xmlenc-0.52.jar
|
|
105
|
+
- classpath/xz-1.0.jar
|
|
106
|
+
- classpath/zookeeper-3.4.6.jar
|
|
107
|
+
homepage: https://github.com/embulk/embulk-executor-mapreduce
|
|
108
|
+
licenses:
|
|
109
|
+
- Apache 2.0
|
|
110
|
+
metadata: {}
|
|
111
|
+
post_install_message:
|
|
112
|
+
rdoc_options: []
|
|
113
|
+
require_paths:
|
|
114
|
+
- lib
|
|
115
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
116
|
+
requirements:
|
|
117
|
+
- - '>='
|
|
118
|
+
- !ruby/object:Gem::Version
|
|
119
|
+
version: '0'
|
|
120
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
121
|
+
requirements:
|
|
122
|
+
- - '>='
|
|
123
|
+
- !ruby/object:Gem::Version
|
|
124
|
+
version: '0'
|
|
125
|
+
requirements: []
|
|
126
|
+
rubyforge_project:
|
|
127
|
+
rubygems_version: 2.1.9
|
|
128
|
+
signing_key:
|
|
129
|
+
specification_version: 4
|
|
130
|
+
summary: MapReduce executor plugin for Embulk
|
|
131
|
+
test_files: []
|