wukong-storm 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/.rspec +1 -2
- data/Gemfile +1 -1
- data/README.md +174 -18
- data/bin/wu-bolt +4 -0
- data/lib/wukong-storm.rb +50 -10
- data/lib/wukong-storm/bolt_driver.rb +81 -0
- data/lib/wukong-storm/bolt_runner.rb +44 -0
- data/lib/wukong-storm/storm_invocation.rb +386 -0
- data/lib/wukong-storm/storm_runner.rb +123 -0
- data/lib/wukong-storm/version.rb +1 -1
- data/lib/wukong-storm/wukong-storm.jar +0 -0
- data/pom.xml +111 -0
- data/spec/spec_helper.rb +13 -1
- data/spec/wukong-storm/bolt_driver_spec.rb +46 -0
- data/spec/wukong-storm/storm_invocation_spec.rb +204 -0
- data/spec/wukong-storm/storm_runner_spec.rb +76 -0
- data/spec/{wu_storm_spec.rb → wukong-storm/wu-bolt_spec.rb} +14 -14
- data/spec/wukong-storm/wu-storm_spec.rb +17 -0
- data/spec/wukong-storm_spec.rb +5 -0
- data/src/main/java/com/infochimps/wukong/storm/Builder.java +53 -0
- data/src/main/java/com/infochimps/wukong/storm/DataflowBuilder.java +74 -0
- data/src/main/java/com/infochimps/wukong/storm/SpoutBuilder.java +237 -0
- data/src/main/java/com/infochimps/wukong/storm/StateBuilder.java +46 -0
- data/src/main/java/com/infochimps/wukong/storm/TopologyBuilder.java +130 -0
- data/src/main/java/com/infochimps/wukong/storm/TopologySubmitter.java +181 -0
- data/wukong-storm.gemspec +3 -2
- metadata +49 -11
- data/lib/wukong-storm/driver.rb +0 -58
- data/lib/wukong-storm/runner.rb +0 -40
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Storm::StormRunner do
|
4
|
+
|
5
|
+
before do
|
6
|
+
@producer = double("Kafka::Producer", push: true)
|
7
|
+
Kafka::Producer.stub(:new).and_return(@producer)
|
8
|
+
end
|
9
|
+
|
10
|
+
describe "validating a topology about to be launched" do
|
11
|
+
it "raises an error without a dataflow (or an explicit --bolt_command) to run" do
|
12
|
+
expect { storm_runner('--input=foo', '--output=bar') }.to raise_error(Wukong::Error, /processor.*dataflow.*run/i)
|
13
|
+
end
|
14
|
+
|
15
|
+
it "raises an error on a non-existing dataflow" do
|
16
|
+
expect { storm_runner('definitelyNotGonnaBeThere', '--input=foo', '--output=bar') }.to raise_error(Wukong::Error, /definitelyNotGonnaBeThere/)
|
17
|
+
end
|
18
|
+
|
19
|
+
context "reading and writing from Kafka" do
|
20
|
+
it "raises an error without an --input topic" do
|
21
|
+
expect { storm_runner('identity', '--output=bar') }.to raise_error(Wukong::Error, /input.*required/i)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "raises an error without an --output topic" do
|
25
|
+
expect { storm_runner('identity', '--input=foo') }.to raise_error(Wukong::Error, /output.*required/i)
|
26
|
+
end
|
27
|
+
|
28
|
+
it "raises an error when --kafka_hosts is empty or missing" do
|
29
|
+
expect { storm_runner('identity', '--input=foo', '--output=bar', '--kafka_hosts=') }.to raise_error(Wukong::Error, /kafka.*host/i)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
context "reading from S3 and writing to Kafka" do
|
34
|
+
it "raises an error without a path" do
|
35
|
+
expect { storm_runner('identity', '--input=s3://foo', '--output=baz', '--aws_key=key', '--aws_secret=secret') }.to raise_error(Wukong::Error, /s3.*path/i)
|
36
|
+
end
|
37
|
+
it "raises an error without an AWS access key" do
|
38
|
+
expect { storm_runner('identity', '--input=s3://foo/bar', '--output=baz', '--aws_secret=secret') }.to raise_error(Wukong::Error, /aws.*key/i)
|
39
|
+
end
|
40
|
+
|
41
|
+
it "raises an error without an AWS secret key" do
|
42
|
+
expect { storm_runner('identity', '--input=s3://foo/bar', '--output=baz', '--aws_key=key') }.to raise_error(Wukong::Error, /aws.*secret/i)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "raises an error on an invalid AWS region" do
|
46
|
+
expect { storm_runner('identity', '--input=s3://foo/bar', '--output=baz', '--aws_key=key', '--aws_secret=secret', '--aws_region=us-east-7') }.to raise_error(Wukong::Error, /aws.*region/i)
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe "setting up for a topology about to be launched" do
|
53
|
+
context "when reading from Kafka" do
|
54
|
+
it "ensures the Kafka input topic exists" do
|
55
|
+
Kafka::Producer.should_receive(:new).with(host: 'localhost', port: 9092, topic: 'foo')
|
56
|
+
@producer.should_receive(:push).with([])
|
57
|
+
storm_runner('identity', '--input=foo', '--output=bar')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
describe "killing a running topology before launching a new one" do
|
63
|
+
it "will not try to kill a previously running topology first" do
|
64
|
+
storm_runner('identity', '--input=foo', '--output=bar', '--wait=1') do
|
65
|
+
should_not_receive(:execute_command).with(/storm.*kill/)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
it "will try to kill a previously running topology if asked" do
|
70
|
+
storm_runner('identity', '--rm', '--input=foo', '--output=bar', '--wait=1') do
|
71
|
+
should_receive(:execute_command).with(/storm.*kill/)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
@@ -8,47 +8,47 @@ Wu.processor(:test) do
|
|
8
8
|
|
9
9
|
end
|
10
10
|
|
11
|
-
describe 'wu-
|
12
|
-
let(:examples) { File.expand_path('
|
11
|
+
describe 'wu-bolt' do
|
12
|
+
let(:examples) { File.expand_path('../../support/examples.rb', __FILE__) }
|
13
13
|
|
14
14
|
context 'without any arguments' do
|
15
|
-
subject {
|
15
|
+
subject { wu_bolt }
|
16
16
|
it { should exit_with(:non_zero) }
|
17
|
-
it { should have_stderr(/provide a
|
17
|
+
it { should have_stderr(/provide a.*dataflow.*run/) }
|
18
18
|
end
|
19
19
|
|
20
20
|
context 'with a simple processor' do
|
21
21
|
let(:input) { 'one event' }
|
22
|
-
subject {
|
22
|
+
subject { wu_bolt(examples, '--run=simple') < input }
|
23
23
|
it { should exit_with(0) }
|
24
|
-
it { should have_stdout("one event\n
|
24
|
+
it { should have_stdout("one event\nX\n") }
|
25
25
|
end
|
26
26
|
|
27
27
|
context 'with a skipped processor' do
|
28
28
|
let(:input) { 'never see this' }
|
29
|
-
subject {
|
29
|
+
subject { wu_bolt(examples, '--run=skipped') < input }
|
30
30
|
it { should exit_with(0) }
|
31
|
-
it { should have_stdout("
|
31
|
+
it { should have_stdout("X\n") }
|
32
32
|
end
|
33
33
|
|
34
34
|
context 'with a duplicating processor' do
|
35
35
|
let(:input) { 'foo' }
|
36
|
-
subject {
|
36
|
+
subject { wu_bolt(examples, '--run=multi') < input }
|
37
37
|
it { should exit_with(0) }
|
38
|
-
it { should have_stdout("foo\nfoo\nfoo\n
|
38
|
+
it { should have_stdout("foo\nfoo\nfoo\nX\n") }
|
39
39
|
end
|
40
40
|
|
41
41
|
context 'with a flow' do
|
42
42
|
let(:input) { '{"foo":"bar"}' }
|
43
|
-
subject {
|
43
|
+
subject { wu_bolt(examples, '--run=flow') < input }
|
44
44
|
it { should exit_with(0) }
|
45
|
-
it { should have_stdout("I raised the bar\n
|
45
|
+
it { should have_stdout("I raised the bar\nX\n") }
|
46
46
|
end
|
47
47
|
|
48
48
|
context 'with multiple arguments' do
|
49
49
|
let(:input) { "foo\nbar\nbaz" }
|
50
|
-
subject {
|
50
|
+
subject { wu_bolt(examples, '--run=simple') < input }
|
51
51
|
it { should exit_with(0) }
|
52
|
-
it { should have_stdout("foo\
|
52
|
+
it { should have_stdout("foo\nX\nbar\nX\nbaz\nX\n") }
|
53
53
|
end
|
54
54
|
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe 'wu-storm' do
|
4
|
+
context "without any arguments" do
|
5
|
+
let(:subject) { command('wu-storm') }
|
6
|
+
it {should exit_with(:non_zero) }
|
7
|
+
it "displays help on STDERR" do
|
8
|
+
should have_stderr(/processor.*dataflow.*run.*bolt_command/i)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
context "in --dry_run mode" do
|
13
|
+
let(:subject) { command('wu-storm', 'identity', "--input=foo", "--output=foo", "--dry_run") }
|
14
|
+
it { should exit_with(0) }
|
15
|
+
it { should have_stdout(/storm.*jar/, /TopologySubmitter/, /wu-bolt.*identity/) }
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
package com.infochimps.wukong.storm;
|
2
|
+
|
3
|
+
import java.util.List;
|
4
|
+
import java.util.ArrayList;
|
5
|
+
|
6
|
+
import org.apache.log4j.Logger;
|
7
|
+
|
8
|
+
public class Builder {
|
9
|
+
|
10
|
+
static Logger LOG = Logger.getLogger(StateBuilder.class);
|
11
|
+
|
12
|
+
public Builder() {
|
13
|
+
}
|
14
|
+
|
15
|
+
public Boolean valid() {
|
16
|
+
return true;
|
17
|
+
}
|
18
|
+
|
19
|
+
public void logInfo() {
|
20
|
+
}
|
21
|
+
|
22
|
+
static public String usage() {
|
23
|
+
return "";
|
24
|
+
}
|
25
|
+
|
26
|
+
public static String ZOOKEEPER_HOSTS = "wukong.zookeeper.hosts";
|
27
|
+
public static String DEFAULT_ZOOKEEPER_HOSTS = "localhost";
|
28
|
+
public String zookeeperHosts() {
|
29
|
+
return prop(ZOOKEEPER_HOSTS, DEFAULT_ZOOKEEPER_HOSTS);
|
30
|
+
}
|
31
|
+
|
32
|
+
public static String KAFKA_HOSTS = "wukong.kafka.hosts";
|
33
|
+
public static String DEFAULT_KAFKA_HOSTS = "localhost";
|
34
|
+
public List<String> kafkaHosts() {
|
35
|
+
ArrayList<String> kh = new ArrayList();
|
36
|
+
for (String host : prop(KAFKA_HOSTS, DEFAULT_KAFKA_HOSTS).split(",")) {
|
37
|
+
kh.add(host);
|
38
|
+
}
|
39
|
+
return kh;
|
40
|
+
}
|
41
|
+
|
42
|
+
public String prop(String key, String defaultValue) {
|
43
|
+
if (System.getProperty(key) == null) {
|
44
|
+
System.setProperty(key, defaultValue);
|
45
|
+
}
|
46
|
+
return prop(key);
|
47
|
+
}
|
48
|
+
|
49
|
+
public String prop(String key) {
|
50
|
+
return System.getProperty(key);
|
51
|
+
}
|
52
|
+
|
53
|
+
}
|
@@ -0,0 +1,74 @@
|
|
1
|
+
package com.infochimps.wukong.storm;
|
2
|
+
|
3
|
+
import org.apache.log4j.Logger;
|
4
|
+
|
5
|
+
import com.infochimps.storm.wukong.WuFunction;
|
6
|
+
|
7
|
+
public class DataflowBuilder extends Builder {
|
8
|
+
|
9
|
+
static Logger LOG = Logger.getLogger(DataflowBuilder.class);
|
10
|
+
|
11
|
+
private SpoutBuilder spoutBuilder;
|
12
|
+
|
13
|
+
public DataflowBuilder(SpoutBuilder spoutBuilder) {
|
14
|
+
this.spoutBuilder = spoutBuilder;
|
15
|
+
}
|
16
|
+
|
17
|
+
@Override
|
18
|
+
public Boolean valid() {
|
19
|
+
if (dataflowName() == null) {
|
20
|
+
LOG.error("Must set a dataflow name using the " + DATAFLOW_NAME + " property");
|
21
|
+
return false;
|
22
|
+
};
|
23
|
+
return true;
|
24
|
+
}
|
25
|
+
|
26
|
+
@Override
|
27
|
+
public void logInfo() {
|
28
|
+
LOG.info("DATAFLOW: Launching Wukong dataflow <" + dataflowName() + "> with parallelism " + dataflowParallelism() + " in environment <" + dataflowEnv() + ">" );
|
29
|
+
}
|
30
|
+
|
31
|
+
public static String usage() {
|
32
|
+
String s = "DATAFLOW OPTIONS\n"
|
33
|
+
+ "\n"
|
34
|
+
+ "The following options can be applied to the dataflow connecting the spout to the state:\n"
|
35
|
+
+ "\n"
|
36
|
+
+ " " + String.format("%10s", DATAFLOW_NAME) + " Name of the Wukong dataflow to launch (Required)\n"
|
37
|
+
+ " " + String.format("%10s", DATAFLOW_ENV) + " Wukong environment (Default: " + DEFAULT_DATAFLOW_ENV + ")\n"
|
38
|
+
+ " " + String.format("%10s", BOLT_COMMAND) + " The command-line to execute within a Storm bolt (Required)\n"
|
39
|
+
+ " " + String.format("%10s", DATAFLOW_DIRECTORY) + " The directory within which to execute the command-line (Default: " + DEFAULT_DATAFLOW_DIRECTORY + ")\n"
|
40
|
+
+ " " + String.format("%10s", DATAFLOW_PARALLELISM) + " Parallelism hint for Wukong dataflow Trident function (Default: same as --input_parallelism)\n";
|
41
|
+
return s;
|
42
|
+
}
|
43
|
+
|
44
|
+
public WuFunction dataflow() {
|
45
|
+
return new WuFunction(dataflowName(), subprocessDirectory(), dataflowEnv());
|
46
|
+
}
|
47
|
+
|
48
|
+
public static String DATAFLOW_DIRECTORY = "wukong.directory";
|
49
|
+
public static String DEFAULT_DATAFLOW_DIRECTORY = System.getProperty("user.dir");
|
50
|
+
public String subprocessDirectory() {
|
51
|
+
return prop(DATAFLOW_DIRECTORY, DEFAULT_DATAFLOW_DIRECTORY);
|
52
|
+
}
|
53
|
+
|
54
|
+
public static String DATAFLOW_NAME = "wukong.dataflow";
|
55
|
+
public String dataflowName() {
|
56
|
+
return prop(DATAFLOW_NAME);
|
57
|
+
}
|
58
|
+
|
59
|
+
// This is actually used directly by WuFunction but it's listed
|
60
|
+
// here for completeness since it is set by the Ruby code.
|
61
|
+
public static String BOLT_COMMAND = "wukong.command";
|
62
|
+
|
63
|
+
public static String DATAFLOW_ENV = "wukong.environment";
|
64
|
+
public static String DEFAULT_DATAFLOW_ENV = "development";
|
65
|
+
public String dataflowEnv() {
|
66
|
+
return prop(DATAFLOW_ENV, DEFAULT_DATAFLOW_ENV);
|
67
|
+
}
|
68
|
+
|
69
|
+
public static String DATAFLOW_PARALLELISM = "wukong.parallelism";
|
70
|
+
public int dataflowParallelism() {
|
71
|
+
return Integer.parseInt(prop(DATAFLOW_PARALLELISM, Integer.toString(spoutBuilder.inputParallelism())));
|
72
|
+
}
|
73
|
+
|
74
|
+
}
|
@@ -0,0 +1,237 @@
|
|
1
|
+
package com.infochimps.wukong.storm;
|
2
|
+
|
3
|
+
import java.lang.IllegalArgumentException;
|
4
|
+
|
5
|
+
import org.apache.log4j.Logger;
|
6
|
+
|
7
|
+
import backtype.storm.spout.SchemeAsMultiScheme;
|
8
|
+
|
9
|
+
import storm.trident.spout.IOpaquePartitionedTridentSpout;
|
10
|
+
|
11
|
+
import storm.kafka.KafkaConfig;
|
12
|
+
import storm.kafka.StringScheme;
|
13
|
+
import storm.kafka.trident.OpaqueTridentKafkaSpout;
|
14
|
+
import storm.kafka.trident.TridentKafkaConfig;
|
15
|
+
|
16
|
+
import com.infochimps.storm.trident.spout.OpaqueTransactionalBlobSpout;
|
17
|
+
import com.infochimps.storm.trident.spout.StartPolicy;
|
18
|
+
import com.infochimps.storm.trident.spout.WukongRecordizer;
|
19
|
+
import com.infochimps.storm.trident.spout.IBlobStore;
|
20
|
+
import com.infochimps.storm.trident.spout.S3BlobStore;
|
21
|
+
import com.infochimps.storm.trident.spout.FileBlobStore;
|
22
|
+
|
23
|
+
public class SpoutBuilder extends Builder {
|
24
|
+
|
25
|
+
static Logger LOG = Logger.getLogger(SpoutBuilder.class);
|
26
|
+
|
27
|
+
@Override
|
28
|
+
public Boolean valid() {
|
29
|
+
if (spoutType().equals(KAFKA_SPOUT_TYPE)) {
|
30
|
+
if (kafkaInputTopic() == null) {
|
31
|
+
LOG.error("Must set an input topic name using the " + KAFKA_INPUT_TOPIC + " property when using a Kafka spout");
|
32
|
+
return false;
|
33
|
+
};
|
34
|
+
}
|
35
|
+
if (spoutType().equals(BLOB_SPOUT_TYPE)) {
|
36
|
+
if (blobStorePath() == null) {
|
37
|
+
LOG.error("Must set a path using the " + BLOB_STORE_PATH + " property when using a blob store spout");
|
38
|
+
return false;
|
39
|
+
};
|
40
|
+
if (blobStoreType().equals(S3_BLOB_TYPE)) {
|
41
|
+
if (s3Bucket() == null) {
|
42
|
+
LOG.error("Must set an S3 bucket using the " + S3_BUCKET + " property when using the S3 spout");
|
43
|
+
return false;
|
44
|
+
};
|
45
|
+
if (awsKey() == null) {
|
46
|
+
LOG.error("Must set an AWS access key using the " + AWS_KEY + " property when using the S3 spout");
|
47
|
+
return false;
|
48
|
+
};
|
49
|
+
if (awsSecret() == null) {
|
50
|
+
LOG.error("Must set an AWS secret key using the " + AWS_SECRET + " property when using the S3 spout");
|
51
|
+
return false;
|
52
|
+
};
|
53
|
+
}
|
54
|
+
}
|
55
|
+
return true;
|
56
|
+
}
|
57
|
+
|
58
|
+
@Override
|
59
|
+
public void logInfo() {
|
60
|
+
if (spoutType().equals(BLOB_SPOUT_TYPE)) {
|
61
|
+
if (blobStoreType().equals(S3_BLOB_TYPE)) {
|
62
|
+
LOG.info("SPOUT: Reading from S3 bucket s3://" + s3Bucket() + " at path /" + blobStorePath() + ", using AWS key " + awsKey());
|
63
|
+
} else {
|
64
|
+
LOG.info("SPOUT: Reading from local file file:///" + blobStorePath());
|
65
|
+
}
|
66
|
+
} else {
|
67
|
+
LOG.info("SPOUT: Reading from offset " + kafkaInputOffset() + " of Kafka topic <" + kafkaInputTopic() + "> in batches of " + kafkaInputBatchSize() + " with parallelism " + inputParallelism());
|
68
|
+
}
|
69
|
+
}
|
70
|
+
|
71
|
+
public static String usage() {
|
72
|
+
String s = "SPOUT OPTIONS\n"
|
73
|
+
+ "\n"
|
74
|
+
+ "Choose the spout with he following properties. Each spout has its own further\n"
|
75
|
+
+ "configuration\n"
|
76
|
+
+ "\n"
|
77
|
+
+ " Kafka Spout -- " + SPOUT_TYPE + "=" + KAFKA_SPOUT_TYPE + "\n"
|
78
|
+
+ " BlobStore Spout -- " + SPOUT_TYPE + "=" + BLOB_TYPE + "\n"
|
79
|
+
+ " Filesystem Spout -- " + BLOB_SPOUT_TYPE + "=" + FILE_BLOB_TYPE + "\n"
|
80
|
+
+ " S3 Spout -- " + BLOB_SPOUT_TYPE + "=" + S3_BLOB_TYPE + "\n"
|
81
|
+
+ "\n"
|
82
|
+
+ "The following options apply for the Kafka spout (" + SPOUT_TYPE + "=" + KAFKA_SPOUT_TYPE + "):\n"
|
83
|
+
+ "\n"
|
84
|
+
+ " " + String.format("%10s", INPUT_PARALLELISM) + " Parallelism hint for the spout (Default: " + DEFAULT_INPUT_PARALLELISM + ")\n"
|
85
|
+
+ " " + String.format("%10s", KAFKA_INPUT_TOPIC) + " Name of the Kafka topic to read input from"
|
86
|
+
+ " " + String.format("%10s", KAFKA_INPUT_OFFSET) + " Offset from which to start consuming from the input topic, one of: -1 = 'end', -2 = 'beginning', or an explicit byte offset. (Default: resume if possible, else '1')\n"
|
87
|
+
+ " " + String.format("%10s", KAFKA_INPUT_PARTITIONS) + " Number of Storm partitions to use. Should match the number of partitions on the input topic. (Default: " + DEFAULT_KAFKA_INPUT_PARTITIONS + ")\n"
|
88
|
+
+ " " + String.format("%10s", KAFKA_INPUT_BATCH) + " Batch size to fetch from Kafka (Default: " + DEFAULT_KAFKA_INPUT_BATCH + ")\n"
|
89
|
+
+ "\n"
|
90
|
+
+ "The following options apply for all BlobStore spouts (" + SPOUT_TYPE + "=" + BLOB_TYPE + "):\n"
|
91
|
+
+ "\n"
|
92
|
+
+ " " + String.format("%10s", BLOB_STORE_PATH) + " Directory to read from (Required)\n"
|
93
|
+
+ " " + String.format("%10s", BLOB_START) + " Starting policy, one of: EARLIEST, LATEST, EXPLICIT, or RESUME. (Default: 'RESUME' if possible, else 'LATEST')\n"
|
94
|
+
+ " " + String.format("%10s", BLOB_MARKER) + " Required name of marker for an EXPLICIT starting policy\n"
|
95
|
+
+ "\n"
|
96
|
+
+ "The following options apply for the S3 spout (" + BLOB_SPOUT_TYPE + "=" + S3_BLOB_TYPE + "):\n"
|
97
|
+
+ "\n"
|
98
|
+
+ " " + String.format("%10s", S3_BUCKET) + " S3 bucket (Required)\n"
|
99
|
+
+ " " + String.format("%10s", AWS_KEY) + " AWS access key (Required)\n"
|
100
|
+
+ " " + String.format("%10s", AWS_SECRET) + " AWS secret key (Required)\n";
|
101
|
+
return s;
|
102
|
+
}
|
103
|
+
|
104
|
+
public IOpaquePartitionedTridentSpout spout() {
|
105
|
+
if (spoutType().equals(BLOB_SPOUT_TYPE)) {
|
106
|
+
return new OpaqueTransactionalBlobSpout(blobStore(), new WukongRecordizer(), blobStart(), blobMarker());
|
107
|
+
} else {
|
108
|
+
return new OpaqueTridentKafkaSpout(kafkaSpoutConfig());
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
private IBlobStore blobStore() {
|
113
|
+
if (blobStoreType().equals(S3_BLOB_TYPE)) {
|
114
|
+
return new S3BlobStore(blobStorePath(), s3Bucket(), s3Endpoint(), awsKey(), awsSecret());
|
115
|
+
} else {
|
116
|
+
return new FileBlobStore(blobStorePath());
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
private TridentKafkaConfig kafkaSpoutConfig() {
|
121
|
+
TridentKafkaConfig kafkaConfig = new TridentKafkaConfig(KafkaConfig.StaticHosts.fromHostString(kafkaHosts(), kafkaInputPartitions()), kafkaInputTopic());
|
122
|
+
kafkaConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
|
123
|
+
kafkaConfig.fetchSizeBytes = kafkaInputBatchSize();
|
124
|
+
kafkaConfig.forceStartOffsetTime(kafkaInputOffset());
|
125
|
+
return kafkaConfig;
|
126
|
+
}
|
127
|
+
|
128
|
+
public static String INPUT_PARALLELISM = "wukong.input.parallelism";
|
129
|
+
public static String DEFAULT_INPUT_PARALLELISM = "1";
|
130
|
+
public int inputParallelism() {
|
131
|
+
return Integer.parseInt(prop(INPUT_PARALLELISM, DEFAULT_INPUT_PARALLELISM));
|
132
|
+
}
|
133
|
+
|
134
|
+
public static String SPOUT_TYPE = "wukong.input.type";
|
135
|
+
public static String KAFKA_SPOUT_TYPE = "kafka";
|
136
|
+
public static String BLOB_SPOUT_TYPE = "blob";
|
137
|
+
public String spoutType() {
|
138
|
+
if ((prop(SPOUT_TYPE) != null) && prop(SPOUT_TYPE).equals(BLOB_SPOUT_TYPE)) {
|
139
|
+
return BLOB_SPOUT_TYPE;
|
140
|
+
} else {
|
141
|
+
return KAFKA_SPOUT_TYPE;
|
142
|
+
}
|
143
|
+
}
|
144
|
+
|
145
|
+
public Boolean isBlobSpout() {
|
146
|
+
return spoutType().equals(BLOB_SPOUT_TYPE);
|
147
|
+
}
|
148
|
+
|
149
|
+
public Boolean isKafkaSpout() {
|
150
|
+
return spoutType().equals(KAFKA_SPOUT_TYPE);
|
151
|
+
}
|
152
|
+
|
153
|
+
public static String BLOB_STORE_PATH = "wukong.input.blob.path";
|
154
|
+
public String blobStorePath() {
|
155
|
+
return prop(BLOB_STORE_PATH);
|
156
|
+
}
|
157
|
+
|
158
|
+
public static String BLOB_TYPE = "wukong.input.blob.type";
|
159
|
+
public static String FILE_BLOB_TYPE = "file";
|
160
|
+
public static String S3_BLOB_TYPE = "s3";
|
161
|
+
public String blobStoreType() {
|
162
|
+
if ((prop(BLOB_TYPE) != null) && prop(BLOB_TYPE).equals(S3_BLOB_TYPE)){
|
163
|
+
return S3_BLOB_TYPE;
|
164
|
+
} else {
|
165
|
+
return FILE_BLOB_TYPE;
|
166
|
+
}
|
167
|
+
}
|
168
|
+
|
169
|
+
public Boolean isS3Spout() {
|
170
|
+
return (isBlobSpout() && blobStoreType().equals(S3_BLOB_TYPE));
|
171
|
+
}
|
172
|
+
|
173
|
+
public Boolean isFileSpout() {
|
174
|
+
return (isBlobSpout() && blobStoreType().equals(FILE_BLOB_TYPE));
|
175
|
+
}
|
176
|
+
|
177
|
+
public static String BLOB_START = "wukong.input.blob.start";
|
178
|
+
public static String DEFAULT_BLOB_START = "RESUME";
|
179
|
+
public StartPolicy blobStart() {
|
180
|
+
try {
|
181
|
+
return StartPolicy.valueOf(prop(BLOB_START, DEFAULT_BLOB_START));
|
182
|
+
} catch (IllegalArgumentException e) {
|
183
|
+
return StartPolicy.RESUME;
|
184
|
+
}
|
185
|
+
}
|
186
|
+
|
187
|
+
public static String BLOB_MARKER = "wukong.input.blob.marker";
|
188
|
+
public String blobMarker() {
|
189
|
+
return prop(BLOB_MARKER);
|
190
|
+
}
|
191
|
+
|
192
|
+
public static String S3_BUCKET = "wukong.input.blob.s3_bucket";
|
193
|
+
public String s3Bucket() {
|
194
|
+
return prop(S3_BUCKET);
|
195
|
+
}
|
196
|
+
|
197
|
+
public static String S3_ENDPOINT = "wukong.input.blob.s3_endpoint";
|
198
|
+
public static String DEFAULT_S3_ENDPOINT = "s3.amazonaws.com";
|
199
|
+
public String s3Endpoint() {
|
200
|
+
return prop(S3_ENDPOINT, DEFAULT_S3_ENDPOINT);
|
201
|
+
}
|
202
|
+
|
203
|
+
public static String AWS_KEY = "wukong.input.blob.aws_key";
|
204
|
+
public String awsKey() {
|
205
|
+
return prop(AWS_KEY);
|
206
|
+
}
|
207
|
+
|
208
|
+
public static String AWS_SECRET = "wukong.input.blob.aws_secret";
|
209
|
+
public String awsSecret() {
|
210
|
+
return prop(AWS_SECRET);
|
211
|
+
}
|
212
|
+
|
213
|
+
public static String KAFKA_INPUT_TOPIC = "wukong.input.kafka.topic";
|
214
|
+
public String kafkaInputTopic() {
|
215
|
+
return prop(KAFKA_INPUT_TOPIC);
|
216
|
+
}
|
217
|
+
|
218
|
+
public static String KAFKA_INPUT_OFFSET = "wukong.input.kafka.offset";
|
219
|
+
public static String DEFAULT_KAFKA_INPUT_OFFSET = "-1";
|
220
|
+
public Integer kafkaInputOffset() {
|
221
|
+
return Integer.parseInt(prop(KAFKA_INPUT_OFFSET, DEFAULT_KAFKA_INPUT_OFFSET));
|
222
|
+
}
|
223
|
+
|
224
|
+
public static String KAFKA_INPUT_PARTITIONS = "wukong.input.kafka.partitions";
|
225
|
+
public static String DEFAULT_KAFKA_INPUT_PARTITIONS = "1";
|
226
|
+
public int kafkaInputPartitions() {
|
227
|
+
return Integer.parseInt(prop(KAFKA_INPUT_PARTITIONS, DEFAULT_KAFKA_INPUT_PARTITIONS));
|
228
|
+
}
|
229
|
+
|
230
|
+
public static String KAFKA_INPUT_BATCH = "wukong.input.kafka.batch";
|
231
|
+
public static String DEFAULT_KAFKA_INPUT_BATCH = "1048576";
|
232
|
+
public int kafkaInputBatchSize() {
|
233
|
+
return Integer.parseInt(prop(KAFKA_INPUT_BATCH, DEFAULT_KAFKA_INPUT_BATCH));
|
234
|
+
}
|
235
|
+
|
236
|
+
|
237
|
+
}
|