embulk-input-http 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,196 +1,3 @@
1
- require "net/http"
2
- require "uri"
3
- require "bracecomp"
4
-
5
- module Embulk
6
- module Input
7
-
8
- class HttpInputPlugin < InputPlugin
9
- Plugin.register_input("http", self)
10
-
11
- def self.transaction(config, &control)
12
- task = {
13
- :url => config.param("url", :string),
14
- :method => config.param("method", :string, default: "get"),
15
- :schema => config.param("schema", :array),
16
- :iterate => config.param("iterate", :hash),
17
- :open_timeout => config.param("open_timeout", :float, default: 2.0),
18
- :read_timeout => config.param("read_timeout", :float, default: 10.0),
19
- :done => config.param("done", :array, default: [])
20
- }
21
- params = config.param("params", :array, default: [])
22
- params_unexpand, params_expand = configure_queries(params)
23
-
24
- data_type = task[:iterate]["type"]
25
- unless ["json", "xml"].include?(data_type)
26
- raise "Unknown data_type #{data_type}, only supported for json or xml"
27
- end
28
-
29
- columns = task[:schema].each_with_index.map do |c, i|
30
- Column.new(i, c["name"], c["type"].to_sym)
31
- end
32
-
33
- task[:params] = params_unexpand
34
- task[:params_expand] = params_expand - task[:done]
35
- num_of_threads = task[:params_expand].empty? ? 1 : task[:params_expand].size
36
-
37
- report = yield(task, columns, num_of_threads)
38
- {"done" => report.map{|r| r["done"]}.compact}
39
- end
40
-
41
- def self.configure_queries(params)
42
- base = params.select{|p| !p["expand"]}.map do |p|
43
- [p["name"], p["value"]]
44
- end
45
- expands = params.select{|p| p["expand"] }.map do |p|
46
- p["value"].expand.map do |v|
47
- [p["name"], v]
48
- end
49
- end
50
- if expands.size > 0
51
- dest = expands.first.product(*(expands.slice(1, expands.size - 1)))
52
- dest.sort!{|a, b| "#{a[0]}=#{a[1]}" <=> "#{b[0]}=#{b[1]}"}
53
- else
54
- dest = []
55
- end
56
- [base, dest]
57
- end
58
-
59
- def run
60
- schema = @task["schema"]
61
- iterate = @task["iterate"]
62
- url = @task["url"]
63
- method = @task["method"]
64
-
65
- params_expand = @task["params_expand"][@index] || []
66
- query = URI.encode_www_form(@task["params"] + params_expand)
67
- puts "#{@index}: #{method.upcase} #{url}?#{query}"
68
-
69
- data = fetch(url, method, query).body
70
- data_type = iterate["type"]
71
-
72
- case data_type
73
- when "json"
74
- iter = IterJson.new(schema, data, iterate)
75
- when "xml"
76
- iter = IterXML.new(schema, data, iterate)
77
- else
78
- raise "Unsupported data_type #{data_type}"
79
- end
80
-
81
- iter.each do |record|
82
- @page_builder.add(record)
83
- end
84
- @page_builder.finish
85
-
86
- {:done => params_expand}
87
- end
88
-
89
- private
90
-
91
- def fetch(url, method, query)
92
- uri = URI.parse(url)
93
-
94
- res = Net::HTTP.start(uri.host, uri.port) do |client|
95
- client.open_timeout = @task["open_timeout"]
96
- client.read_timeout = @task["read_timeout"]
97
- case method.downcase
98
- when "get"
99
- client.get([uri.path, query].join("?"))
100
- when "post"
101
- client.post(uri.path, query)
102
- else
103
- raise "Unsupported method #{method}"
104
- end
105
- end
106
-
107
- case res
108
- when Net::HTTPSuccess
109
- res
110
- else
111
- raise "Request is not successful, code=#{res.code}, value=#{res.body}"
112
- end
113
- end
114
-
115
- class Iter
116
- def initialize(schema, data, config)
117
- @schema = schema
118
- @data = data
119
- @config = config
120
- end
121
-
122
- def each
123
- raise NotImplementedError.new("each")
124
- end
125
-
126
- private
127
-
128
- def make_record(e)
129
- @schema.map do |c|
130
- name = c["name"]
131
- path = c["path"]
132
- val = path.nil? ? e[name] : find_by_path(e, path)
133
-
134
- v = val.nil? ? "" : val
135
- type = c["type"]
136
- case type
137
- when "string"
138
- v
139
- when "long"
140
- v.to_i
141
- when "double"
142
- v.to_f
143
- when "boolean"
144
- ["yes", "true", "1"].include?(v)
145
- when "timestamp"
146
- v.empty? ? nil : Time.strptime(v, c["format"])
147
- else
148
- raise "Unsupported type #{type}"
149
- end
150
- end
151
- end
152
-
153
- def find_by_path(e, path)
154
- raise NotImplementedError.new("Find by path is unsupported")
155
- end
156
- end
157
-
158
- class IterXML < Iter
159
- def initialize(schema, data, config)
160
- require "rexml/document"
161
- super
162
- @doc = REXML::Document.new(@data)
163
- end
164
-
165
- def each
166
- @doc.elements.each(@config["path"]) do |e|
167
- dest = {}
168
- e.elements.each do |d|
169
- dest[d.name] = d.text
170
- end
171
- yield make_record(dest)
172
- end
173
- end
174
- end
175
-
176
- class IterJson < Iter
177
- def initialize(schema, data, config)
178
- require "jsonpath"
179
- super
180
- @jsonpath = JsonPath.new(@config["path"])
181
- end
182
-
183
- def each
184
- @jsonpath.on(@data).flatten.each do |e|
185
- yield make_record(e)
186
- end
187
- end
188
-
189
- def find_by_path(e, path)
190
- JsonPath.on(e, path).first
191
- end
192
- end
193
-
194
- end
195
- end
196
- end
1
+ Embulk::JavaPlugin.register_input(
2
+ "http", "org.embulk.input.HttpInputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,258 @@
1
+ package org.embulk.input;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.base.Throwables;
5
+ import org.apache.http.Header;
6
+ import org.apache.http.HttpException;
7
+ import org.apache.http.HttpResponse;
8
+ import org.apache.http.NameValuePair;
9
+ import org.apache.http.client.HttpClient;
10
+ import org.apache.http.client.config.RequestConfig;
11
+ import org.apache.http.client.entity.UrlEncodedFormEntity;
12
+ import org.apache.http.client.methods.HttpGet;
13
+ import org.apache.http.client.methods.HttpPost;
14
+ import org.apache.http.client.methods.HttpRequestBase;
15
+ import org.apache.http.client.utils.URIBuilder;
16
+ import org.apache.http.impl.client.HttpClientBuilder;
17
+ import org.apache.http.message.BasicHeader;
18
+ import org.apache.http.message.BasicNameValuePair;
19
+ import org.apache.http.util.EntityUtils;
20
+ import org.embulk.config.*;
21
+ import org.embulk.spi.BufferAllocator;
22
+ import org.embulk.spi.Exec;
23
+ import org.embulk.spi.FileInputPlugin;
24
+ import org.embulk.spi.TransactionalFileInput;
25
+ import org.embulk.spi.util.InputStreamFileInput;
26
+ import org.slf4j.Logger;
27
+
28
+ import java.io.IOException;
29
+ import java.io.InputStream;
30
+ import java.io.UnsupportedEncodingException;
31
+ import java.net.URISyntaxException;
32
+ import java.util.ArrayList;
33
+ import java.util.List;
34
+
35
+ public class HttpInputPlugin implements FileInputPlugin {
36
+
37
+ private final Logger logger = Exec.getLogger(getClass());
38
+
39
+ public interface PluginTask extends Task {
40
+ @Config("url")
41
+ public String getUrl();
42
+
43
+ @Config("charset")
44
+ @ConfigDefault("\"utf-8\"")
45
+ public String getCharset();
46
+
47
+ @Config("method")
48
+ @ConfigDefault("\"get\"")
49
+ public String getMethod();
50
+
51
+ @Config("user_agent")
52
+ @ConfigDefault("\"Embulk::Input::HttpInputPlugin\"")
53
+ public String getUserAgent();
54
+
55
+ @Config("open_timeout")
56
+ @ConfigDefault("2000")
57
+ public int getOpenTimeout();
58
+
59
+ @Config("read_timeout")
60
+ @ConfigDefault("10000")
61
+ public int getReadTimeout();
62
+
63
+ @Config("params")
64
+ @ConfigDefault("null")
65
+ public Optional<ParamsConfig> getParams();
66
+
67
+ @ConfigInject
68
+ public BufferAllocator getBufferAllocator();
69
+
70
+ public List<ParamsConfig> getQueries();
71
+ public void setQueries(List<ParamsConfig> queries);
72
+
73
+ public HttpMethod getHttpMethod();
74
+ public void setHttpMethod(HttpMethod httpMethod);
75
+ }
76
+
77
+ public enum HttpMethod {
78
+ POST,
79
+ GET
80
+ }
81
+
82
+ @Override
83
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control) {
84
+ PluginTask task = config.loadConfig(PluginTask.class);
85
+
86
+ int numOfThreads = 1;
87
+ if (task.getParams().isPresent()) {
88
+ List<ParamsConfig> expandedQueries = task.getParams().get().expandQueries();
89
+ task.setQueries(expandedQueries);
90
+ numOfThreads = expandedQueries.size();
91
+ } else {
92
+ task.setQueries(new ArrayList<ParamsConfig>());
93
+ }
94
+
95
+ switch (task.getMethod().toUpperCase()) {
96
+ case "GET":
97
+ task.setHttpMethod(HttpMethod.GET);
98
+ break;
99
+ case "POST":
100
+ task.setHttpMethod(HttpMethod.POST);
101
+ break;
102
+ default:
103
+ throw new ConfigException(String.format("Unsupported http method %s", task.getMethod()));
104
+ }
105
+
106
+ return resume(task.dump(), numOfThreads, control);
107
+ }
108
+
109
+ @Override
110
+ public ConfigDiff resume(TaskSource taskSource,
111
+ int taskCount,
112
+ FileInputPlugin.Control control) {
113
+ control.run(taskSource, taskCount);
114
+ return Exec.newConfigDiff();
115
+ }
116
+
117
+ @Override
118
+ public void cleanup(TaskSource taskSource,
119
+ int taskCount,
120
+ List<CommitReport> successCommitReports) {
121
+ }
122
+
123
+ @Override
124
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex) {
125
+ PluginTask task = taskSource.loadTask(PluginTask.class);
126
+
127
+ HttpRequestBase request;
128
+ try {
129
+ request = makeRequest(task, taskIndex);
130
+ } catch (URISyntaxException | UnsupportedEncodingException e) {
131
+ throw Throwables.propagate(e);
132
+ }
133
+ logger.info(String.format("%s \"%s\"", task.getMethod().toUpperCase(),
134
+ request.getURI().toString()));
135
+
136
+ HttpClient client = HttpClientBuilder.create()
137
+ .setDefaultRequestConfig(makeRequestConfig(task))
138
+ .setDefaultHeaders(makeHeaders(task))
139
+ .build();
140
+ try {
141
+ HttpResponse response = client.execute(request);
142
+ statusIsOkOrThrow(response);
143
+ InputStream stream = response.getEntity().getContent();
144
+ PluginFileInput input = new PluginFileInput(task, stream);
145
+ stream = null;
146
+ return input;
147
+ } catch (IOException | HttpException e) {
148
+ throw Throwables.propagate(e);
149
+ }
150
+ }
151
+
152
+ private HttpRequestBase makeRequest(PluginTask task, int taskIndex)
153
+ throws URISyntaxException, UnsupportedEncodingException {
154
+ final ParamsConfig paramsConfig = (task.getQueries().isEmpty()) ?
155
+ null : task.getQueries().get(taskIndex);
156
+ if (task.getHttpMethod() == HttpMethod.GET) {
157
+ HttpGet request = new HttpGet(task.getUrl());
158
+ if (paramsConfig != null) {
159
+ URIBuilder builder = new URIBuilder(request.getURI());
160
+ for (QueryConfig p : paramsConfig.getQueries()) {
161
+ builder.addParameter(p.getName(), p.getValue());
162
+ }
163
+ request.setURI(builder.build());
164
+ }
165
+ return request;
166
+ } else if (task.getHttpMethod() == HttpMethod.POST) {
167
+ HttpPost request = new HttpPost(task.getUrl());
168
+ if (paramsConfig != null) {
169
+ List<NameValuePair> pairs = new ArrayList<>();
170
+ for (QueryConfig p : paramsConfig.getQueries()) {
171
+ pairs.add(new BasicNameValuePair(p.getName(), p.getValue()));
172
+ }
173
+ request.setEntity(new UrlEncodedFormEntity(pairs));
174
+ }
175
+ return request;
176
+ }
177
+ throw new IllegalArgumentException(String.format("Unsupported http method %s", task.getMethod()));
178
+ }
179
+
180
+ private List<Header> makeHeaders(PluginTask task) {
181
+ List<Header> headers = new ArrayList<>();
182
+ headers.add(new BasicHeader("Accept", "*/*"));
183
+ headers.add(new BasicHeader("Accept-Charset", task.getCharset()));
184
+ headers.add(new BasicHeader("Accept-Encoding", "gzip, deflate"));
185
+ headers.add(new BasicHeader("Accept-Language", "en-us,en;q=0.5"));
186
+ headers.add(new BasicHeader("User-Agent", task.getUserAgent()));
187
+ return headers;
188
+ }
189
+
190
+ private RequestConfig makeRequestConfig(PluginTask task) {
191
+ return RequestConfig.custom()
192
+ .setCircularRedirectsAllowed(true)
193
+ .setMaxRedirects(10)
194
+ .setRedirectsEnabled(true)
195
+ .setConnectTimeout(task.getOpenTimeout())
196
+ .setSocketTimeout(task.getReadTimeout())
197
+ .build();
198
+ }
199
+
200
+ private void statusIsOkOrThrow(HttpResponse response)
201
+ throws HttpException, IOException {
202
+ int code = response.getStatusLine().getStatusCode();
203
+ switch (response.getStatusLine().getStatusCode()) {
204
+ case 200:
205
+ return;
206
+ default:
207
+ throw new HttpException(String.format("Request is not successful, code=%d, body=%s",
208
+ code, EntityUtils.toString(response.getEntity())));
209
+ }
210
+ }
211
+
212
+ public static class PluginFileInput extends InputStreamFileInput
213
+ implements TransactionalFileInput {
214
+
215
+ private static class SingleFileProvider
216
+ implements InputStreamFileInput.Provider {
217
+
218
+ private InputStream stream;
219
+ private boolean opened = false;
220
+
221
+ public SingleFileProvider(InputStream stream) {
222
+ this.stream = stream;
223
+ }
224
+
225
+ @Override
226
+ public InputStream openNext() throws IOException {
227
+ if (opened) {
228
+ return null;
229
+ }
230
+ opened = true;
231
+ return stream;
232
+ }
233
+
234
+ @Override
235
+ public void close() throws IOException {
236
+ if (!opened) {
237
+ stream.close();
238
+ }
239
+ }
240
+ }
241
+
242
+ public PluginFileInput(PluginTask task, InputStream stream) {
243
+ super(task.getBufferAllocator(), new SingleFileProvider(stream));
244
+ }
245
+
246
+ public void abort() {
247
+ }
248
+
249
+ public CommitReport commit() {
250
+ return Exec.newCommitReport();
251
+ }
252
+
253
+ @Override
254
+ public void close() {
255
+ }
256
+ }
257
+
258
+ }
@@ -0,0 +1,66 @@
1
+ package org.embulk.input;
2
+
3
+ import com.fasterxml.jackson.annotation.JsonCreator;
4
+ import com.fasterxml.jackson.annotation.JsonValue;
5
+ import com.google.common.base.Objects;
6
+
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+
10
+ public class ParamsConfig {
11
+
12
+ private final List<QueryConfig> queries;
13
+
14
+ @JsonCreator
15
+ public ParamsConfig(List<QueryConfig> queries) {
16
+ this.queries = queries;
17
+ }
18
+
19
+ @JsonValue
20
+ public List<QueryConfig> getQueries() {
21
+ return queries;
22
+ }
23
+
24
+ public List<ParamsConfig> expandQueries() {
25
+ List<List<QueryConfig>> base = new ArrayList<>(queries.size());
26
+ for (QueryConfig p : queries) {
27
+ base.add(p.expand());
28
+ }
29
+
30
+ int productSize = 1;
31
+ int baseSize = base.size();
32
+ for (int i = 0; i < baseSize; productSize *= base.get(i).size(), i++);
33
+
34
+ List<ParamsConfig> expands = new ArrayList<>(productSize);
35
+ for(int i = 0; i < productSize; i++) {
36
+ int j = 1;
37
+ List<QueryConfig> query = new ArrayList<>();
38
+ for(List<QueryConfig> list : base) {
39
+ QueryConfig pc = list.get((i / j) % list.size());
40
+ query.add(pc);
41
+ j *= list.size();
42
+ }
43
+ expands.add(new ParamsConfig(query));
44
+ }
45
+
46
+ return expands;
47
+ }
48
+
49
+ @Override
50
+ public boolean equals(Object obj) {
51
+ if (this == obj) {
52
+ return true;
53
+ }
54
+ if (!(obj instanceof ParamsConfig)) {
55
+ return false;
56
+ }
57
+ ParamsConfig other = (ParamsConfig) obj;
58
+ return Objects.equal(queries, other.queries);
59
+ }
60
+
61
+ @Override
62
+ public int hashCode() {
63
+ return Objects.hashCode(queries);
64
+ }
65
+
66
+ }
@@ -0,0 +1,125 @@
1
+ package org.embulk.input;
2
+
3
+ import com.google.common.base.Objects;
4
+ import com.fasterxml.jackson.annotation.JsonCreator;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+
10
+ public class QueryConfig {
11
+
12
+ private final String name;
13
+ private final String value;
14
+ private final boolean expand;
15
+
16
+ @JsonCreator
17
+ public QueryConfig(
18
+ @JsonProperty("name") String name,
19
+ @JsonProperty("value") String value,
20
+ @JsonProperty("expand") boolean expand) {
21
+ this.name = name;
22
+ this.value = value;
23
+ this.expand = expand;
24
+ }
25
+
26
+ public List<QueryConfig> expand() {
27
+ List<QueryConfig> dest;
28
+ if (!expand) {
29
+ dest = new ArrayList<>(1);
30
+ dest.add(this);
31
+ } else {
32
+ List<String> expanded = BraceExpansion.expand(value);
33
+ dest = new ArrayList<>(expanded.size());
34
+ for(String s : expanded) {
35
+ dest.add(new QueryConfig(name, s, false));
36
+ }
37
+ }
38
+ return dest;
39
+ }
40
+
41
+ @JsonProperty("name")
42
+ public String getName() {
43
+ return name;
44
+ }
45
+
46
+ @JsonProperty("value")
47
+ public String getValue() {
48
+ return value;
49
+ }
50
+
51
+ @JsonProperty("expand")
52
+ public boolean isExpand() {
53
+ return expand;
54
+ }
55
+
56
+ @Override
57
+ public boolean equals(Object obj) {
58
+ if (this == obj) {
59
+ return true;
60
+ }
61
+ if (!(obj instanceof QueryConfig)) {
62
+ return false;
63
+ }
64
+ QueryConfig other = (QueryConfig) obj;
65
+ return Objects.equal(this.name, other.name) &&
66
+ Objects.equal(value, other.value) &&
67
+ Objects.equal(expand, other.expand);
68
+ }
69
+
70
+ @Override
71
+ public int hashCode() {
72
+ return Objects.hashCode(name, value, expand);
73
+ }
74
+
75
+ @Override
76
+ public String toString() {
77
+ return String.format("ParameterConfig[%s, %s, %s]",
78
+ getName(), getValue(), isExpand());
79
+ }
80
+
81
+ private static class BraceExpansion {
82
+
83
+ public static List<String> expand(String s) {
84
+ return expandRecursive("", s, "", new ArrayList<String>());
85
+ }
86
+
87
+ private static List<String> expandRecursive(String prefix, String s,
88
+ String suffix, List<String> dest) {
89
+ // I used the code below as reference.
90
+ // http://rosettacode.org/wiki/Brace_expansion#Java
91
+ int i1 = -1, i2 = 0;
92
+ String noEscape = s.replaceAll("([\\\\]{2}|[\\\\][,}{])", " ");
93
+ StringBuilder sb = null;
94
+
95
+ outer:
96
+ while ((i1 = noEscape.indexOf('{', i1 + 1)) != -1) {
97
+ i2 = i1 + 1;
98
+ sb = new StringBuilder(s);
99
+ for (int depth = 1; i2 < s.length() && depth > 0; i2++) {
100
+ char c = noEscape.charAt(i2);
101
+ depth = (c == '{') ? ++depth : depth;
102
+ depth = (c == '}') ? --depth : depth;
103
+ if (c == ',' && depth == 1) {
104
+ sb.setCharAt(i2, '\u0000');
105
+ } else if (c == '}' && depth == 0 && sb.indexOf("\u0000") != -1) {
106
+ break outer;
107
+ }
108
+ }
109
+ }
110
+
111
+ if (i1 == -1) {
112
+ if (suffix.length() > 0) {
113
+ expandRecursive(prefix + s, suffix, "", dest);
114
+ } else {
115
+ dest.add(String.format("%s%s%s", prefix, s, suffix));
116
+ }
117
+ } else {
118
+ for (String m : sb.substring(i1 + 1, i2).split("\u0000", -1)) {
119
+ expandRecursive(prefix + s.substring(0, i1), m, s.substring(i2 + 1) + suffix, dest);
120
+ }
121
+ }
122
+ return dest;
123
+ }
124
+ }
125
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.input;
2
+
3
+ public class TestHttpInputPlugin
4
+ {
5
+ }