embulk-input-http 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,196 +1,3 @@
1
- require "net/http"
2
- require "uri"
3
- require "bracecomp"
4
-
5
- module Embulk
6
- module Input
7
-
8
- class HttpInputPlugin < InputPlugin
9
- Plugin.register_input("http", self)
10
-
11
- def self.transaction(config, &control)
12
- task = {
13
- :url => config.param("url", :string),
14
- :method => config.param("method", :string, default: "get"),
15
- :schema => config.param("schema", :array),
16
- :iterate => config.param("iterate", :hash),
17
- :open_timeout => config.param("open_timeout", :float, default: 2.0),
18
- :read_timeout => config.param("read_timeout", :float, default: 10.0),
19
- :done => config.param("done", :array, default: [])
20
- }
21
- params = config.param("params", :array, default: [])
22
- params_unexpand, params_expand = configure_queries(params)
23
-
24
- data_type = task[:iterate]["type"]
25
- unless ["json", "xml"].include?(data_type)
26
- raise "Unknown data_type #{data_type}, only supported for json or xml"
27
- end
28
-
29
- columns = task[:schema].each_with_index.map do |c, i|
30
- Column.new(i, c["name"], c["type"].to_sym)
31
- end
32
-
33
- task[:params] = params_unexpand
34
- task[:params_expand] = params_expand - task[:done]
35
- num_of_threads = task[:params_expand].empty? ? 1 : task[:params_expand].size
36
-
37
- report = yield(task, columns, num_of_threads)
38
- {"done" => report.map{|r| r["done"]}.compact}
39
- end
40
-
41
- def self.configure_queries(params)
42
- base = params.select{|p| !p["expand"]}.map do |p|
43
- [p["name"], p["value"]]
44
- end
45
- expands = params.select{|p| p["expand"] }.map do |p|
46
- p["value"].expand.map do |v|
47
- [p["name"], v]
48
- end
49
- end
50
- if expands.size > 0
51
- dest = expands.first.product(*(expands.slice(1, expands.size - 1)))
52
- dest.sort!{|a, b| "#{a[0]}=#{a[1]}" <=> "#{b[0]}=#{b[1]}"}
53
- else
54
- dest = []
55
- end
56
- [base, dest]
57
- end
58
-
59
- def run
60
- schema = @task["schema"]
61
- iterate = @task["iterate"]
62
- url = @task["url"]
63
- method = @task["method"]
64
-
65
- params_expand = @task["params_expand"][@index] || []
66
- query = URI.encode_www_form(@task["params"] + params_expand)
67
- puts "#{@index}: #{method.upcase} #{url}?#{query}"
68
-
69
- data = fetch(url, method, query).body
70
- data_type = iterate["type"]
71
-
72
- case data_type
73
- when "json"
74
- iter = IterJson.new(schema, data, iterate)
75
- when "xml"
76
- iter = IterXML.new(schema, data, iterate)
77
- else
78
- raise "Unsupported data_type #{data_type}"
79
- end
80
-
81
- iter.each do |record|
82
- @page_builder.add(record)
83
- end
84
- @page_builder.finish
85
-
86
- {:done => params_expand}
87
- end
88
-
89
- private
90
-
91
- def fetch(url, method, query)
92
- uri = URI.parse(url)
93
-
94
- res = Net::HTTP.start(uri.host, uri.port) do |client|
95
- client.open_timeout = @task["open_timeout"]
96
- client.read_timeout = @task["read_timeout"]
97
- case method.downcase
98
- when "get"
99
- client.get([uri.path, query].join("?"))
100
- when "post"
101
- client.post(uri.path, query)
102
- else
103
- raise "Unsupported method #{method}"
104
- end
105
- end
106
-
107
- case res
108
- when Net::HTTPSuccess
109
- res
110
- else
111
- raise "Request is not successful, code=#{res.code}, value=#{res.body}"
112
- end
113
- end
114
-
115
- class Iter
116
- def initialize(schema, data, config)
117
- @schema = schema
118
- @data = data
119
- @config = config
120
- end
121
-
122
- def each
123
- raise NotImplementedError.new("each")
124
- end
125
-
126
- private
127
-
128
- def make_record(e)
129
- @schema.map do |c|
130
- name = c["name"]
131
- path = c["path"]
132
- val = path.nil? ? e[name] : find_by_path(e, path)
133
-
134
- v = val.nil? ? "" : val
135
- type = c["type"]
136
- case type
137
- when "string"
138
- v
139
- when "long"
140
- v.to_i
141
- when "double"
142
- v.to_f
143
- when "boolean"
144
- ["yes", "true", "1"].include?(v)
145
- when "timestamp"
146
- v.empty? ? nil : Time.strptime(v, c["format"])
147
- else
148
- raise "Unsupported type #{type}"
149
- end
150
- end
151
- end
152
-
153
- def find_by_path(e, path)
154
- raise NotImplementedError.new("Find by path is unsupported")
155
- end
156
- end
157
-
158
- class IterXML < Iter
159
- def initialize(schema, data, config)
160
- require "rexml/document"
161
- super
162
- @doc = REXML::Document.new(@data)
163
- end
164
-
165
- def each
166
- @doc.elements.each(@config["path"]) do |e|
167
- dest = {}
168
- e.elements.each do |d|
169
- dest[d.name] = d.text
170
- end
171
- yield make_record(dest)
172
- end
173
- end
174
- end
175
-
176
- class IterJson < Iter
177
- def initialize(schema, data, config)
178
- require "jsonpath"
179
- super
180
- @jsonpath = JsonPath.new(@config["path"])
181
- end
182
-
183
- def each
184
- @jsonpath.on(@data).flatten.each do |e|
185
- yield make_record(e)
186
- end
187
- end
188
-
189
- def find_by_path(e, path)
190
- JsonPath.on(e, path).first
191
- end
192
- end
193
-
194
- end
195
- end
196
- end
1
+ Embulk::JavaPlugin.register_input(
2
+ "http", "org.embulk.input.HttpInputPlugin",
3
+ File.expand_path('../../../../classpath', __FILE__))
@@ -0,0 +1,258 @@
1
+ package org.embulk.input;
2
+
3
+ import com.google.common.base.Optional;
4
+ import com.google.common.base.Throwables;
5
+ import org.apache.http.Header;
6
+ import org.apache.http.HttpException;
7
+ import org.apache.http.HttpResponse;
8
+ import org.apache.http.NameValuePair;
9
+ import org.apache.http.client.HttpClient;
10
+ import org.apache.http.client.config.RequestConfig;
11
+ import org.apache.http.client.entity.UrlEncodedFormEntity;
12
+ import org.apache.http.client.methods.HttpGet;
13
+ import org.apache.http.client.methods.HttpPost;
14
+ import org.apache.http.client.methods.HttpRequestBase;
15
+ import org.apache.http.client.utils.URIBuilder;
16
+ import org.apache.http.impl.client.HttpClientBuilder;
17
+ import org.apache.http.message.BasicHeader;
18
+ import org.apache.http.message.BasicNameValuePair;
19
+ import org.apache.http.util.EntityUtils;
20
+ import org.embulk.config.*;
21
+ import org.embulk.spi.BufferAllocator;
22
+ import org.embulk.spi.Exec;
23
+ import org.embulk.spi.FileInputPlugin;
24
+ import org.embulk.spi.TransactionalFileInput;
25
+ import org.embulk.spi.util.InputStreamFileInput;
26
+ import org.slf4j.Logger;
27
+
28
+ import java.io.IOException;
29
+ import java.io.InputStream;
30
+ import java.io.UnsupportedEncodingException;
31
+ import java.net.URISyntaxException;
32
+ import java.util.ArrayList;
33
+ import java.util.List;
34
+
35
+ public class HttpInputPlugin implements FileInputPlugin {
36
+
37
+ private final Logger logger = Exec.getLogger(getClass());
38
+
39
+ public interface PluginTask extends Task {
40
+ @Config("url")
41
+ public String getUrl();
42
+
43
+ @Config("charset")
44
+ @ConfigDefault("\"utf-8\"")
45
+ public String getCharset();
46
+
47
+ @Config("method")
48
+ @ConfigDefault("\"get\"")
49
+ public String getMethod();
50
+
51
+ @Config("user_agent")
52
+ @ConfigDefault("\"Embulk::Input::HttpInputPlugin\"")
53
+ public String getUserAgent();
54
+
55
+ @Config("open_timeout")
56
+ @ConfigDefault("2000")
57
+ public int getOpenTimeout();
58
+
59
+ @Config("read_timeout")
60
+ @ConfigDefault("10000")
61
+ public int getReadTimeout();
62
+
63
+ @Config("params")
64
+ @ConfigDefault("null")
65
+ public Optional<ParamsConfig> getParams();
66
+
67
+ @ConfigInject
68
+ public BufferAllocator getBufferAllocator();
69
+
70
+ public List<ParamsConfig> getQueries();
71
+ public void setQueries(List<ParamsConfig> queries);
72
+
73
+ public HttpMethod getHttpMethod();
74
+ public void setHttpMethod(HttpMethod httpMethod);
75
+ }
76
+
77
+ public enum HttpMethod {
78
+ POST,
79
+ GET
80
+ }
81
+
82
+ @Override
83
+ public ConfigDiff transaction(ConfigSource config, FileInputPlugin.Control control) {
84
+ PluginTask task = config.loadConfig(PluginTask.class);
85
+
86
+ int numOfThreads = 1;
87
+ if (task.getParams().isPresent()) {
88
+ List<ParamsConfig> expandedQueries = task.getParams().get().expandQueries();
89
+ task.setQueries(expandedQueries);
90
+ numOfThreads = expandedQueries.size();
91
+ } else {
92
+ task.setQueries(new ArrayList<ParamsConfig>());
93
+ }
94
+
95
+ switch (task.getMethod().toUpperCase()) {
96
+ case "GET":
97
+ task.setHttpMethod(HttpMethod.GET);
98
+ break;
99
+ case "POST":
100
+ task.setHttpMethod(HttpMethod.POST);
101
+ break;
102
+ default:
103
+ throw new ConfigException(String.format("Unsupported http method %s", task.getMethod()));
104
+ }
105
+
106
+ return resume(task.dump(), numOfThreads, control);
107
+ }
108
+
109
+ @Override
110
+ public ConfigDiff resume(TaskSource taskSource,
111
+ int taskCount,
112
+ FileInputPlugin.Control control) {
113
+ control.run(taskSource, taskCount);
114
+ return Exec.newConfigDiff();
115
+ }
116
+
117
+ @Override
118
+ public void cleanup(TaskSource taskSource,
119
+ int taskCount,
120
+ List<CommitReport> successCommitReports) {
121
+ }
122
+
123
+ @Override
124
+ public TransactionalFileInput open(TaskSource taskSource, int taskIndex) {
125
+ PluginTask task = taskSource.loadTask(PluginTask.class);
126
+
127
+ HttpRequestBase request;
128
+ try {
129
+ request = makeRequest(task, taskIndex);
130
+ } catch (URISyntaxException | UnsupportedEncodingException e) {
131
+ throw Throwables.propagate(e);
132
+ }
133
+ logger.info(String.format("%s \"%s\"", task.getMethod().toUpperCase(),
134
+ request.getURI().toString()));
135
+
136
+ HttpClient client = HttpClientBuilder.create()
137
+ .setDefaultRequestConfig(makeRequestConfig(task))
138
+ .setDefaultHeaders(makeHeaders(task))
139
+ .build();
140
+ try {
141
+ HttpResponse response = client.execute(request);
142
+ statusIsOkOrThrow(response);
143
+ InputStream stream = response.getEntity().getContent();
144
+ PluginFileInput input = new PluginFileInput(task, stream);
145
+ stream = null;
146
+ return input;
147
+ } catch (IOException | HttpException e) {
148
+ throw Throwables.propagate(e);
149
+ }
150
+ }
151
+
152
+ private HttpRequestBase makeRequest(PluginTask task, int taskIndex)
153
+ throws URISyntaxException, UnsupportedEncodingException {
154
+ final ParamsConfig paramsConfig = (task.getQueries().isEmpty()) ?
155
+ null : task.getQueries().get(taskIndex);
156
+ if (task.getHttpMethod() == HttpMethod.GET) {
157
+ HttpGet request = new HttpGet(task.getUrl());
158
+ if (paramsConfig != null) {
159
+ URIBuilder builder = new URIBuilder(request.getURI());
160
+ for (QueryConfig p : paramsConfig.getQueries()) {
161
+ builder.addParameter(p.getName(), p.getValue());
162
+ }
163
+ request.setURI(builder.build());
164
+ }
165
+ return request;
166
+ } else if (task.getHttpMethod() == HttpMethod.POST) {
167
+ HttpPost request = new HttpPost(task.getUrl());
168
+ if (paramsConfig != null) {
169
+ List<NameValuePair> pairs = new ArrayList<>();
170
+ for (QueryConfig p : paramsConfig.getQueries()) {
171
+ pairs.add(new BasicNameValuePair(p.getName(), p.getValue()));
172
+ }
173
+ request.setEntity(new UrlEncodedFormEntity(pairs));
174
+ }
175
+ return request;
176
+ }
177
+ throw new IllegalArgumentException(String.format("Unsupported http method %s", task.getMethod()));
178
+ }
179
+
180
+ private List<Header> makeHeaders(PluginTask task) {
181
+ List<Header> headers = new ArrayList<>();
182
+ headers.add(new BasicHeader("Accept", "*/*"));
183
+ headers.add(new BasicHeader("Accept-Charset", task.getCharset()));
184
+ headers.add(new BasicHeader("Accept-Encoding", "gzip, deflate"));
185
+ headers.add(new BasicHeader("Accept-Language", "en-us,en;q=0.5"));
186
+ headers.add(new BasicHeader("User-Agent", task.getUserAgent()));
187
+ return headers;
188
+ }
189
+
190
+ private RequestConfig makeRequestConfig(PluginTask task) {
191
+ return RequestConfig.custom()
192
+ .setCircularRedirectsAllowed(true)
193
+ .setMaxRedirects(10)
194
+ .setRedirectsEnabled(true)
195
+ .setConnectTimeout(task.getOpenTimeout())
196
+ .setSocketTimeout(task.getReadTimeout())
197
+ .build();
198
+ }
199
+
200
+ private void statusIsOkOrThrow(HttpResponse response)
201
+ throws HttpException, IOException {
202
+ int code = response.getStatusLine().getStatusCode();
203
+ switch (response.getStatusLine().getStatusCode()) {
204
+ case 200:
205
+ return;
206
+ default:
207
+ throw new HttpException(String.format("Request is not successful, code=%d, body=%s",
208
+ code, EntityUtils.toString(response.getEntity())));
209
+ }
210
+ }
211
+
212
+ public static class PluginFileInput extends InputStreamFileInput
213
+ implements TransactionalFileInput {
214
+
215
+ private static class SingleFileProvider
216
+ implements InputStreamFileInput.Provider {
217
+
218
+ private InputStream stream;
219
+ private boolean opened = false;
220
+
221
+ public SingleFileProvider(InputStream stream) {
222
+ this.stream = stream;
223
+ }
224
+
225
+ @Override
226
+ public InputStream openNext() throws IOException {
227
+ if (opened) {
228
+ return null;
229
+ }
230
+ opened = true;
231
+ return stream;
232
+ }
233
+
234
+ @Override
235
+ public void close() throws IOException {
236
+ if (!opened) {
237
+ stream.close();
238
+ }
239
+ }
240
+ }
241
+
242
+ public PluginFileInput(PluginTask task, InputStream stream) {
243
+ super(task.getBufferAllocator(), new SingleFileProvider(stream));
244
+ }
245
+
246
+ public void abort() {
247
+ }
248
+
249
+ public CommitReport commit() {
250
+ return Exec.newCommitReport();
251
+ }
252
+
253
+ @Override
254
+ public void close() {
255
+ }
256
+ }
257
+
258
+ }
@@ -0,0 +1,66 @@
1
+ package org.embulk.input;
2
+
3
+ import com.fasterxml.jackson.annotation.JsonCreator;
4
+ import com.fasterxml.jackson.annotation.JsonValue;
5
+ import com.google.common.base.Objects;
6
+
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+
10
+ public class ParamsConfig {
11
+
12
+ private final List<QueryConfig> queries;
13
+
14
+ @JsonCreator
15
+ public ParamsConfig(List<QueryConfig> queries) {
16
+ this.queries = queries;
17
+ }
18
+
19
+ @JsonValue
20
+ public List<QueryConfig> getQueries() {
21
+ return queries;
22
+ }
23
+
24
+ public List<ParamsConfig> expandQueries() {
25
+ List<List<QueryConfig>> base = new ArrayList<>(queries.size());
26
+ for (QueryConfig p : queries) {
27
+ base.add(p.expand());
28
+ }
29
+
30
+ int productSize = 1;
31
+ int baseSize = base.size();
32
+ for (int i = 0; i < baseSize; productSize *= base.get(i).size(), i++);
33
+
34
+ List<ParamsConfig> expands = new ArrayList<>(productSize);
35
+ for(int i = 0; i < productSize; i++) {
36
+ int j = 1;
37
+ List<QueryConfig> query = new ArrayList<>();
38
+ for(List<QueryConfig> list : base) {
39
+ QueryConfig pc = list.get((i / j) % list.size());
40
+ query.add(pc);
41
+ j *= list.size();
42
+ }
43
+ expands.add(new ParamsConfig(query));
44
+ }
45
+
46
+ return expands;
47
+ }
48
+
49
+ @Override
50
+ public boolean equals(Object obj) {
51
+ if (this == obj) {
52
+ return true;
53
+ }
54
+ if (!(obj instanceof ParamsConfig)) {
55
+ return false;
56
+ }
57
+ ParamsConfig other = (ParamsConfig) obj;
58
+ return Objects.equal(queries, other.queries);
59
+ }
60
+
61
+ @Override
62
+ public int hashCode() {
63
+ return Objects.hashCode(queries);
64
+ }
65
+
66
+ }
@@ -0,0 +1,125 @@
1
+ package org.embulk.input;
2
+
3
+ import com.google.common.base.Objects;
4
+ import com.fasterxml.jackson.annotation.JsonCreator;
5
+ import com.fasterxml.jackson.annotation.JsonProperty;
6
+
7
+ import java.util.ArrayList;
8
+ import java.util.List;
9
+
10
+ public class QueryConfig {
11
+
12
+ private final String name;
13
+ private final String value;
14
+ private final boolean expand;
15
+
16
+ @JsonCreator
17
+ public QueryConfig(
18
+ @JsonProperty("name") String name,
19
+ @JsonProperty("value") String value,
20
+ @JsonProperty("expand") boolean expand) {
21
+ this.name = name;
22
+ this.value = value;
23
+ this.expand = expand;
24
+ }
25
+
26
+ public List<QueryConfig> expand() {
27
+ List<QueryConfig> dest;
28
+ if (!expand) {
29
+ dest = new ArrayList<>(1);
30
+ dest.add(this);
31
+ } else {
32
+ List<String> expanded = BraceExpansion.expand(value);
33
+ dest = new ArrayList<>(expanded.size());
34
+ for(String s : expanded) {
35
+ dest.add(new QueryConfig(name, s, false));
36
+ }
37
+ }
38
+ return dest;
39
+ }
40
+
41
+ @JsonProperty("name")
42
+ public String getName() {
43
+ return name;
44
+ }
45
+
46
+ @JsonProperty("value")
47
+ public String getValue() {
48
+ return value;
49
+ }
50
+
51
+ @JsonProperty("expand")
52
+ public boolean isExpand() {
53
+ return expand;
54
+ }
55
+
56
+ @Override
57
+ public boolean equals(Object obj) {
58
+ if (this == obj) {
59
+ return true;
60
+ }
61
+ if (!(obj instanceof QueryConfig)) {
62
+ return false;
63
+ }
64
+ QueryConfig other = (QueryConfig) obj;
65
+ return Objects.equal(this.name, other.name) &&
66
+ Objects.equal(value, other.value) &&
67
+ Objects.equal(expand, other.expand);
68
+ }
69
+
70
+ @Override
71
+ public int hashCode() {
72
+ return Objects.hashCode(name, value, expand);
73
+ }
74
+
75
+ @Override
76
+ public String toString() {
77
+ return String.format("ParameterConfig[%s, %s, %s]",
78
+ getName(), getValue(), isExpand());
79
+ }
80
+
81
+ private static class BraceExpansion {
82
+
83
+ public static List<String> expand(String s) {
84
+ return expandRecursive("", s, "", new ArrayList<String>());
85
+ }
86
+
87
+ private static List<String> expandRecursive(String prefix, String s,
88
+ String suffix, List<String> dest) {
89
+ // I used the code below as reference.
90
+ // http://rosettacode.org/wiki/Brace_expansion#Java
91
+ int i1 = -1, i2 = 0;
92
+ String noEscape = s.replaceAll("([\\\\]{2}|[\\\\][,}{])", " ");
93
+ StringBuilder sb = null;
94
+
95
+ outer:
96
+ while ((i1 = noEscape.indexOf('{', i1 + 1)) != -1) {
97
+ i2 = i1 + 1;
98
+ sb = new StringBuilder(s);
99
+ for (int depth = 1; i2 < s.length() && depth > 0; i2++) {
100
+ char c = noEscape.charAt(i2);
101
+ depth = (c == '{') ? ++depth : depth;
102
+ depth = (c == '}') ? --depth : depth;
103
+ if (c == ',' && depth == 1) {
104
+ sb.setCharAt(i2, '\u0000');
105
+ } else if (c == '}' && depth == 0 && sb.indexOf("\u0000") != -1) {
106
+ break outer;
107
+ }
108
+ }
109
+ }
110
+
111
+ if (i1 == -1) {
112
+ if (suffix.length() > 0) {
113
+ expandRecursive(prefix + s, suffix, "", dest);
114
+ } else {
115
+ dest.add(String.format("%s%s%s", prefix, s, suffix));
116
+ }
117
+ } else {
118
+ for (String m : sb.substring(i1 + 1, i2).split("\u0000", -1)) {
119
+ expandRecursive(prefix + s.substring(0, i1), m, s.substring(i2 + 1) + suffix, dest);
120
+ }
121
+ }
122
+ return dest;
123
+ }
124
+ }
125
+ }
@@ -0,0 +1,5 @@
1
+ package org.embulk.input;
2
+
3
+ public class TestHttpInputPlugin
4
+ {
5
+ }