embulk-input-http 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +42 -6
- data/embulk-input-http.gemspec +3 -2
- data/example/json-example-expand.yml +21 -0
- data/example/json-example.yml +2 -2
- data/lib/embulk/input/http.rb +111 -79
- metadata +24 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0bc5ae1c890470e5d30b21ced647bd9e85e8a272
|
4
|
+
data.tar.gz: 77c89043f7c45fae6750eb84152053f10ae691a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be0f73d6d3e4f33d7dcba51208e32446b5c31c4b4415bc6b415285234ec2982ffbc131a565914605b8d8331a371fd0daf143f4cf23f24a01d604be8dce1999fc
|
7
|
+
data.tar.gz: 44c57e268a4a9e4d0ecc7c8e06f836e62b7f69f2c760306665aba9e714eb7e522b025daabf9c23b34029ba9b5820094bd32c0170203ae78daaa3bb8f4eec57e0
|
data/README.md
CHANGED
@@ -22,14 +22,14 @@ in:
|
|
22
22
|
params:
|
23
23
|
- {name: method, value: getStations}
|
24
24
|
- {name: x, value: 135.0}
|
25
|
-
- {name: y, value: 35.0}
|
25
|
+
- {name: y, value: "{30..35}.0", expand: true}
|
26
26
|
schema:
|
27
27
|
- {name: name, type: string}
|
28
28
|
- {name: next, type: string}
|
29
29
|
- {name: prev, type: string}
|
30
30
|
- {name: distance, type: string}
|
31
|
-
- {name:
|
32
|
-
- {name:
|
31
|
+
- {name: lat, type: double, path: x}
|
32
|
+
- {name: lng, type: double, path: y}
|
33
33
|
- {name: line, type: string}
|
34
34
|
- {name: postal, type: string}
|
35
35
|
iterate: {type: json, path: $.response.station}
|
@@ -42,7 +42,8 @@ in:
|
|
42
42
|
- iterate: data type and path to find root data, json/xml is supported for now (required)
|
43
43
|
- method: http method, get is used by default (optional)
|
44
44
|
- params: pair of name/value to specify query parameter (optional)
|
45
|
-
|
45
|
+
- open_timeout: timeout to open connection (optional, 5 is used by default)
|
46
|
+
- read_timeout: timeout to read content via http (optional, 10 is used by default)
|
46
47
|
|
47
48
|
### Iterate data
|
48
49
|
|
@@ -71,6 +72,30 @@ You can iterate "students" node by the following condifuration:
|
|
71
72
|
|
72
73
|
iterate: {type: json, path: $.students}
|
73
74
|
|
75
|
+
You can specify jsonpath to also *path* in schema section:
|
76
|
+
|
77
|
+
```yaml
|
78
|
+
schema:
|
79
|
+
- {name: firstName, type: string, path: "names[0]"}
|
80
|
+
- {name: lastName, type: string, path: "names[1]"}
|
81
|
+
iterate: {type: json, path: $.students}
|
82
|
+
```
|
83
|
+
|
84
|
+
Then you can make record from more complicated json like as follows:
|
85
|
+
|
86
|
+
```json
|
87
|
+
{
|
88
|
+
"result" : "success",
|
89
|
+
"students" : [
|
90
|
+
{ "names" : ["John", "Lennon"], "age" : 10 },
|
91
|
+
{ "names" : ["Paul", "Maccartney"], "age" : 10 }
|
92
|
+
]
|
93
|
+
}
|
94
|
+
```
|
95
|
+
|
96
|
+
In this case, names[0] will be firstName of schema and names[1] will be lastName.
|
97
|
+
|
98
|
+
|
74
99
|
#### xml
|
75
100
|
|
76
101
|
You can parse also xml by specifing **path/to/node** style to *path*.
|
@@ -105,12 +130,23 @@ Configuration as below to iterate student node:
|
|
105
130
|
|
106
131
|
iterate: {type: xml, path: data/students/student}
|
107
132
|
|
133
|
+
### Brace expansion style in params
|
108
134
|
|
109
|
-
|
135
|
+
In *params* section, you can specify also multilple params by using **brace expansion style**.
|
110
136
|
|
137
|
+
```yaml
|
138
|
+
params
|
139
|
+
- {name: id, value "{1..5}", expand: true}
|
140
|
+
- {name: name, value "{John,Paul,George,Ringo}", expand: true}
|
141
|
+
```
|
142
|
+
|
143
|
+
To use this style, you need to set true to parameter *expand*, then all patterns of query will be called in a defferent request.
|
144
|
+
|
145
|
+
|
146
|
+
## TODO
|
147
|
+
- Split input/formatter
|
111
148
|
- BasicAuth
|
112
149
|
- HTTP-proxy
|
113
|
-
- Breace-expansion style parameter, such as curl
|
114
150
|
|
115
151
|
## Patch
|
116
152
|
|
data/embulk-input-http.gemspec
CHANGED
@@ -4,11 +4,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
|
5
5
|
Gem::Specification.new do |spec|
|
6
6
|
spec.name = "embulk-input-http"
|
7
|
-
spec.version = "0.0.
|
7
|
+
spec.version = "0.0.3"
|
8
8
|
spec.authors = ["Takuma kanari"]
|
9
9
|
spec.email = ["chemtrails.t@gmail.com"]
|
10
10
|
spec.summary = %q{Embulk plugin for http input}
|
11
|
-
spec.description = %q{
|
11
|
+
spec.description = %q{Fetch data via http}
|
12
12
|
spec.homepage = "https://github.com/takumakanari/embulk-input-http"
|
13
13
|
spec.license = "MIT"
|
14
14
|
|
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
|
|
17
17
|
spec.require_paths = ["lib"]
|
18
18
|
|
19
19
|
spec.add_dependency "jsonpath", "~> 0.5"
|
20
|
+
spec.add_dependency "bracecomp", "~> 0.1", ">= 0.1.2"
|
20
21
|
spec.add_development_dependency "bundler", "~> 1.0"
|
21
22
|
spec.add_development_dependency "rake", "~> 0.9.2"
|
22
23
|
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
exec: {}
|
2
|
+
in:
|
3
|
+
type: http
|
4
|
+
url: http://express.heartrails.com/api/json
|
5
|
+
params:
|
6
|
+
- {name: method, value: getStations}
|
7
|
+
- {name: x, value: "{130..135}.0", expand: true}
|
8
|
+
- {name: y, value: "{30..35}.0", expand: true}
|
9
|
+
schema:
|
10
|
+
- {name: name, type: string}
|
11
|
+
- {name: next, type: string}
|
12
|
+
- {name: prev, type: string}
|
13
|
+
- {name: distance, type: string}
|
14
|
+
- {name: lat, type: double, path: x}
|
15
|
+
- {name: lng, type: double, path: y}
|
16
|
+
- {name: line, type: string}
|
17
|
+
- {name: postal, type: string}
|
18
|
+
method: get
|
19
|
+
iterate: {type: json, path: $.response.station}
|
20
|
+
out: {type: stdout}
|
21
|
+
|
data/example/json-example.yml
CHANGED
@@ -11,8 +11,8 @@ in:
|
|
11
11
|
- {name: next, type: string}
|
12
12
|
- {name: prev, type: string}
|
13
13
|
- {name: distance, type: string}
|
14
|
-
- {name:
|
15
|
-
- {name:
|
14
|
+
- {name: lat, type: double, path: x}
|
15
|
+
- {name: lng, type: double, path: y}
|
16
16
|
- {name: line, type: string}
|
17
17
|
- {name: postal, type: string}
|
18
18
|
method: get
|
data/lib/embulk/input/http.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "net/http"
|
2
2
|
require "uri"
|
3
|
+
require "bracecomp"
|
3
4
|
|
4
5
|
module Embulk
|
5
6
|
module Input
|
@@ -8,155 +9,186 @@ module Embulk
|
|
8
9
|
Plugin.register_input("http", self)
|
9
10
|
|
10
11
|
def self.transaction(config, &control)
|
11
|
-
|
12
|
-
|
13
|
-
|
12
|
+
task = {
|
13
|
+
:url => config.param("url", :string),
|
14
|
+
:method => config.param("method", :string, default: "get"),
|
15
|
+
:schema => config.param("schema", :array),
|
16
|
+
:iterate => config.param("iterate", :hash),
|
17
|
+
:open_timeout => config.param("open_timeout", :float, default: 2.0),
|
18
|
+
:read_timeout => config.param("read_timeout", :float, default: 10.0),
|
19
|
+
:done => config.param("done", :array, default: [])
|
20
|
+
}
|
14
21
|
params = config.param("params", :array, default: [])
|
15
|
-
|
16
|
-
open_timeout = config.param("open_timeout", :float, default: 2.0)
|
17
|
-
read_timeout = config.param("read_timeout", :float, default: 10.0)
|
22
|
+
params_unexpand, params_expand = configure_queries(params)
|
18
23
|
|
19
|
-
data_type = iterate["type"]
|
24
|
+
data_type = task[:iterate]["type"]
|
20
25
|
unless ["json", "xml"].include?(data_type)
|
21
26
|
raise "Unknown data_type #{data_type}, only supported for json or xml"
|
22
27
|
end
|
23
28
|
|
24
|
-
columns = schema.each_with_index.map do |c, i|
|
29
|
+
columns = task[:schema].each_with_index.map do |c, i|
|
25
30
|
Column.new(i, c["name"], c["type"].to_sym)
|
26
31
|
end
|
27
32
|
|
28
|
-
task =
|
29
|
-
|
30
|
-
|
31
|
-
:params => params,
|
32
|
-
:schema => schema,
|
33
|
-
:iterate => iterate,
|
34
|
-
:open_timeout => open_timeout,
|
35
|
-
:read_timeout => read_timeout
|
36
|
-
}
|
33
|
+
task[:params] = params_unexpand
|
34
|
+
task[:params_expand] = params_expand - task[:done]
|
35
|
+
num_of_threads = task[:params_expand].empty? ? 1 : task[:params_expand].size
|
37
36
|
|
38
|
-
report = yield(task, columns,
|
39
|
-
|
40
|
-
|
37
|
+
report = yield(task, columns, num_of_threads)
|
38
|
+
{"done" => report.map{|r| r["done"]}.compact}
|
39
|
+
end
|
40
|
+
|
41
|
+
def self.configure_queries(params)
|
42
|
+
base = params.select{|p| !p["expand"]}.map do |p|
|
43
|
+
[p["name"], p["value"]]
|
44
|
+
end
|
45
|
+
expands = params.select{|p| p["expand"] }.map do |p|
|
46
|
+
p["value"].expand.map do |v|
|
47
|
+
[p["name"], v]
|
48
|
+
end
|
49
|
+
end
|
50
|
+
if expands.size > 0
|
51
|
+
dest = expands.first.product(*(expands.slice(1, expands.size - 1)))
|
52
|
+
dest.sort!{|a, b| "#{a[0]}=#{a[1]}" <=> "#{b[0]}=#{b[1]}"}
|
53
|
+
else
|
54
|
+
dest = []
|
55
|
+
end
|
56
|
+
[base, dest]
|
41
57
|
end
|
42
58
|
|
43
59
|
def run
|
44
60
|
schema = @task["schema"]
|
45
61
|
iterate = @task["iterate"]
|
62
|
+
url = @task["url"]
|
63
|
+
method = @task["method"]
|
46
64
|
|
47
|
-
|
65
|
+
params_expand = @task["params_expand"][@index] || []
|
66
|
+
query = URI.encode_www_form(@task["params"] + params_expand)
|
67
|
+
puts "#{@index}: #{method.upcase} #{url}?#{query}"
|
68
|
+
|
69
|
+
data = fetch(url, method, query).body
|
48
70
|
data_type = iterate["type"]
|
49
71
|
|
50
72
|
case data_type
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
73
|
+
when "json"
|
74
|
+
iter = IterJson.new(schema, data, iterate)
|
75
|
+
when "xml"
|
76
|
+
iter = IterXML.new(schema, data, iterate)
|
77
|
+
else
|
78
|
+
raise "Unsupported data_type #{data_type}"
|
57
79
|
end
|
58
80
|
|
59
|
-
|
60
|
-
|
61
|
-
rows += 1
|
62
|
-
@page_builder.add(schema.map{|c|
|
63
|
-
name = c["name"]
|
64
|
-
type = c["type"]
|
65
|
-
val = e[name].nil? ? "" : e[name]
|
66
|
-
case type
|
67
|
-
when "string"
|
68
|
-
val
|
69
|
-
when "long"
|
70
|
-
val.to_i
|
71
|
-
when "double"
|
72
|
-
val.to_f
|
73
|
-
when "boolean"
|
74
|
-
["yes", "true", "1"].include?(val)
|
75
|
-
when "timestamp"
|
76
|
-
(val.nil? || val.empty?) ? nil : Time.strptime(val, c["format"])
|
77
|
-
else
|
78
|
-
raise "Unsupported type #{type}"
|
79
|
-
end
|
80
|
-
})
|
81
|
+
iter.each do |record|
|
82
|
+
@page_builder.add(record)
|
81
83
|
end
|
82
84
|
@page_builder.finish
|
83
85
|
|
84
|
-
{:
|
86
|
+
{:done => params_expand}
|
85
87
|
end
|
86
88
|
|
87
89
|
private
|
88
90
|
|
89
|
-
def fetch
|
90
|
-
uri = URI.parse(
|
91
|
-
method = @task["method"]
|
92
|
-
qs = URI.encode_www_form(@task["params"].map {|p|
|
93
|
-
[p["name"], p["value"]]
|
94
|
-
})
|
95
|
-
puts "#{method.upcase} #{uri}?#{qs}"
|
91
|
+
def fetch(url, method, query)
|
92
|
+
uri = URI.parse(url)
|
96
93
|
|
97
94
|
res = Net::HTTP.start(uri.host, uri.port) do |client|
|
98
95
|
client.open_timeout = @task["open_timeout"]
|
99
96
|
client.read_timeout = @task["read_timeout"]
|
100
97
|
case method.downcase
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
98
|
+
when "get"
|
99
|
+
client.get([uri.path, query].join("?"))
|
100
|
+
when "post"
|
101
|
+
client.post(uri.path, query)
|
102
|
+
else
|
103
|
+
raise "Unsupported method #{method}"
|
107
104
|
end
|
108
105
|
end
|
109
106
|
|
110
107
|
case res
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
108
|
+
when Net::HTTPSuccess
|
109
|
+
res
|
110
|
+
else
|
111
|
+
raise "Request is not successful, code=#{res.code}, value=#{res.body}"
|
115
112
|
end
|
116
113
|
end
|
117
114
|
|
118
115
|
class Iter
|
119
|
-
def initialize(data,
|
116
|
+
def initialize(schema, data, config)
|
117
|
+
@schema = schema
|
120
118
|
@data = data
|
121
|
-
@
|
119
|
+
@config = config
|
122
120
|
end
|
123
121
|
|
124
122
|
def each
|
125
|
-
raise NotImplementedError("each")
|
123
|
+
raise NotImplementedError.new("each")
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
def make_record(e)
|
129
|
+
@schema.map do |c|
|
130
|
+
name = c["name"]
|
131
|
+
path = c["path"]
|
132
|
+
val = path.nil? ? e[name] : find_by_path(e, path)
|
133
|
+
|
134
|
+
v = val.nil? ? "" : val
|
135
|
+
type = c["type"]
|
136
|
+
case type
|
137
|
+
when "string"
|
138
|
+
v
|
139
|
+
when "long"
|
140
|
+
v.to_i
|
141
|
+
when "double"
|
142
|
+
v.to_f
|
143
|
+
when "boolean"
|
144
|
+
["yes", "true", "1"].include?(v)
|
145
|
+
when "timestamp"
|
146
|
+
v.empty? ? nil : Time.strptime(v, c["format"])
|
147
|
+
else
|
148
|
+
raise "Unsupported type #{type}"
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
def find_by_path(e, path)
|
154
|
+
raise NotImplementedError.new("Find by path is unsupported")
|
126
155
|
end
|
127
156
|
end
|
128
157
|
|
129
158
|
class IterXML < Iter
|
130
|
-
def initialize(data,
|
159
|
+
def initialize(schema, data, config)
|
131
160
|
require "rexml/document"
|
132
161
|
super
|
133
162
|
@doc = REXML::Document.new(@data)
|
134
163
|
end
|
135
164
|
|
136
165
|
def each
|
137
|
-
@doc.elements.each(@path) do |e|
|
138
|
-
|
166
|
+
@doc.elements.each(@config["path"]) do |e|
|
167
|
+
dest = {}
|
139
168
|
e.elements.each do |d|
|
140
|
-
|
169
|
+
dest[d.name] = d.text
|
141
170
|
end
|
142
|
-
yield
|
171
|
+
yield make_record(dest)
|
143
172
|
end
|
144
173
|
end
|
145
174
|
end
|
146
175
|
|
147
176
|
class IterJson < Iter
|
148
|
-
def initialize(data,
|
177
|
+
def initialize(schema, data, config)
|
149
178
|
require "jsonpath"
|
150
179
|
super
|
151
|
-
@jsonpath = JsonPath.new(@path)
|
180
|
+
@jsonpath = JsonPath.new(@config["path"])
|
152
181
|
end
|
153
182
|
|
154
183
|
def each
|
155
184
|
@jsonpath.on(@data).flatten.each do |e|
|
156
|
-
|
157
|
-
yield e
|
185
|
+
yield make_record(e)
|
158
186
|
end
|
159
187
|
end
|
188
|
+
|
189
|
+
def find_by_path(e, path)
|
190
|
+
JsonPath.on(e, path).first
|
191
|
+
end
|
160
192
|
end
|
161
193
|
|
162
194
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-http
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Takuma kanari
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: jsonpath
|
@@ -24,6 +24,26 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0.5'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bracecomp
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.1'
|
34
|
+
- - ">="
|
35
|
+
- !ruby/object:Gem::Version
|
36
|
+
version: 0.1.2
|
37
|
+
type: :runtime
|
38
|
+
prerelease: false
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - "~>"
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0.1'
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: 0.1.2
|
27
47
|
- !ruby/object:Gem::Dependency
|
28
48
|
name: bundler
|
29
49
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,7 +72,7 @@ dependencies:
|
|
52
72
|
- - "~>"
|
53
73
|
- !ruby/object:Gem::Version
|
54
74
|
version: 0.9.2
|
55
|
-
description:
|
75
|
+
description: Fetch data via http
|
56
76
|
email:
|
57
77
|
- chemtrails.t@gmail.com
|
58
78
|
executables: []
|
@@ -65,6 +85,7 @@ files:
|
|
65
85
|
- README.md
|
66
86
|
- Rakefile
|
67
87
|
- embulk-input-http.gemspec
|
88
|
+
- example/json-example-expand.yml
|
68
89
|
- example/json-example.yml
|
69
90
|
- lib/embulk/input/http.rb
|
70
91
|
homepage: https://github.com/takumakanari/embulk-input-http
|