embulk-input-http 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d8ea58d778613a1a60214badcea8ea098f9dab77
4
- data.tar.gz: c5eb80607d79558874a009a2dc47f12696053c7f
3
+ metadata.gz: 0bc5ae1c890470e5d30b21ced647bd9e85e8a272
4
+ data.tar.gz: 77c89043f7c45fae6750eb84152053f10ae691a1
5
5
  SHA512:
6
- metadata.gz: 98ab84c0d458317898947ded60b4edfd6b4c132b1aaea930a485d76b14e408b742863fac125172bb46d86d54e9e6b6ea6c3aa6229832054b0dbd3233361ffe90
7
- data.tar.gz: 984d9997c12a42438546d8b4acf28053fd2d3c6b5620fde4cbbf07718a6cdce631a7e0ffac8b5156005a284856c884c6c28f1f66bc7d8b3e24fca4922093d4ff
6
+ metadata.gz: be0f73d6d3e4f33d7dcba51208e32446b5c31c4b4415bc6b415285234ec2982ffbc131a565914605b8d8331a371fd0daf143f4cf23f24a01d604be8dce1999fc
7
+ data.tar.gz: 44c57e268a4a9e4d0ecc7c8e06f836e62b7f69f2c760306665aba9e714eb7e522b025daabf9c23b34029ba9b5820094bd32c0170203ae78daaa3bb8f4eec57e0
data/README.md CHANGED
@@ -22,14 +22,14 @@ in:
22
22
  params:
23
23
  - {name: method, value: getStations}
24
24
  - {name: x, value: 135.0}
25
- - {name: y, value: 35.0}
25
+ - {name: y, value: "{30..35}.0", expand: true}
26
26
  schema:
27
27
  - {name: name, type: string}
28
28
  - {name: next, type: string}
29
29
  - {name: prev, type: string}
30
30
  - {name: distance, type: string}
31
- - {name: x, type: double}
32
- - {name: y, type: double}
31
+ - {name: lat, type: double, path: x}
32
+ - {name: lng, type: double, path: y}
33
33
  - {name: line, type: string}
34
34
  - {name: postal, type: string}
35
35
  iterate: {type: json, path: $.response.station}
@@ -42,7 +42,8 @@ in:
42
42
  - iterate: data type and path to find root data, json/xml is supported for now (required)
43
43
  - method: http method, get is used by default (optional)
44
44
  - params: pair of name/value to specify query parameter (optional)
45
-
45
+ - open_timeout: timeout to open connection (optional, 5 is used by default)
46
+ - read_timeout: timeout to read content via http (optional, 10 is used by default)
46
47
 
47
48
  ### Iterate data
48
49
 
@@ -71,6 +72,30 @@ You can iterate "students" node by the following condifuration:
71
72
 
72
73
  iterate: {type: json, path: $.students}
73
74
 
75
+ You can specify jsonpath to also *path* in schema section:
76
+
77
+ ```yaml
78
+ schema:
79
+ - {name: firstName, type: string, path: "names[0]"}
80
+ - {name: lastName, type: string, path: "names[1]"}
81
+ iterate: {type: json, path: $.students}
82
+ ```
83
+
84
+ Then you can make record from more complicated json like as follows:
85
+
86
+ ```json
87
+ {
88
+ "result" : "success",
89
+ "students" : [
90
+ { "names" : ["John", "Lennon"], "age" : 10 },
91
+ { "names" : ["Paul", "Maccartney"], "age" : 10 }
92
+ ]
93
+ }
94
+ ```
95
+
96
+ In this case, names[0] will be firstName of schema and names[1] will be lastName.
97
+
98
+
74
99
  #### xml
75
100
 
76
101
  You can parse also xml by specifing **path/to/node** style to *path*.
@@ -105,12 +130,23 @@ Configuration as below to iterate student node:
105
130
 
106
131
  iterate: {type: xml, path: data/students/student}
107
132
 
133
+ ### Brace expansion style in params
108
134
 
109
- ## TODO
135
+ In *params* section, you can specify also multilple params by using **brace expansion style**.
110
136
 
137
+ ```yaml
138
+ params
139
+ - {name: id, value "{1..5}", expand: true}
140
+ - {name: name, value "{John,Paul,George,Ringo}", expand: true}
141
+ ```
142
+
143
+ To use this style, you need to set true to parameter *expand*, then all patterns of query will be called in a defferent request.
144
+
145
+
146
+ ## TODO
147
+ - Split input/formatter
111
148
  - BasicAuth
112
149
  - HTTP-proxy
113
- - Breace-expansion style parameter, such as curl
114
150
 
115
151
  ## Patch
116
152
 
@@ -4,11 +4,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
 
5
5
  Gem::Specification.new do |spec|
6
6
  spec.name = "embulk-input-http"
7
- spec.version = "0.0.2"
7
+ spec.version = "0.0.3"
8
8
  spec.authors = ["Takuma kanari"]
9
9
  spec.email = ["chemtrails.t@gmail.com"]
10
10
  spec.summary = %q{Embulk plugin for http input}
11
- spec.description = %q{fetch data via http}
11
+ spec.description = %q{Fetch data via http}
12
12
  spec.homepage = "https://github.com/takumakanari/embulk-input-http"
13
13
  spec.license = "MIT"
14
14
 
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
17
17
  spec.require_paths = ["lib"]
18
18
 
19
19
  spec.add_dependency "jsonpath", "~> 0.5"
20
+ spec.add_dependency "bracecomp", "~> 0.1", ">= 0.1.2"
20
21
  spec.add_development_dependency "bundler", "~> 1.0"
21
22
  spec.add_development_dependency "rake", "~> 0.9.2"
22
23
  end
@@ -0,0 +1,21 @@
1
+ exec: {}
2
+ in:
3
+ type: http
4
+ url: http://express.heartrails.com/api/json
5
+ params:
6
+ - {name: method, value: getStations}
7
+ - {name: x, value: "{130..135}.0", expand: true}
8
+ - {name: y, value: "{30..35}.0", expand: true}
9
+ schema:
10
+ - {name: name, type: string}
11
+ - {name: next, type: string}
12
+ - {name: prev, type: string}
13
+ - {name: distance, type: string}
14
+ - {name: lat, type: double, path: x}
15
+ - {name: lng, type: double, path: y}
16
+ - {name: line, type: string}
17
+ - {name: postal, type: string}
18
+ method: get
19
+ iterate: {type: json, path: $.response.station}
20
+ out: {type: stdout}
21
+
@@ -11,8 +11,8 @@ in:
11
11
  - {name: next, type: string}
12
12
  - {name: prev, type: string}
13
13
  - {name: distance, type: string}
14
- - {name: x, type: double}
15
- - {name: y, type: double}
14
+ - {name: lat, type: double, path: x}
15
+ - {name: lng, type: double, path: y}
16
16
  - {name: line, type: string}
17
17
  - {name: postal, type: string}
18
18
  method: get
@@ -1,5 +1,6 @@
1
1
  require "net/http"
2
2
  require "uri"
3
+ require "bracecomp"
3
4
 
4
5
  module Embulk
5
6
  module Input
@@ -8,155 +9,186 @@ module Embulk
8
9
  Plugin.register_input("http", self)
9
10
 
10
11
  def self.transaction(config, &control)
11
- url = config.param("url", :string)
12
- schema = config.param("schema", :array)
13
- method = config.param("method", :string, default: "get")
12
+ task = {
13
+ :url => config.param("url", :string),
14
+ :method => config.param("method", :string, default: "get"),
15
+ :schema => config.param("schema", :array),
16
+ :iterate => config.param("iterate", :hash),
17
+ :open_timeout => config.param("open_timeout", :float, default: 2.0),
18
+ :read_timeout => config.param("read_timeout", :float, default: 10.0),
19
+ :done => config.param("done", :array, default: [])
20
+ }
14
21
  params = config.param("params", :array, default: [])
15
- iterate = config.param("iterate", :hash)
16
- open_timeout = config.param("open_timeout", :float, default: 2.0)
17
- read_timeout = config.param("read_timeout", :float, default: 10.0)
22
+ params_unexpand, params_expand = configure_queries(params)
18
23
 
19
- data_type = iterate["type"]
24
+ data_type = task[:iterate]["type"]
20
25
  unless ["json", "xml"].include?(data_type)
21
26
  raise "Unknown data_type #{data_type}, only supported for json or xml"
22
27
  end
23
28
 
24
- columns = schema.each_with_index.map do |c, i|
29
+ columns = task[:schema].each_with_index.map do |c, i|
25
30
  Column.new(i, c["name"], c["type"].to_sym)
26
31
  end
27
32
 
28
- task = {
29
- :url => url,
30
- :method => method,
31
- :params => params,
32
- :schema => schema,
33
- :iterate => iterate,
34
- :open_timeout => open_timeout,
35
- :read_timeout => read_timeout
36
- }
33
+ task[:params] = params_unexpand
34
+ task[:params_expand] = params_expand - task[:done]
35
+ num_of_threads = task[:params_expand].empty? ? 1 : task[:params_expand].size
37
36
 
38
- report = yield(task, columns, 1)
39
- config.merge(report["done"].flatten.compact)
40
- {}
37
+ report = yield(task, columns, num_of_threads)
38
+ {"done" => report.map{|r| r["done"]}.compact}
39
+ end
40
+
41
+ def self.configure_queries(params)
42
+ base = params.select{|p| !p["expand"]}.map do |p|
43
+ [p["name"], p["value"]]
44
+ end
45
+ expands = params.select{|p| p["expand"] }.map do |p|
46
+ p["value"].expand.map do |v|
47
+ [p["name"], v]
48
+ end
49
+ end
50
+ if expands.size > 0
51
+ dest = expands.first.product(*(expands.slice(1, expands.size - 1)))
52
+ dest.sort!{|a, b| "#{a[0]}=#{a[1]}" <=> "#{b[0]}=#{b[1]}"}
53
+ else
54
+ dest = []
55
+ end
56
+ [base, dest]
41
57
  end
42
58
 
43
59
  def run
44
60
  schema = @task["schema"]
45
61
  iterate = @task["iterate"]
62
+ url = @task["url"]
63
+ method = @task["method"]
46
64
 
47
- data = fetch.body
65
+ params_expand = @task["params_expand"][@index] || []
66
+ query = URI.encode_www_form(@task["params"] + params_expand)
67
+ puts "#{@index}: #{method.upcase} #{url}?#{query}"
68
+
69
+ data = fetch(url, method, query).body
48
70
  data_type = iterate["type"]
49
71
 
50
72
  case data_type
51
- when "json"
52
- iter = IterJson.new(data, iterate["path"])
53
- when "xml"
54
- iter = IterXML.new(data, iterate["path"])
55
- else
56
- raise "Unsupported data_type #{data_type}"
73
+ when "json"
74
+ iter = IterJson.new(schema, data, iterate)
75
+ when "xml"
76
+ iter = IterXML.new(schema, data, iterate)
77
+ else
78
+ raise "Unsupported data_type #{data_type}"
57
79
  end
58
80
 
59
- rows = 0
60
- iter.each do |e|
61
- rows += 1
62
- @page_builder.add(schema.map{|c|
63
- name = c["name"]
64
- type = c["type"]
65
- val = e[name].nil? ? "" : e[name]
66
- case type
67
- when "string"
68
- val
69
- when "long"
70
- val.to_i
71
- when "double"
72
- val.to_f
73
- when "boolean"
74
- ["yes", "true", "1"].include?(val)
75
- when "timestamp"
76
- (val.nil? || val.empty?) ? nil : Time.strptime(val, c["format"])
77
- else
78
- raise "Unsupported type #{type}"
79
- end
80
- })
81
+ iter.each do |record|
82
+ @page_builder.add(record)
81
83
  end
82
84
  @page_builder.finish
83
85
 
84
- {:rows => rows}
86
+ {:done => params_expand}
85
87
  end
86
88
 
87
89
  private
88
90
 
89
- def fetch
90
- uri = URI.parse(@task["url"])
91
- method = @task["method"]
92
- qs = URI.encode_www_form(@task["params"].map {|p|
93
- [p["name"], p["value"]]
94
- })
95
- puts "#{method.upcase} #{uri}?#{qs}"
91
+ def fetch(url, method, query)
92
+ uri = URI.parse(url)
96
93
 
97
94
  res = Net::HTTP.start(uri.host, uri.port) do |client|
98
95
  client.open_timeout = @task["open_timeout"]
99
96
  client.read_timeout = @task["read_timeout"]
100
97
  case method.downcase
101
- when "get"
102
- client.get([uri.path, qs].join("?"))
103
- when "post"
104
- client.post(uri.path, qs)
105
- else
106
- raise "Unsupported method #{method}"
98
+ when "get"
99
+ client.get([uri.path, query].join("?"))
100
+ when "post"
101
+ client.post(uri.path, query)
102
+ else
103
+ raise "Unsupported method #{method}"
107
104
  end
108
105
  end
109
106
 
110
107
  case res
111
- when Net::HTTPSuccess
112
- res
113
- else
114
- raise "Request is not successful, code=#{res.code}, value=#{res.body}"
108
+ when Net::HTTPSuccess
109
+ res
110
+ else
111
+ raise "Request is not successful, code=#{res.code}, value=#{res.body}"
115
112
  end
116
113
  end
117
114
 
118
115
  class Iter
119
- def initialize(data, path)
116
+ def initialize(schema, data, config)
117
+ @schema = schema
120
118
  @data = data
121
- @path = path
119
+ @config = config
122
120
  end
123
121
 
124
122
  def each
125
- raise NotImplementedError("each")
123
+ raise NotImplementedError.new("each")
124
+ end
125
+
126
+ private
127
+
128
+ def make_record(e)
129
+ @schema.map do |c|
130
+ name = c["name"]
131
+ path = c["path"]
132
+ val = path.nil? ? e[name] : find_by_path(e, path)
133
+
134
+ v = val.nil? ? "" : val
135
+ type = c["type"]
136
+ case type
137
+ when "string"
138
+ v
139
+ when "long"
140
+ v.to_i
141
+ when "double"
142
+ v.to_f
143
+ when "boolean"
144
+ ["yes", "true", "1"].include?(v)
145
+ when "timestamp"
146
+ v.empty? ? nil : Time.strptime(v, c["format"])
147
+ else
148
+ raise "Unsupported type #{type}"
149
+ end
150
+ end
151
+ end
152
+
153
+ def find_by_path(e, path)
154
+ raise NotImplementedError.new("Find by path is unsupported")
126
155
  end
127
156
  end
128
157
 
129
158
  class IterXML < Iter
130
- def initialize(data, path)
159
+ def initialize(schema, data, config)
131
160
  require "rexml/document"
132
161
  super
133
162
  @doc = REXML::Document.new(@data)
134
163
  end
135
164
 
136
165
  def each
137
- @doc.elements.each(@path) do |e|
138
- ret = {}
166
+ @doc.elements.each(@config["path"]) do |e|
167
+ dest = {}
139
168
  e.elements.each do |d|
140
- ret[d.name] = d.text
169
+ dest[d.name] = d.text
141
170
  end
142
- yield ret
171
+ yield make_record(dest)
143
172
  end
144
173
  end
145
174
  end
146
175
 
147
176
  class IterJson < Iter
148
- def initialize(data, path)
177
+ def initialize(schema, data, config)
149
178
  require "jsonpath"
150
179
  super
151
- @jsonpath = JsonPath.new(@path)
180
+ @jsonpath = JsonPath.new(@config["path"])
152
181
  end
153
182
 
154
183
  def each
155
184
  @jsonpath.on(@data).flatten.each do |e|
156
- raise "data is must be hash, but #{e.class}" unless e.instance_of?(Hash)
157
- yield e
185
+ yield make_record(e)
158
186
  end
159
187
  end
188
+
189
+ def find_by_path(e, path)
190
+ JsonPath.on(e, path).first
191
+ end
160
192
  end
161
193
 
162
194
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-http
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takuma kanari
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-08 00:00:00.000000000 Z
11
+ date: 2015-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: jsonpath
@@ -24,6 +24,26 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bracecomp
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.1'
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 0.1.2
37
+ type: :runtime
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '0.1'
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 0.1.2
27
47
  - !ruby/object:Gem::Dependency
28
48
  name: bundler
29
49
  requirement: !ruby/object:Gem::Requirement
@@ -52,7 +72,7 @@ dependencies:
52
72
  - - "~>"
53
73
  - !ruby/object:Gem::Version
54
74
  version: 0.9.2
55
- description: fetch data via http
75
+ description: Fetch data via http
56
76
  email:
57
77
  - chemtrails.t@gmail.com
58
78
  executables: []
@@ -65,6 +85,7 @@ files:
65
85
  - README.md
66
86
  - Rakefile
67
87
  - embulk-input-http.gemspec
88
+ - example/json-example-expand.yml
68
89
  - example/json-example.yml
69
90
  - lib/embulk/input/http.rb
70
91
  homepage: https://github.com/takumakanari/embulk-input-http