embulk-input-http 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d8ea58d778613a1a60214badcea8ea098f9dab77
4
- data.tar.gz: c5eb80607d79558874a009a2dc47f12696053c7f
3
+ metadata.gz: 0bc5ae1c890470e5d30b21ced647bd9e85e8a272
4
+ data.tar.gz: 77c89043f7c45fae6750eb84152053f10ae691a1
5
5
  SHA512:
6
- metadata.gz: 98ab84c0d458317898947ded60b4edfd6b4c132b1aaea930a485d76b14e408b742863fac125172bb46d86d54e9e6b6ea6c3aa6229832054b0dbd3233361ffe90
7
- data.tar.gz: 984d9997c12a42438546d8b4acf28053fd2d3c6b5620fde4cbbf07718a6cdce631a7e0ffac8b5156005a284856c884c6c28f1f66bc7d8b3e24fca4922093d4ff
6
+ metadata.gz: be0f73d6d3e4f33d7dcba51208e32446b5c31c4b4415bc6b415285234ec2982ffbc131a565914605b8d8331a371fd0daf143f4cf23f24a01d604be8dce1999fc
7
+ data.tar.gz: 44c57e268a4a9e4d0ecc7c8e06f836e62b7f69f2c760306665aba9e714eb7e522b025daabf9c23b34029ba9b5820094bd32c0170203ae78daaa3bb8f4eec57e0
data/README.md CHANGED
@@ -22,14 +22,14 @@ in:
22
22
  params:
23
23
  - {name: method, value: getStations}
24
24
  - {name: x, value: 135.0}
25
- - {name: y, value: 35.0}
25
+ - {name: y, value: "{30..35}.0", expand: true}
26
26
  schema:
27
27
  - {name: name, type: string}
28
28
  - {name: next, type: string}
29
29
  - {name: prev, type: string}
30
30
  - {name: distance, type: string}
31
- - {name: x, type: double}
32
- - {name: y, type: double}
31
+ - {name: lat, type: double, path: x}
32
+ - {name: lng, type: double, path: y}
33
33
  - {name: line, type: string}
34
34
  - {name: postal, type: string}
35
35
  iterate: {type: json, path: $.response.station}
@@ -42,7 +42,8 @@ in:
42
42
  - iterate: data type and path to find root data, json/xml is supported for now (required)
43
43
  - method: http method, get is used by default (optional)
44
44
  - params: pair of name/value to specify query parameter (optional)
45
-
45
+ - open_timeout: timeout to open connection (optional, 5 is used by default)
46
+ - read_timeout: timeout to read content via http (optional, 10 is used by default)
46
47
 
47
48
  ### Iterate data
48
49
 
@@ -71,6 +72,30 @@ You can iterate "students" node by the following condifuration:
71
72
 
72
73
  iterate: {type: json, path: $.students}
73
74
 
75
+ You can specify jsonpath to also *path* in schema section:
76
+
77
+ ```yaml
78
+ schema:
79
+ - {name: firstName, type: string, path: "names[0]"}
80
+ - {name: lastName, type: string, path: "names[1]"}
81
+ iterate: {type: json, path: $.students}
82
+ ```
83
+
84
+ Then you can make record from more complicated json like as follows:
85
+
86
+ ```json
87
+ {
88
+ "result" : "success",
89
+ "students" : [
90
+ { "names" : ["John", "Lennon"], "age" : 10 },
91
+ { "names" : ["Paul", "Maccartney"], "age" : 10 }
92
+ ]
93
+ }
94
+ ```
95
+
96
+ In this case, names[0] will be firstName of schema and names[1] will be lastName.
97
+
98
+
74
99
  #### xml
75
100
 
76
101
  You can parse also xml by specifing **path/to/node** style to *path*.
@@ -105,12 +130,23 @@ Configuration as below to iterate student node:
105
130
 
106
131
  iterate: {type: xml, path: data/students/student}
107
132
 
133
+ ### Brace expansion style in params
108
134
 
109
- ## TODO
135
+ In *params* section, you can specify also multilple params by using **brace expansion style**.
110
136
 
137
+ ```yaml
138
+ params
139
+ - {name: id, value "{1..5}", expand: true}
140
+ - {name: name, value "{John,Paul,George,Ringo}", expand: true}
141
+ ```
142
+
143
+ To use this style, you need to set true to parameter *expand*, then all patterns of query will be called in a defferent request.
144
+
145
+
146
+ ## TODO
147
+ - Split input/formatter
111
148
  - BasicAuth
112
149
  - HTTP-proxy
113
- - Breace-expansion style parameter, such as curl
114
150
 
115
151
  ## Patch
116
152
 
@@ -4,11 +4,11 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
4
 
5
5
  Gem::Specification.new do |spec|
6
6
  spec.name = "embulk-input-http"
7
- spec.version = "0.0.2"
7
+ spec.version = "0.0.3"
8
8
  spec.authors = ["Takuma kanari"]
9
9
  spec.email = ["chemtrails.t@gmail.com"]
10
10
  spec.summary = %q{Embulk plugin for http input}
11
- spec.description = %q{fetch data via http}
11
+ spec.description = %q{Fetch data via http}
12
12
  spec.homepage = "https://github.com/takumakanari/embulk-input-http"
13
13
  spec.license = "MIT"
14
14
 
@@ -17,6 +17,7 @@ Gem::Specification.new do |spec|
17
17
  spec.require_paths = ["lib"]
18
18
 
19
19
  spec.add_dependency "jsonpath", "~> 0.5"
20
+ spec.add_dependency "bracecomp", "~> 0.1", ">= 0.1.2"
20
21
  spec.add_development_dependency "bundler", "~> 1.0"
21
22
  spec.add_development_dependency "rake", "~> 0.9.2"
22
23
  end
@@ -0,0 +1,21 @@
1
+ exec: {}
2
+ in:
3
+ type: http
4
+ url: http://express.heartrails.com/api/json
5
+ params:
6
+ - {name: method, value: getStations}
7
+ - {name: x, value: "{130..135}.0", expand: true}
8
+ - {name: y, value: "{30..35}.0", expand: true}
9
+ schema:
10
+ - {name: name, type: string}
11
+ - {name: next, type: string}
12
+ - {name: prev, type: string}
13
+ - {name: distance, type: string}
14
+ - {name: lat, type: double, path: x}
15
+ - {name: lng, type: double, path: y}
16
+ - {name: line, type: string}
17
+ - {name: postal, type: string}
18
+ method: get
19
+ iterate: {type: json, path: $.response.station}
20
+ out: {type: stdout}
21
+
@@ -11,8 +11,8 @@ in:
11
11
  - {name: next, type: string}
12
12
  - {name: prev, type: string}
13
13
  - {name: distance, type: string}
14
- - {name: x, type: double}
15
- - {name: y, type: double}
14
+ - {name: lat, type: double, path: x}
15
+ - {name: lng, type: double, path: y}
16
16
  - {name: line, type: string}
17
17
  - {name: postal, type: string}
18
18
  method: get
@@ -1,5 +1,6 @@
1
1
  require "net/http"
2
2
  require "uri"
3
+ require "bracecomp"
3
4
 
4
5
  module Embulk
5
6
  module Input
@@ -8,155 +9,186 @@ module Embulk
8
9
  Plugin.register_input("http", self)
9
10
 
10
11
  def self.transaction(config, &control)
11
- url = config.param("url", :string)
12
- schema = config.param("schema", :array)
13
- method = config.param("method", :string, default: "get")
12
+ task = {
13
+ :url => config.param("url", :string),
14
+ :method => config.param("method", :string, default: "get"),
15
+ :schema => config.param("schema", :array),
16
+ :iterate => config.param("iterate", :hash),
17
+ :open_timeout => config.param("open_timeout", :float, default: 2.0),
18
+ :read_timeout => config.param("read_timeout", :float, default: 10.0),
19
+ :done => config.param("done", :array, default: [])
20
+ }
14
21
  params = config.param("params", :array, default: [])
15
- iterate = config.param("iterate", :hash)
16
- open_timeout = config.param("open_timeout", :float, default: 2.0)
17
- read_timeout = config.param("read_timeout", :float, default: 10.0)
22
+ params_unexpand, params_expand = configure_queries(params)
18
23
 
19
- data_type = iterate["type"]
24
+ data_type = task[:iterate]["type"]
20
25
  unless ["json", "xml"].include?(data_type)
21
26
  raise "Unknown data_type #{data_type}, only supported for json or xml"
22
27
  end
23
28
 
24
- columns = schema.each_with_index.map do |c, i|
29
+ columns = task[:schema].each_with_index.map do |c, i|
25
30
  Column.new(i, c["name"], c["type"].to_sym)
26
31
  end
27
32
 
28
- task = {
29
- :url => url,
30
- :method => method,
31
- :params => params,
32
- :schema => schema,
33
- :iterate => iterate,
34
- :open_timeout => open_timeout,
35
- :read_timeout => read_timeout
36
- }
33
+ task[:params] = params_unexpand
34
+ task[:params_expand] = params_expand - task[:done]
35
+ num_of_threads = task[:params_expand].empty? ? 1 : task[:params_expand].size
37
36
 
38
- report = yield(task, columns, 1)
39
- config.merge(report["done"].flatten.compact)
40
- {}
37
+ report = yield(task, columns, num_of_threads)
38
+ {"done" => report.map{|r| r["done"]}.compact}
39
+ end
40
+
41
+ def self.configure_queries(params)
42
+ base = params.select{|p| !p["expand"]}.map do |p|
43
+ [p["name"], p["value"]]
44
+ end
45
+ expands = params.select{|p| p["expand"] }.map do |p|
46
+ p["value"].expand.map do |v|
47
+ [p["name"], v]
48
+ end
49
+ end
50
+ if expands.size > 0
51
+ dest = expands.first.product(*(expands.slice(1, expands.size - 1)))
52
+ dest.sort!{|a, b| "#{a[0]}=#{a[1]}" <=> "#{b[0]}=#{b[1]}"}
53
+ else
54
+ dest = []
55
+ end
56
+ [base, dest]
41
57
  end
42
58
 
43
59
  def run
44
60
  schema = @task["schema"]
45
61
  iterate = @task["iterate"]
62
+ url = @task["url"]
63
+ method = @task["method"]
46
64
 
47
- data = fetch.body
65
+ params_expand = @task["params_expand"][@index] || []
66
+ query = URI.encode_www_form(@task["params"] + params_expand)
67
+ puts "#{@index}: #{method.upcase} #{url}?#{query}"
68
+
69
+ data = fetch(url, method, query).body
48
70
  data_type = iterate["type"]
49
71
 
50
72
  case data_type
51
- when "json"
52
- iter = IterJson.new(data, iterate["path"])
53
- when "xml"
54
- iter = IterXML.new(data, iterate["path"])
55
- else
56
- raise "Unsupported data_type #{data_type}"
73
+ when "json"
74
+ iter = IterJson.new(schema, data, iterate)
75
+ when "xml"
76
+ iter = IterXML.new(schema, data, iterate)
77
+ else
78
+ raise "Unsupported data_type #{data_type}"
57
79
  end
58
80
 
59
- rows = 0
60
- iter.each do |e|
61
- rows += 1
62
- @page_builder.add(schema.map{|c|
63
- name = c["name"]
64
- type = c["type"]
65
- val = e[name].nil? ? "" : e[name]
66
- case type
67
- when "string"
68
- val
69
- when "long"
70
- val.to_i
71
- when "double"
72
- val.to_f
73
- when "boolean"
74
- ["yes", "true", "1"].include?(val)
75
- when "timestamp"
76
- (val.nil? || val.empty?) ? nil : Time.strptime(val, c["format"])
77
- else
78
- raise "Unsupported type #{type}"
79
- end
80
- })
81
+ iter.each do |record|
82
+ @page_builder.add(record)
81
83
  end
82
84
  @page_builder.finish
83
85
 
84
- {:rows => rows}
86
+ {:done => params_expand}
85
87
  end
86
88
 
87
89
  private
88
90
 
89
- def fetch
90
- uri = URI.parse(@task["url"])
91
- method = @task["method"]
92
- qs = URI.encode_www_form(@task["params"].map {|p|
93
- [p["name"], p["value"]]
94
- })
95
- puts "#{method.upcase} #{uri}?#{qs}"
91
+ def fetch(url, method, query)
92
+ uri = URI.parse(url)
96
93
 
97
94
  res = Net::HTTP.start(uri.host, uri.port) do |client|
98
95
  client.open_timeout = @task["open_timeout"]
99
96
  client.read_timeout = @task["read_timeout"]
100
97
  case method.downcase
101
- when "get"
102
- client.get([uri.path, qs].join("?"))
103
- when "post"
104
- client.post(uri.path, qs)
105
- else
106
- raise "Unsupported method #{method}"
98
+ when "get"
99
+ client.get([uri.path, query].join("?"))
100
+ when "post"
101
+ client.post(uri.path, query)
102
+ else
103
+ raise "Unsupported method #{method}"
107
104
  end
108
105
  end
109
106
 
110
107
  case res
111
- when Net::HTTPSuccess
112
- res
113
- else
114
- raise "Request is not successful, code=#{res.code}, value=#{res.body}"
108
+ when Net::HTTPSuccess
109
+ res
110
+ else
111
+ raise "Request is not successful, code=#{res.code}, value=#{res.body}"
115
112
  end
116
113
  end
117
114
 
118
115
  class Iter
119
- def initialize(data, path)
116
+ def initialize(schema, data, config)
117
+ @schema = schema
120
118
  @data = data
121
- @path = path
119
+ @config = config
122
120
  end
123
121
 
124
122
  def each
125
- raise NotImplementedError("each")
123
+ raise NotImplementedError.new("each")
124
+ end
125
+
126
+ private
127
+
128
+ def make_record(e)
129
+ @schema.map do |c|
130
+ name = c["name"]
131
+ path = c["path"]
132
+ val = path.nil? ? e[name] : find_by_path(e, path)
133
+
134
+ v = val.nil? ? "" : val
135
+ type = c["type"]
136
+ case type
137
+ when "string"
138
+ v
139
+ when "long"
140
+ v.to_i
141
+ when "double"
142
+ v.to_f
143
+ when "boolean"
144
+ ["yes", "true", "1"].include?(v)
145
+ when "timestamp"
146
+ v.empty? ? nil : Time.strptime(v, c["format"])
147
+ else
148
+ raise "Unsupported type #{type}"
149
+ end
150
+ end
151
+ end
152
+
153
+ def find_by_path(e, path)
154
+ raise NotImplementedError.new("Find by path is unsupported")
126
155
  end
127
156
  end
128
157
 
129
158
  class IterXML < Iter
130
- def initialize(data, path)
159
+ def initialize(schema, data, config)
131
160
  require "rexml/document"
132
161
  super
133
162
  @doc = REXML::Document.new(@data)
134
163
  end
135
164
 
136
165
  def each
137
- @doc.elements.each(@path) do |e|
138
- ret = {}
166
+ @doc.elements.each(@config["path"]) do |e|
167
+ dest = {}
139
168
  e.elements.each do |d|
140
- ret[d.name] = d.text
169
+ dest[d.name] = d.text
141
170
  end
142
- yield ret
171
+ yield make_record(dest)
143
172
  end
144
173
  end
145
174
  end
146
175
 
147
176
  class IterJson < Iter
148
- def initialize(data, path)
177
+ def initialize(schema, data, config)
149
178
  require "jsonpath"
150
179
  super
151
- @jsonpath = JsonPath.new(@path)
180
+ @jsonpath = JsonPath.new(@config["path"])
152
181
  end
153
182
 
154
183
  def each
155
184
  @jsonpath.on(@data).flatten.each do |e|
156
- raise "data is must be hash, but #{e.class}" unless e.instance_of?(Hash)
157
- yield e
185
+ yield make_record(e)
158
186
  end
159
187
  end
188
+
189
+ def find_by_path(e, path)
190
+ JsonPath.on(e, path).first
191
+ end
160
192
  end
161
193
 
162
194
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-http
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Takuma kanari
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-08 00:00:00.000000000 Z
11
+ date: 2015-03-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: jsonpath
@@ -24,6 +24,26 @@ dependencies:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bracecomp
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.1'
34
+ - - ">="
35
+ - !ruby/object:Gem::Version
36
+ version: 0.1.2
37
+ type: :runtime
38
+ prerelease: false
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - "~>"
42
+ - !ruby/object:Gem::Version
43
+ version: '0.1'
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 0.1.2
27
47
  - !ruby/object:Gem::Dependency
28
48
  name: bundler
29
49
  requirement: !ruby/object:Gem::Requirement
@@ -52,7 +72,7 @@ dependencies:
52
72
  - - "~>"
53
73
  - !ruby/object:Gem::Version
54
74
  version: 0.9.2
55
- description: fetch data via http
75
+ description: Fetch data via http
56
76
  email:
57
77
  - chemtrails.t@gmail.com
58
78
  executables: []
@@ -65,6 +85,7 @@ files:
65
85
  - README.md
66
86
  - Rakefile
67
87
  - embulk-input-http.gemspec
88
+ - example/json-example-expand.yml
68
89
  - example/json-example.yml
69
90
  - lib/embulk/input/http.rb
70
91
  homepage: https://github.com/takumakanari/embulk-input-http