sinew 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,135 @@
1
+ require_relative 'test_helper'
2
+
3
+ class TestRequests < MiniTest::Test
4
+ def test_user_agent
5
+ sinew.dsl.get('http://httpbin.org/get', a: 1, b: 2)
6
+ assert_match(/sinew/, sinew.dsl.json[:headers][:'User-Agent'])
7
+ end
8
+
9
+ def test_basic_methods
10
+ sinew.dsl.get('http://httpbin.org/get', a: 1, b: 2)
11
+ assert_equal({ a: '1', b: '2' }, sinew.dsl.json[:args])
12
+
13
+ sinew.dsl.post('http://httpbin.org/post', a: 1, b: 2)
14
+ assert_equal({ a: '1', b: '2' }, sinew.dsl.json[:form])
15
+
16
+ sinew.dsl.post_json('http://httpbin.org/post', a: 1, b: 2)
17
+ assert_equal({ a: 1, b: 2 }, sinew.dsl.json[:json])
18
+ end
19
+
20
+ def test_custom_headers
21
+ sinew.dsl.http('get', 'http://httpbin.org/get', headers: { "User-Agent": '007' })
22
+ assert_match(/007/, sinew.dsl.json[:headers][:'User-Agent'])
23
+ end
24
+
25
+ def test_redirects
26
+ # absolute redirect
27
+ sinew.dsl.get('http://httpbin.org/redirect/2')
28
+ assert_equal 'http://httpbin.org/get', sinew.dsl.url
29
+
30
+ # and relative redirect
31
+ sinew.dsl.get('http://httpbin.org/relative-redirect/2')
32
+ assert_equal 'http://httpbin.org/get', sinew.dsl.url
33
+ end
34
+
35
+ def test_errors
36
+ skip if test_network?
37
+
38
+ # 500
39
+ assert_output(/failed with 500/) do
40
+ sinew.dsl.get('http://httpbin.org/status/500')
41
+ assert_equal '500', sinew.dsl.raw
42
+ end
43
+
44
+ # timeout
45
+ assert_output(/failed with 999/) do
46
+ sinew.dsl.get('http://httpbin.org/delay/1')
47
+ assert_equal 'timeout', sinew.dsl.raw
48
+ end
49
+ end
50
+
51
+ def test_retry_timeout
52
+ skip if test_network?
53
+
54
+ errors = 2
55
+ stub_request(:get, %r{http://[^/]+/error}).to_return do
56
+ if errors > 0
57
+ errors -= 1
58
+ raise Timeout::Error
59
+ end
60
+ { body: 'done', status: 200 }
61
+ end
62
+ sinew.dsl.get('http://httpbin.org/error')
63
+ assert_equal 0, errors
64
+ assert_equal 'done', sinew.dsl.raw
65
+ end
66
+
67
+ def test_retry_500
68
+ skip if test_network?
69
+
70
+ errors = 2
71
+ stub_request(:get, %r{http://[^/]+/error}).to_return do
72
+ if errors > 0
73
+ errors -= 1
74
+ return { status: 500 }
75
+ end
76
+ { body: 'done', status: 200 }
77
+ end
78
+ sinew.dsl.get('http://httpbin.org/error')
79
+ assert_equal 0, errors
80
+ assert_equal 'done', sinew.dsl.raw
81
+ end
82
+
83
+ def test_cache_key
84
+ # empty
85
+ req = Sinew::Request.new(sinew, 'get', 'http://host')
86
+ assert_equal req.cache_key, 'host/_root_'
87
+
88
+ # path
89
+ req = Sinew::Request.new(sinew, 'get', 'http://host/path')
90
+ assert_equal req.cache_key, 'host/path'
91
+
92
+ # query
93
+ req = Sinew::Request.new(sinew, 'get', 'http://host/path', query: { a: 'b' })
94
+ assert_equal req.cache_key, 'host/path,a=b'
95
+
96
+ # post with body
97
+ req = Sinew::Request.new(sinew, 'post', 'http://host/path', body: { c: 'd' })
98
+ assert_equal req.cache_key, 'host/post,path,c=d'
99
+
100
+ # too long should turn into digest
101
+ path = 'xyz' * 123
102
+ req = Sinew::Request.new(sinew, 'get', "http://host/#{path}")
103
+ assert_equal "host/#{Digest::MD5.hexdigest(path)}", req.cache_key
104
+ end
105
+
106
+ def test_before_generate_cache_key
107
+ sinew.runtime_options.before_generate_cache_key = method(:redact_cache_key)
108
+ req = Sinew::Request.new(sinew, 'get', 'http://host', query: { secret: 'xyz' })
109
+ assert_equal 'host/secret=redacted', req.cache_key
110
+ end
111
+
112
+ def test_urls
113
+ # simple
114
+ req = Sinew::Request.new(sinew, 'get', 'https://host')
115
+ assert_equal 'https://host', req.uri.to_s
116
+
117
+ # with query
118
+ req = Sinew::Request.new(sinew, 'get', 'https://host', query: { a: 1 })
119
+ assert_equal 'https://host?a=1', req.uri.to_s
120
+
121
+ # entity decoding
122
+ req = Sinew::Request.new(sinew, 'get', 'https://host?a=&lt;5')
123
+ assert_equal 'https://host?a=%3C5', req.uri.to_s
124
+
125
+ # sloppy urls
126
+ req = Sinew::Request.new(sinew, 'get', 'https://host?a=b c&d=f\'g')
127
+ assert_equal 'https://host?a=b%20c&d=f%27g', req.uri.to_s
128
+ end
129
+
130
+ def redact_cache_key(key)
131
+ key[:query].gsub!(/secret=[^&]+/, 'secret=redacted')
132
+ key
133
+ end
134
+ protected :redact_cache_key
135
+ end
@@ -0,0 +1,39 @@
1
+ require_relative 'test_helper'
2
+
3
+ class TestRequests < MiniTest::Test
4
+ def test_get
5
+ # network (or stub)
6
+ sinew.dsl.get('http://httpbin.org/get')
7
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
8
+
9
+ # disk
10
+ sinew.dsl.get('http://httpbin.org/get')
11
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
12
+ end
13
+
14
+ def test_utf8
15
+ skip if !test_network?
16
+
17
+ # network
18
+ sinew.dsl.get('http://httpbin.org/encoding/utf8')
19
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
20
+ assert_match(/∑/, sinew.dsl.raw)
21
+
22
+ # disk
23
+ sinew.dsl.get('http://httpbin.org/encoding/utf8')
24
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
25
+ assert_match(/∑/, sinew.dsl.raw)
26
+ end
27
+
28
+ def test_encode
29
+ skip if !test_network?
30
+
31
+ # network
32
+ sinew.dsl.get('https://www.google.co.jp')
33
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
34
+
35
+ # disk
36
+ sinew.dsl.get('https://www.google.co.jp')
37
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
38
+ end
39
+ end
metadata CHANGED
@@ -1,113 +1,155 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Doppelt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-10 00:00:00.000000000 Z
11
+ date: 2018-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: activesupport
14
+ name: awesome_print
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '3.0'
19
+ version: '1.8'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '3.0'
26
+ version: '1.8'
27
27
  - !ruby/object:Gem::Dependency
28
- name: awesome_print
28
+ name: htmlentities
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '4.3'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '4.3'
41
41
  - !ruby/object:Gem::Dependency
42
- name: htmlentities
42
+ name: httparty
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '0.16'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '0.16'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.8'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.8'
69
+ - !ruby/object:Gem::Dependency
70
+ name: scripto
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
60
74
  - !ruby/object:Gem::Version
61
75
  version: '0'
62
76
  type: :runtime
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - '>='
80
+ - - "~>"
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
- name: stringex
84
+ name: slop
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - ~>
87
+ - - "~>"
74
88
  - !ruby/object:Gem::Version
75
- version: '2.0'
89
+ version: '4.6'
76
90
  type: :runtime
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - ~>
94
+ - - "~>"
81
95
  - !ruby/object:Gem::Version
82
- version: '2.0'
96
+ version: '4.6'
83
97
  - !ruby/object:Gem::Dependency
84
- name: trollop
98
+ name: stringex
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - '>='
101
+ - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '0'
103
+ version: '2.8'
90
104
  type: :runtime
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - '>='
108
+ - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '0'
110
+ version: '2.8'
111
+ - !ruby/object:Gem::Dependency
112
+ name: minitest
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '5.11'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '5.11'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: rake
99
127
  requirement: !ruby/object:Gem::Requirement
100
128
  requirements:
101
- - - '>='
129
+ - - "~>"
102
130
  - !ruby/object:Gem::Version
103
- version: '0'
131
+ version: '12.3'
104
132
  type: :development
105
133
  prerelease: false
106
134
  version_requirements: !ruby/object:Gem::Requirement
107
135
  requirements:
108
- - - '>='
136
+ - - "~>"
109
137
  - !ruby/object:Gem::Version
110
- version: '0'
138
+ version: '12.3'
139
+ - !ruby/object:Gem::Dependency
140
+ name: webmock
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '3.4'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '3.4'
111
153
  description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
112
154
  email:
113
155
  - amd@gurge.com
@@ -116,29 +158,40 @@ executables:
116
158
  extensions: []
117
159
  extra_rdoc_files: []
118
160
  files:
119
- - .gitignore
161
+ - ".gitignore"
162
+ - ".rubocop.yml"
163
+ - ".travis.yml"
164
+ - ".vscode/extensions.json"
165
+ - ".vscode/settings.json"
120
166
  - Gemfile
121
167
  - LICENSE
122
168
  - README.md
123
169
  - Rakefile
124
170
  - bin/sinew
125
171
  - lib/sinew.rb
126
- - lib/sinew/curler.rb
172
+ - lib/sinew/cache.rb
173
+ - lib/sinew/core_ext.rb
174
+ - lib/sinew/dsl.rb
127
175
  - lib/sinew/main.rb
128
176
  - lib/sinew/nokogiri_ext.rb
129
- - lib/sinew/text_util.rb
130
- - lib/sinew/util.rb
177
+ - lib/sinew/output.rb
178
+ - lib/sinew/request.rb
179
+ - lib/sinew/response.rb
180
+ - lib/sinew/runtime_options.rb
131
181
  - lib/sinew/version.rb
132
182
  - sample.sinew
133
183
  - sinew.gemspec
134
- - test/helper.rb
135
184
  - test/test.html
136
- - test/test_curler.rb
185
+ - test/test_cache.rb
186
+ - test/test_helper.rb
137
187
  - test/test_main.rb
138
188
  - test/test_nokogiri_ext.rb
139
- - test/test_text_util.rb
189
+ - test/test_output.rb
190
+ - test/test_requests.rb
191
+ - test/test_utf8.rb
140
192
  homepage: http://github.com/gurgeous/sinew
141
- licenses: []
193
+ licenses:
194
+ - MIT
142
195
  metadata: {}
143
196
  post_install_message:
144
197
  rdoc_options: []
@@ -146,24 +199,26 @@ require_paths:
146
199
  - lib
147
200
  required_ruby_version: !ruby/object:Gem::Requirement
148
201
  requirements:
149
- - - '>='
202
+ - - "~>"
150
203
  - !ruby/object:Gem::Version
151
- version: '0'
204
+ version: '2.3'
152
205
  required_rubygems_version: !ruby/object:Gem::Requirement
153
206
  requirements:
154
- - - '>='
207
+ - - ">="
155
208
  - !ruby/object:Gem::Version
156
209
  version: '0'
157
210
  requirements: []
158
211
  rubyforge_project: sinew
159
- rubygems_version: 2.0.6
212
+ rubygems_version: 2.7.6
160
213
  signing_key:
161
214
  specification_version: 4
162
215
  summary: Sinew - structured web crawling using recipes.
163
216
  test_files:
164
- - test/helper.rb
165
217
  - test/test.html
166
- - test/test_curler.rb
218
+ - test/test_cache.rb
219
+ - test/test_helper.rb
167
220
  - test/test_main.rb
168
221
  - test/test_nokogiri_ext.rb
169
- - test/test_text_util.rb
222
+ - test/test_output.rb
223
+ - test/test_requests.rb
224
+ - test/test_utf8.rb
@@ -1,173 +0,0 @@
1
- require "uri"
2
-
3
- module Sinew
4
- class Curler
5
- class Error < StandardError ; end
6
-
7
- DEFAULT_OPTIONS = {
8
- :cache_errors => true,
9
- :max_time => 30,
10
- :retry => 3,
11
- :verbose => true,
12
- }
13
-
14
- attr_reader :url, :uri, :root
15
-
16
- def initialize(options = {})
17
- @options = DEFAULT_OPTIONS.merge(options)
18
- @curl_args = ["--silent", "--fail", "--user-agent", @options[:user_agent], "--max-time", @options[:max_time], "--retry", @options[:retry], "--location", "--max-redirs", "3"]
19
- @last_request = Time.at(0)
20
-
21
- @root = @options[:dir]
22
- if !@root
23
- if File.exists?(ENV["HOME"]) && File.stat(ENV["HOME"]).writable?
24
- @root = "#{ENV["HOME"]}/.sinew"
25
- else
26
- @root = "/tmp/sinew"
27
- end
28
- end
29
- end
30
-
31
- def get(url)
32
- curl(url, nil)
33
- end
34
-
35
- def post(url, body)
36
- curl(url, body)
37
- end
38
-
39
- def curl(url, body)
40
- #
41
- # prepare url/uri and calculate paths
42
- #
43
-
44
- @uri = url.is_a?(URI) ? url : Curler.url_to_uri(url.to_s)
45
- @url = @uri.to_s
46
-
47
- path = fullpath(@uri)
48
- path = "#{path},#{Util.pathify(body)}" if body
49
-
50
- # shorten long paths
51
- if path.length > 250
52
- dir, base = File.dirname(path), File.basename(path)
53
- path = "#{dir}/#{Util.md5(base)}"
54
- end
55
-
56
- head = "#{File.dirname(path)}/head/#{File.basename(path)}"
57
-
58
- if !File.exists?(path)
59
- verbose(body ? "curl #{@url} (POST)" : "curl #{@url}")
60
- tmp = "/tmp/curler_#{Util.random_text(6)}"
61
- tmph = "#{tmp}.head"
62
- begin
63
- rate_limit
64
- Util.mkdir_if_necessary(File.dirname(path))
65
- Util.mkdir_if_necessary(File.dirname(head))
66
- begin
67
- command = []
68
- command += @curl_args
69
- if body
70
- command += ["--data-binary", body]
71
- command += ["--header", "Content-Type: application/x-www-form-urlencoded"]
72
- end
73
- command += ["--output", tmp]
74
- command += ["--dump-header", tmph]
75
- command << @url
76
-
77
- Util.run("curl", command)
78
-
79
- # empty response?
80
- if !File.exists?(tmp)
81
- Util.touch(tmp)
82
- Util.touch(tmph)
83
- end
84
- rescue Util::RunError => e
85
- message = "curl error"
86
- if e.message =~ /(\d+)$/
87
- message = "#{message} (#{$1})"
88
- end
89
-
90
- # cache the error?
91
- if @options[:cache_errors]
92
- File.open(path, "w") { |f| f.puts "" }
93
- File.open(head, "w") { |f| f.puts "CURLER_ERROR\t#{message}" }
94
- end
95
-
96
- raise Error, message
97
- end
98
- Util.mv(tmp, path)
99
- Util.mv(tmph, head)
100
- ensure
101
- Util.rm_if_necessary(tmp)
102
- Util.rm_if_necessary(tmph)
103
- end
104
- end
105
-
106
- #
107
- # handle redirects (recalculate @uri/@url)
108
- #
109
-
110
- if File.exists?(head)
111
- head_contents = File.read(head)
112
- # handle cached errors
113
- if head_contents =~ /^CURLER_ERROR\t(.*)/
114
- raise Error, $1
115
- end
116
- original = @uri
117
- head_contents.scan(/\A(HTTP\/\d\.\d (\d+).*?\r\n\r\n)/m) do |i|
118
- headers, code = $1, $2
119
- if code =~ /^3/
120
- if redir = headers[/^Location: ([^\r\n]+)/, 1]
121
- @uri += redir
122
- @url = @uri.to_s
123
- end
124
- end
125
- end
126
- # kill unnecessary head files
127
- if original == @uri
128
- Util.rm(head)
129
- end
130
- end
131
-
132
- path
133
- end
134
-
135
- def verbose(s)
136
- $stderr.puts s if @options[:verbose]
137
- end
138
-
139
- #
140
- # helpers
141
- #
142
-
143
- def fullpath(uri)
144
- "#{@root}/#{Curler.uri_to_path(uri)}"
145
- end
146
-
147
- def uncache!(url)
148
- Util.rm_if_necessary("#{@root}/#{Curler.url_to_path(url)}")
149
- end
150
-
151
- def self.url_to_uri(url)
152
- url = url.gsub(" ", "%20")
153
- url = url.gsub("'", "%27")
154
- URI.parse(url)
155
- end
156
-
157
- def self.url_to_path(url)
158
- uri_to_path(url_to_uri(url))
159
- end
160
-
161
- def self.uri_to_path(uri)
162
- s = uri.path
163
- s = "#{s}?#{uri.query}" if uri.query
164
- "#{Util.pathify(uri.host)}/#{Util.pathify(s)}"
165
- end
166
-
167
- def rate_limit
168
- sleep = (@last_request + 1) - Time.now
169
- sleep(sleep) if sleep > 0
170
- @last_request = Time.now
171
- end
172
- end
173
- end