sinew 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ require_relative 'test_helper'
2
+
3
+ class TestRequests < MiniTest::Test
4
+ def test_user_agent
5
+ sinew.dsl.get('http://httpbin.org/get', a: 1, b: 2)
6
+ assert_match(/sinew/, sinew.dsl.json[:headers][:'User-Agent'])
7
+ end
8
+
9
+ def test_basic_methods
10
+ sinew.dsl.get('http://httpbin.org/get', a: 1, b: 2)
11
+ assert_equal({ a: '1', b: '2' }, sinew.dsl.json[:args])
12
+
13
+ sinew.dsl.post('http://httpbin.org/post', a: 1, b: 2)
14
+ assert_equal({ a: '1', b: '2' }, sinew.dsl.json[:form])
15
+
16
+ sinew.dsl.post_json('http://httpbin.org/post', a: 1, b: 2)
17
+ assert_equal({ a: 1, b: 2 }, sinew.dsl.json[:json])
18
+ end
19
+
20
+ def test_custom_headers
21
+ sinew.dsl.http('get', 'http://httpbin.org/get', headers: { "User-Agent": '007' })
22
+ assert_match(/007/, sinew.dsl.json[:headers][:'User-Agent'])
23
+ end
24
+
25
+ def test_redirects
26
+ # absolute redirect
27
+ sinew.dsl.get('http://httpbin.org/redirect/2')
28
+ assert_equal 'http://httpbin.org/get', sinew.dsl.url
29
+
30
+ # and relative redirect
31
+ sinew.dsl.get('http://httpbin.org/relative-redirect/2')
32
+ assert_equal 'http://httpbin.org/get', sinew.dsl.url
33
+ end
34
+
35
+ def test_errors
36
+ skip if test_network?
37
+
38
+ # 500
39
+ assert_output(/failed with 500/) do
40
+ sinew.dsl.get('http://httpbin.org/status/500')
41
+ assert_equal '500', sinew.dsl.raw
42
+ end
43
+
44
+ # timeout
45
+ assert_output(/failed with 999/) do
46
+ sinew.dsl.get('http://httpbin.org/delay/1')
47
+ assert_equal 'timeout', sinew.dsl.raw
48
+ end
49
+ end
50
+
51
+ def test_retry_timeout
52
+ skip if test_network?
53
+
54
+ errors = 2
55
+ stub_request(:get, %r{http://[^/]+/error}).to_return do
56
+ if errors > 0
57
+ errors -= 1
58
+ raise Timeout::Error
59
+ end
60
+ { body: 'done', status: 200 }
61
+ end
62
+ sinew.dsl.get('http://httpbin.org/error')
63
+ assert_equal 0, errors
64
+ assert_equal 'done', sinew.dsl.raw
65
+ end
66
+
67
+ def test_retry_500
68
+ skip if test_network?
69
+
70
+ errors = 2
71
+ stub_request(:get, %r{http://[^/]+/error}).to_return do
72
+ if errors > 0
73
+ errors -= 1
74
+ return { status: 500 }
75
+ end
76
+ { body: 'done', status: 200 }
77
+ end
78
+ sinew.dsl.get('http://httpbin.org/error')
79
+ assert_equal 0, errors
80
+ assert_equal 'done', sinew.dsl.raw
81
+ end
82
+
83
+ def test_cache_key
84
+ # empty
85
+ req = Sinew::Request.new(sinew, 'get', 'http://host')
86
+ assert_equal req.cache_key, 'host/_root_'
87
+
88
+ # path
89
+ req = Sinew::Request.new(sinew, 'get', 'http://host/path')
90
+ assert_equal req.cache_key, 'host/path'
91
+
92
+ # query
93
+ req = Sinew::Request.new(sinew, 'get', 'http://host/path', query: { a: 'b' })
94
+ assert_equal req.cache_key, 'host/path,a=b'
95
+
96
+ # post with body
97
+ req = Sinew::Request.new(sinew, 'post', 'http://host/path', body: { c: 'd' })
98
+ assert_equal req.cache_key, 'host/post,path,c=d'
99
+
100
+ # too long should turn into digest
101
+ path = 'xyz' * 123
102
+ req = Sinew::Request.new(sinew, 'get', "http://host/#{path}")
103
+ assert_equal "host/#{Digest::MD5.hexdigest(path)}", req.cache_key
104
+ end
105
+
106
+ def test_before_generate_cache_key
107
+ sinew.runtime_options.before_generate_cache_key = method(:redact_cache_key)
108
+ req = Sinew::Request.new(sinew, 'get', 'http://host', query: { secret: 'xyz' })
109
+ assert_equal 'host/secret=redacted', req.cache_key
110
+ end
111
+
112
+ def test_urls
113
+ # simple
114
+ req = Sinew::Request.new(sinew, 'get', 'https://host')
115
+ assert_equal 'https://host', req.uri.to_s
116
+
117
+ # with query
118
+ req = Sinew::Request.new(sinew, 'get', 'https://host', query: { a: 1 })
119
+ assert_equal 'https://host?a=1', req.uri.to_s
120
+
121
+ # entity decoding
122
+ req = Sinew::Request.new(sinew, 'get', 'https://host?a=&lt;5')
123
+ assert_equal 'https://host?a=%3C5', req.uri.to_s
124
+
125
+ # sloppy urls
126
+ req = Sinew::Request.new(sinew, 'get', 'https://host?a=b c&d=f\'g')
127
+ assert_equal 'https://host?a=b%20c&d=f%27g', req.uri.to_s
128
+ end
129
+
130
+ def redact_cache_key(key)
131
+ key[:query].gsub!(/secret=[^&]+/, 'secret=redacted')
132
+ key
133
+ end
134
+ protected :redact_cache_key
135
+ end
@@ -0,0 +1,39 @@
1
+ require_relative 'test_helper'
2
+
3
+ class TestRequests < MiniTest::Test
4
+ def test_get
5
+ # network (or stub)
6
+ sinew.dsl.get('http://httpbin.org/get')
7
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
8
+
9
+ # disk
10
+ sinew.dsl.get('http://httpbin.org/get')
11
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
12
+ end
13
+
14
+ def test_utf8
15
+ skip if !test_network?
16
+
17
+ # network
18
+ sinew.dsl.get('http://httpbin.org/encoding/utf8')
19
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
20
+ assert_match(/∑/, sinew.dsl.raw)
21
+
22
+ # disk
23
+ sinew.dsl.get('http://httpbin.org/encoding/utf8')
24
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
25
+ assert_match(/∑/, sinew.dsl.raw)
26
+ end
27
+
28
+ def test_encode
29
+ skip if !test_network?
30
+
31
+ # network
32
+ sinew.dsl.get('https://www.google.co.jp')
33
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
34
+
35
+ # disk
36
+ sinew.dsl.get('https://www.google.co.jp')
37
+ assert_equal 'UTF-8', sinew.dsl.raw.encoding.name
38
+ end
39
+ end
metadata CHANGED
@@ -1,113 +1,155 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Doppelt
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-11-10 00:00:00.000000000 Z
11
+ date: 2018-05-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: activesupport
14
+ name: awesome_print
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '3.0'
19
+ version: '1.8'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '3.0'
26
+ version: '1.8'
27
27
  - !ruby/object:Gem::Dependency
28
- name: awesome_print
28
+ name: htmlentities
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - '>='
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '4.3'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - '>='
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '4.3'
41
41
  - !ruby/object:Gem::Dependency
42
- name: htmlentities
42
+ name: httparty
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - '>='
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '0.16'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - '>='
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
54
+ version: '0.16'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: nokogiri
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - '>='
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.8'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.8'
69
+ - !ruby/object:Gem::Dependency
70
+ name: scripto
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
60
74
  - !ruby/object:Gem::Version
61
75
  version: '0'
62
76
  type: :runtime
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - '>='
80
+ - - "~>"
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
70
- name: stringex
84
+ name: slop
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - ~>
87
+ - - "~>"
74
88
  - !ruby/object:Gem::Version
75
- version: '2.0'
89
+ version: '4.6'
76
90
  type: :runtime
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - ~>
94
+ - - "~>"
81
95
  - !ruby/object:Gem::Version
82
- version: '2.0'
96
+ version: '4.6'
83
97
  - !ruby/object:Gem::Dependency
84
- name: trollop
98
+ name: stringex
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - '>='
101
+ - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '0'
103
+ version: '2.8'
90
104
  type: :runtime
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - '>='
108
+ - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '0'
110
+ version: '2.8'
111
+ - !ruby/object:Gem::Dependency
112
+ name: minitest
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '5.11'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '5.11'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: rake
99
127
  requirement: !ruby/object:Gem::Requirement
100
128
  requirements:
101
- - - '>='
129
+ - - "~>"
102
130
  - !ruby/object:Gem::Version
103
- version: '0'
131
+ version: '12.3'
104
132
  type: :development
105
133
  prerelease: false
106
134
  version_requirements: !ruby/object:Gem::Requirement
107
135
  requirements:
108
- - - '>='
136
+ - - "~>"
109
137
  - !ruby/object:Gem::Version
110
- version: '0'
138
+ version: '12.3'
139
+ - !ruby/object:Gem::Dependency
140
+ name: webmock
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: '3.4'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: '3.4'
111
153
  description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
112
154
  email:
113
155
  - amd@gurge.com
@@ -116,29 +158,40 @@ executables:
116
158
  extensions: []
117
159
  extra_rdoc_files: []
118
160
  files:
119
- - .gitignore
161
+ - ".gitignore"
162
+ - ".rubocop.yml"
163
+ - ".travis.yml"
164
+ - ".vscode/extensions.json"
165
+ - ".vscode/settings.json"
120
166
  - Gemfile
121
167
  - LICENSE
122
168
  - README.md
123
169
  - Rakefile
124
170
  - bin/sinew
125
171
  - lib/sinew.rb
126
- - lib/sinew/curler.rb
172
+ - lib/sinew/cache.rb
173
+ - lib/sinew/core_ext.rb
174
+ - lib/sinew/dsl.rb
127
175
  - lib/sinew/main.rb
128
176
  - lib/sinew/nokogiri_ext.rb
129
- - lib/sinew/text_util.rb
130
- - lib/sinew/util.rb
177
+ - lib/sinew/output.rb
178
+ - lib/sinew/request.rb
179
+ - lib/sinew/response.rb
180
+ - lib/sinew/runtime_options.rb
131
181
  - lib/sinew/version.rb
132
182
  - sample.sinew
133
183
  - sinew.gemspec
134
- - test/helper.rb
135
184
  - test/test.html
136
- - test/test_curler.rb
185
+ - test/test_cache.rb
186
+ - test/test_helper.rb
137
187
  - test/test_main.rb
138
188
  - test/test_nokogiri_ext.rb
139
- - test/test_text_util.rb
189
+ - test/test_output.rb
190
+ - test/test_requests.rb
191
+ - test/test_utf8.rb
140
192
  homepage: http://github.com/gurgeous/sinew
141
- licenses: []
193
+ licenses:
194
+ - MIT
142
195
  metadata: {}
143
196
  post_install_message:
144
197
  rdoc_options: []
@@ -146,24 +199,26 @@ require_paths:
146
199
  - lib
147
200
  required_ruby_version: !ruby/object:Gem::Requirement
148
201
  requirements:
149
- - - '>='
202
+ - - "~>"
150
203
  - !ruby/object:Gem::Version
151
- version: '0'
204
+ version: '2.3'
152
205
  required_rubygems_version: !ruby/object:Gem::Requirement
153
206
  requirements:
154
- - - '>='
207
+ - - ">="
155
208
  - !ruby/object:Gem::Version
156
209
  version: '0'
157
210
  requirements: []
158
211
  rubyforge_project: sinew
159
- rubygems_version: 2.0.6
212
+ rubygems_version: 2.7.6
160
213
  signing_key:
161
214
  specification_version: 4
162
215
  summary: Sinew - structured web crawling using recipes.
163
216
  test_files:
164
- - test/helper.rb
165
217
  - test/test.html
166
- - test/test_curler.rb
218
+ - test/test_cache.rb
219
+ - test/test_helper.rb
167
220
  - test/test_main.rb
168
221
  - test/test_nokogiri_ext.rb
169
- - test/test_text_util.rb
222
+ - test/test_output.rb
223
+ - test/test_requests.rb
224
+ - test/test_utf8.rb
@@ -1,173 +0,0 @@
1
- require "uri"
2
-
3
- module Sinew
4
- class Curler
5
- class Error < StandardError ; end
6
-
7
- DEFAULT_OPTIONS = {
8
- :cache_errors => true,
9
- :max_time => 30,
10
- :retry => 3,
11
- :verbose => true,
12
- }
13
-
14
- attr_reader :url, :uri, :root
15
-
16
- def initialize(options = {})
17
- @options = DEFAULT_OPTIONS.merge(options)
18
- @curl_args = ["--silent", "--fail", "--user-agent", @options[:user_agent], "--max-time", @options[:max_time], "--retry", @options[:retry], "--location", "--max-redirs", "3"]
19
- @last_request = Time.at(0)
20
-
21
- @root = @options[:dir]
22
- if !@root
23
- if File.exists?(ENV["HOME"]) && File.stat(ENV["HOME"]).writable?
24
- @root = "#{ENV["HOME"]}/.sinew"
25
- else
26
- @root = "/tmp/sinew"
27
- end
28
- end
29
- end
30
-
31
- def get(url)
32
- curl(url, nil)
33
- end
34
-
35
- def post(url, body)
36
- curl(url, body)
37
- end
38
-
39
- def curl(url, body)
40
- #
41
- # prepare url/uri and calculate paths
42
- #
43
-
44
- @uri = url.is_a?(URI) ? url : Curler.url_to_uri(url.to_s)
45
- @url = @uri.to_s
46
-
47
- path = fullpath(@uri)
48
- path = "#{path},#{Util.pathify(body)}" if body
49
-
50
- # shorten long paths
51
- if path.length > 250
52
- dir, base = File.dirname(path), File.basename(path)
53
- path = "#{dir}/#{Util.md5(base)}"
54
- end
55
-
56
- head = "#{File.dirname(path)}/head/#{File.basename(path)}"
57
-
58
- if !File.exists?(path)
59
- verbose(body ? "curl #{@url} (POST)" : "curl #{@url}")
60
- tmp = "/tmp/curler_#{Util.random_text(6)}"
61
- tmph = "#{tmp}.head"
62
- begin
63
- rate_limit
64
- Util.mkdir_if_necessary(File.dirname(path))
65
- Util.mkdir_if_necessary(File.dirname(head))
66
- begin
67
- command = []
68
- command += @curl_args
69
- if body
70
- command += ["--data-binary", body]
71
- command += ["--header", "Content-Type: application/x-www-form-urlencoded"]
72
- end
73
- command += ["--output", tmp]
74
- command += ["--dump-header", tmph]
75
- command << @url
76
-
77
- Util.run("curl", command)
78
-
79
- # empty response?
80
- if !File.exists?(tmp)
81
- Util.touch(tmp)
82
- Util.touch(tmph)
83
- end
84
- rescue Util::RunError => e
85
- message = "curl error"
86
- if e.message =~ /(\d+)$/
87
- message = "#{message} (#{$1})"
88
- end
89
-
90
- # cache the error?
91
- if @options[:cache_errors]
92
- File.open(path, "w") { |f| f.puts "" }
93
- File.open(head, "w") { |f| f.puts "CURLER_ERROR\t#{message}" }
94
- end
95
-
96
- raise Error, message
97
- end
98
- Util.mv(tmp, path)
99
- Util.mv(tmph, head)
100
- ensure
101
- Util.rm_if_necessary(tmp)
102
- Util.rm_if_necessary(tmph)
103
- end
104
- end
105
-
106
- #
107
- # handle redirects (recalculate @uri/@url)
108
- #
109
-
110
- if File.exists?(head)
111
- head_contents = File.read(head)
112
- # handle cached errors
113
- if head_contents =~ /^CURLER_ERROR\t(.*)/
114
- raise Error, $1
115
- end
116
- original = @uri
117
- head_contents.scan(/\A(HTTP\/\d\.\d (\d+).*?\r\n\r\n)/m) do |i|
118
- headers, code = $1, $2
119
- if code =~ /^3/
120
- if redir = headers[/^Location: ([^\r\n]+)/, 1]
121
- @uri += redir
122
- @url = @uri.to_s
123
- end
124
- end
125
- end
126
- # kill unnecessary head files
127
- if original == @uri
128
- Util.rm(head)
129
- end
130
- end
131
-
132
- path
133
- end
134
-
135
- def verbose(s)
136
- $stderr.puts s if @options[:verbose]
137
- end
138
-
139
- #
140
- # helpers
141
- #
142
-
143
- def fullpath(uri)
144
- "#{@root}/#{Curler.uri_to_path(uri)}"
145
- end
146
-
147
- def uncache!(url)
148
- Util.rm_if_necessary("#{@root}/#{Curler.url_to_path(url)}")
149
- end
150
-
151
- def self.url_to_uri(url)
152
- url = url.gsub(" ", "%20")
153
- url = url.gsub("'", "%27")
154
- URI.parse(url)
155
- end
156
-
157
- def self.url_to_path(url)
158
- uri_to_path(url_to_uri(url))
159
- end
160
-
161
- def self.uri_to_path(uri)
162
- s = uri.path
163
- s = "#{s}?#{uri.query}" if uri.query
164
- "#{Util.pathify(uri.host)}/#{Util.pathify(s)}"
165
- end
166
-
167
- def rate_limit
168
- sleep = (@last_request + 1) - Time.now
169
- sleep(sleep) if sleep > 0
170
- @last_request = Time.now
171
- end
172
- end
173
- end