sinew 2.0.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +26 -0
- data/.rubocop.yml +9 -6
- data/.vscode/settings.json +0 -10
- data/Gemfile +9 -0
- data/LICENSE +1 -1
- data/README.md +77 -58
- data/Rakefile +33 -18
- data/bin/sinew +8 -4
- data/lib/sinew.rb +0 -1
- data/lib/sinew/connection.rb +52 -0
- data/lib/sinew/connection/log_formatter.rb +22 -0
- data/lib/sinew/connection/rate_limit.rb +29 -0
- data/lib/sinew/core_ext.rb +1 -1
- data/lib/sinew/dsl.rb +27 -10
- data/lib/sinew/main.rb +7 -54
- data/lib/sinew/output.rb +26 -19
- data/lib/sinew/request.rb +28 -49
- data/lib/sinew/response.rb +25 -55
- data/lib/sinew/runtime_options.rb +4 -2
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +2 -2
- data/sinew.gemspec +16 -17
- metadata +41 -81
- data/.travis.yml +0 -4
- data/lib/sinew/cache.rb +0 -79
- data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
- data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
- data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
- data/test/legacy/eu.httpbin.org/status,500 +0 -1
- data/test/legacy/legacy.sinew +0 -2
- data/test/test.html +0 -45
- data/test/test_cache.rb +0 -69
- data/test/test_helper.rb +0 -113
- data/test/test_legacy.rb +0 -21
- data/test/test_main.rb +0 -46
- data/test/test_nokogiri_ext.rb +0 -18
- data/test/test_output.rb +0 -73
- data/test/test_requests.rb +0 -135
- data/test/test_utf8.rb +0 -39
data/lib/sinew/response.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
require 'zlib'
|
3
|
+
|
1
4
|
#
|
2
|
-
# An HTTP response.
|
5
|
+
# An HTTP response.
|
3
6
|
#
|
4
7
|
|
5
8
|
module Sinew
|
@@ -10,69 +13,36 @@ module Sinew
|
|
10
13
|
# factory methods
|
11
14
|
#
|
12
15
|
|
13
|
-
def self.from_network(request,
|
14
|
-
Response.new.tap do
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
# force to utf-8 as best we can
|
21
|
-
body = party_response.body
|
22
|
-
if body.encoding != Encoding::UTF_8
|
23
|
-
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
24
|
-
end
|
25
|
-
response.body = body
|
16
|
+
def self.from_network(request, fday_response)
|
17
|
+
Response.new.tap do
|
18
|
+
_1.request = request
|
19
|
+
_1.uri = fday_response.env.url
|
20
|
+
_1.code = fday_response.status
|
21
|
+
_1.headers = fday_response.headers.to_h
|
22
|
+
_1.body = process_body(fday_response)
|
26
23
|
end
|
27
24
|
end
|
28
25
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
response.body = body
|
26
|
+
# helper for decoding bodies before parsing
|
27
|
+
def self.process_body(response)
|
28
|
+
body = response.body
|
33
29
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
30
|
+
# inflate if necessary
|
31
|
+
bits = body[0, 10].force_encoding('BINARY')
|
32
|
+
if bits =~ /\A\x1f\x8b/n
|
33
|
+
body = Zlib::GzipReader.new(StringIO.new(body)).read
|
34
|
+
end
|
38
35
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
36
|
+
# force to utf-8 if we think this could be text
|
37
|
+
if body.encoding != Encoding::UTF_8
|
38
|
+
if content_type = response.headers['content-type']
|
39
|
+
if content_type =~ /\b(html|javascript|json|text|xml)\b/
|
40
|
+
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
43
41
|
end
|
44
|
-
head = JSON.parse(head, symbolize_names: true)
|
45
|
-
response.uri = URI.parse(head[:uri])
|
46
|
-
response.code = head[:code]
|
47
|
-
response.headers = head[:headers]
|
48
42
|
end
|
49
43
|
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def self.from_timeout(request)
|
53
|
-
Response.new.tap do |response|
|
54
|
-
response.request = request
|
55
|
-
response.uri = request.uri
|
56
|
-
response.body = 'timeout'
|
57
|
-
response.code = 999
|
58
|
-
response.headers = {}
|
59
|
-
end
|
60
|
-
end
|
61
44
|
|
62
|
-
|
63
|
-
response.tap do |response|
|
64
|
-
case head
|
65
|
-
when /\ACURLER_ERROR/
|
66
|
-
# error
|
67
|
-
response.code = 999
|
68
|
-
when /\AHTTP/
|
69
|
-
# redirect
|
70
|
-
location = head.scan(/Location: ([^\r\n]+)/).flatten.last
|
71
|
-
response.uri += location
|
72
|
-
else
|
73
|
-
$stderr.puts "unknown cached /head for #{response.uri}"
|
74
|
-
end
|
75
|
-
end
|
45
|
+
body
|
76
46
|
end
|
77
47
|
|
78
48
|
#
|
@@ -7,7 +7,8 @@ module Sinew
|
|
7
7
|
attr_accessor :retries
|
8
8
|
attr_accessor :rate_limit
|
9
9
|
attr_accessor :headers
|
10
|
-
attr_accessor :
|
10
|
+
attr_accessor :httpdisk_options
|
11
|
+
attr_accessor :insecure
|
11
12
|
|
12
13
|
def initialize
|
13
14
|
self.retries = 3
|
@@ -15,7 +16,8 @@ module Sinew
|
|
15
16
|
self.headers = {
|
16
17
|
'User-Agent' => "sinew/#{VERSION}",
|
17
18
|
}
|
18
|
-
self.
|
19
|
+
self.httpdisk_options = {}
|
20
|
+
self.insecure = false
|
19
21
|
|
20
22
|
# for testing
|
21
23
|
if ENV['SINEW_TEST']
|
data/lib/sinew/version.rb
CHANGED
data/sample.sinew
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
get 'http://
|
1
|
+
get 'http://httpbingo.org'
|
2
2
|
noko.css('ul li a').each do |a|
|
3
3
|
row = {}
|
4
4
|
row[:url] = a[:href]
|
@@ -6,4 +6,4 @@ noko.css('ul li a').each do |a|
|
|
6
6
|
csv_emit(row)
|
7
7
|
end
|
8
8
|
|
9
|
-
get 'http://
|
9
|
+
get 'http://httpbingo.org/redirect/2'
|
data/sinew.gemspec
CHANGED
@@ -5,30 +5,29 @@ require 'sinew/version'
|
|
5
5
|
Gem::Specification.new do |s|
|
6
6
|
s.name = 'sinew'
|
7
7
|
s.version = Sinew::VERSION
|
8
|
-
s.platform = Gem::Platform::RUBY
|
9
8
|
s.license = 'MIT'
|
10
|
-
s.authors = [ 'Adam Doppelt' ]
|
9
|
+
s.authors = [ 'Adam Doppelt', 'Nathan Kriege' ]
|
11
10
|
s.email = [ 'amd@gurge.com' ]
|
12
11
|
s.homepage = 'http://github.com/gurgeous/sinew'
|
13
12
|
s.summary = 'Sinew - structured web crawling using recipes.'
|
14
13
|
s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
|
15
|
-
s.required_ruby_version = '
|
14
|
+
s.required_ruby_version = '>= 2.7'
|
16
15
|
|
17
|
-
s
|
16
|
+
# what's in the gem?
|
17
|
+
s.files = Dir.chdir(File.expand_path(__dir__)) do
|
18
|
+
`git ls-files -z`.split("\x0").reject { _1.match(%r{^test/}) }
|
19
|
+
end
|
20
|
+
s.bindir = 'bin'
|
21
|
+
s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
|
22
|
+
s.require_paths = [ 'lib' ]
|
18
23
|
|
19
|
-
s.add_runtime_dependency '
|
24
|
+
s.add_runtime_dependency 'amazing_print', '~> 1.3'
|
25
|
+
s.add_runtime_dependency 'faraday', '~> 1.4'
|
26
|
+
s.add_runtime_dependency 'faraday-encoding', '~> 0'
|
20
27
|
s.add_runtime_dependency 'htmlentities', '~> 4.3'
|
21
|
-
s.add_runtime_dependency '
|
22
|
-
s.add_runtime_dependency 'nokogiri', '~> 1.
|
28
|
+
s.add_runtime_dependency 'httpdisk', '~> 0'
|
29
|
+
s.add_runtime_dependency 'nokogiri', '~> 1.11'
|
23
30
|
s.add_runtime_dependency 'scripto', '~> 0'
|
24
|
-
s.add_runtime_dependency 'slop', '~> 4.
|
25
|
-
s.add_runtime_dependency '
|
26
|
-
s.add_development_dependency 'minitest', '~> 5.11'
|
27
|
-
s.add_development_dependency 'rake', '~> 12.3'
|
28
|
-
s.add_development_dependency 'webmock', '~> 3.4'
|
29
|
-
|
30
|
-
s.files = `git ls-files`.split("\n")
|
31
|
-
s.test_files = `git ls-files -- test/*`.split("\n")
|
32
|
-
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
33
|
-
s.require_paths = [ 'lib' ]
|
31
|
+
s.add_runtime_dependency 'slop', '~> 4.8'
|
32
|
+
s.add_runtime_dependency 'sterile', '~> 1.0'
|
34
33
|
end
|
metadata
CHANGED
@@ -1,73 +1,74 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sinew
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 3.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Doppelt
|
8
|
-
|
8
|
+
- Nathan Kriege
|
9
|
+
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date:
|
12
|
+
date: 2021-05-11 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
15
|
+
name: amazing_print
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
16
17
|
requirements:
|
17
18
|
- - "~>"
|
18
19
|
- !ruby/object:Gem::Version
|
19
|
-
version: '1.
|
20
|
+
version: '1.3'
|
20
21
|
type: :runtime
|
21
22
|
prerelease: false
|
22
23
|
version_requirements: !ruby/object:Gem::Requirement
|
23
24
|
requirements:
|
24
25
|
- - "~>"
|
25
26
|
- !ruby/object:Gem::Version
|
26
|
-
version: '1.
|
27
|
+
version: '1.3'
|
27
28
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
29
|
+
name: faraday
|
29
30
|
requirement: !ruby/object:Gem::Requirement
|
30
31
|
requirements:
|
31
32
|
- - "~>"
|
32
33
|
- !ruby/object:Gem::Version
|
33
|
-
version: '4
|
34
|
+
version: '1.4'
|
34
35
|
type: :runtime
|
35
36
|
prerelease: false
|
36
37
|
version_requirements: !ruby/object:Gem::Requirement
|
37
38
|
requirements:
|
38
39
|
- - "~>"
|
39
40
|
- !ruby/object:Gem::Version
|
40
|
-
version: '4
|
41
|
+
version: '1.4'
|
41
42
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
43
|
+
name: faraday-encoding
|
43
44
|
requirement: !ruby/object:Gem::Requirement
|
44
45
|
requirements:
|
45
46
|
- - "~>"
|
46
47
|
- !ruby/object:Gem::Version
|
47
|
-
version: '0
|
48
|
+
version: '0'
|
48
49
|
type: :runtime
|
49
50
|
prerelease: false
|
50
51
|
version_requirements: !ruby/object:Gem::Requirement
|
51
52
|
requirements:
|
52
53
|
- - "~>"
|
53
54
|
- !ruby/object:Gem::Version
|
54
|
-
version: '0
|
55
|
+
version: '0'
|
55
56
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
57
|
+
name: htmlentities
|
57
58
|
requirement: !ruby/object:Gem::Requirement
|
58
59
|
requirements:
|
59
60
|
- - "~>"
|
60
61
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
62
|
+
version: '4.3'
|
62
63
|
type: :runtime
|
63
64
|
prerelease: false
|
64
65
|
version_requirements: !ruby/object:Gem::Requirement
|
65
66
|
requirements:
|
66
67
|
- - "~>"
|
67
68
|
- !ruby/object:Gem::Version
|
68
|
-
version: '
|
69
|
+
version: '4.3'
|
69
70
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
71
|
+
name: httpdisk
|
71
72
|
requirement: !ruby/object:Gem::Requirement
|
72
73
|
requirements:
|
73
74
|
- - "~>"
|
@@ -81,75 +82,61 @@ dependencies:
|
|
81
82
|
- !ruby/object:Gem::Version
|
82
83
|
version: '0'
|
83
84
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
85
|
+
name: nokogiri
|
85
86
|
requirement: !ruby/object:Gem::Requirement
|
86
87
|
requirements:
|
87
88
|
- - "~>"
|
88
89
|
- !ruby/object:Gem::Version
|
89
|
-
version: '
|
90
|
+
version: '1.11'
|
90
91
|
type: :runtime
|
91
92
|
prerelease: false
|
92
93
|
version_requirements: !ruby/object:Gem::Requirement
|
93
94
|
requirements:
|
94
95
|
- - "~>"
|
95
96
|
- !ruby/object:Gem::Version
|
96
|
-
version: '
|
97
|
+
version: '1.11'
|
97
98
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
99
|
+
name: scripto
|
99
100
|
requirement: !ruby/object:Gem::Requirement
|
100
101
|
requirements:
|
101
102
|
- - "~>"
|
102
103
|
- !ruby/object:Gem::Version
|
103
|
-
version: '
|
104
|
+
version: '0'
|
104
105
|
type: :runtime
|
105
106
|
prerelease: false
|
106
107
|
version_requirements: !ruby/object:Gem::Requirement
|
107
108
|
requirements:
|
108
109
|
- - "~>"
|
109
110
|
- !ruby/object:Gem::Version
|
110
|
-
version: '
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: minitest
|
113
|
-
requirement: !ruby/object:Gem::Requirement
|
114
|
-
requirements:
|
115
|
-
- - "~>"
|
116
|
-
- !ruby/object:Gem::Version
|
117
|
-
version: '5.11'
|
118
|
-
type: :development
|
119
|
-
prerelease: false
|
120
|
-
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
requirements:
|
122
|
-
- - "~>"
|
123
|
-
- !ruby/object:Gem::Version
|
124
|
-
version: '5.11'
|
111
|
+
version: '0'
|
125
112
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
113
|
+
name: slop
|
127
114
|
requirement: !ruby/object:Gem::Requirement
|
128
115
|
requirements:
|
129
116
|
- - "~>"
|
130
117
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
132
|
-
type: :
|
118
|
+
version: '4.8'
|
119
|
+
type: :runtime
|
133
120
|
prerelease: false
|
134
121
|
version_requirements: !ruby/object:Gem::Requirement
|
135
122
|
requirements:
|
136
123
|
- - "~>"
|
137
124
|
- !ruby/object:Gem::Version
|
138
|
-
version: '
|
125
|
+
version: '4.8'
|
139
126
|
- !ruby/object:Gem::Dependency
|
140
|
-
name:
|
127
|
+
name: sterile
|
141
128
|
requirement: !ruby/object:Gem::Requirement
|
142
129
|
requirements:
|
143
130
|
- - "~>"
|
144
131
|
- !ruby/object:Gem::Version
|
145
|
-
version: '
|
146
|
-
type: :
|
132
|
+
version: '1.0'
|
133
|
+
type: :runtime
|
147
134
|
prerelease: false
|
148
135
|
version_requirements: !ruby/object:Gem::Requirement
|
149
136
|
requirements:
|
150
137
|
- - "~>"
|
151
138
|
- !ruby/object:Gem::Version
|
152
|
-
version: '
|
139
|
+
version: '1.0'
|
153
140
|
description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
|
154
141
|
email:
|
155
142
|
- amd@gurge.com
|
@@ -158,9 +145,9 @@ executables:
|
|
158
145
|
extensions: []
|
159
146
|
extra_rdoc_files: []
|
160
147
|
files:
|
148
|
+
- ".github/workflows/test.yml"
|
161
149
|
- ".gitignore"
|
162
150
|
- ".rubocop.yml"
|
163
|
-
- ".travis.yml"
|
164
151
|
- ".vscode/extensions.json"
|
165
152
|
- ".vscode/settings.json"
|
166
153
|
- Gemfile
|
@@ -169,7 +156,9 @@ files:
|
|
169
156
|
- Rakefile
|
170
157
|
- bin/sinew
|
171
158
|
- lib/sinew.rb
|
172
|
-
- lib/sinew/
|
159
|
+
- lib/sinew/connection.rb
|
160
|
+
- lib/sinew/connection/log_formatter.rb
|
161
|
+
- lib/sinew/connection/rate_limit.rb
|
173
162
|
- lib/sinew/core_ext.rb
|
174
163
|
- lib/sinew/dsl.rb
|
175
164
|
- lib/sinew/main.rb
|
@@ -181,56 +170,27 @@ files:
|
|
181
170
|
- lib/sinew/version.rb
|
182
171
|
- sample.sinew
|
183
172
|
- sinew.gemspec
|
184
|
-
- test/legacy/eu.httpbin.org/head/redirect,3
|
185
|
-
- test/legacy/eu.httpbin.org/head/status,500
|
186
|
-
- test/legacy/eu.httpbin.org/redirect,3
|
187
|
-
- test/legacy/eu.httpbin.org/status,500
|
188
|
-
- test/legacy/legacy.sinew
|
189
|
-
- test/test.html
|
190
|
-
- test/test_cache.rb
|
191
|
-
- test/test_helper.rb
|
192
|
-
- test/test_legacy.rb
|
193
|
-
- test/test_main.rb
|
194
|
-
- test/test_nokogiri_ext.rb
|
195
|
-
- test/test_output.rb
|
196
|
-
- test/test_requests.rb
|
197
|
-
- test/test_utf8.rb
|
198
173
|
homepage: http://github.com/gurgeous/sinew
|
199
174
|
licenses:
|
200
175
|
- MIT
|
201
176
|
metadata: {}
|
202
|
-
post_install_message:
|
177
|
+
post_install_message:
|
203
178
|
rdoc_options: []
|
204
179
|
require_paths:
|
205
180
|
- lib
|
206
181
|
required_ruby_version: !ruby/object:Gem::Requirement
|
207
182
|
requirements:
|
208
|
-
- - "
|
183
|
+
- - ">="
|
209
184
|
- !ruby/object:Gem::Version
|
210
|
-
version: '2.
|
185
|
+
version: '2.7'
|
211
186
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
212
187
|
requirements:
|
213
188
|
- - ">="
|
214
189
|
- !ruby/object:Gem::Version
|
215
190
|
version: '0'
|
216
191
|
requirements: []
|
217
|
-
|
218
|
-
|
219
|
-
signing_key:
|
192
|
+
rubygems_version: 3.1.4
|
193
|
+
signing_key:
|
220
194
|
specification_version: 4
|
221
195
|
summary: Sinew - structured web crawling using recipes.
|
222
|
-
test_files:
|
223
|
-
- test/legacy/eu.httpbin.org/head/redirect,3
|
224
|
-
- test/legacy/eu.httpbin.org/head/status,500
|
225
|
-
- test/legacy/eu.httpbin.org/redirect,3
|
226
|
-
- test/legacy/eu.httpbin.org/status,500
|
227
|
-
- test/legacy/legacy.sinew
|
228
|
-
- test/test.html
|
229
|
-
- test/test_cache.rb
|
230
|
-
- test/test_helper.rb
|
231
|
-
- test/test_legacy.rb
|
232
|
-
- test/test_main.rb
|
233
|
-
- test/test_nokogiri_ext.rb
|
234
|
-
- test/test_output.rb
|
235
|
-
- test/test_requests.rb
|
236
|
-
- test/test_utf8.rb
|
196
|
+
test_files: []
|