sinew 2.0.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,8 @@
1
+ require 'stringio'
2
+ require 'zlib'
3
+
1
4
  #
2
- # An HTTP response. Mostly a wrapper around HTTParty.
5
+ # An HTTP response.
3
6
  #
4
7
 
5
8
  module Sinew
@@ -10,69 +13,36 @@ module Sinew
10
13
  # factory methods
11
14
  #
12
15
 
13
- def self.from_network(request, party_response)
14
- Response.new.tap do |response|
15
- response.request = request
16
- response.uri = party_response.request.last_uri
17
- response.code = party_response.code
18
- response.headers = party_response.headers.to_h
19
-
20
- # force to utf-8 as best we can
21
- body = party_response.body
22
- if body.encoding != Encoding::UTF_8
23
- body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
24
- end
25
- response.body = body
16
+ def self.from_network(request, fday_response)
17
+ Response.new.tap do
18
+ _1.request = request
19
+ _1.uri = fday_response.env.url
20
+ _1.code = fday_response.status
21
+ _1.headers = fday_response.headers.to_h
22
+ _1.body = process_body(fday_response)
26
23
  end
27
24
  end
28
25
 
29
- def self.from_cache(request, body, head)
30
- Response.new.tap do |response|
31
- response.request = request
32
- response.body = body
26
+ # helper for decoding bodies before parsing
27
+ def self.process_body(response)
28
+ body = response.body
33
29
 
34
- # defaults
35
- response.uri = request.uri
36
- response.code = 200
37
- response.headers = {}
30
+ # inflate if necessary
31
+ bits = body[0, 10].force_encoding('BINARY')
32
+ if bits =~ /\A\x1f\x8b/n
33
+ body = Zlib::GzipReader.new(StringIO.new(body)).read
34
+ end
38
35
 
39
- # overwrite with cached response headers
40
- if head
41
- if head !~ /^{/
42
- return from_legacy_head(response, head)
36
+ # force to utf-8 if we think this could be text
37
+ if body.encoding != Encoding::UTF_8
38
+ if content_type = response.headers['content-type']
39
+ if content_type =~ /\b(html|javascript|json|text|xml)\b/
40
+ body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
43
41
  end
44
- head = JSON.parse(head, symbolize_names: true)
45
- response.uri = URI.parse(head[:uri])
46
- response.code = head[:code]
47
- response.headers = head[:headers]
48
42
  end
49
43
  end
50
- end
51
-
52
- def self.from_timeout(request)
53
- Response.new.tap do |response|
54
- response.request = request
55
- response.uri = request.uri
56
- response.body = 'timeout'
57
- response.code = 999
58
- response.headers = {}
59
- end
60
- end
61
44
 
62
- def self.from_legacy_head(response, head)
63
- response.tap do |response|
64
- case head
65
- when /\ACURLER_ERROR/
66
- # error
67
- response.code = 999
68
- when /\AHTTP/
69
- # redirect
70
- location = head.scan(/Location: ([^\r\n]+)/).flatten.last
71
- response.uri += location
72
- else
73
- $stderr.puts "unknown cached /head for #{response.uri}"
74
- end
75
- end
45
+ body
76
46
  end
77
47
 
78
48
  #
@@ -7,7 +7,8 @@ module Sinew
7
7
  attr_accessor :retries
8
8
  attr_accessor :rate_limit
9
9
  attr_accessor :headers
10
- attr_accessor :before_generate_cache_key
10
+ attr_accessor :httpdisk_options
11
+ attr_accessor :insecure
11
12
 
12
13
  def initialize
13
14
  self.retries = 3
@@ -15,7 +16,8 @@ module Sinew
15
16
  self.headers = {
16
17
  'User-Agent' => "sinew/#{VERSION}",
17
18
  }
18
- self.before_generate_cache_key = ->(i) { i }
19
+ self.httpdisk_options = {}
20
+ self.insecure = false
19
21
 
20
22
  # for testing
21
23
  if ENV['SINEW_TEST']
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = '2.0.1'.freeze
3
+ VERSION = '3.0.0'.freeze
4
4
  end
data/sample.sinew CHANGED
@@ -1,4 +1,4 @@
1
- get 'http://httpbin.org'
1
+ get 'http://httpbingo.org'
2
2
  noko.css('ul li a').each do |a|
3
3
  row = {}
4
4
  row[:url] = a[:href]
@@ -6,4 +6,4 @@ noko.css('ul li a').each do |a|
6
6
  csv_emit(row)
7
7
  end
8
8
 
9
- get 'http://httpbin.org/redirect/2'
9
+ get 'http://httpbingo.org/redirect/2'
data/sinew.gemspec CHANGED
@@ -5,30 +5,29 @@ require 'sinew/version'
5
5
  Gem::Specification.new do |s|
6
6
  s.name = 'sinew'
7
7
  s.version = Sinew::VERSION
8
- s.platform = Gem::Platform::RUBY
9
8
  s.license = 'MIT'
10
- s.authors = [ 'Adam Doppelt' ]
9
+ s.authors = [ 'Adam Doppelt', 'Nathan Kriege' ]
11
10
  s.email = [ 'amd@gurge.com' ]
12
11
  s.homepage = 'http://github.com/gurgeous/sinew'
13
12
  s.summary = 'Sinew - structured web crawling using recipes.'
14
13
  s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
15
- s.required_ruby_version = '~> 2.3'
14
+ s.required_ruby_version = '>= 2.7'
16
15
 
17
- s.rubyforge_project = 'sinew'
16
+ # what's in the gem?
17
+ s.files = Dir.chdir(File.expand_path(__dir__)) do
18
+ `git ls-files -z`.split("\x0").reject { _1.match(%r{^test/}) }
19
+ end
20
+ s.bindir = 'bin'
21
+ s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
22
+ s.require_paths = [ 'lib' ]
18
23
 
19
- s.add_runtime_dependency 'awesome_print', '~> 1.8'
24
+ s.add_runtime_dependency 'amazing_print', '~> 1.3'
25
+ s.add_runtime_dependency 'faraday', '~> 1.4'
26
+ s.add_runtime_dependency 'faraday-encoding', '~> 0'
20
27
  s.add_runtime_dependency 'htmlentities', '~> 4.3'
21
- s.add_runtime_dependency 'httparty', '~> 0.16'
22
- s.add_runtime_dependency 'nokogiri', '~> 1.8'
28
+ s.add_runtime_dependency 'httpdisk', '~> 0'
29
+ s.add_runtime_dependency 'nokogiri', '~> 1.11'
23
30
  s.add_runtime_dependency 'scripto', '~> 0'
24
- s.add_runtime_dependency 'slop', '~> 4.6'
25
- s.add_runtime_dependency 'stringex', '~> 2.8'
26
- s.add_development_dependency 'minitest', '~> 5.11'
27
- s.add_development_dependency 'rake', '~> 12.3'
28
- s.add_development_dependency 'webmock', '~> 3.4'
29
-
30
- s.files = `git ls-files`.split("\n")
31
- s.test_files = `git ls-files -- test/*`.split("\n")
32
- s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
33
- s.require_paths = [ 'lib' ]
31
+ s.add_runtime_dependency 'slop', '~> 4.8'
32
+ s.add_runtime_dependency 'sterile', '~> 1.0'
34
33
  end
metadata CHANGED
@@ -1,73 +1,74 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 3.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Doppelt
8
- autorequire:
8
+ - Nathan Kriege
9
+ autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2018-05-02 00:00:00.000000000 Z
12
+ date: 2021-05-11 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
- name: awesome_print
15
+ name: amazing_print
15
16
  requirement: !ruby/object:Gem::Requirement
16
17
  requirements:
17
18
  - - "~>"
18
19
  - !ruby/object:Gem::Version
19
- version: '1.8'
20
+ version: '1.3'
20
21
  type: :runtime
21
22
  prerelease: false
22
23
  version_requirements: !ruby/object:Gem::Requirement
23
24
  requirements:
24
25
  - - "~>"
25
26
  - !ruby/object:Gem::Version
26
- version: '1.8'
27
+ version: '1.3'
27
28
  - !ruby/object:Gem::Dependency
28
- name: htmlentities
29
+ name: faraday
29
30
  requirement: !ruby/object:Gem::Requirement
30
31
  requirements:
31
32
  - - "~>"
32
33
  - !ruby/object:Gem::Version
33
- version: '4.3'
34
+ version: '1.4'
34
35
  type: :runtime
35
36
  prerelease: false
36
37
  version_requirements: !ruby/object:Gem::Requirement
37
38
  requirements:
38
39
  - - "~>"
39
40
  - !ruby/object:Gem::Version
40
- version: '4.3'
41
+ version: '1.4'
41
42
  - !ruby/object:Gem::Dependency
42
- name: httparty
43
+ name: faraday-encoding
43
44
  requirement: !ruby/object:Gem::Requirement
44
45
  requirements:
45
46
  - - "~>"
46
47
  - !ruby/object:Gem::Version
47
- version: '0.16'
48
+ version: '0'
48
49
  type: :runtime
49
50
  prerelease: false
50
51
  version_requirements: !ruby/object:Gem::Requirement
51
52
  requirements:
52
53
  - - "~>"
53
54
  - !ruby/object:Gem::Version
54
- version: '0.16'
55
+ version: '0'
55
56
  - !ruby/object:Gem::Dependency
56
- name: nokogiri
57
+ name: htmlentities
57
58
  requirement: !ruby/object:Gem::Requirement
58
59
  requirements:
59
60
  - - "~>"
60
61
  - !ruby/object:Gem::Version
61
- version: '1.8'
62
+ version: '4.3'
62
63
  type: :runtime
63
64
  prerelease: false
64
65
  version_requirements: !ruby/object:Gem::Requirement
65
66
  requirements:
66
67
  - - "~>"
67
68
  - !ruby/object:Gem::Version
68
- version: '1.8'
69
+ version: '4.3'
69
70
  - !ruby/object:Gem::Dependency
70
- name: scripto
71
+ name: httpdisk
71
72
  requirement: !ruby/object:Gem::Requirement
72
73
  requirements:
73
74
  - - "~>"
@@ -81,75 +82,61 @@ dependencies:
81
82
  - !ruby/object:Gem::Version
82
83
  version: '0'
83
84
  - !ruby/object:Gem::Dependency
84
- name: slop
85
+ name: nokogiri
85
86
  requirement: !ruby/object:Gem::Requirement
86
87
  requirements:
87
88
  - - "~>"
88
89
  - !ruby/object:Gem::Version
89
- version: '4.6'
90
+ version: '1.11'
90
91
  type: :runtime
91
92
  prerelease: false
92
93
  version_requirements: !ruby/object:Gem::Requirement
93
94
  requirements:
94
95
  - - "~>"
95
96
  - !ruby/object:Gem::Version
96
- version: '4.6'
97
+ version: '1.11'
97
98
  - !ruby/object:Gem::Dependency
98
- name: stringex
99
+ name: scripto
99
100
  requirement: !ruby/object:Gem::Requirement
100
101
  requirements:
101
102
  - - "~>"
102
103
  - !ruby/object:Gem::Version
103
- version: '2.8'
104
+ version: '0'
104
105
  type: :runtime
105
106
  prerelease: false
106
107
  version_requirements: !ruby/object:Gem::Requirement
107
108
  requirements:
108
109
  - - "~>"
109
110
  - !ruby/object:Gem::Version
110
- version: '2.8'
111
- - !ruby/object:Gem::Dependency
112
- name: minitest
113
- requirement: !ruby/object:Gem::Requirement
114
- requirements:
115
- - - "~>"
116
- - !ruby/object:Gem::Version
117
- version: '5.11'
118
- type: :development
119
- prerelease: false
120
- version_requirements: !ruby/object:Gem::Requirement
121
- requirements:
122
- - - "~>"
123
- - !ruby/object:Gem::Version
124
- version: '5.11'
111
+ version: '0'
125
112
  - !ruby/object:Gem::Dependency
126
- name: rake
113
+ name: slop
127
114
  requirement: !ruby/object:Gem::Requirement
128
115
  requirements:
129
116
  - - "~>"
130
117
  - !ruby/object:Gem::Version
131
- version: '12.3'
132
- type: :development
118
+ version: '4.8'
119
+ type: :runtime
133
120
  prerelease: false
134
121
  version_requirements: !ruby/object:Gem::Requirement
135
122
  requirements:
136
123
  - - "~>"
137
124
  - !ruby/object:Gem::Version
138
- version: '12.3'
125
+ version: '4.8'
139
126
  - !ruby/object:Gem::Dependency
140
- name: webmock
127
+ name: sterile
141
128
  requirement: !ruby/object:Gem::Requirement
142
129
  requirements:
143
130
  - - "~>"
144
131
  - !ruby/object:Gem::Version
145
- version: '3.4'
146
- type: :development
132
+ version: '1.0'
133
+ type: :runtime
147
134
  prerelease: false
148
135
  version_requirements: !ruby/object:Gem::Requirement
149
136
  requirements:
150
137
  - - "~>"
151
138
  - !ruby/object:Gem::Version
152
- version: '3.4'
139
+ version: '1.0'
153
140
  description: Crawl web sites easily using ruby recipes, with caching and nokogiri.
154
141
  email:
155
142
  - amd@gurge.com
@@ -158,9 +145,9 @@ executables:
158
145
  extensions: []
159
146
  extra_rdoc_files: []
160
147
  files:
148
+ - ".github/workflows/test.yml"
161
149
  - ".gitignore"
162
150
  - ".rubocop.yml"
163
- - ".travis.yml"
164
151
  - ".vscode/extensions.json"
165
152
  - ".vscode/settings.json"
166
153
  - Gemfile
@@ -169,7 +156,9 @@ files:
169
156
  - Rakefile
170
157
  - bin/sinew
171
158
  - lib/sinew.rb
172
- - lib/sinew/cache.rb
159
+ - lib/sinew/connection.rb
160
+ - lib/sinew/connection/log_formatter.rb
161
+ - lib/sinew/connection/rate_limit.rb
173
162
  - lib/sinew/core_ext.rb
174
163
  - lib/sinew/dsl.rb
175
164
  - lib/sinew/main.rb
@@ -181,56 +170,27 @@ files:
181
170
  - lib/sinew/version.rb
182
171
  - sample.sinew
183
172
  - sinew.gemspec
184
- - test/legacy/eu.httpbin.org/head/redirect,3
185
- - test/legacy/eu.httpbin.org/head/status,500
186
- - test/legacy/eu.httpbin.org/redirect,3
187
- - test/legacy/eu.httpbin.org/status,500
188
- - test/legacy/legacy.sinew
189
- - test/test.html
190
- - test/test_cache.rb
191
- - test/test_helper.rb
192
- - test/test_legacy.rb
193
- - test/test_main.rb
194
- - test/test_nokogiri_ext.rb
195
- - test/test_output.rb
196
- - test/test_requests.rb
197
- - test/test_utf8.rb
198
173
  homepage: http://github.com/gurgeous/sinew
199
174
  licenses:
200
175
  - MIT
201
176
  metadata: {}
202
- post_install_message:
177
+ post_install_message:
203
178
  rdoc_options: []
204
179
  require_paths:
205
180
  - lib
206
181
  required_ruby_version: !ruby/object:Gem::Requirement
207
182
  requirements:
208
- - - "~>"
183
+ - - ">="
209
184
  - !ruby/object:Gem::Version
210
- version: '2.3'
185
+ version: '2.7'
211
186
  required_rubygems_version: !ruby/object:Gem::Requirement
212
187
  requirements:
213
188
  - - ">="
214
189
  - !ruby/object:Gem::Version
215
190
  version: '0'
216
191
  requirements: []
217
- rubyforge_project: sinew
218
- rubygems_version: 2.7.6
219
- signing_key:
192
+ rubygems_version: 3.1.4
193
+ signing_key:
220
194
  specification_version: 4
221
195
  summary: Sinew - structured web crawling using recipes.
222
- test_files:
223
- - test/legacy/eu.httpbin.org/head/redirect,3
224
- - test/legacy/eu.httpbin.org/head/status,500
225
- - test/legacy/eu.httpbin.org/redirect,3
226
- - test/legacy/eu.httpbin.org/status,500
227
- - test/legacy/legacy.sinew
228
- - test/test.html
229
- - test/test_cache.rb
230
- - test/test_helper.rb
231
- - test/test_legacy.rb
232
- - test/test_main.rb
233
- - test/test_nokogiri_ext.rb
234
- - test/test_output.rb
235
- - test/test_requests.rb
236
- - test/test_utf8.rb
196
+ test_files: []