sinew 3.0.1 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,72 +1,61 @@
1
- require 'stringio'
2
- require 'zlib'
3
-
4
- #
5
- # An HTTP response.
6
- #
1
+ require "delegate"
2
+ require "hashie/mash"
3
+ require "json"
4
+ require "nokogiri"
7
5
 
8
6
  module Sinew
9
- class Response
10
- attr_accessor :request, :uri, :body, :code, :headers
7
+ # A wrapper around Faraday::Response, with some parsing helpers.
8
+ class Response < SimpleDelegator
9
+ # Like body, but tries to cleanup whitespace around HTML for easier parsing.
10
+ def html
11
+ @html ||= body.dup.tap do
12
+ # fix invalid utf8
13
+ if _1.encoding == Encoding::UTF_8
14
+ _1.encode!("UTF-8", invalid: :replace, undef: :replace, replace: "?")
15
+ end
11
16
 
12
- #
13
- # factory methods
14
- #
17
+ # squish
18
+ _1.strip!
19
+ _1.gsub!(/\s+/, " ")
15
20
 
16
- def self.from_network(request, fday_response)
17
- Response.new.tap do
18
- _1.request = request
19
- _1.uri = fday_response.env.url
20
- _1.code = fday_response.status
21
- _1.headers = fday_response.headers.to_h
22
- _1.body = process_body(fday_response)
21
+ # kill whitespace around tags
22
+ _1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
23
23
  end
24
24
  end
25
25
 
26
- # helper for decoding bodies before parsing
27
- def self.process_body(response)
28
- body = response.body
29
-
30
- # inflate if necessary
31
- bits = body[0, 10].force_encoding('BINARY')
32
- if bits =~ /\A\x1f\x8b/n
33
- body = Zlib::GzipReader.new(StringIO.new(body)).read
34
- end
35
-
36
- # force to utf-8 if we think this could be text
37
- if body.encoding != Encoding::UTF_8
38
- if content_type = response.headers['content-type']
39
- if content_type =~ /\b(html|javascript|json|text|xml)\b/
40
- body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
41
- end
42
- end
43
- end
26
+ # Return body as JSON
27
+ def json
28
+ @json ||= JSON.parse(body, symbolize_names: true)
29
+ end
44
30
 
45
- body
31
+ # Return JSON body as Hashie::Mash
32
+ def mash
33
+ @mash ||= Hashie::Mash.new(json)
46
34
  end
47
35
 
48
- #
49
- # accessors
50
- #
36
+ # Return body HTML as Nokogiri document
37
+ def noko
38
+ @noko ||= Nokogiri::HTML(html)
39
+ end
51
40
 
52
- def error?
53
- code >= 400
41
+ # Return body XML as Nokogiri document
42
+ def xml
43
+ @xml ||= Nokogiri::XML(html)
54
44
  end
55
45
 
56
- def error_500?
57
- code / 100 >= 5
46
+ # Return the final URI for the request, after redirects
47
+ def url
48
+ env.url
58
49
  end
59
50
 
60
- def redirected?
61
- request.uri != uri
51
+ # Return the cache diskpath for this response
52
+ def diskpath
53
+ env[:httpdisk_diskpath]
62
54
  end
63
55
 
64
- def head_as_json
65
- {
66
- uri: uri,
67
- code: code,
68
- headers: headers,
69
- }
56
+ # Remove cached response from disk, if any
57
+ def uncache
58
+ File.unlink(diskpath) if File.exist?(diskpath)
70
59
  end
71
60
  end
72
61
  end
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = '3.0.1'.freeze
3
+ VERSION = "4.0.1".freeze
4
4
  end
data/lib/sinew.rb CHANGED
@@ -1,9 +1,23 @@
1
- require_relative 'sinew/core_ext'
2
- require_relative 'sinew/dsl'
3
- require_relative 'sinew/main'
4
- require_relative 'sinew/nokogiri_ext'
5
- require_relative 'sinew/output'
6
- require_relative 'sinew/request'
7
- require_relative 'sinew/response'
8
- require_relative 'sinew/runtime_options'
9
- require_relative 'sinew/version'
1
+ # sinew
2
+ require "sinew/args"
3
+ require "sinew/base"
4
+ require "sinew/csv"
5
+ require "sinew/main"
6
+ require "sinew/nokogiri_ext"
7
+ require "sinew/response"
8
+ require "sinew/version"
9
+
10
+ # custom faraday middleware
11
+ require "sinew/middleware/log_formatter"
12
+
13
+ module Sinew
14
+ # flow control for --limit
15
+ class LimitError < StandardError; end
16
+
17
+ # shortcut for Sinew::Base.new
18
+ class << self
19
+ def new(**args)
20
+ Sinew::Base.new(**args)
21
+ end
22
+ end
23
+ end
data/sample.rb ADDED
@@ -0,0 +1,13 @@
1
+ require_relative "lib/sinew"
2
+
3
+ sinew = Sinew.new(output: "sample.csv", verbose: true)
4
+
5
+ response = sinew.get "http://httpbingo.org"
6
+ response.noko.css("ul li a").each do |a|
7
+ row = {}
8
+ row[:url] = a[:href]
9
+ row[:title] = a.text
10
+ sinew.csv_emit(row)
11
+ end
12
+
13
+ sinew.get "http://httpbingo.org/redirect/2"
data/sample.sinew CHANGED
@@ -1,9 +1,9 @@
1
- get 'http://httpbingo.org'
2
- noko.css('ul li a').each do |a|
1
+ response = sinew.get 'http://httpbingo.org'
2
+ response.noko.css('ul li a').each do |a|
3
3
  row = {}
4
4
  row[:url] = a[:href]
5
5
  row[:title] = a.text
6
- csv_emit(row)
6
+ sinew.csv_emit(row)
7
7
  end
8
8
 
9
- get 'http://httpbingo.org/redirect/2'
9
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sinew.gemspec CHANGED
@@ -1,32 +1,36 @@
1
1
  $LOAD_PATH.unshift("#{__dir__}/lib")
2
2
 
3
- require 'sinew/version'
3
+ require "sinew/version"
4
4
 
5
5
  Gem::Specification.new do |s|
6
- s.name = 'sinew'
7
- s.version = Sinew::VERSION
8
- s.license = 'MIT'
9
- s.authors = [ 'Adam Doppelt', 'Nathan Kriege' ]
10
- s.email = [ 'amd@gurge.com' ]
11
- s.homepage = 'http://github.com/gurgeous/sinew'
12
- s.summary = 'Sinew - structured web crawling using recipes.'
13
- s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
14
- s.required_ruby_version = '>= 2.7'
6
+ s.name = "sinew"
7
+ s.version = Sinew::VERSION
8
+ s.authors = ["Adam Doppelt", "Nathan Kriege"]
9
+ s.email = ["amd@gurge.com"]
10
+
11
+ s.summary = "Sinew - structured web crawling using recipes."
12
+ s.description = "Crawl web sites easily using ruby recipes, with caching and nokogiri."
13
+ s.homepage = "http://github.com/gurgeous/sinew"
14
+ s.license = "MIT"
15
+ s.required_ruby_version = ">= 3.1"
15
16
 
16
17
  # what's in the gem?
17
18
  s.files = Dir.chdir(File.expand_path(__dir__)) do
18
19
  `git ls-files -z`.split("\x0").reject { _1.match(%r{^test/}) }
19
20
  end
20
- s.bindir = 'bin'
21
+ s.bindir = "bin"
21
22
  s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
22
- s.require_paths = [ 'lib' ]
23
+ s.require_paths = ["lib"]
23
24
 
24
- s.add_runtime_dependency 'amazing_print', '~> 1.3'
25
- s.add_runtime_dependency 'faraday', '~> 1.4'
26
- s.add_runtime_dependency 'faraday-encoding', '~> 0'
27
- s.add_runtime_dependency 'httpdisk', '~> 0'
28
- s.add_runtime_dependency 'nokogiri', '~> 1.11'
29
- s.add_runtime_dependency 'scripto', '~> 0'
30
- s.add_runtime_dependency 'slop', '~> 4.8'
31
- s.add_runtime_dependency 'sterile', '~> 1.0'
25
+ # gem dependencies
26
+ s.add_dependency "amazing_print", "~> 1.5"
27
+ s.add_dependency "faraday", "~> 2.7"
28
+ s.add_dependency "faraday-encoding", "~> 0.0"
29
+ s.add_dependency "faraday-rate_limiter", "~> 0.0"
30
+ s.add_dependency "faraday-retry", "~> 2.0"
31
+ s.add_dependency "hashie", "~> 5.0"
32
+ s.add_dependency "httpdisk", "~> 1.0"
33
+ s.add_dependency "nokogiri", "~> 1.15"
34
+ s.add_dependency "slop", "~> 4.10"
35
+ s.add_dependency "sterile", "~> 1.0"
32
36
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.1
4
+ version: 4.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Doppelt
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-06-04 00:00:00.000000000 Z
12
+ date: 2023-08-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: amazing_print
@@ -17,98 +17,126 @@ dependencies:
17
17
  requirements:
18
18
  - - "~>"
19
19
  - !ruby/object:Gem::Version
20
- version: '1.3'
20
+ version: '1.5'
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - "~>"
26
26
  - !ruby/object:Gem::Version
27
- version: '1.3'
27
+ version: '1.5'
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: faraday
30
30
  requirement: !ruby/object:Gem::Requirement
31
31
  requirements:
32
32
  - - "~>"
33
33
  - !ruby/object:Gem::Version
34
- version: '1.4'
34
+ version: '2.7'
35
35
  type: :runtime
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
- version: '1.4'
41
+ version: '2.7'
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: faraday-encoding
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
46
  - - "~>"
47
47
  - !ruby/object:Gem::Version
48
- version: '0'
48
+ version: '0.0'
49
49
  type: :runtime
50
50
  prerelease: false
51
51
  version_requirements: !ruby/object:Gem::Requirement
52
52
  requirements:
53
53
  - - "~>"
54
54
  - !ruby/object:Gem::Version
55
- version: '0'
55
+ version: '0.0'
56
56
  - !ruby/object:Gem::Dependency
57
- name: httpdisk
57
+ name: faraday-rate_limiter
58
58
  requirement: !ruby/object:Gem::Requirement
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '0'
62
+ version: '0.0'
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: '0'
69
+ version: '0.0'
70
70
  - !ruby/object:Gem::Dependency
71
- name: nokogiri
71
+ name: faraday-retry
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '2.0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '2.0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: hashie
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '5.0'
91
+ type: :runtime
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '5.0'
98
+ - !ruby/object:Gem::Dependency
99
+ name: httpdisk
72
100
  requirement: !ruby/object:Gem::Requirement
73
101
  requirements:
74
102
  - - "~>"
75
103
  - !ruby/object:Gem::Version
76
- version: '1.11'
104
+ version: '1.0'
77
105
  type: :runtime
78
106
  prerelease: false
79
107
  version_requirements: !ruby/object:Gem::Requirement
80
108
  requirements:
81
109
  - - "~>"
82
110
  - !ruby/object:Gem::Version
83
- version: '1.11'
111
+ version: '1.0'
84
112
  - !ruby/object:Gem::Dependency
85
- name: scripto
113
+ name: nokogiri
86
114
  requirement: !ruby/object:Gem::Requirement
87
115
  requirements:
88
116
  - - "~>"
89
117
  - !ruby/object:Gem::Version
90
- version: '0'
118
+ version: '1.15'
91
119
  type: :runtime
92
120
  prerelease: false
93
121
  version_requirements: !ruby/object:Gem::Requirement
94
122
  requirements:
95
123
  - - "~>"
96
124
  - !ruby/object:Gem::Version
97
- version: '0'
125
+ version: '1.15'
98
126
  - !ruby/object:Gem::Dependency
99
127
  name: slop
100
128
  requirement: !ruby/object:Gem::Requirement
101
129
  requirements:
102
130
  - - "~>"
103
131
  - !ruby/object:Gem::Version
104
- version: '4.8'
132
+ version: '4.10'
105
133
  type: :runtime
106
134
  prerelease: false
107
135
  version_requirements: !ruby/object:Gem::Requirement
108
136
  requirements:
109
137
  - - "~>"
110
138
  - !ruby/object:Gem::Version
111
- version: '4.8'
139
+ version: '4.10'
112
140
  - !ruby/object:Gem::Dependency
113
141
  name: sterile
114
142
  requirement: !ruby/object:Gem::Requirement
@@ -134,26 +162,23 @@ files:
134
162
  - ".github/workflows/test.yml"
135
163
  - ".gitignore"
136
164
  - ".rubocop.yml"
137
- - ".vscode/extensions.json"
138
- - ".vscode/settings.json"
139
165
  - Gemfile
166
+ - Gemfile.lock
140
167
  - LICENSE
141
168
  - README.md
142
169
  - Rakefile
143
170
  - bin/sinew
171
+ - justfile
144
172
  - lib/sinew.rb
145
- - lib/sinew/connection.rb
146
- - lib/sinew/connection/log_formatter.rb
147
- - lib/sinew/connection/rate_limit.rb
148
- - lib/sinew/core_ext.rb
149
- - lib/sinew/dsl.rb
173
+ - lib/sinew/args.rb
174
+ - lib/sinew/base.rb
175
+ - lib/sinew/csv.rb
150
176
  - lib/sinew/main.rb
177
+ - lib/sinew/middleware/log_formatter.rb
151
178
  - lib/sinew/nokogiri_ext.rb
152
- - lib/sinew/output.rb
153
- - lib/sinew/request.rb
154
179
  - lib/sinew/response.rb
155
- - lib/sinew/runtime_options.rb
156
180
  - lib/sinew/version.rb
181
+ - sample.rb
157
182
  - sample.sinew
158
183
  - sinew.gemspec
159
184
  homepage: http://github.com/gurgeous/sinew
@@ -168,14 +193,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
168
193
  requirements:
169
194
  - - ">="
170
195
  - !ruby/object:Gem::Version
171
- version: '2.7'
196
+ version: '3.1'
172
197
  required_rubygems_version: !ruby/object:Gem::Requirement
173
198
  requirements:
174
199
  - - ">="
175
200
  - !ruby/object:Gem::Version
176
201
  version: '0'
177
202
  requirements: []
178
- rubygems_version: 3.1.4
203
+ rubygems_version: 3.3.7
179
204
  signing_key:
180
205
  specification_version: 4
181
206
  summary: Sinew - structured web crawling using recipes.
@@ -1,3 +0,0 @@
1
- {
2
- "recommendations": ["rebornix.Ruby"]
3
- }
@@ -1,5 +0,0 @@
1
- {
2
- "files.associations": {
3
- "*.sinew": "ruby"
4
- }
5
- }
@@ -1,29 +0,0 @@
1
- module Sinew
2
- module Connection
3
- class RateLimit < Faraday::Middleware
4
- attr_reader :rate_limit
5
-
6
- def initialize(app, options = {})
7
- super(app)
8
-
9
- @last_request_tm = @current_request_tm = nil
10
- @rate_limit = options.fetch(:rate_limit, 1)
11
- end
12
-
13
- def on_request(_env)
14
- if @last_request_tm
15
- sleep = (@last_request_tm + rate_limit) - Time.now
16
- sleep(sleep) if sleep > 0
17
- end
18
-
19
- @current_request_tm = Time.now
20
- end
21
-
22
- def on_complete(env)
23
- # Only rate limit on uncached requests
24
- @last_request_tm = @current_request_tm unless env[:httpdisk]
25
- @current_request_tm = nil
26
- end
27
- end
28
- end
29
- end
@@ -1,52 +0,0 @@
1
- require 'faraday'
2
- require 'faraday-encoding'
3
- require 'faraday/logging/formatter'
4
- require 'httpdisk'
5
- require 'sinew/connection/log_formatter'
6
- require 'sinew/connection/rate_limit'
7
-
8
- module Sinew
9
- module Connection
10
- def self.create(options:, runtime_options:)
11
- connection_options = {}
12
- connection_options[:ssl] = { verify: false } if runtime_options.insecure
13
-
14
- Faraday.new(nil, connection_options) do
15
- _1.use RateLimit, rate_limit: runtime_options.rate_limit
16
-
17
- # auto-encode form bodies
18
- _1.request :url_encoded
19
-
20
- # Before httpdisk so each redirect segment is cached
21
- # Keep track of redirect status for logger
22
- _1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
23
-
24
- # set Ruby string encoding based on Content-Type (should be above httpdisk)
25
- _1.response :encoding
26
-
27
- # disk caching
28
- httpdisk_options = {
29
- dir: options[:cache],
30
- force: options[:force],
31
- force_errors: options[:force_errors],
32
- }.merge(runtime_options.httpdisk_options)
33
-
34
- _1.use :httpdisk, httpdisk_options
35
-
36
- # After httpdisk so that only non-cached requests are logged.
37
- # Before retry so that we don't log each retry attempt.
38
- _1.response :logger, nil, formatter: LogFormatter if !options[:quiet]
39
-
40
- # After httpdisk so transient failures are not cached
41
- retry_options = {
42
- interval: runtime_options.rate_limit,
43
- max: runtime_options.retries,
44
- methods: %w[delete get head options patch post put trace],
45
- retry_statuses: (500..600).to_a,
46
- retry_if: ->(_env, _err) { true },
47
- }
48
- _1.request :retry, retry_options
49
- end
50
- end
51
- end
52
- end
@@ -1,59 +0,0 @@
1
- #
2
- # A few core extensions brought over from ActiveSupport. These are handy for
3
- # parsing.
4
- #
5
-
6
- class String
7
- def squish
8
- dup.squish!
9
- end
10
-
11
- def squish!
12
- strip!
13
- gsub!(/\s+/, ' ')
14
- self
15
- end
16
-
17
- def first(limit = 1)
18
- if limit == 0
19
- ''
20
- elsif limit >= size
21
- dup
22
- else
23
- self[0..limit - 1]
24
- end
25
- end
26
-
27
- def last(limit = 1)
28
- if limit == 0
29
- ''
30
- elsif limit >= size
31
- dup
32
- else
33
- self[-limit..]
34
- end
35
- end
36
-
37
- alias starts_with? start_with?
38
- alias ends_with? end_with?
39
- end
40
-
41
- #
42
- # blank?/present?
43
- #
44
-
45
- class Object
46
- def blank?
47
- respond_to?(:empty?) ? !!empty? : !self
48
- end
49
-
50
- def present?
51
- !blank?
52
- end
53
- end
54
-
55
- class String
56
- def blank?
57
- !!(self =~ /\A\s*\z/)
58
- end
59
- end