sinew 3.0.1 → 4.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,72 +1,61 @@
1
- require 'stringio'
2
- require 'zlib'
3
-
4
- #
5
- # An HTTP response.
6
- #
1
+ require "delegate"
2
+ require "hashie/mash"
3
+ require "json"
4
+ require "nokogiri"
7
5
 
8
6
  module Sinew
9
- class Response
10
- attr_accessor :request, :uri, :body, :code, :headers
7
+ # A wrapper around Faraday::Response, with some parsing helpers.
8
+ class Response < SimpleDelegator
9
+ # Like body, but tries to cleanup whitespace around HTML for easier parsing.
10
+ def html
11
+ @html ||= body.dup.tap do
12
+ # fix invalid utf8
13
+ if _1.encoding == Encoding::UTF_8
14
+ _1.encode!("UTF-8", invalid: :replace, undef: :replace, replace: "?")
15
+ end
11
16
 
12
- #
13
- # factory methods
14
- #
17
+ # squish
18
+ _1.strip!
19
+ _1.gsub!(/\s+/, " ")
15
20
 
16
- def self.from_network(request, fday_response)
17
- Response.new.tap do
18
- _1.request = request
19
- _1.uri = fday_response.env.url
20
- _1.code = fday_response.status
21
- _1.headers = fday_response.headers.to_h
22
- _1.body = process_body(fday_response)
21
+ # kill whitespace around tags
22
+ _1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
23
23
  end
24
24
  end
25
25
 
26
- # helper for decoding bodies before parsing
27
- def self.process_body(response)
28
- body = response.body
29
-
30
- # inflate if necessary
31
- bits = body[0, 10].force_encoding('BINARY')
32
- if bits =~ /\A\x1f\x8b/n
33
- body = Zlib::GzipReader.new(StringIO.new(body)).read
34
- end
35
-
36
- # force to utf-8 if we think this could be text
37
- if body.encoding != Encoding::UTF_8
38
- if content_type = response.headers['content-type']
39
- if content_type =~ /\b(html|javascript|json|text|xml)\b/
40
- body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
41
- end
42
- end
43
- end
26
+ # Return body as JSON
27
+ def json
28
+ @json ||= JSON.parse(body, symbolize_names: true)
29
+ end
44
30
 
45
- body
31
+ # Return JSON body as Hashie::Mash
32
+ def mash
33
+ @mash ||= Hashie::Mash.new(json)
46
34
  end
47
35
 
48
- #
49
- # accessors
50
- #
36
+ # Return body HTML as Nokogiri document
37
+ def noko
38
+ @noko ||= Nokogiri::HTML(html)
39
+ end
51
40
 
52
- def error?
53
- code >= 400
41
+ # Return body XML as Nokogiri document
42
+ def xml
43
+ @xml ||= Nokogiri::XML(html)
54
44
  end
55
45
 
56
- def error_500?
57
- code / 100 >= 5
46
+ # Return the final URI for the request, after redirects
47
+ def url
48
+ env.url
58
49
  end
59
50
 
60
- def redirected?
61
- request.uri != uri
51
+ # Return the cache diskpath for this response
52
+ def diskpath
53
+ env[:httpdisk_diskpath]
62
54
  end
63
55
 
64
- def head_as_json
65
- {
66
- uri: uri,
67
- code: code,
68
- headers: headers,
69
- }
56
+ # Remove cached response from disk, if any
57
+ def uncache
58
+ File.unlink(diskpath) if File.exist?(diskpath)
70
59
  end
71
60
  end
72
61
  end
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = '3.0.1'.freeze
3
+ VERSION = "4.0.1".freeze
4
4
  end
data/lib/sinew.rb CHANGED
@@ -1,9 +1,23 @@
1
- require_relative 'sinew/core_ext'
2
- require_relative 'sinew/dsl'
3
- require_relative 'sinew/main'
4
- require_relative 'sinew/nokogiri_ext'
5
- require_relative 'sinew/output'
6
- require_relative 'sinew/request'
7
- require_relative 'sinew/response'
8
- require_relative 'sinew/runtime_options'
9
- require_relative 'sinew/version'
1
+ # sinew
2
+ require "sinew/args"
3
+ require "sinew/base"
4
+ require "sinew/csv"
5
+ require "sinew/main"
6
+ require "sinew/nokogiri_ext"
7
+ require "sinew/response"
8
+ require "sinew/version"
9
+
10
+ # custom faraday middleware
11
+ require "sinew/middleware/log_formatter"
12
+
13
+ module Sinew
14
+ # flow control for --limit
15
+ class LimitError < StandardError; end
16
+
17
+ # shortcut for Sinew::Base.new
18
+ class << self
19
+ def new(**args)
20
+ Sinew::Base.new(**args)
21
+ end
22
+ end
23
+ end
data/sample.rb ADDED
@@ -0,0 +1,13 @@
1
+ require_relative "lib/sinew"
2
+
3
+ sinew = Sinew.new(output: "sample.csv", verbose: true)
4
+
5
+ response = sinew.get "http://httpbingo.org"
6
+ response.noko.css("ul li a").each do |a|
7
+ row = {}
8
+ row[:url] = a[:href]
9
+ row[:title] = a.text
10
+ sinew.csv_emit(row)
11
+ end
12
+
13
+ sinew.get "http://httpbingo.org/redirect/2"
data/sample.sinew CHANGED
@@ -1,9 +1,9 @@
1
- get 'http://httpbingo.org'
2
- noko.css('ul li a').each do |a|
1
+ response = sinew.get 'http://httpbingo.org'
2
+ response.noko.css('ul li a').each do |a|
3
3
  row = {}
4
4
  row[:url] = a[:href]
5
5
  row[:title] = a.text
6
- csv_emit(row)
6
+ sinew.csv_emit(row)
7
7
  end
8
8
 
9
- get 'http://httpbingo.org/redirect/2'
9
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sinew.gemspec CHANGED
@@ -1,32 +1,36 @@
1
1
  $LOAD_PATH.unshift("#{__dir__}/lib")
2
2
 
3
- require 'sinew/version'
3
+ require "sinew/version"
4
4
 
5
5
  Gem::Specification.new do |s|
6
- s.name = 'sinew'
7
- s.version = Sinew::VERSION
8
- s.license = 'MIT'
9
- s.authors = [ 'Adam Doppelt', 'Nathan Kriege' ]
10
- s.email = [ 'amd@gurge.com' ]
11
- s.homepage = 'http://github.com/gurgeous/sinew'
12
- s.summary = 'Sinew - structured web crawling using recipes.'
13
- s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
14
- s.required_ruby_version = '>= 2.7'
6
+ s.name = "sinew"
7
+ s.version = Sinew::VERSION
8
+ s.authors = ["Adam Doppelt", "Nathan Kriege"]
9
+ s.email = ["amd@gurge.com"]
10
+
11
+ s.summary = "Sinew - structured web crawling using recipes."
12
+ s.description = "Crawl web sites easily using ruby recipes, with caching and nokogiri."
13
+ s.homepage = "http://github.com/gurgeous/sinew"
14
+ s.license = "MIT"
15
+ s.required_ruby_version = ">= 3.1"
15
16
 
16
17
  # what's in the gem?
17
18
  s.files = Dir.chdir(File.expand_path(__dir__)) do
18
19
  `git ls-files -z`.split("\x0").reject { _1.match(%r{^test/}) }
19
20
  end
20
- s.bindir = 'bin'
21
+ s.bindir = "bin"
21
22
  s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
22
- s.require_paths = [ 'lib' ]
23
+ s.require_paths = ["lib"]
23
24
 
24
- s.add_runtime_dependency 'amazing_print', '~> 1.3'
25
- s.add_runtime_dependency 'faraday', '~> 1.4'
26
- s.add_runtime_dependency 'faraday-encoding', '~> 0'
27
- s.add_runtime_dependency 'httpdisk', '~> 0'
28
- s.add_runtime_dependency 'nokogiri', '~> 1.11'
29
- s.add_runtime_dependency 'scripto', '~> 0'
30
- s.add_runtime_dependency 'slop', '~> 4.8'
31
- s.add_runtime_dependency 'sterile', '~> 1.0'
25
+ # gem dependencies
26
+ s.add_dependency "amazing_print", "~> 1.5"
27
+ s.add_dependency "faraday", "~> 2.7"
28
+ s.add_dependency "faraday-encoding", "~> 0.0"
29
+ s.add_dependency "faraday-rate_limiter", "~> 0.0"
30
+ s.add_dependency "faraday-retry", "~> 2.0"
31
+ s.add_dependency "hashie", "~> 5.0"
32
+ s.add_dependency "httpdisk", "~> 1.0"
33
+ s.add_dependency "nokogiri", "~> 1.15"
34
+ s.add_dependency "slop", "~> 4.10"
35
+ s.add_dependency "sterile", "~> 1.0"
32
36
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: sinew
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.1
4
+ version: 4.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Doppelt
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2021-06-04 00:00:00.000000000 Z
12
+ date: 2023-08-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: amazing_print
@@ -17,98 +17,126 @@ dependencies:
17
17
  requirements:
18
18
  - - "~>"
19
19
  - !ruby/object:Gem::Version
20
- version: '1.3'
20
+ version: '1.5'
21
21
  type: :runtime
22
22
  prerelease: false
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
25
  - - "~>"
26
26
  - !ruby/object:Gem::Version
27
- version: '1.3'
27
+ version: '1.5'
28
28
  - !ruby/object:Gem::Dependency
29
29
  name: faraday
30
30
  requirement: !ruby/object:Gem::Requirement
31
31
  requirements:
32
32
  - - "~>"
33
33
  - !ruby/object:Gem::Version
34
- version: '1.4'
34
+ version: '2.7'
35
35
  type: :runtime
36
36
  prerelease: false
37
37
  version_requirements: !ruby/object:Gem::Requirement
38
38
  requirements:
39
39
  - - "~>"
40
40
  - !ruby/object:Gem::Version
41
- version: '1.4'
41
+ version: '2.7'
42
42
  - !ruby/object:Gem::Dependency
43
43
  name: faraday-encoding
44
44
  requirement: !ruby/object:Gem::Requirement
45
45
  requirements:
46
46
  - - "~>"
47
47
  - !ruby/object:Gem::Version
48
- version: '0'
48
+ version: '0.0'
49
49
  type: :runtime
50
50
  prerelease: false
51
51
  version_requirements: !ruby/object:Gem::Requirement
52
52
  requirements:
53
53
  - - "~>"
54
54
  - !ruby/object:Gem::Version
55
- version: '0'
55
+ version: '0.0'
56
56
  - !ruby/object:Gem::Dependency
57
- name: httpdisk
57
+ name: faraday-rate_limiter
58
58
  requirement: !ruby/object:Gem::Requirement
59
59
  requirements:
60
60
  - - "~>"
61
61
  - !ruby/object:Gem::Version
62
- version: '0'
62
+ version: '0.0'
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: '0'
69
+ version: '0.0'
70
70
  - !ruby/object:Gem::Dependency
71
- name: nokogiri
71
+ name: faraday-retry
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - "~>"
75
+ - !ruby/object:Gem::Version
76
+ version: '2.0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - "~>"
82
+ - !ruby/object:Gem::Version
83
+ version: '2.0'
84
+ - !ruby/object:Gem::Dependency
85
+ name: hashie
86
+ requirement: !ruby/object:Gem::Requirement
87
+ requirements:
88
+ - - "~>"
89
+ - !ruby/object:Gem::Version
90
+ version: '5.0'
91
+ type: :runtime
92
+ prerelease: false
93
+ version_requirements: !ruby/object:Gem::Requirement
94
+ requirements:
95
+ - - "~>"
96
+ - !ruby/object:Gem::Version
97
+ version: '5.0'
98
+ - !ruby/object:Gem::Dependency
99
+ name: httpdisk
72
100
  requirement: !ruby/object:Gem::Requirement
73
101
  requirements:
74
102
  - - "~>"
75
103
  - !ruby/object:Gem::Version
76
- version: '1.11'
104
+ version: '1.0'
77
105
  type: :runtime
78
106
  prerelease: false
79
107
  version_requirements: !ruby/object:Gem::Requirement
80
108
  requirements:
81
109
  - - "~>"
82
110
  - !ruby/object:Gem::Version
83
- version: '1.11'
111
+ version: '1.0'
84
112
  - !ruby/object:Gem::Dependency
85
- name: scripto
113
+ name: nokogiri
86
114
  requirement: !ruby/object:Gem::Requirement
87
115
  requirements:
88
116
  - - "~>"
89
117
  - !ruby/object:Gem::Version
90
- version: '0'
118
+ version: '1.15'
91
119
  type: :runtime
92
120
  prerelease: false
93
121
  version_requirements: !ruby/object:Gem::Requirement
94
122
  requirements:
95
123
  - - "~>"
96
124
  - !ruby/object:Gem::Version
97
- version: '0'
125
+ version: '1.15'
98
126
  - !ruby/object:Gem::Dependency
99
127
  name: slop
100
128
  requirement: !ruby/object:Gem::Requirement
101
129
  requirements:
102
130
  - - "~>"
103
131
  - !ruby/object:Gem::Version
104
- version: '4.8'
132
+ version: '4.10'
105
133
  type: :runtime
106
134
  prerelease: false
107
135
  version_requirements: !ruby/object:Gem::Requirement
108
136
  requirements:
109
137
  - - "~>"
110
138
  - !ruby/object:Gem::Version
111
- version: '4.8'
139
+ version: '4.10'
112
140
  - !ruby/object:Gem::Dependency
113
141
  name: sterile
114
142
  requirement: !ruby/object:Gem::Requirement
@@ -134,26 +162,23 @@ files:
134
162
  - ".github/workflows/test.yml"
135
163
  - ".gitignore"
136
164
  - ".rubocop.yml"
137
- - ".vscode/extensions.json"
138
- - ".vscode/settings.json"
139
165
  - Gemfile
166
+ - Gemfile.lock
140
167
  - LICENSE
141
168
  - README.md
142
169
  - Rakefile
143
170
  - bin/sinew
171
+ - justfile
144
172
  - lib/sinew.rb
145
- - lib/sinew/connection.rb
146
- - lib/sinew/connection/log_formatter.rb
147
- - lib/sinew/connection/rate_limit.rb
148
- - lib/sinew/core_ext.rb
149
- - lib/sinew/dsl.rb
173
+ - lib/sinew/args.rb
174
+ - lib/sinew/base.rb
175
+ - lib/sinew/csv.rb
150
176
  - lib/sinew/main.rb
177
+ - lib/sinew/middleware/log_formatter.rb
151
178
  - lib/sinew/nokogiri_ext.rb
152
- - lib/sinew/output.rb
153
- - lib/sinew/request.rb
154
179
  - lib/sinew/response.rb
155
- - lib/sinew/runtime_options.rb
156
180
  - lib/sinew/version.rb
181
+ - sample.rb
157
182
  - sample.sinew
158
183
  - sinew.gemspec
159
184
  homepage: http://github.com/gurgeous/sinew
@@ -168,14 +193,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
168
193
  requirements:
169
194
  - - ">="
170
195
  - !ruby/object:Gem::Version
171
- version: '2.7'
196
+ version: '3.1'
172
197
  required_rubygems_version: !ruby/object:Gem::Requirement
173
198
  requirements:
174
199
  - - ">="
175
200
  - !ruby/object:Gem::Version
176
201
  version: '0'
177
202
  requirements: []
178
- rubygems_version: 3.1.4
203
+ rubygems_version: 3.3.7
179
204
  signing_key:
180
205
  specification_version: 4
181
206
  summary: Sinew - structured web crawling using recipes.
@@ -1,3 +0,0 @@
1
- {
2
- "recommendations": ["rebornix.Ruby"]
3
- }
@@ -1,5 +0,0 @@
1
- {
2
- "files.associations": {
3
- "*.sinew": "ruby"
4
- }
5
- }
@@ -1,29 +0,0 @@
1
- module Sinew
2
- module Connection
3
- class RateLimit < Faraday::Middleware
4
- attr_reader :rate_limit
5
-
6
- def initialize(app, options = {})
7
- super(app)
8
-
9
- @last_request_tm = @current_request_tm = nil
10
- @rate_limit = options.fetch(:rate_limit, 1)
11
- end
12
-
13
- def on_request(_env)
14
- if @last_request_tm
15
- sleep = (@last_request_tm + rate_limit) - Time.now
16
- sleep(sleep) if sleep > 0
17
- end
18
-
19
- @current_request_tm = Time.now
20
- end
21
-
22
- def on_complete(env)
23
- # Only rate limit on uncached requests
24
- @last_request_tm = @current_request_tm unless env[:httpdisk]
25
- @current_request_tm = nil
26
- end
27
- end
28
- end
29
- end
@@ -1,52 +0,0 @@
1
- require 'faraday'
2
- require 'faraday-encoding'
3
- require 'faraday/logging/formatter'
4
- require 'httpdisk'
5
- require 'sinew/connection/log_formatter'
6
- require 'sinew/connection/rate_limit'
7
-
8
- module Sinew
9
- module Connection
10
- def self.create(options:, runtime_options:)
11
- connection_options = {}
12
- connection_options[:ssl] = { verify: false } if runtime_options.insecure
13
-
14
- Faraday.new(nil, connection_options) do
15
- _1.use RateLimit, rate_limit: runtime_options.rate_limit
16
-
17
- # auto-encode form bodies
18
- _1.request :url_encoded
19
-
20
- # Before httpdisk so each redirect segment is cached
21
- # Keep track of redirect status for logger
22
- _1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
23
-
24
- # set Ruby string encoding based on Content-Type (should be above httpdisk)
25
- _1.response :encoding
26
-
27
- # disk caching
28
- httpdisk_options = {
29
- dir: options[:cache],
30
- force: options[:force],
31
- force_errors: options[:force_errors],
32
- }.merge(runtime_options.httpdisk_options)
33
-
34
- _1.use :httpdisk, httpdisk_options
35
-
36
- # After httpdisk so that only non-cached requests are logged.
37
- # Before retry so that we don't log each retry attempt.
38
- _1.response :logger, nil, formatter: LogFormatter if !options[:quiet]
39
-
40
- # After httpdisk so transient failures are not cached
41
- retry_options = {
42
- interval: runtime_options.rate_limit,
43
- max: runtime_options.retries,
44
- methods: %w[delete get head options patch post put trace],
45
- retry_statuses: (500..600).to_a,
46
- retry_if: ->(_env, _err) { true },
47
- }
48
- _1.request :retry, retry_options
49
- end
50
- end
51
- end
52
- end
@@ -1,59 +0,0 @@
1
- #
2
- # A few core extensions brought over from ActiveSupport. These are handy for
3
- # parsing.
4
- #
5
-
6
- class String
7
- def squish
8
- dup.squish!
9
- end
10
-
11
- def squish!
12
- strip!
13
- gsub!(/\s+/, ' ')
14
- self
15
- end
16
-
17
- def first(limit = 1)
18
- if limit == 0
19
- ''
20
- elsif limit >= size
21
- dup
22
- else
23
- self[0..limit - 1]
24
- end
25
- end
26
-
27
- def last(limit = 1)
28
- if limit == 0
29
- ''
30
- elsif limit >= size
31
- dup
32
- else
33
- self[-limit..]
34
- end
35
- end
36
-
37
- alias starts_with? start_with?
38
- alias ends_with? end_with?
39
- end
40
-
41
- #
42
- # blank?/present?
43
- #
44
-
45
- class Object
46
- def blank?
47
- respond_to?(:empty?) ? !!empty? : !self
48
- end
49
-
50
- def present?
51
- !blank?
52
- end
53
- end
54
-
55
- class String
56
- def blank?
57
- !!(self =~ /\A\s*\z/)
58
- end
59
- end