spidr 0.7.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ruby.yml +4 -4
- data/ChangeLog.md +8 -0
- data/LICENSE.txt +1 -1
- data/README.md +0 -2
- data/lib/spidr/agent/actions.rb +2 -0
- data/lib/spidr/agent/events.rb +2 -0
- data/lib/spidr/agent/filters.rb +5 -3
- data/lib/spidr/agent/robots.rb +2 -0
- data/lib/spidr/agent/sanitizers.rb +2 -0
- data/lib/spidr/agent.rb +13 -11
- data/lib/spidr/auth_credential.rb +2 -0
- data/lib/spidr/auth_store.rb +8 -6
- data/lib/spidr/cookie_jar.rb +7 -5
- data/lib/spidr/extensions/uri.rb +3 -1
- data/lib/spidr/extensions.rb +3 -1
- data/lib/spidr/page/content_types.rb +2 -0
- data/lib/spidr/page/cookies.rb +2 -0
- data/lib/spidr/page/html.rb +5 -2
- data/lib/spidr/page/status_codes.rb +3 -1
- data/lib/spidr/page.rb +3 -1
- data/lib/spidr/proxy.rb +2 -0
- data/lib/spidr/rules.rb +2 -0
- data/lib/spidr/session_cache.rb +6 -4
- data/lib/spidr/settings/proxy.rb +3 -1
- data/lib/spidr/settings/timeouts.rb +2 -0
- data/lib/spidr/settings/user_agent.rb +2 -0
- data/lib/spidr/settings.rb +5 -3
- data/lib/spidr/spidr.rb +9 -7
- data/lib/spidr/version.rb +3 -1
- data/lib/spidr.rb +5 -3
- data/spidr.gemspec +1 -4
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 471764341b98b0cfeb57db24ac34a849dcfdcf43a751b648451a20c29c1ec051
|
4
|
+
data.tar.gz: '009c903cf30a13e55bbb8029fe2fdbfa4f8a8af32126b74aeb558f1afd3d3d88'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bddb65750dce8f6193764ac9d372adfa1893dc8743c24c383c359069043b51cd94e09ecd8bffad16bb8b4d92f99324c98ca95f8f59a9c9655a3f2fb7c42b9f57
|
7
|
+
data.tar.gz: c02f98806d9297ee22c6552eaaf6bb82f619001af25b0d8eeaabf91d0e32ab7154b5436de71ed4773b15353ba5556b52ece92a6035a891eb001c27b90e5cdda5
|
data/.github/workflows/ruby.yml
CHANGED
@@ -9,18 +9,18 @@ jobs:
|
|
9
9
|
fail-fast: false
|
10
10
|
matrix:
|
11
11
|
ruby:
|
12
|
-
- 2.7
|
13
12
|
- '3.0'
|
14
13
|
- '3.1'
|
14
|
+
- '3.2'
|
15
|
+
- '3.3'
|
15
16
|
- jruby
|
16
17
|
name: Ruby ${{ matrix.ruby }}
|
17
18
|
steps:
|
18
|
-
- uses: actions/checkout@
|
19
|
+
- uses: actions/checkout@v4
|
19
20
|
- name: Set up Ruby
|
20
21
|
uses: ruby/setup-ruby@v1
|
21
22
|
with:
|
22
23
|
ruby-version: ${{ matrix.ruby }}
|
23
|
-
|
24
|
-
run: bundle install --jobs 4 --retry 3
|
24
|
+
bundler-cache: true
|
25
25
|
- name: Run tests
|
26
26
|
run: bundle exec rake test
|
data/ChangeLog.md
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
### 0.7.1 / 2024-01-25
|
2
|
+
|
3
|
+
* Switched to using `require_relative` to improve load-times.
|
4
|
+
* Added `# frozen_string_literal: true` to all files.
|
5
|
+
* Use keyword arguments for {Spidr.domain}.
|
6
|
+
* Rescue `URI::Error` instead of `Exception` when calling `URI::HTTP#merge` in
|
7
|
+
{Spidr::Page#to_absolute}.
|
8
|
+
|
1
9
|
### 0.7.0 / 2022-12-31
|
2
10
|
|
3
11
|
* Added {Spidr.domain} and {Spidr::Agent.domain}.
|
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
data/lib/spidr/agent/actions.rb
CHANGED
data/lib/spidr/agent/events.rb
CHANGED
data/lib/spidr/agent/filters.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../rules'
|
2
4
|
|
3
5
|
module Spidr
|
4
6
|
class Agent
|
@@ -170,7 +172,7 @@ module Spidr
|
|
170
172
|
#
|
171
173
|
# @yieldparam [String] link
|
172
174
|
# A link to accept or reject.
|
173
|
-
#
|
175
|
+
#
|
174
176
|
# @since 0.2.4
|
175
177
|
#
|
176
178
|
def visit_links_like(pattern=nil,&block)
|
@@ -238,7 +240,7 @@ module Spidr
|
|
238
240
|
#
|
239
241
|
# @yieldparam [URI::HTTP, URI::HTTPS] url
|
240
242
|
# A URL to accept or reject.
|
241
|
-
#
|
243
|
+
#
|
242
244
|
# @since 0.2.4
|
243
245
|
#
|
244
246
|
def visit_urls_like(pattern=nil,&block)
|
data/lib/spidr/agent/robots.rb
CHANGED
data/lib/spidr/agent.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'settings/user_agent'
|
4
|
+
require_relative 'agent/sanitizers'
|
5
|
+
require_relative 'agent/filters'
|
6
|
+
require_relative 'agent/events'
|
7
|
+
require_relative 'agent/actions'
|
8
|
+
require_relative 'agent/robots'
|
9
|
+
require_relative 'page'
|
10
|
+
require_relative 'session_cache'
|
11
|
+
require_relative 'cookie_jar'
|
12
|
+
require_relative 'auth_store'
|
13
|
+
require_relative 'spidr'
|
12
14
|
|
13
15
|
require 'openssl'
|
14
16
|
require 'net/http'
|
data/lib/spidr/auth_store.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'extensions/uri'
|
4
|
+
require_relative 'auth_credential'
|
5
|
+
require_relative 'page'
|
4
6
|
|
5
7
|
require 'base64'
|
6
8
|
|
@@ -20,7 +22,7 @@ module Spidr
|
|
20
22
|
@credentials = {}
|
21
23
|
end
|
22
24
|
|
23
|
-
#
|
25
|
+
#
|
24
26
|
# Given a URL, return the most specific matching auth credential.
|
25
27
|
#
|
26
28
|
# @param [URI] url
|
@@ -54,7 +56,7 @@ module Spidr
|
|
54
56
|
return nil
|
55
57
|
end
|
56
58
|
|
57
|
-
#
|
59
|
+
#
|
58
60
|
# Add an auth credential to the store for supplied base URL.
|
59
61
|
#
|
60
62
|
# @param [URI] url
|
@@ -122,7 +124,7 @@ module Spidr
|
|
122
124
|
end
|
123
125
|
end
|
124
126
|
|
125
|
-
#
|
127
|
+
#
|
126
128
|
# Clear the contents of the auth store.
|
127
129
|
#
|
128
130
|
# @return [AuthStore]
|
data/lib/spidr/cookie_jar.rb
CHANGED
@@ -1,4 +1,6 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'page'
|
2
4
|
|
3
5
|
require 'set'
|
4
6
|
|
@@ -42,8 +44,8 @@ module Spidr
|
|
42
44
|
@params.each(&block)
|
43
45
|
end
|
44
46
|
|
45
|
-
#
|
46
|
-
# Return all relevant cookies in a single string for the
|
47
|
+
#
|
48
|
+
# Return all relevant cookies in a single string for the
|
47
49
|
# named host or domain (in browser request format).
|
48
50
|
#
|
49
51
|
# @param [String] host
|
@@ -59,7 +61,7 @@ module Spidr
|
|
59
61
|
@params[host] ||= {}
|
60
62
|
end
|
61
63
|
|
62
|
-
#
|
64
|
+
#
|
63
65
|
# Add a cookie to the jar for a particular domain.
|
64
66
|
#
|
65
67
|
# @param [String] host
|
@@ -166,7 +168,7 @@ module Spidr
|
|
166
168
|
return host_cookies
|
167
169
|
end
|
168
170
|
|
169
|
-
#
|
171
|
+
#
|
170
172
|
# Clear out the jar, removing all stored cookies.
|
171
173
|
#
|
172
174
|
# @since 0.2.2
|
data/lib/spidr/extensions/uri.rb
CHANGED
data/lib/spidr/extensions.rb
CHANGED
data/lib/spidr/page/cookies.rb
CHANGED
data/lib/spidr/page/html.rb
CHANGED
@@ -1,5 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative '../extensions/uri'
|
4
|
+
|
1
5
|
require 'nokogiri'
|
2
|
-
require 'spidr/extensions/uri'
|
3
6
|
|
4
7
|
module Spidr
|
5
8
|
class Page
|
@@ -265,7 +268,7 @@ module Spidr
|
|
265
268
|
link = link.to_s
|
266
269
|
new_url = begin
|
267
270
|
url.merge(link)
|
268
|
-
rescue
|
271
|
+
rescue URI::Error
|
269
272
|
return
|
270
273
|
end
|
271
274
|
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Spidr
|
2
4
|
class Page
|
3
5
|
#
|
@@ -92,7 +94,7 @@ module Spidr
|
|
92
94
|
|
93
95
|
#
|
94
96
|
# Determines if the response code is `300`, `301`, `302`, `303`
|
95
|
-
# or `307`. Also checks for "soft" redirects added at the page
|
97
|
+
# or `307`. Also checks for "soft" redirects added at the page
|
96
98
|
# level by a meta refresh tag.
|
97
99
|
#
|
98
100
|
# @return [Boolean]
|
data/lib/spidr/page.rb
CHANGED
data/lib/spidr/proxy.rb
CHANGED
data/lib/spidr/rules.rb
CHANGED
data/lib/spidr/session_cache.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'settings/proxy'
|
4
|
+
require_relative 'settings/timeouts'
|
5
|
+
require_relative 'spidr'
|
4
6
|
|
5
7
|
require 'net/http'
|
6
8
|
require 'openssl'
|
@@ -135,7 +137,7 @@ module Spidr
|
|
135
137
|
key = key_for(url)
|
136
138
|
|
137
139
|
if (sess = @sessions[key])
|
138
|
-
begin
|
140
|
+
begin
|
139
141
|
sess.finish
|
140
142
|
rescue IOError
|
141
143
|
end
|
data/lib/spidr/settings/proxy.rb
CHANGED
data/lib/spidr/settings.rb
CHANGED
data/lib/spidr/spidr.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'settings/proxy'
|
4
|
+
require_relative 'settings/timeouts'
|
5
|
+
require_relative 'settings/user_agent'
|
6
|
+
require_relative 'agent'
|
5
7
|
|
6
8
|
module Spidr
|
7
9
|
extend Settings::Proxy
|
@@ -52,8 +54,8 @@ module Spidr
|
|
52
54
|
#
|
53
55
|
# @since 0.7.0
|
54
56
|
#
|
55
|
-
def self.domain(name
|
56
|
-
Agent.domain(name
|
57
|
+
def self.domain(name,**kwargs,&block)
|
58
|
+
Agent.domain(name,**kwargs,&block)
|
57
59
|
end
|
58
60
|
|
59
61
|
#
|
@@ -63,7 +65,7 @@ module Spidr
|
|
63
65
|
Agent.site(url,**kwargs,&block)
|
64
66
|
end
|
65
67
|
|
66
|
-
#
|
68
|
+
#
|
67
69
|
# @abstract
|
68
70
|
#
|
69
71
|
def self.robots
|
data/lib/spidr/version.rb
CHANGED
data/lib/spidr.rb
CHANGED
data/spidr.gemspec
CHANGED
@@ -7,10 +7,7 @@ Gem::Specification.new do |gem|
|
|
7
7
|
|
8
8
|
gem.name = gemspec.fetch('name')
|
9
9
|
gem.version = gemspec.fetch('version') do
|
10
|
-
|
11
|
-
$LOAD_PATH << lib_dir unless $LOAD_PATH.include?(lib_dir)
|
12
|
-
|
13
|
-
require 'spidr/version'
|
10
|
+
require_relative 'lib/spidr/version'
|
14
11
|
Spidr::VERSION
|
15
12
|
end
|
16
13
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidr
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Postmodern
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -128,7 +128,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
128
128
|
- !ruby/object:Gem::Version
|
129
129
|
version: '0'
|
130
130
|
requirements: []
|
131
|
-
rubygems_version: 3.
|
131
|
+
rubygems_version: 3.4.10
|
132
132
|
signing_key:
|
133
133
|
specification_version: 4
|
134
134
|
summary: A versatile Ruby web spidering library
|