wgit 0.0.18 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/wgit.rb +0 -1
- data/lib/wgit/assertable.rb +20 -23
- data/lib/wgit/core_ext.rb +6 -14
- data/lib/wgit/crawler.rb +94 -183
- data/lib/wgit/database/database.rb +209 -185
- data/lib/wgit/database/model.rb +7 -7
- data/lib/wgit/document.rb +281 -241
- data/lib/wgit/indexer.rb +99 -92
- data/lib/wgit/logger.rb +5 -1
- data/lib/wgit/url.rb +171 -185
- data/lib/wgit/utils.rb +57 -68
- data/lib/wgit/version.rb +1 -1
- metadata +86 -60
- data/CHANGELOG.md +0 -61
- data/LICENSE.txt +0 -21
- data/README.md +0 -361
- data/TODO.txt +0 -34
- data/lib/wgit/database/connection_details.rb +0 -41
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6956381fcc74e20521f0e219cbfaaa74da79de5bdb24349c2fdf4643ca384a31
|
4
|
+
data.tar.gz: a544446aa9333d2001119df37ca929cdf2585f89ed084071e077c460b4ff24c9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 517665017a25419d9213df10347cd704a98ee0061243ebcd8d482465461a16d5b8319971321703b663ec8d6ef8f453d60d771d2122590b1655a6fc08be461026
|
7
|
+
data.tar.gz: 760e1c8b1b5cf385dfb1d0418c3b416cdef7a9e02595b1f729a30179848145cdc3c4fa25e2bacf073779baba9909b20ef9f2c5038c8b9df1437f0ade81e05990
|
data/lib/wgit.rb
CHANGED
@@ -8,7 +8,6 @@ require_relative 'wgit/url'
|
|
8
8
|
require_relative 'wgit/document'
|
9
9
|
require_relative 'wgit/document_extensions'
|
10
10
|
require_relative 'wgit/crawler'
|
11
|
-
require_relative 'wgit/database/connection_details'
|
12
11
|
require_relative 'wgit/database/model'
|
13
12
|
require_relative 'wgit/database/database'
|
14
13
|
require_relative 'wgit/indexer'
|
data/lib/wgit/assertable.rb
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wgit
|
4
|
-
# Module containing
|
5
|
-
# for asserting the integrity of method definitions etc.
|
4
|
+
# Module containing assertion methods including type checking and duck typing.
|
6
5
|
module Assertable
|
7
6
|
# Default type fail message.
|
8
7
|
DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
|
@@ -11,21 +10,23 @@ module Wgit
|
|
11
10
|
# Default duck fail message.
|
12
11
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
13
12
|
# Default required keys message.
|
14
|
-
DEFAULT_REQUIRED_KEYS_MSG =
|
13
|
+
DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not \
|
14
|
+
present: %s"
|
15
15
|
|
16
|
-
# Tests if the obj
|
16
|
+
# Tests if the obj is_a? given type; raises an Exception if not.
|
17
17
|
#
|
18
18
|
# @param obj [Object] The Object to test.
|
19
19
|
# @param type_or_types [Type, Array<Type>] The type/types that obj must
|
20
20
|
# belong to or an exception is thrown.
|
21
|
-
# @param msg [String] The raised
|
21
|
+
# @param msg [String] The raised StandardError message, if provided.
|
22
|
+
# @raise [StandardError] If the assertion fails.
|
22
23
|
# @return [Object] The given obj on successful assertion.
|
23
24
|
def assert_types(obj, type_or_types, msg = nil)
|
24
25
|
msg ||= format(DEFAULT_TYPE_FAIL_MSG, type_or_types, obj.class)
|
25
26
|
match = if type_or_types.respond_to?(:any?)
|
26
|
-
type_or_types.any? { |type| obj.
|
27
|
+
type_or_types.any? { |type| obj.is_a?(type) }
|
27
28
|
else
|
28
|
-
obj.
|
29
|
+
obj.is_a?(type_or_types)
|
29
30
|
end
|
30
31
|
raise msg unless match
|
31
32
|
|
@@ -33,36 +34,36 @@ module Wgit
|
|
33
34
|
end
|
34
35
|
|
35
36
|
# Each object within arr must match one of the types listed in
|
36
|
-
# type_or_types or an exception is raised using msg, if provided.
|
37
|
+
# type_or_types; or an exception is raised using msg, if provided.
|
37
38
|
#
|
38
39
|
# @param arr [Enumerable#each] Enumerable of objects to type check.
|
39
40
|
# @param type_or_types [Type, Array<Type>] The allowed type(s).
|
40
|
-
# @param msg [String] The raised
|
41
|
+
# @param msg [String] The raised StandardError message, if provided.
|
42
|
+
# @raise [StandardError] If the assertion fails.
|
41
43
|
# @return [Object] The given arr on successful assertion.
|
42
44
|
def assert_arr_types(arr, type_or_types, msg = nil)
|
43
45
|
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
44
46
|
|
45
|
-
arr.each
|
46
|
-
assert_types(obj, type_or_types, msg)
|
47
|
-
end
|
47
|
+
arr.each { |obj| assert_types(obj, type_or_types, msg) }
|
48
48
|
end
|
49
49
|
|
50
50
|
# The obj_or_objs must respond_to? all of the given methods or an
|
51
51
|
# Exception is raised using msg, if provided.
|
52
52
|
#
|
53
|
-
# @param obj_or_objs [Object, Enumerable#each] The
|
53
|
+
# @param obj_or_objs [Object, Enumerable#each] The object(s) to duck check.
|
54
54
|
# @param methods [Array<Symbol>] The methods to :respond_to?.
|
55
|
-
# @param msg [String] The raised
|
55
|
+
# @param msg [String] The raised StandardError message, if provided.
|
56
|
+
# @raise [StandardError] If the assertion fails.
|
56
57
|
# @return [Object] The given obj_or_objs on successful assertion.
|
57
58
|
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
58
59
|
methods = [methods] unless methods.respond_to?(:all?)
|
60
|
+
|
59
61
|
if obj_or_objs.respond_to?(:each)
|
60
|
-
obj_or_objs.each
|
61
|
-
_assert_respond_to(obj, methods, msg)
|
62
|
-
end
|
62
|
+
obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
|
63
63
|
else
|
64
64
|
_assert_respond_to(obj_or_objs, methods, msg)
|
65
65
|
end
|
66
|
+
|
66
67
|
obj_or_objs
|
67
68
|
end
|
68
69
|
|
@@ -71,6 +72,7 @@ module Wgit
|
|
71
72
|
# @param hash [Hash] The hash which should include the required keys.
|
72
73
|
# @param keys [Array<String, Symbol>] The keys whose presence to assert.
|
73
74
|
# @param msg [String] The raised KeyError message, if provided.
|
75
|
+
# @raise [KeyError] If the assertion fails.
|
74
76
|
# @return [Hash] The given hash on successful assertion.
|
75
77
|
def assert_required_keys(hash, keys, msg = nil)
|
76
78
|
msg ||= format(DEFAULT_REQUIRED_KEYS_MSG, keys.join(', '))
|
@@ -93,12 +95,7 @@ module Wgit
|
|
93
95
|
obj
|
94
96
|
end
|
95
97
|
|
96
|
-
alias assert_type
|
97
|
-
alias type assert_types
|
98
|
-
alias types assert_types
|
98
|
+
alias assert_type assert_types
|
99
99
|
alias assert_arr_type assert_arr_types
|
100
|
-
alias arr_type assert_arr_types
|
101
|
-
alias arr_types assert_arr_types
|
102
|
-
alias respond_to assert_respond_to
|
103
100
|
end
|
104
101
|
end
|
data/lib/wgit/core_ext.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
# Script which extends Ruby's core functionality when parsed.
|
4
|
-
# Needs to be required separately using `require 'wgit/core_ext'`.
|
4
|
+
# Needs to be required separately to 'wgit' using `require 'wgit/core_ext'`.
|
5
5
|
|
6
6
|
require_relative 'url'
|
7
7
|
|
@@ -22,19 +22,15 @@ module Enumerable
|
|
22
22
|
#
|
23
23
|
# @return [Array<Wgit::Url>] The converted URL's.
|
24
24
|
def to_urls
|
25
|
-
map
|
26
|
-
process_url_element(element)
|
27
|
-
end
|
25
|
+
map { |element| process_url_element(element) }
|
28
26
|
end
|
29
27
|
|
30
|
-
# Converts each String instance into a Wgit::Url object and returns
|
31
|
-
#
|
28
|
+
# Converts each String instance into a Wgit::Url object and returns self
|
29
|
+
# having modified the receiver.
|
32
30
|
#
|
33
31
|
# @return [Array<Wgit::Url>] Self containing the converted URL's.
|
34
32
|
def to_urls!
|
35
|
-
map!
|
36
|
-
process_url_element(element)
|
37
|
-
end
|
33
|
+
map! { |element| process_url_element(element) }
|
38
34
|
end
|
39
35
|
end
|
40
36
|
|
@@ -42,9 +38,5 @@ private
|
|
42
38
|
|
43
39
|
# Converts the element to a Wgit::Url if the element is a String.
|
44
40
|
def process_url_element(element)
|
45
|
-
|
46
|
-
element.to_url
|
47
|
-
else
|
48
|
-
element
|
49
|
-
end
|
41
|
+
element.is_a?(String) ? element.to_url : element
|
50
42
|
end
|
data/lib/wgit/crawler.rb
CHANGED
@@ -7,142 +7,24 @@ require_relative 'assertable'
|
|
7
7
|
require 'net/http' # Requires 'uri'.
|
8
8
|
|
9
9
|
module Wgit
|
10
|
-
# The Crawler class provides a means of crawling web based Wgit::Url's,
|
11
|
-
# their HTML into Wgit::Document instances.
|
10
|
+
# The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
|
11
|
+
# serialising their HTML into Wgit::Document instances.
|
12
12
|
class Crawler
|
13
13
|
include Assertable
|
14
14
|
|
15
|
-
# The
|
16
|
-
|
17
|
-
|
18
|
-
class << self
|
19
|
-
# Class level instance accessor methods for @default_redirect_limit.
|
20
|
-
# Call using Wgit::Crawler.default_redirect_limit etc.
|
21
|
-
attr_accessor :default_redirect_limit
|
22
|
-
end
|
23
|
-
|
24
|
-
# The urls to crawl.
|
25
|
-
attr_reader :urls
|
26
|
-
|
27
|
-
# The docs of the crawled @urls.
|
28
|
-
attr_reader :docs
|
15
|
+
# The amount of allowed redirects before raising an error. Set to 0 to
|
16
|
+
# disable redirects completely.
|
17
|
+
attr_accessor :redirect_limit
|
29
18
|
|
30
19
|
# The Net::HTTPResponse of the most recently crawled URL or nil.
|
31
20
|
attr_reader :last_response
|
32
21
|
|
33
|
-
# Initializes
|
34
|
-
#
|
35
|
-
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
36
|
-
# Crawler#crawl_url or Crawler#crawl_site. Note that the urls passed here
|
37
|
-
# will NOT update if they happen to redirect when crawled. If in doubt,
|
38
|
-
# pass the url(s) directly to the crawl_* method instead of to the new
|
39
|
-
# method.
|
40
|
-
def initialize(*urls)
|
41
|
-
self.[](*urls)
|
42
|
-
@docs = []
|
43
|
-
end
|
44
|
-
|
45
|
-
# Sets this Crawler's @urls.
|
46
|
-
#
|
47
|
-
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
48
|
-
# crawl_url or crawl_site. Note that the urls passed here will NOT update
|
49
|
-
# if they happen to redirect when crawled. If in doubt, pass the url(s)
|
50
|
-
# directly to the crawl_* method instead of to the new method.
|
51
|
-
def urls=(urls)
|
52
|
-
@urls = []
|
53
|
-
Wgit::Utils.each(urls) { |url| add_url(url) }
|
54
|
-
end
|
55
|
-
|
56
|
-
# Sets this Crawler's @urls.
|
57
|
-
#
|
58
|
-
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
59
|
-
# crawl_url or crawl_site. Note that the urls passed here will NOT update
|
60
|
-
# if they happen to redirect when crawled. If in doubt, pass the url(s)
|
61
|
-
# directly to the crawl_* method instead of to the new method.
|
62
|
-
def [](*urls)
|
63
|
-
# If urls is nil then add_url (when called later) will set @urls = []
|
64
|
-
# so we do nothing here.
|
65
|
-
unless urls.nil?
|
66
|
-
# Due to *urls you can end up with [[url1,url2,url3]] etc. where the
|
67
|
-
# outer array is bogus so we use the inner one only.
|
68
|
-
if urls.is_a?(Enumerable) &&
|
69
|
-
urls.length == 1 &&
|
70
|
-
urls.first.is_a?(Enumerable)
|
71
|
-
urls = urls.first
|
72
|
-
end
|
73
|
-
|
74
|
-
# Here we call urls= method using self because the param name is also
|
75
|
-
# urls which conflicts.
|
76
|
-
self.urls = urls
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
# Adds the url to this Crawler's @urls.
|
22
|
+
# Initializes and returns a Wgit::Crawler instance.
|
81
23
|
#
|
82
|
-
# @param
|
83
|
-
#
|
84
|
-
|
85
|
-
|
86
|
-
def <<(url)
|
87
|
-
add_url(url)
|
88
|
-
end
|
89
|
-
|
90
|
-
# Crawls one or more individual urls using Wgit::Crawler#crawl_url
|
91
|
-
# underneath. See Wgit::Crawler#crawl_site for crawling entire sites. Note
|
92
|
-
# that any external redirects are followed. Use Wgit::Crawler#crawl_url if
|
93
|
-
# this isn't desirable.
|
94
|
-
#
|
95
|
-
# @param urls [Array<Wgit::Url>] The URLs to crawl.
|
96
|
-
# @yield [Wgit::Document] If provided, the block is given each crawled
|
97
|
-
# Document. Otherwise each doc is added to @docs which can be accessed
|
98
|
-
# by Crawler#docs after this method returns.
|
99
|
-
# @return [Wgit::Document] The last Document crawled.
|
100
|
-
def crawl_urls(urls = @urls, &block)
|
101
|
-
raise 'No urls to crawl' unless urls
|
102
|
-
|
103
|
-
@docs = []
|
104
|
-
doc = nil
|
105
|
-
Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
|
106
|
-
doc || @docs.last
|
107
|
-
end
|
108
|
-
|
109
|
-
# Crawl the url returning the response Wgit::Document or nil if an error
|
110
|
-
# occurs.
|
111
|
-
#
|
112
|
-
# @param url [Wgit::Url] The URL to crawl.
|
113
|
-
# @param follow_external_redirects [Boolean] Whether or not to follow
|
114
|
-
# an external redirect. False will return nil for such a crawl. If false,
|
115
|
-
# you must also provide a `host:` parameter.
|
116
|
-
# @param host [Wgit::Url, String] Specify the host by which
|
117
|
-
# an absolute redirect is determined to be internal or not. Must be
|
118
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
119
|
-
# 'http://www.example.com' will only allow redirects for Urls with a
|
120
|
-
# `to_host` value of 'www.example.com'.
|
121
|
-
# @yield [Wgit::Document] The crawled HTML Document regardless if the
|
122
|
-
# crawl was successful or not. Therefore, the Document#url can be used.
|
123
|
-
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
124
|
-
# crawl was unsuccessful.
|
125
|
-
def crawl_url(
|
126
|
-
url = @urls.first,
|
127
|
-
follow_external_redirects: true,
|
128
|
-
host: nil
|
129
|
-
)
|
130
|
-
assert_type(url, Wgit::Url)
|
131
|
-
if !follow_external_redirects && host.nil?
|
132
|
-
raise 'host cannot be nil if follow_external_redirects is false'
|
133
|
-
end
|
134
|
-
|
135
|
-
html = fetch(
|
136
|
-
url,
|
137
|
-
follow_external_redirects: follow_external_redirects,
|
138
|
-
host: host
|
139
|
-
)
|
140
|
-
url.crawled = true
|
141
|
-
|
142
|
-
doc = Wgit::Document.new(url, html)
|
143
|
-
yield(doc) if block_given?
|
144
|
-
|
145
|
-
doc.empty? ? nil : doc
|
24
|
+
# @param redirect_limit [Integer] The amount of allowed redirects before
|
25
|
+
# raising an error. Set to 0 to disable redirects completely.
|
26
|
+
def initialize(redirect_limit: 5)
|
27
|
+
@redirect_limit = redirect_limit
|
146
28
|
end
|
147
29
|
|
148
30
|
# Crawls an entire website's HTML pages by recursively going through
|
@@ -159,18 +41,16 @@ module Wgit
|
|
159
41
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
160
42
|
# It is recommended that this URL be the index page of the site to give a
|
161
43
|
# greater chance of finding all pages within that site/host.
|
162
|
-
# @yield [
|
44
|
+
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
163
45
|
# A block is the only way to interact with each crawled Document.
|
164
46
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
165
47
|
# from all of the site's pages or nil if the url could not be
|
166
48
|
# crawled successfully.
|
167
|
-
def crawl_site(url
|
168
|
-
assert_type(url, Wgit::Url)
|
169
|
-
|
49
|
+
def crawl_site(url, &block)
|
170
50
|
doc = crawl_url(url, &block)
|
171
51
|
return nil if doc.nil?
|
172
52
|
|
173
|
-
|
53
|
+
opts = { follow_external_redirects: false, host: url.to_base }
|
174
54
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
175
55
|
crawled = [url, alt_url]
|
176
56
|
externals = doc.external_links
|
@@ -187,9 +67,7 @@ module Wgit
|
|
187
67
|
|
188
68
|
links.each do |link|
|
189
69
|
orig_link = link.dup
|
190
|
-
doc = crawl_url(
|
191
|
-
link, follow_external_redirects: false, host: host, &block
|
192
|
-
)
|
70
|
+
doc = crawl_url(link, opts, &block)
|
193
71
|
|
194
72
|
crawled.push(orig_link, link) # Push both in case of redirects.
|
195
73
|
next if doc.nil?
|
@@ -202,6 +80,66 @@ module Wgit
|
|
202
80
|
externals.uniq
|
203
81
|
end
|
204
82
|
|
83
|
+
# Crawls one or more individual urls using Wgit::Crawler#crawl_url
|
84
|
+
# underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
|
85
|
+
#
|
86
|
+
# @param urls [*Wgit::Url] The Url's to crawl.
|
87
|
+
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
88
|
+
# way to interact with them.
|
89
|
+
# @raise [StandardError] If no urls are provided.
|
90
|
+
# @return [Wgit::Document] The last Document crawled.
|
91
|
+
def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
|
92
|
+
raise 'You must provide at least one Url' if urls.empty?
|
93
|
+
|
94
|
+
opts = {
|
95
|
+
follow_external_redirects: follow_external_redirects,
|
96
|
+
host: host
|
97
|
+
}
|
98
|
+
doc = nil
|
99
|
+
|
100
|
+
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
101
|
+
|
102
|
+
doc
|
103
|
+
end
|
104
|
+
|
105
|
+
# Crawl the url returning the response Wgit::Document or nil if an error
|
106
|
+
# occurs.
|
107
|
+
#
|
108
|
+
# @param url [Wgit::Url] The Url to crawl.
|
109
|
+
# @param follow_external_redirects [Boolean] Whether or not to follow
|
110
|
+
# an external redirect. External meaning to a different host. False will
|
111
|
+
# return nil for such a crawl. If false, you must also provide a `host:`
|
112
|
+
# parameter.
|
113
|
+
# @param host [Wgit::Url, String] Specify the host by which
|
114
|
+
# an absolute redirect is determined to be internal or not. Must be
|
115
|
+
# absolute and contain a protocol prefix. For example, a `host:` of
|
116
|
+
# 'http://www.example.com' will only allow redirects for Url's with a
|
117
|
+
# `to_host` value of 'www.example.com'.
|
118
|
+
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
119
|
+
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
120
|
+
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
121
|
+
# crawl was unsuccessful.
|
122
|
+
def crawl_url(url, follow_external_redirects: true, host: nil)
|
123
|
+
# A String url isn't allowed because it's passed by value not reference,
|
124
|
+
# meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
|
125
|
+
assert_type(url, Wgit::Url)
|
126
|
+
if !follow_external_redirects && host.nil?
|
127
|
+
raise 'host cannot be nil if follow_external_redirects is false'
|
128
|
+
end
|
129
|
+
|
130
|
+
html = fetch(
|
131
|
+
url,
|
132
|
+
follow_external_redirects: follow_external_redirects,
|
133
|
+
host: host
|
134
|
+
)
|
135
|
+
url.crawled = true
|
136
|
+
|
137
|
+
doc = Wgit::Document.new(url, html)
|
138
|
+
yield(doc) if block_given?
|
139
|
+
|
140
|
+
doc.empty? ? nil : doc
|
141
|
+
end
|
142
|
+
|
205
143
|
protected
|
206
144
|
|
207
145
|
# This method calls Wgit::Crawler#resolve to obtain the page HTML, handling
|
@@ -227,22 +165,19 @@ module Wgit
|
|
227
165
|
host: host
|
228
166
|
)
|
229
167
|
@last_response = response
|
168
|
+
|
230
169
|
response.body.empty? ? nil : response.body
|
231
170
|
rescue StandardError => e
|
232
|
-
Wgit.logger.debug(
|
233
|
-
"Wgit::Crawler#fetch('#{url}') exception: #{e.message}"
|
234
|
-
)
|
171
|
+
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
|
235
172
|
@last_response = nil
|
173
|
+
|
236
174
|
nil
|
237
175
|
end
|
238
176
|
|
239
177
|
# The resolve method performs a HTTP GET to obtain the HTML response. The
|
240
|
-
# Net::HTTPResponse will be returned or an error raised.
|
241
|
-
# disabled by setting `redirect_limit: 0`.
|
178
|
+
# Net::HTTPResponse will be returned or an error raised.
|
242
179
|
#
|
243
180
|
# @param url [Wgit::Url] The URL to fetch the HTML from.
|
244
|
-
# @param redirect_limit [Integer] The number of redirect hops to allow
|
245
|
-
# before raising an error.
|
246
181
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
247
182
|
# an external redirect. If false, you must also provide a `host:`
|
248
183
|
# parameter.
|
@@ -254,12 +189,7 @@ module Wgit
|
|
254
189
|
# @raise [StandardError] If !url.respond_to? :to_uri or a redirect isn't
|
255
190
|
# allowed.
|
256
191
|
# @return [Net::HTTPResponse] The HTTP response of the GET request.
|
257
|
-
def resolve(
|
258
|
-
url,
|
259
|
-
redirect_limit: Wgit::Crawler.default_redirect_limit,
|
260
|
-
follow_external_redirects: true,
|
261
|
-
host: nil
|
262
|
-
)
|
192
|
+
def resolve(url, follow_external_redirects: true, host: nil)
|
263
193
|
raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
|
264
194
|
|
265
195
|
redirect_count = 0
|
@@ -267,25 +197,25 @@ module Wgit
|
|
267
197
|
|
268
198
|
loop do
|
269
199
|
response = Net::HTTP.get_response(url.to_uri)
|
200
|
+
break unless response.is_a?(Net::HTTPRedirection)
|
201
|
+
|
270
202
|
location = Wgit::Url.new(response.fetch('location', ''))
|
203
|
+
raise 'Encountered redirect without Location header' if location.empty?
|
271
204
|
|
272
|
-
break unless response.is_a?(Net::HTTPRedirection)
|
273
205
|
yield(url, response, location) if block_given?
|
274
206
|
|
275
|
-
|
276
|
-
|
277
|
-
!location.is_relative?(host: host)
|
278
|
-
raise "External redirect not allowed - Redirected to: \
|
207
|
+
if !follow_external_redirects && !location.is_relative?(host: host)
|
208
|
+
raise "External redirect not allowed - Redirected to: \
|
279
209
|
'#{location}', which is outside of host: '#{host}'"
|
280
|
-
|
210
|
+
end
|
281
211
|
|
282
|
-
|
212
|
+
raise "Too many redirects: #{redirect_count}" \
|
213
|
+
if redirect_count >= @redirect_limit
|
283
214
|
|
284
|
-
|
215
|
+
redirect_count += 1
|
285
216
|
|
286
|
-
|
287
|
-
|
288
|
-
end
|
217
|
+
location = url.to_base.concat(location) if location.is_relative?
|
218
|
+
url.replace(location) # Update the url on redirect.
|
289
219
|
end
|
290
220
|
|
291
221
|
response
|
@@ -300,7 +230,7 @@ module Wgit
|
|
300
230
|
# internal page links.
|
301
231
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
302
232
|
def get_internal_links(doc)
|
303
|
-
doc.
|
233
|
+
doc.internal_absolute_links
|
304
234
|
.map(&:without_anchor) # Because anchors don't change page content.
|
305
235
|
.uniq
|
306
236
|
.reject do |link|
|
@@ -309,28 +239,9 @@ module Wgit
|
|
309
239
|
end
|
310
240
|
end
|
311
241
|
|
312
|
-
|
313
|
-
|
314
|
-
# Add the document to the @docs array for later processing or let the block
|
315
|
-
# process it here and now.
|
316
|
-
def handle_crawl_block(url, &block)
|
317
|
-
if block_given?
|
318
|
-
crawl_url(url, &block)
|
319
|
-
else
|
320
|
-
@docs << crawl_url(url)
|
321
|
-
nil
|
322
|
-
end
|
323
|
-
end
|
324
|
-
|
325
|
-
# Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
|
326
|
-
def add_url(url)
|
327
|
-
@urls = [] if @urls.nil?
|
328
|
-
@urls << Wgit::Url.new(url)
|
329
|
-
end
|
330
|
-
|
331
|
-
alias crawl crawl_urls
|
242
|
+
alias crawl crawl_urls
|
332
243
|
alias crawl_pages crawl_urls
|
333
|
-
alias crawl_page
|
334
|
-
alias crawl_r
|
244
|
+
alias crawl_page crawl_url
|
245
|
+
alias crawl_r crawl_site
|
335
246
|
end
|
336
247
|
end
|