wgit 0.0.18 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wgit.rb +0 -1
- data/lib/wgit/assertable.rb +20 -23
- data/lib/wgit/core_ext.rb +6 -14
- data/lib/wgit/crawler.rb +94 -183
- data/lib/wgit/database/database.rb +209 -185
- data/lib/wgit/database/model.rb +7 -7
- data/lib/wgit/document.rb +281 -241
- data/lib/wgit/indexer.rb +99 -92
- data/lib/wgit/logger.rb +5 -1
- data/lib/wgit/url.rb +171 -185
- data/lib/wgit/utils.rb +57 -68
- data/lib/wgit/version.rb +1 -1
- metadata +86 -60
- data/CHANGELOG.md +0 -61
- data/LICENSE.txt +0 -21
- data/README.md +0 -361
- data/TODO.txt +0 -34
- data/lib/wgit/database/connection_details.rb +0 -41
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6956381fcc74e20521f0e219cbfaaa74da79de5bdb24349c2fdf4643ca384a31
|
4
|
+
data.tar.gz: a544446aa9333d2001119df37ca929cdf2585f89ed084071e077c460b4ff24c9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 517665017a25419d9213df10347cd704a98ee0061243ebcd8d482465461a16d5b8319971321703b663ec8d6ef8f453d60d771d2122590b1655a6fc08be461026
|
7
|
+
data.tar.gz: 760e1c8b1b5cf385dfb1d0418c3b416cdef7a9e02595b1f729a30179848145cdc3c4fa25e2bacf073779baba9909b20ef9f2c5038c8b9df1437f0ade81e05990
|
data/lib/wgit.rb
CHANGED
@@ -8,7 +8,6 @@ require_relative 'wgit/url'
|
|
8
8
|
require_relative 'wgit/document'
|
9
9
|
require_relative 'wgit/document_extensions'
|
10
10
|
require_relative 'wgit/crawler'
|
11
|
-
require_relative 'wgit/database/connection_details'
|
12
11
|
require_relative 'wgit/database/model'
|
13
12
|
require_relative 'wgit/database/database'
|
14
13
|
require_relative 'wgit/indexer'
|
data/lib/wgit/assertable.rb
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module Wgit
|
4
|
-
# Module containing
|
5
|
-
# for asserting the integrity of method definitions etc.
|
4
|
+
# Module containing assertion methods including type checking and duck typing.
|
6
5
|
module Assertable
|
7
6
|
# Default type fail message.
|
8
7
|
DEFAULT_TYPE_FAIL_MSG = 'Expected: %s, Actual: %s'
|
@@ -11,21 +10,23 @@ module Wgit
|
|
11
10
|
# Default duck fail message.
|
12
11
|
DEFAULT_DUCK_FAIL_MSG = "%s doesn't respond_to? %s"
|
13
12
|
# Default required keys message.
|
14
|
-
DEFAULT_REQUIRED_KEYS_MSG =
|
13
|
+
DEFAULT_REQUIRED_KEYS_MSG = "Some or all of the required keys are not \
|
14
|
+
present: %s"
|
15
15
|
|
16
|
-
# Tests if the obj
|
16
|
+
# Tests if the obj is_a? given type; raises an Exception if not.
|
17
17
|
#
|
18
18
|
# @param obj [Object] The Object to test.
|
19
19
|
# @param type_or_types [Type, Array<Type>] The type/types that obj must
|
20
20
|
# belong to or an exception is thrown.
|
21
|
-
# @param msg [String] The raised
|
21
|
+
# @param msg [String] The raised StandardError message, if provided.
|
22
|
+
# @raise [StandardError] If the assertion fails.
|
22
23
|
# @return [Object] The given obj on successful assertion.
|
23
24
|
def assert_types(obj, type_or_types, msg = nil)
|
24
25
|
msg ||= format(DEFAULT_TYPE_FAIL_MSG, type_or_types, obj.class)
|
25
26
|
match = if type_or_types.respond_to?(:any?)
|
26
|
-
type_or_types.any? { |type| obj.
|
27
|
+
type_or_types.any? { |type| obj.is_a?(type) }
|
27
28
|
else
|
28
|
-
obj.
|
29
|
+
obj.is_a?(type_or_types)
|
29
30
|
end
|
30
31
|
raise msg unless match
|
31
32
|
|
@@ -33,36 +34,36 @@ module Wgit
|
|
33
34
|
end
|
34
35
|
|
35
36
|
# Each object within arr must match one of the types listed in
|
36
|
-
# type_or_types or an exception is raised using msg, if provided.
|
37
|
+
# type_or_types; or an exception is raised using msg, if provided.
|
37
38
|
#
|
38
39
|
# @param arr [Enumerable#each] Enumerable of objects to type check.
|
39
40
|
# @param type_or_types [Type, Array<Type>] The allowed type(s).
|
40
|
-
# @param msg [String] The raised
|
41
|
+
# @param msg [String] The raised StandardError message, if provided.
|
42
|
+
# @raise [StandardError] If the assertion fails.
|
41
43
|
# @return [Object] The given arr on successful assertion.
|
42
44
|
def assert_arr_types(arr, type_or_types, msg = nil)
|
43
45
|
raise WRONG_METHOD_MSG unless arr.respond_to?(:each)
|
44
46
|
|
45
|
-
arr.each
|
46
|
-
assert_types(obj, type_or_types, msg)
|
47
|
-
end
|
47
|
+
arr.each { |obj| assert_types(obj, type_or_types, msg) }
|
48
48
|
end
|
49
49
|
|
50
50
|
# The obj_or_objs must respond_to? all of the given methods or an
|
51
51
|
# Exception is raised using msg, if provided.
|
52
52
|
#
|
53
|
-
# @param obj_or_objs [Object, Enumerable#each] The
|
53
|
+
# @param obj_or_objs [Object, Enumerable#each] The object(s) to duck check.
|
54
54
|
# @param methods [Array<Symbol>] The methods to :respond_to?.
|
55
|
-
# @param msg [String] The raised
|
55
|
+
# @param msg [String] The raised StandardError message, if provided.
|
56
|
+
# @raise [StandardError] If the assertion fails.
|
56
57
|
# @return [Object] The given obj_or_objs on successful assertion.
|
57
58
|
def assert_respond_to(obj_or_objs, methods, msg = nil)
|
58
59
|
methods = [methods] unless methods.respond_to?(:all?)
|
60
|
+
|
59
61
|
if obj_or_objs.respond_to?(:each)
|
60
|
-
obj_or_objs.each
|
61
|
-
_assert_respond_to(obj, methods, msg)
|
62
|
-
end
|
62
|
+
obj_or_objs.each { |obj| _assert_respond_to(obj, methods, msg) }
|
63
63
|
else
|
64
64
|
_assert_respond_to(obj_or_objs, methods, msg)
|
65
65
|
end
|
66
|
+
|
66
67
|
obj_or_objs
|
67
68
|
end
|
68
69
|
|
@@ -71,6 +72,7 @@ module Wgit
|
|
71
72
|
# @param hash [Hash] The hash which should include the required keys.
|
72
73
|
# @param keys [Array<String, Symbol>] The keys whose presence to assert.
|
73
74
|
# @param msg [String] The raised KeyError message, if provided.
|
75
|
+
# @raise [KeyError] If the assertion fails.
|
74
76
|
# @return [Hash] The given hash on successful assertion.
|
75
77
|
def assert_required_keys(hash, keys, msg = nil)
|
76
78
|
msg ||= format(DEFAULT_REQUIRED_KEYS_MSG, keys.join(', '))
|
@@ -93,12 +95,7 @@ module Wgit
|
|
93
95
|
obj
|
94
96
|
end
|
95
97
|
|
96
|
-
alias assert_type
|
97
|
-
alias type assert_types
|
98
|
-
alias types assert_types
|
98
|
+
alias assert_type assert_types
|
99
99
|
alias assert_arr_type assert_arr_types
|
100
|
-
alias arr_type assert_arr_types
|
101
|
-
alias arr_types assert_arr_types
|
102
|
-
alias respond_to assert_respond_to
|
103
100
|
end
|
104
101
|
end
|
data/lib/wgit/core_ext.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
# Script which extends Ruby's core functionality when parsed.
|
4
|
-
# Needs to be required separately using `require 'wgit/core_ext'`.
|
4
|
+
# Needs to be required separately to 'wgit' using `require 'wgit/core_ext'`.
|
5
5
|
|
6
6
|
require_relative 'url'
|
7
7
|
|
@@ -22,19 +22,15 @@ module Enumerable
|
|
22
22
|
#
|
23
23
|
# @return [Array<Wgit::Url>] The converted URL's.
|
24
24
|
def to_urls
|
25
|
-
map
|
26
|
-
process_url_element(element)
|
27
|
-
end
|
25
|
+
map { |element| process_url_element(element) }
|
28
26
|
end
|
29
27
|
|
30
|
-
# Converts each String instance into a Wgit::Url object and returns
|
31
|
-
#
|
28
|
+
# Converts each String instance into a Wgit::Url object and returns self
|
29
|
+
# having modified the receiver.
|
32
30
|
#
|
33
31
|
# @return [Array<Wgit::Url>] Self containing the converted URL's.
|
34
32
|
def to_urls!
|
35
|
-
map!
|
36
|
-
process_url_element(element)
|
37
|
-
end
|
33
|
+
map! { |element| process_url_element(element) }
|
38
34
|
end
|
39
35
|
end
|
40
36
|
|
@@ -42,9 +38,5 @@ private
|
|
42
38
|
|
43
39
|
# Converts the element to a Wgit::Url if the element is a String.
|
44
40
|
def process_url_element(element)
|
45
|
-
|
46
|
-
element.to_url
|
47
|
-
else
|
48
|
-
element
|
49
|
-
end
|
41
|
+
element.is_a?(String) ? element.to_url : element
|
50
42
|
end
|
data/lib/wgit/crawler.rb
CHANGED
@@ -7,142 +7,24 @@ require_relative 'assertable'
|
|
7
7
|
require 'net/http' # Requires 'uri'.
|
8
8
|
|
9
9
|
module Wgit
|
10
|
-
# The Crawler class provides a means of crawling web based Wgit::Url's,
|
11
|
-
# their HTML into Wgit::Document instances.
|
10
|
+
# The Crawler class provides a means of crawling web based HTTP Wgit::Url's,
|
11
|
+
# serialising their HTML into Wgit::Document instances.
|
12
12
|
class Crawler
|
13
13
|
include Assertable
|
14
14
|
|
15
|
-
# The
|
16
|
-
|
17
|
-
|
18
|
-
class << self
|
19
|
-
# Class level instance accessor methods for @default_redirect_limit.
|
20
|
-
# Call using Wgit::Crawler.default_redirect_limit etc.
|
21
|
-
attr_accessor :default_redirect_limit
|
22
|
-
end
|
23
|
-
|
24
|
-
# The urls to crawl.
|
25
|
-
attr_reader :urls
|
26
|
-
|
27
|
-
# The docs of the crawled @urls.
|
28
|
-
attr_reader :docs
|
15
|
+
# The amount of allowed redirects before raising an error. Set to 0 to
|
16
|
+
# disable redirects completely.
|
17
|
+
attr_accessor :redirect_limit
|
29
18
|
|
30
19
|
# The Net::HTTPResponse of the most recently crawled URL or nil.
|
31
20
|
attr_reader :last_response
|
32
21
|
|
33
|
-
# Initializes
|
34
|
-
#
|
35
|
-
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
36
|
-
# Crawler#crawl_url or Crawler#crawl_site. Note that the urls passed here
|
37
|
-
# will NOT update if they happen to redirect when crawled. If in doubt,
|
38
|
-
# pass the url(s) directly to the crawl_* method instead of to the new
|
39
|
-
# method.
|
40
|
-
def initialize(*urls)
|
41
|
-
self.[](*urls)
|
42
|
-
@docs = []
|
43
|
-
end
|
44
|
-
|
45
|
-
# Sets this Crawler's @urls.
|
46
|
-
#
|
47
|
-
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
48
|
-
# crawl_url or crawl_site. Note that the urls passed here will NOT update
|
49
|
-
# if they happen to redirect when crawled. If in doubt, pass the url(s)
|
50
|
-
# directly to the crawl_* method instead of to the new method.
|
51
|
-
def urls=(urls)
|
52
|
-
@urls = []
|
53
|
-
Wgit::Utils.each(urls) { |url| add_url(url) }
|
54
|
-
end
|
55
|
-
|
56
|
-
# Sets this Crawler's @urls.
|
57
|
-
#
|
58
|
-
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
59
|
-
# crawl_url or crawl_site. Note that the urls passed here will NOT update
|
60
|
-
# if they happen to redirect when crawled. If in doubt, pass the url(s)
|
61
|
-
# directly to the crawl_* method instead of to the new method.
|
62
|
-
def [](*urls)
|
63
|
-
# If urls is nil then add_url (when called later) will set @urls = []
|
64
|
-
# so we do nothing here.
|
65
|
-
unless urls.nil?
|
66
|
-
# Due to *urls you can end up with [[url1,url2,url3]] etc. where the
|
67
|
-
# outer array is bogus so we use the inner one only.
|
68
|
-
if urls.is_a?(Enumerable) &&
|
69
|
-
urls.length == 1 &&
|
70
|
-
urls.first.is_a?(Enumerable)
|
71
|
-
urls = urls.first
|
72
|
-
end
|
73
|
-
|
74
|
-
# Here we call urls= method using self because the param name is also
|
75
|
-
# urls which conflicts.
|
76
|
-
self.urls = urls
|
77
|
-
end
|
78
|
-
end
|
79
|
-
|
80
|
-
# Adds the url to this Crawler's @urls.
|
22
|
+
# Initializes and returns a Wgit::Crawler instance.
|
81
23
|
#
|
82
|
-
# @param
|
83
|
-
#
|
84
|
-
|
85
|
-
|
86
|
-
def <<(url)
|
87
|
-
add_url(url)
|
88
|
-
end
|
89
|
-
|
90
|
-
# Crawls one or more individual urls using Wgit::Crawler#crawl_url
|
91
|
-
# underneath. See Wgit::Crawler#crawl_site for crawling entire sites. Note
|
92
|
-
# that any external redirects are followed. Use Wgit::Crawler#crawl_url if
|
93
|
-
# this isn't desirable.
|
94
|
-
#
|
95
|
-
# @param urls [Array<Wgit::Url>] The URLs to crawl.
|
96
|
-
# @yield [Wgit::Document] If provided, the block is given each crawled
|
97
|
-
# Document. Otherwise each doc is added to @docs which can be accessed
|
98
|
-
# by Crawler#docs after this method returns.
|
99
|
-
# @return [Wgit::Document] The last Document crawled.
|
100
|
-
def crawl_urls(urls = @urls, &block)
|
101
|
-
raise 'No urls to crawl' unless urls
|
102
|
-
|
103
|
-
@docs = []
|
104
|
-
doc = nil
|
105
|
-
Wgit::Utils.each(urls) { |url| doc = handle_crawl_block(url, &block) }
|
106
|
-
doc || @docs.last
|
107
|
-
end
|
108
|
-
|
109
|
-
# Crawl the url returning the response Wgit::Document or nil if an error
|
110
|
-
# occurs.
|
111
|
-
#
|
112
|
-
# @param url [Wgit::Url] The URL to crawl.
|
113
|
-
# @param follow_external_redirects [Boolean] Whether or not to follow
|
114
|
-
# an external redirect. False will return nil for such a crawl. If false,
|
115
|
-
# you must also provide a `host:` parameter.
|
116
|
-
# @param host [Wgit::Url, String] Specify the host by which
|
117
|
-
# an absolute redirect is determined to be internal or not. Must be
|
118
|
-
# absolute and contain a protocol prefix. For example, a `host:` of
|
119
|
-
# 'http://www.example.com' will only allow redirects for Urls with a
|
120
|
-
# `to_host` value of 'www.example.com'.
|
121
|
-
# @yield [Wgit::Document] The crawled HTML Document regardless if the
|
122
|
-
# crawl was successful or not. Therefore, the Document#url can be used.
|
123
|
-
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
124
|
-
# crawl was unsuccessful.
|
125
|
-
def crawl_url(
|
126
|
-
url = @urls.first,
|
127
|
-
follow_external_redirects: true,
|
128
|
-
host: nil
|
129
|
-
)
|
130
|
-
assert_type(url, Wgit::Url)
|
131
|
-
if !follow_external_redirects && host.nil?
|
132
|
-
raise 'host cannot be nil if follow_external_redirects is false'
|
133
|
-
end
|
134
|
-
|
135
|
-
html = fetch(
|
136
|
-
url,
|
137
|
-
follow_external_redirects: follow_external_redirects,
|
138
|
-
host: host
|
139
|
-
)
|
140
|
-
url.crawled = true
|
141
|
-
|
142
|
-
doc = Wgit::Document.new(url, html)
|
143
|
-
yield(doc) if block_given?
|
144
|
-
|
145
|
-
doc.empty? ? nil : doc
|
24
|
+
# @param redirect_limit [Integer] The amount of allowed redirects before
|
25
|
+
# raising an error. Set to 0 to disable redirects completely.
|
26
|
+
def initialize(redirect_limit: 5)
|
27
|
+
@redirect_limit = redirect_limit
|
146
28
|
end
|
147
29
|
|
148
30
|
# Crawls an entire website's HTML pages by recursively going through
|
@@ -159,18 +41,16 @@ module Wgit
|
|
159
41
|
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
160
42
|
# It is recommended that this URL be the index page of the site to give a
|
161
43
|
# greater chance of finding all pages within that site/host.
|
162
|
-
# @yield [
|
44
|
+
# @yield [doc] Given each crawled page (Wgit::Document) of the site.
|
163
45
|
# A block is the only way to interact with each crawled Document.
|
164
46
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
165
47
|
# from all of the site's pages or nil if the url could not be
|
166
48
|
# crawled successfully.
|
167
|
-
def crawl_site(url
|
168
|
-
assert_type(url, Wgit::Url)
|
169
|
-
|
49
|
+
def crawl_site(url, &block)
|
170
50
|
doc = crawl_url(url, &block)
|
171
51
|
return nil if doc.nil?
|
172
52
|
|
173
|
-
|
53
|
+
opts = { follow_external_redirects: false, host: url.to_base }
|
174
54
|
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
175
55
|
crawled = [url, alt_url]
|
176
56
|
externals = doc.external_links
|
@@ -187,9 +67,7 @@ module Wgit
|
|
187
67
|
|
188
68
|
links.each do |link|
|
189
69
|
orig_link = link.dup
|
190
|
-
doc = crawl_url(
|
191
|
-
link, follow_external_redirects: false, host: host, &block
|
192
|
-
)
|
70
|
+
doc = crawl_url(link, opts, &block)
|
193
71
|
|
194
72
|
crawled.push(orig_link, link) # Push both in case of redirects.
|
195
73
|
next if doc.nil?
|
@@ -202,6 +80,66 @@ module Wgit
|
|
202
80
|
externals.uniq
|
203
81
|
end
|
204
82
|
|
83
|
+
# Crawls one or more individual urls using Wgit::Crawler#crawl_url
|
84
|
+
# underneath. See Wgit::Crawler#crawl_site for crawling entire sites.
|
85
|
+
#
|
86
|
+
# @param urls [*Wgit::Url] The Url's to crawl.
|
87
|
+
# @yield [doc] Given each crawled page (Wgit::Document); this is the only
|
88
|
+
# way to interact with them.
|
89
|
+
# @raise [StandardError] If no urls are provided.
|
90
|
+
# @return [Wgit::Document] The last Document crawled.
|
91
|
+
def crawl_urls(*urls, follow_external_redirects: true, host: nil, &block)
|
92
|
+
raise 'You must provide at least one Url' if urls.empty?
|
93
|
+
|
94
|
+
opts = {
|
95
|
+
follow_external_redirects: follow_external_redirects,
|
96
|
+
host: host
|
97
|
+
}
|
98
|
+
doc = nil
|
99
|
+
|
100
|
+
Wgit::Utils.each(urls) { |url| doc = crawl_url(url, opts, &block) }
|
101
|
+
|
102
|
+
doc
|
103
|
+
end
|
104
|
+
|
105
|
+
# Crawl the url returning the response Wgit::Document or nil if an error
|
106
|
+
# occurs.
|
107
|
+
#
|
108
|
+
# @param url [Wgit::Url] The Url to crawl.
|
109
|
+
# @param follow_external_redirects [Boolean] Whether or not to follow
|
110
|
+
# an external redirect. External meaning to a different host. False will
|
111
|
+
# return nil for such a crawl. If false, you must also provide a `host:`
|
112
|
+
# parameter.
|
113
|
+
# @param host [Wgit::Url, String] Specify the host by which
|
114
|
+
# an absolute redirect is determined to be internal or not. Must be
|
115
|
+
# absolute and contain a protocol prefix. For example, a `host:` of
|
116
|
+
# 'http://www.example.com' will only allow redirects for Url's with a
|
117
|
+
# `to_host` value of 'www.example.com'.
|
118
|
+
# @yield [doc] The crawled HTML page (Wgit::Document) regardless if the
|
119
|
+
# crawl was successful or not. Therefore, Document#url etc. can be used.
|
120
|
+
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
121
|
+
# crawl was unsuccessful.
|
122
|
+
def crawl_url(url, follow_external_redirects: true, host: nil)
|
123
|
+
# A String url isn't allowed because it's passed by value not reference,
|
124
|
+
# meaning a redirect isn't reflected; A Wgit::Url is passed by reference.
|
125
|
+
assert_type(url, Wgit::Url)
|
126
|
+
if !follow_external_redirects && host.nil?
|
127
|
+
raise 'host cannot be nil if follow_external_redirects is false'
|
128
|
+
end
|
129
|
+
|
130
|
+
html = fetch(
|
131
|
+
url,
|
132
|
+
follow_external_redirects: follow_external_redirects,
|
133
|
+
host: host
|
134
|
+
)
|
135
|
+
url.crawled = true
|
136
|
+
|
137
|
+
doc = Wgit::Document.new(url, html)
|
138
|
+
yield(doc) if block_given?
|
139
|
+
|
140
|
+
doc.empty? ? nil : doc
|
141
|
+
end
|
142
|
+
|
205
143
|
protected
|
206
144
|
|
207
145
|
# This method calls Wgit::Crawler#resolve to obtain the page HTML, handling
|
@@ -227,22 +165,19 @@ module Wgit
|
|
227
165
|
host: host
|
228
166
|
)
|
229
167
|
@last_response = response
|
168
|
+
|
230
169
|
response.body.empty? ? nil : response.body
|
231
170
|
rescue StandardError => e
|
232
|
-
Wgit.logger.debug(
|
233
|
-
"Wgit::Crawler#fetch('#{url}') exception: #{e.message}"
|
234
|
-
)
|
171
|
+
Wgit.logger.debug("Wgit::Crawler#fetch('#{url}') exception: #{e.message}")
|
235
172
|
@last_response = nil
|
173
|
+
|
236
174
|
nil
|
237
175
|
end
|
238
176
|
|
239
177
|
# The resolve method performs a HTTP GET to obtain the HTML response. The
|
240
|
-
# Net::HTTPResponse will be returned or an error raised.
|
241
|
-
# disabled by setting `redirect_limit: 0`.
|
178
|
+
# Net::HTTPResponse will be returned or an error raised.
|
242
179
|
#
|
243
180
|
# @param url [Wgit::Url] The URL to fetch the HTML from.
|
244
|
-
# @param redirect_limit [Integer] The number of redirect hops to allow
|
245
|
-
# before raising an error.
|
246
181
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
247
182
|
# an external redirect. If false, you must also provide a `host:`
|
248
183
|
# parameter.
|
@@ -254,12 +189,7 @@ module Wgit
|
|
254
189
|
# @raise [StandardError] If !url.respond_to? :to_uri or a redirect isn't
|
255
190
|
# allowed.
|
256
191
|
# @return [Net::HTTPResponse] The HTTP response of the GET request.
|
257
|
-
def resolve(
|
258
|
-
url,
|
259
|
-
redirect_limit: Wgit::Crawler.default_redirect_limit,
|
260
|
-
follow_external_redirects: true,
|
261
|
-
host: nil
|
262
|
-
)
|
192
|
+
def resolve(url, follow_external_redirects: true, host: nil)
|
263
193
|
raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
|
264
194
|
|
265
195
|
redirect_count = 0
|
@@ -267,25 +197,25 @@ module Wgit
|
|
267
197
|
|
268
198
|
loop do
|
269
199
|
response = Net::HTTP.get_response(url.to_uri)
|
200
|
+
break unless response.is_a?(Net::HTTPRedirection)
|
201
|
+
|
270
202
|
location = Wgit::Url.new(response.fetch('location', ''))
|
203
|
+
raise 'Encountered redirect without Location header' if location.empty?
|
271
204
|
|
272
|
-
break unless response.is_a?(Net::HTTPRedirection)
|
273
205
|
yield(url, response, location) if block_given?
|
274
206
|
|
275
|
-
|
276
|
-
|
277
|
-
!location.is_relative?(host: host)
|
278
|
-
raise "External redirect not allowed - Redirected to: \
|
207
|
+
if !follow_external_redirects && !location.is_relative?(host: host)
|
208
|
+
raise "External redirect not allowed - Redirected to: \
|
279
209
|
'#{location}', which is outside of host: '#{host}'"
|
280
|
-
|
210
|
+
end
|
281
211
|
|
282
|
-
|
212
|
+
raise "Too many redirects: #{redirect_count}" \
|
213
|
+
if redirect_count >= @redirect_limit
|
283
214
|
|
284
|
-
|
215
|
+
redirect_count += 1
|
285
216
|
|
286
|
-
|
287
|
-
|
288
|
-
end
|
217
|
+
location = url.to_base.concat(location) if location.is_relative?
|
218
|
+
url.replace(location) # Update the url on redirect.
|
289
219
|
end
|
290
220
|
|
291
221
|
response
|
@@ -300,7 +230,7 @@ module Wgit
|
|
300
230
|
# internal page links.
|
301
231
|
# @return [Array<Wgit::Url>] The internal page links from doc.
|
302
232
|
def get_internal_links(doc)
|
303
|
-
doc.
|
233
|
+
doc.internal_absolute_links
|
304
234
|
.map(&:without_anchor) # Because anchors don't change page content.
|
305
235
|
.uniq
|
306
236
|
.reject do |link|
|
@@ -309,28 +239,9 @@ module Wgit
|
|
309
239
|
end
|
310
240
|
end
|
311
241
|
|
312
|
-
|
313
|
-
|
314
|
-
# Add the document to the @docs array for later processing or let the block
|
315
|
-
# process it here and now.
|
316
|
-
def handle_crawl_block(url, &block)
|
317
|
-
if block_given?
|
318
|
-
crawl_url(url, &block)
|
319
|
-
else
|
320
|
-
@docs << crawl_url(url)
|
321
|
-
nil
|
322
|
-
end
|
323
|
-
end
|
324
|
-
|
325
|
-
# Add the url to @urls ensuring it is cast to a Wgit::Url if necessary.
|
326
|
-
def add_url(url)
|
327
|
-
@urls = [] if @urls.nil?
|
328
|
-
@urls << Wgit::Url.new(url)
|
329
|
-
end
|
330
|
-
|
331
|
-
alias crawl crawl_urls
|
242
|
+
alias crawl crawl_urls
|
332
243
|
alias crawl_pages crawl_urls
|
333
|
-
alias crawl_page
|
334
|
-
alias crawl_r
|
244
|
+
alias crawl_page crawl_url
|
245
|
+
alias crawl_r crawl_site
|
335
246
|
end
|
336
247
|
end
|