wgit 0.0.16 → 0.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -9
- data/lib/wgit/crawler.rb +111 -48
- data/lib/wgit/document.rb +46 -32
- data/lib/wgit/document_extensions.rb +11 -9
- data/lib/wgit/indexer.rb +2 -2
- data/lib/wgit/logger.rb +5 -1
- data/lib/wgit/url.rb +79 -17
- data/lib/wgit/version.rb +1 -1
- metadata +1 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a805551a72869241a425dc4d0f88ed6f740c75b95db6e3acf2564393b79708d9
|
4
|
+
data.tar.gz: 5425e8bb21c7822b5ac93afbe6c7d90777a649808702c3e6012c0ec19cbe1dfb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fd5dcc1b4e9706326810b3fdbdf1df285ec1a98788aac9521fbcd52ad4132c039ab2a2b2d2e574af115845d1968c0eb1bc8d487dbbec4ee9a3427597bb99b09f
|
7
|
+
data.tar.gz: 3e947536f694ea74460f919cab1ec8e42274eb6bd0f856ac900c6b2e4f31da22ddc920afbaaa3a4b80abe3d9729fdcf00964e794363f3d76e1f35dc33a05224a
|
data/README.md
CHANGED
@@ -57,13 +57,13 @@ doc.stats # => {
|
|
57
57
|
|
58
58
|
# doc responds to the following methods:
|
59
59
|
Wgit::Document.instance_methods(false).sort # => [
|
60
|
-
#
|
61
|
-
# :external_urls, :html, :
|
62
|
-
# :
|
63
|
-
#
|
64
|
-
#
|
65
|
-
# :xpath
|
66
|
-
#]
|
60
|
+
# :==, :[], :author, :base, :base_url, :css, :date_crawled, :doc, :empty?,
|
61
|
+
# :external_links, :external_urls, :html, :internal_absolute_links,
|
62
|
+
# :internal_full_links, :internal_links, :keywords, :links,
|
63
|
+
# :relative_absolute_links, :relative_absolute_urls, :relative_full_links,
|
64
|
+
# :relative_full_urls, :relative_links, :relative_urls, :score, :search,
|
65
|
+
# :search!, :size, :stats, :text, :title, :to_h, :to_json, :url, :xpath
|
66
|
+
# ]
|
67
67
|
|
68
68
|
results = doc.search "corruption"
|
69
69
|
results.first # => "ial materials involving war, spying and corruption.
|
@@ -325,7 +325,7 @@ Currently there is no executable provided with Wgit, however...
|
|
325
325
|
|
326
326
|
In future versions of Wgit, an executable will be packaged with the gem. The executable will provide a `pry` console with the `wgit` gem already loaded. Using the console, you'll easily be able to index and search the web without having to write your own scripts.
|
327
327
|
|
328
|
-
This executable will be
|
328
|
+
This executable will be similar in nature to `./bin/console` which is currently used for development and isn't packaged as part of the `wgit` gem.
|
329
329
|
|
330
330
|
## Change Log
|
331
331
|
|
@@ -345,7 +345,7 @@ The current road map is rudimentally listed in the [TODO.txt](https://github.com
|
|
345
345
|
|
346
346
|
For a full list of available Rake tasks, run `bundle exec rake help`. The most commonly used tasks are listed below...
|
347
347
|
|
348
|
-
After checking out the repo, run
|
348
|
+
After checking out the repo, run `bundle exec rake setup` to install the dependencies (requires `bundler`). Then, run `bundle exec rake test` to run the tests. You can also run `bundle exec rake console` for an interactive (`pry`) REPL that will allow you to experiment with the code.
|
349
349
|
|
350
350
|
To generate code documentation run `bundle exec yard doc`. To browse the generated documentation run `bundle exec yard server -r`.
|
351
351
|
|
data/lib/wgit/crawler.rb
CHANGED
@@ -6,8 +6,8 @@ require 'net/http' # Requires 'uri'.
|
|
6
6
|
|
7
7
|
module Wgit
|
8
8
|
|
9
|
-
# The Crawler class provides a means of crawling web based
|
10
|
-
# their HTML into Wgit::Document
|
9
|
+
# The Crawler class provides a means of crawling web based Wgit::Url's, turning
|
10
|
+
# their HTML into Wgit::Document instances.
|
11
11
|
class Crawler
|
12
12
|
include Assertable
|
13
13
|
|
@@ -29,9 +29,13 @@ module Wgit
|
|
29
29
|
# The Net::HTTPResponse of the most recently crawled URL or nil.
|
30
30
|
attr_reader :last_response
|
31
31
|
|
32
|
-
# Initializes the Crawler
|
32
|
+
# Initializes the Crawler and sets the @urls and @docs.
|
33
33
|
#
|
34
|
-
# @param urls [*Wgit::Url] The
|
34
|
+
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
35
|
+
# Crawler#crawl_url or Crawler#crawl_site. Note that the urls passed here
|
36
|
+
# will NOT update if they happen to redirect when crawled. If in doubt,
|
37
|
+
# pass the url(s) directly to the crawl_* method instead of to the new
|
38
|
+
# method.
|
35
39
|
def initialize(*urls)
|
36
40
|
self.[](*urls)
|
37
41
|
@docs = []
|
@@ -39,7 +43,10 @@ module Wgit
|
|
39
43
|
|
40
44
|
# Sets this Crawler's @urls.
|
41
45
|
#
|
42
|
-
# @param urls [
|
46
|
+
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
47
|
+
# crawl_url or crawl_site. Note that the urls passed here will NOT update
|
48
|
+
# if they happen to redirect when crawled. If in doubt, pass the url(s)
|
49
|
+
# directly to the crawl_* method instead of to the new method.
|
43
50
|
def urls=(urls)
|
44
51
|
@urls = []
|
45
52
|
Wgit::Utils.each(urls) { |url| add_url(url) }
|
@@ -47,7 +54,10 @@ module Wgit
|
|
47
54
|
|
48
55
|
# Sets this Crawler's @urls.
|
49
56
|
#
|
50
|
-
# @param urls [*Wgit::Url] The
|
57
|
+
# @param urls [*Wgit::Url] The URL's to crawl in the future using either
|
58
|
+
# crawl_url or crawl_site. Note that the urls passed here will NOT update
|
59
|
+
# if they happen to redirect when crawled. If in doubt, pass the url(s)
|
60
|
+
# directly to the crawl_* method instead of to the new method.
|
51
61
|
def [](*urls)
|
52
62
|
# If urls is nil then add_url (when called later) will set @urls = []
|
53
63
|
# so we do nothing here.
|
@@ -68,12 +78,18 @@ module Wgit
|
|
68
78
|
|
69
79
|
# Adds the url to this Crawler's @urls.
|
70
80
|
#
|
71
|
-
# @param url [Wgit::Url] A URL to crawl.
|
81
|
+
# @param url [Wgit::Url] A URL to crawl later by calling a crawl_* method.
|
82
|
+
# Note that the url added here will NOT update if it happens to
|
83
|
+
# redirect when crawled. If in doubt, pass the url directly to the
|
84
|
+
# crawl_* method instead of to the new method.
|
72
85
|
def <<(url)
|
73
86
|
add_url(url)
|
74
87
|
end
|
75
88
|
|
76
|
-
# Crawls individual urls
|
89
|
+
# Crawls one or more individual urls using Wgit::Crawler#crawl_url
|
90
|
+
# underneath. See Wgit::Crawler#crawl_site for crawling entire sites. Note
|
91
|
+
# that any external redirects are followed. Use Wgit::Crawler#crawl_url if
|
92
|
+
# this isn't desirable.
|
77
93
|
#
|
78
94
|
# @param urls [Array<Wgit::Url>] The URLs to crawl.
|
79
95
|
# @yield [Wgit::Document] If provided, the block is given each crawled
|
@@ -88,68 +104,100 @@ module Wgit
|
|
88
104
|
doc ? doc : @docs.last
|
89
105
|
end
|
90
106
|
|
91
|
-
# Crawl the url
|
107
|
+
# Crawl the url returning the response Wgit::Document or nil if an error
|
108
|
+
# occurs.
|
92
109
|
#
|
93
|
-
# @param url [Wgit::
|
110
|
+
# @param url [Wgit::Url] The URL to crawl.
|
94
111
|
# @param follow_external_redirects [Boolean] Whether or not to follow
|
95
|
-
# external
|
112
|
+
# an external redirect. False will return nil for such a crawl. If false,
|
113
|
+
# you must also provide a `host:` parameter.
|
114
|
+
# @param host [Wgit::Url, String] Specify the host by which
|
115
|
+
# an absolute redirect is determined to be internal or not. Must be
|
116
|
+
# absolute and contain a protocol prefix. For example, a `host:` of
|
117
|
+
# 'http://www.example.com' will only allow redirects for Urls with a
|
118
|
+
# `to_host` value of 'www.example.com'.
|
96
119
|
# @yield [Wgit::Document] The crawled HTML Document regardless if the
|
97
120
|
# crawl was successful or not. Therefore, the Document#url can be used.
|
98
121
|
# @return [Wgit::Document, nil] The crawled HTML Document or nil if the
|
99
122
|
# crawl was unsuccessful.
|
100
|
-
def crawl_url(
|
123
|
+
def crawl_url(
|
124
|
+
url = @urls.first,
|
125
|
+
follow_external_redirects: true,
|
126
|
+
host: nil
|
127
|
+
)
|
101
128
|
assert_type(url, Wgit::Url)
|
102
|
-
|
129
|
+
if !follow_external_redirects and host.nil?
|
130
|
+
raise 'host cannot be nil if follow_external_redirects is false'
|
131
|
+
end
|
132
|
+
|
133
|
+
html = fetch(
|
134
|
+
url,
|
135
|
+
follow_external_redirects: follow_external_redirects,
|
136
|
+
host: host
|
137
|
+
)
|
103
138
|
url.crawled = true
|
104
|
-
|
139
|
+
|
140
|
+
doc = Wgit::Document.new(url, html)
|
105
141
|
yield(doc) if block_given?
|
142
|
+
|
106
143
|
doc.empty? ? nil : doc
|
107
144
|
end
|
108
145
|
|
109
146
|
# Crawls an entire website's HTML pages by recursively going through
|
110
|
-
# its internal links. Each crawled
|
147
|
+
# its internal links. Each crawled Document is yielded to a block.
|
111
148
|
#
|
112
|
-
#
|
149
|
+
# Only redirects to the same host are followed. For example, the Url
|
150
|
+
# 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
|
151
|
+
# a link which redirects to 'https://ftp.example.co.uk' or
|
152
|
+
# 'https://www.example.com' will not be followed. The only exception to
|
153
|
+
# this is the initially crawled url which is allowed to redirect anywhere;
|
154
|
+
# it's host is then used for other link redirections on the site, as
|
155
|
+
# described above.
|
156
|
+
#
|
157
|
+
# @param url [Wgit::Url] The base URL of the website to be crawled.
|
158
|
+
# It is recommended that this URL be the index page of the site to give a
|
159
|
+
# greater chance of finding all pages within that site/host.
|
113
160
|
# @yield [Wgit::Document] Given each crawled Document/page of the site.
|
114
161
|
# A block is the only way to interact with each crawled Document.
|
115
162
|
# @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
|
116
|
-
# from all of the site's pages or nil if the
|
163
|
+
# from all of the site's pages or nil if the url could not be
|
117
164
|
# crawled successfully.
|
118
|
-
def crawl_site(
|
119
|
-
assert_type(
|
165
|
+
def crawl_site(url = @urls.first, &block)
|
166
|
+
assert_type(url, Wgit::Url)
|
120
167
|
|
121
|
-
doc = crawl_url(
|
168
|
+
doc = crawl_url(url, &block)
|
122
169
|
return nil if doc.nil?
|
123
170
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
171
|
+
host = url.to_base
|
172
|
+
alt_url = url.end_with?('/') ? url.chop : url + '/'
|
173
|
+
crawled = [url, alt_url]
|
174
|
+
externals = doc.external_links
|
175
|
+
internals = get_internal_links(doc)
|
128
176
|
|
129
|
-
return doc.external_links.uniq if
|
177
|
+
return doc.external_links.uniq if internals.empty?
|
130
178
|
|
131
179
|
loop do
|
132
|
-
|
180
|
+
crawled.uniq!
|
181
|
+
internals.uniq!
|
133
182
|
|
134
|
-
links =
|
183
|
+
links = internals - crawled
|
135
184
|
break if links.empty?
|
136
185
|
|
137
186
|
links.each do |link|
|
187
|
+
orig_link = link.dup
|
138
188
|
doc = crawl_url(
|
139
|
-
|
140
|
-
follow_external_redirects: false,
|
141
|
-
&block
|
189
|
+
link, follow_external_redirects: false, host: host, &block
|
142
190
|
)
|
143
191
|
|
144
|
-
|
192
|
+
crawled.push(orig_link, link) # Push both in case of redirects.
|
145
193
|
next if doc.nil?
|
146
194
|
|
147
|
-
|
148
|
-
|
195
|
+
internals.concat(get_internal_links(doc))
|
196
|
+
externals.concat(doc.external_links)
|
149
197
|
end
|
150
198
|
end
|
151
199
|
|
152
|
-
|
200
|
+
externals.uniq
|
153
201
|
end
|
154
202
|
|
155
203
|
private
|
@@ -168,8 +216,13 @@ module Wgit
|
|
168
216
|
# The fetch method performs a HTTP GET to obtain the HTML document.
|
169
217
|
# Invalid urls or any HTTP response that doesn't return a HTML body will be
|
170
218
|
# ignored and nil will be returned. Otherwise, the HTML is returned.
|
171
|
-
|
172
|
-
|
219
|
+
# External redirects are followed by default but can be disabled.
|
220
|
+
def fetch(url, follow_external_redirects: true, host: nil)
|
221
|
+
response = resolve(
|
222
|
+
url,
|
223
|
+
follow_external_redirects: follow_external_redirects,
|
224
|
+
host: host
|
225
|
+
)
|
173
226
|
@last_response = response
|
174
227
|
response.body.empty? ? nil : response.body
|
175
228
|
rescue Exception => ex
|
@@ -183,28 +236,35 @@ module Wgit
|
|
183
236
|
# The resolve method performs a HTTP GET to obtain the HTML document.
|
184
237
|
# A certain amount of redirects will be followed by default before raising
|
185
238
|
# an exception. Redirects can be disabled by setting `redirect_limit: 0`.
|
239
|
+
# External redirects are followed by default but can be disabled.
|
186
240
|
# The Net::HTTPResponse will be returned.
|
187
241
|
def resolve(
|
188
242
|
url,
|
189
243
|
redirect_limit: Wgit::Crawler.default_redirect_limit,
|
190
|
-
follow_external_redirects: true
|
244
|
+
follow_external_redirects: true,
|
245
|
+
host: nil
|
191
246
|
)
|
192
|
-
raise 'url must respond to :
|
193
|
-
redirect_count =
|
247
|
+
raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
|
248
|
+
redirect_count = 0
|
194
249
|
|
195
250
|
begin
|
196
|
-
raise 'Too many redirects' if redirect_count >= redirect_limit
|
197
|
-
redirect_count += 1
|
198
|
-
|
199
251
|
response = Net::HTTP.get_response(url.to_uri)
|
200
252
|
location = Wgit::Url.new(response.fetch('location', ''))
|
201
253
|
|
254
|
+
yield(url, response, location) if block_given?
|
255
|
+
|
202
256
|
if not location.empty?
|
203
|
-
if
|
204
|
-
|
257
|
+
if !follow_external_redirects and
|
258
|
+
!location.is_relative?(host: host)
|
259
|
+
raise "External redirect not allowed - Redirected to: \
|
260
|
+
'#{location}', which is outside of host: '#{host}'"
|
205
261
|
end
|
206
262
|
|
207
|
-
|
263
|
+
raise 'Too many redirects' if redirect_count >= redirect_limit
|
264
|
+
redirect_count += 1
|
265
|
+
|
266
|
+
location = url.to_base.concat(location) if location.is_relative?
|
267
|
+
url.replace(location)
|
208
268
|
end
|
209
269
|
end while response.is_a?(Net::HTTPRedirection)
|
210
270
|
|
@@ -217,10 +277,13 @@ module Wgit
|
|
217
277
|
@urls << Wgit::Url.new(url)
|
218
278
|
end
|
219
279
|
|
220
|
-
#
|
280
|
+
# Returns doc's internal HTML page links in absolute form for crawling.
|
281
|
+
# We remove anchors because they are client side and don't change the
|
282
|
+
# resulting page's HTML; unlike query strings for example, which do.
|
221
283
|
def get_internal_links(doc)
|
222
|
-
doc.
|
223
|
-
|
284
|
+
doc.internal_full_links.
|
285
|
+
map(&:without_anchor).
|
286
|
+
uniq.
|
224
287
|
reject do |link|
|
225
288
|
ext = link.to_extension
|
226
289
|
ext ? !['htm', 'html'].include?(ext) : false
|
data/lib/wgit/document.rb
CHANGED
@@ -126,6 +126,38 @@ module Wgit
|
|
126
126
|
@url.date_crawled
|
127
127
|
end
|
128
128
|
|
129
|
+
# Returns the base URL of this Wgit::Document. The base URL is either the
|
130
|
+
# <base> element's href value or @url (if @base is nil). If @base is
|
131
|
+
# present and relative, then @url.to_base + @base is returned. This method
|
132
|
+
# should be used instead of `doc.url.to_base` etc. if manually building
|
133
|
+
# absolute links.
|
134
|
+
#
|
135
|
+
# Provide the `link:` parameter to get the correct base URL for that type
|
136
|
+
# of link. For example, a link of `#top` would always return @url because
|
137
|
+
# it applies to that page, not a different one. Query strings work in the
|
138
|
+
# same way. Use this parameter if manually concatting links e.g.
|
139
|
+
# `absolute_link = doc.base_url(link: link).concat(link)` etc.
|
140
|
+
#
|
141
|
+
# @param link [Wgit::Url] The link to obtain the correct base URL for.
|
142
|
+
# @return [Wgit::Url] The base URL of this Document e.g.
|
143
|
+
# 'http://example.com/public'.
|
144
|
+
def base_url(link: nil)
|
145
|
+
get_base = -> { @base.is_relative? ? @url.to_base.concat(@base) : @base }
|
146
|
+
|
147
|
+
if link
|
148
|
+
assert_type(link, Wgit::Url)
|
149
|
+
raise "link must be relative: #{link}" unless link.is_relative?
|
150
|
+
|
151
|
+
if link.is_anchor? or link.is_query_string?
|
152
|
+
base_url = @base ? get_base.call : @url
|
153
|
+
return base_url.without_anchor.without_query_string
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
base_url = @base ? get_base.call : @url.base
|
158
|
+
base_url.without_anchor.without_query_string
|
159
|
+
end
|
160
|
+
|
129
161
|
# Returns a Hash containing this Document's instance vars.
|
130
162
|
# Used when storing the Document in a Database e.g. MongoDB etc.
|
131
163
|
# By default the @html var is excluded from the returned Hash.
|
@@ -209,23 +241,19 @@ module Wgit
|
|
209
241
|
end
|
210
242
|
|
211
243
|
# Get all the internal links of this Document in relative form. Internal
|
212
|
-
# meaning a link to another document on
|
244
|
+
# meaning a link to another document on the same host. This Document's host
|
213
245
|
# is used to determine if an absolute URL is actually a relative link e.g.
|
214
|
-
# For a Document representing http://server.com/about, an absolute link
|
215
|
-
# <a href='http://server.com/search'> will be recognized and
|
216
|
-
# internal link because both Documents live on the same
|
217
|
-
# Wgit::Document#internal_full_links.
|
246
|
+
# For a Document representing http://www.server.com/about, an absolute link
|
247
|
+
# of <a href='http://www.server.com/search'> will be recognized and
|
248
|
+
# returned as an internal link because both Documents live on the same
|
249
|
+
# host. Also see Wgit::Document#internal_full_links.
|
218
250
|
#
|
219
251
|
# @return [Array<Wgit::Url>] self's internal/relative URL's.
|
220
252
|
def internal_links
|
221
253
|
return [] if @links.empty?
|
222
254
|
|
223
255
|
links = @links.
|
224
|
-
reject
|
225
|
-
not link.relative_link?(base: @url.to_base)
|
226
|
-
rescue
|
227
|
-
true
|
228
|
-
end.
|
256
|
+
reject { |link| !link.is_relative?(host: @url.to_base) }.
|
229
257
|
map(&:without_base).
|
230
258
|
map do |link| # We map @url.to_host into / because it's a duplicate.
|
231
259
|
link.to_host == @url.to_host ? Wgit::Url.new('/') : link
|
@@ -234,19 +262,6 @@ module Wgit
|
|
234
262
|
Wgit::Utils.process_arr(links)
|
235
263
|
end
|
236
264
|
|
237
|
-
# Get all the internal links of this Document with their anchors removed
|
238
|
-
# (if present). Also see Wgit::Document#internal_links.
|
239
|
-
#
|
240
|
-
# @return [Array<Wgit::Url>] self's internal/relative URL's with their
|
241
|
-
# anchors removed.
|
242
|
-
def internal_links_without_anchors
|
243
|
-
in_links = internal_links
|
244
|
-
return [] if in_links.empty?
|
245
|
-
in_links.
|
246
|
-
map(&:without_anchor).
|
247
|
-
reject(&:empty?)
|
248
|
-
end
|
249
|
-
|
250
265
|
# Get all the internal links of this Document and append them to this
|
251
266
|
# Document's base URL making them absolute. Also see
|
252
267
|
# Wgit::Document#internal_links.
|
@@ -254,24 +269,20 @@ module Wgit
|
|
254
269
|
# @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
|
255
270
|
# form.
|
256
271
|
def internal_full_links
|
257
|
-
|
258
|
-
return [] if
|
259
|
-
|
272
|
+
links = internal_links
|
273
|
+
return [] if links.empty?
|
274
|
+
links.map { |link| base_url(link: link).concat(link) }
|
260
275
|
end
|
261
276
|
|
262
277
|
# Get all the external links of this Document. External meaning a link to
|
263
|
-
# a different
|
278
|
+
# a different host.
|
264
279
|
#
|
265
280
|
# @return [Array<Wgit::Url>] self's external/absolute URL's.
|
266
281
|
def external_links
|
267
282
|
return [] if @links.empty?
|
268
283
|
|
269
284
|
links = @links.
|
270
|
-
reject
|
271
|
-
link.relative_link?(base: @url.to_base)
|
272
|
-
rescue
|
273
|
-
true
|
274
|
-
end.
|
285
|
+
reject { |link| link.relative_link?(host: @url.to_base) }.
|
275
286
|
map(&:without_trailing_slash)
|
276
287
|
|
277
288
|
Wgit::Utils.process_arr(links)
|
@@ -506,6 +517,9 @@ module Wgit
|
|
506
517
|
alias :relative_urls :internal_links
|
507
518
|
alias :relative_full_links :internal_full_links
|
508
519
|
alias :relative_full_urls :internal_full_links
|
520
|
+
alias :internal_absolute_links :internal_full_links
|
521
|
+
alias :relative_absolute_links :internal_full_links
|
522
|
+
alias :relative_absolute_urls :internal_full_links
|
509
523
|
alias :external_urls :external_links
|
510
524
|
end
|
511
525
|
end
|
@@ -1,5 +1,15 @@
|
|
1
1
|
### Default Document Extensions ###
|
2
2
|
|
3
|
+
# Base.
|
4
|
+
Wgit::Document.define_extension(
|
5
|
+
:base,
|
6
|
+
'//base/@href',
|
7
|
+
singleton: true,
|
8
|
+
text_content_only: true,
|
9
|
+
) do |base|
|
10
|
+
base = Wgit::Url.new(base) if base
|
11
|
+
end
|
12
|
+
|
3
13
|
# Title.
|
4
14
|
Wgit::Document.define_extension(
|
5
15
|
:title,
|
@@ -37,15 +47,7 @@ Wgit::Document.define_extension(
|
|
37
47
|
singleton: false,
|
38
48
|
text_content_only: true,
|
39
49
|
) do |links|
|
40
|
-
if links
|
41
|
-
links.map! do |link|
|
42
|
-
Wgit::Url.new(link)
|
43
|
-
rescue
|
44
|
-
nil
|
45
|
-
end
|
46
|
-
links.compact!
|
47
|
-
end
|
48
|
-
links
|
50
|
+
links.map! { |link| Wgit::Url.new(link) } if links
|
49
51
|
end
|
50
52
|
|
51
53
|
# Text.
|
data/lib/wgit/indexer.rb
CHANGED
@@ -219,7 +219,7 @@ site: #{url}")
|
|
219
219
|
# manipulation. Return nil or false from the block to prevent the
|
220
220
|
# document from being saved into the database.
|
221
221
|
def index_this_page(url, insert_externals = true)
|
222
|
-
|
222
|
+
document = @crawler.crawl_page(url) do |doc|
|
223
223
|
result = true
|
224
224
|
if block_given?
|
225
225
|
result = yield(doc)
|
@@ -236,7 +236,7 @@ site: #{url}")
|
|
236
236
|
@db.url?(url) ? @db.update(url) : @db.insert(url)
|
237
237
|
|
238
238
|
if insert_externals
|
239
|
-
ext_urls =
|
239
|
+
ext_urls = document.external_links
|
240
240
|
write_urls_to_db(ext_urls)
|
241
241
|
Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
|
242
242
|
end
|
data/lib/wgit/logger.rb
CHANGED
@@ -23,7 +23,11 @@ module Wgit
|
|
23
23
|
# Returns the default Logger instance.
|
24
24
|
# @return [Logger] The default Logger instance.
|
25
25
|
def self.default_logger
|
26
|
-
Logger.new(STDOUT, progname: 'wgit', level: :info)
|
26
|
+
logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
|
27
|
+
logger.formatter = proc do |severity, datetime, progname, msg|
|
28
|
+
"[#{progname}] #{msg}\n"
|
29
|
+
end
|
30
|
+
logger
|
27
31
|
end
|
28
32
|
|
29
33
|
# Sets the default Logger instance to be used by Wgit.
|
data/lib/wgit/url.rb
CHANGED
@@ -117,30 +117,54 @@ module Wgit
|
|
117
117
|
Wgit::Url.new(host + separator + link)
|
118
118
|
end
|
119
119
|
|
120
|
+
# Overrides String#replace setting the new_url @uri and String value.
|
121
|
+
#
|
122
|
+
# @param new_url [Wgit::Url, String] The new URL value.
|
123
|
+
# @return [String] The new URL value once set.
|
124
|
+
def replace(new_url)
|
125
|
+
@uri = Addressable::URI.parse(new_url)
|
126
|
+
super(new_url)
|
127
|
+
end
|
128
|
+
|
120
129
|
# Returns true if self is a relative Url.
|
121
130
|
#
|
122
131
|
# All external links in a page are expected to have a protocol prefix e.g.
|
123
132
|
# "http://", otherwise the link is treated as an internal link (regardless
|
124
|
-
# of whether it's valid or not). The only exception is if
|
125
|
-
# and self is a page
|
133
|
+
# of whether it's valid or not). The only exception is if host or domain is
|
134
|
+
# provided and self is a page belonging to that host/domain; then the link
|
135
|
+
# is relative.
|
126
136
|
#
|
127
|
-
# @param
|
137
|
+
# @param host [Wgit::Url, String] The Url host e.g.
|
138
|
+
# http://www.google.com/how which gives a host of www.google.com.
|
139
|
+
# The host must be absolute and prefixed with a protocol.
|
140
|
+
# @param domain [Wgit::Url, String] The Url domain e.g.
|
141
|
+
# http://www.google.com/how which gives a domain of google.com. The
|
142
|
+
# domain must be absolute and prefixed with a protocol.
|
128
143
|
# @return [Boolean] True if relative, false if absolute.
|
129
144
|
# @raise [RuntimeError] If self is invalid e.g. empty.
|
130
|
-
def is_relative?(
|
145
|
+
def is_relative?(host: nil, domain: nil)
|
131
146
|
raise "Invalid link: #{self}" if nil? or empty?
|
147
|
+
raise "Provide host or domain, not both" if host and domain
|
148
|
+
|
149
|
+
if host
|
150
|
+
host = Wgit::Url.new(host)
|
151
|
+
if host.to_base.nil?
|
152
|
+
raise "Invalid host, must be absolute and contain protocol: #{host}"
|
153
|
+
end
|
154
|
+
end
|
132
155
|
|
133
|
-
if
|
134
|
-
|
135
|
-
if
|
136
|
-
raise "Invalid
|
156
|
+
if domain
|
157
|
+
domain = Wgit::Url.new(domain)
|
158
|
+
if domain.to_base.nil?
|
159
|
+
raise "Invalid domain, must be absolute and contain protocol: #{domain}"
|
137
160
|
end
|
138
161
|
end
|
139
162
|
|
140
163
|
if @uri.relative?
|
141
164
|
true
|
142
165
|
else
|
143
|
-
|
166
|
+
return host ? to_host == host.to_host : false if host
|
167
|
+
return domain ? to_domain == domain.to_domain : false if domain
|
144
168
|
end
|
145
169
|
end
|
146
170
|
|
@@ -207,6 +231,15 @@ module Wgit
|
|
207
231
|
host ? Wgit::Url.new(host) : nil
|
208
232
|
end
|
209
233
|
|
234
|
+
# Returns a new Wgit::Url containing just the domain of this URL e.g.
|
235
|
+
# Given http://www.google.co.uk/about.html, google.co.uk is returned.
|
236
|
+
#
|
237
|
+
# @return [Wgit::Url, nil] Containing just the domain or nil.
|
238
|
+
def to_domain
|
239
|
+
domain = @uri.domain
|
240
|
+
domain ? Wgit::Url.new(domain) : nil
|
241
|
+
end
|
242
|
+
|
210
243
|
# Returns only the base of this URL e.g. the protocol and host combined.
|
211
244
|
#
|
212
245
|
# @return [Wgit::Url, nil] Base of self e.g. http://www.google.co.uk or nil.
|
@@ -226,9 +259,7 @@ module Wgit
|
|
226
259
|
path = @uri.path
|
227
260
|
return nil if path.nil? or path.empty?
|
228
261
|
return Wgit::Url.new('/') if path == '/'
|
229
|
-
Wgit::Url.new(path).
|
230
|
-
without_leading_slash.
|
231
|
-
without_trailing_slash
|
262
|
+
Wgit::Url.new(path).without_slashes
|
232
263
|
end
|
233
264
|
|
234
265
|
# Returns the endpoint of this URL e.g. the bit after the host with any
|
@@ -245,12 +276,12 @@ module Wgit
|
|
245
276
|
end
|
246
277
|
|
247
278
|
# Returns a new Wgit::Url containing just the query string of this URL
|
248
|
-
# e.g. Given http://google.com?q=ruby, 'ruby' is returned.
|
279
|
+
# e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
|
249
280
|
#
|
250
281
|
# @return [Wgit::Url, nil] Containing just the query string or nil.
|
251
282
|
def to_query_string
|
252
283
|
query = @uri.query
|
253
|
-
query ? Wgit::Url.new(query) : nil
|
284
|
+
query ? Wgit::Url.new("?#{query}") : nil
|
254
285
|
end
|
255
286
|
|
256
287
|
# Returns a new Wgit::Url containing just the anchor string of this URL
|
@@ -313,9 +344,21 @@ module Wgit
|
|
313
344
|
without_base = base_url ? gsub(base_url, '') : self
|
314
345
|
|
315
346
|
return self if ['', '/'].include?(without_base)
|
316
|
-
Wgit::Url.new(without_base).
|
317
|
-
|
318
|
-
|
347
|
+
Wgit::Url.new(without_base).without_slashes
|
348
|
+
end
|
349
|
+
|
350
|
+
# Returns a new Wgit::Url with the query string portion removed e.g. Given
|
351
|
+
# http://google.com/search?q=hello, http://google.com/search is
|
352
|
+
# returned. Self is returned as is if no query string is present. A URL
|
353
|
+
# consisting of only a query string e.g. '?q=hello' will return an empty
|
354
|
+
# URL.
|
355
|
+
#
|
356
|
+
# @return [Wgit::Url] Self with the query string portion removed.
|
357
|
+
def without_query_string
|
358
|
+
query = to_query_string
|
359
|
+
without_query_string = query ? gsub(query, '') : self
|
360
|
+
|
361
|
+
Wgit::Url.new(without_query_string)
|
319
362
|
end
|
320
363
|
|
321
364
|
# Returns a new Wgit::Url with the anchor portion removed e.g. Given
|
@@ -333,6 +376,20 @@ module Wgit
|
|
333
376
|
Wgit::Url.new(without_anchor)
|
334
377
|
end
|
335
378
|
|
379
|
+
# Returns true if self is a URL query string e.g. ?q=hello etc.
|
380
|
+
#
|
381
|
+
# @return [Boolean] True if self is a query string, false otherwise.
|
382
|
+
def is_query_string?
|
383
|
+
start_with?('?')
|
384
|
+
end
|
385
|
+
|
386
|
+
# Returns true if self is a URL anchor/fragment e.g. #top etc.
|
387
|
+
#
|
388
|
+
# @return [Boolean] True if self is a anchor/fragment, false otherwise.
|
389
|
+
def is_anchor?
|
390
|
+
start_with?('#')
|
391
|
+
end
|
392
|
+
|
336
393
|
# Returns a Hash containing this Url's instance vars excluding @uri.
|
337
394
|
# Used when storing the URL in a Database e.g. MongoDB etc.
|
338
395
|
#
|
@@ -349,6 +406,7 @@ module Wgit
|
|
349
406
|
alias :to_protocol :to_scheme
|
350
407
|
alias :protocol :to_scheme
|
351
408
|
alias :host :to_host
|
409
|
+
alias :domain :to_domain
|
352
410
|
alias :base :to_base
|
353
411
|
alias :path :to_path
|
354
412
|
alias :endpoint :to_endpoint
|
@@ -358,10 +416,14 @@ module Wgit
|
|
358
416
|
alias :to_fragment :to_anchor
|
359
417
|
alias :fragment :to_anchor
|
360
418
|
alias :extension :to_extension
|
419
|
+
alias :without_query :without_query_string
|
361
420
|
alias :without_fragment :without_anchor
|
421
|
+
alias :is_query? :is_query_string?
|
422
|
+
alias :is_fragment? :is_anchor?
|
362
423
|
alias :relative_link? :is_relative?
|
363
424
|
alias :internal_link? :is_relative?
|
364
425
|
alias :is_internal? :is_relative?
|
426
|
+
alias :relative? :is_relative?
|
365
427
|
alias :crawled? :crawled
|
366
428
|
alias :normalize :normalise
|
367
429
|
end
|
data/lib/wgit/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wgit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Michael Telford
|
@@ -128,20 +128,6 @@ dependencies:
|
|
128
128
|
- - "~>"
|
129
129
|
- !ruby/object:Gem::Version
|
130
130
|
version: '3.6'
|
131
|
-
- !ruby/object:Gem::Dependency
|
132
|
-
name: rack
|
133
|
-
requirement: !ruby/object:Gem::Requirement
|
134
|
-
requirements:
|
135
|
-
- - "~>"
|
136
|
-
- !ruby/object:Gem::Version
|
137
|
-
version: '2.0'
|
138
|
-
type: :development
|
139
|
-
prerelease: false
|
140
|
-
version_requirements: !ruby/object:Gem::Requirement
|
141
|
-
requirements:
|
142
|
-
- - "~>"
|
143
|
-
- !ruby/object:Gem::Version
|
144
|
-
version: '2.0'
|
145
131
|
- !ruby/object:Gem::Dependency
|
146
132
|
name: addressable
|
147
133
|
requirement: !ruby/object:Gem::Requirement
|