wgit 0.0.16 → 0.0.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 269236ab66e07aaabe01d61f765689e3d997628ad76d5f61a9c477e35d67880b
4
- data.tar.gz: 5fd11a994c23cd9569099109f8e2236873cf2d6267ea38bd661329620ece50b0
3
+ metadata.gz: a805551a72869241a425dc4d0f88ed6f740c75b95db6e3acf2564393b79708d9
4
+ data.tar.gz: 5425e8bb21c7822b5ac93afbe6c7d90777a649808702c3e6012c0ec19cbe1dfb
5
5
  SHA512:
6
- metadata.gz: 1c97aab9a225690205fcf10a99d2f632c45c08e7c3c5a543a0d374eb0595a6953baf77acd16eebf032c741d671d9e0fece030b01578af14fabc2acfa446734aa
7
- data.tar.gz: eefb60a4462142fce4643dc12edac1fa11951c32f7e2d72f3295369fc8db83b3b126ea5e51410bc9ca9955cc1d7c386ac3c0aac77b1c6eaf9bc89ffc517f44ee
6
+ metadata.gz: fd5dcc1b4e9706326810b3fdbdf1df285ec1a98788aac9521fbcd52ad4132c039ab2a2b2d2e574af115845d1968c0eb1bc8d487dbbec4ee9a3427597bb99b09f
7
+ data.tar.gz: 3e947536f694ea74460f919cab1ec8e42274eb6bd0f856ac900c6b2e4f31da22ddc920afbaaa3a4b80abe3d9729fdcf00964e794363f3d76e1f35dc33a05224a
data/README.md CHANGED
@@ -57,13 +57,13 @@ doc.stats # => {
57
57
 
58
58
  # doc responds to the following methods:
59
59
  Wgit::Document.instance_methods(false).sort # => [
60
- # :==, :[], :author, :css, :date_crawled, :doc, :empty?, :external_links,
61
- # :external_urls, :html, :internal_full_links, :internal_links,
62
- # :internal_links_without_anchors, :keywords, :links, :relative_full_links,
63
- # :relative_full_urls, :relative_links, :relative_urls, :score, :search,
64
- # :search!, :size, :stats, :text, :title, :to_h, :to_json, :url,
65
- # :xpath
66
- #]
60
+ # :==, :[], :author, :base, :base_url, :css, :date_crawled, :doc, :empty?,
61
+ # :external_links, :external_urls, :html, :internal_absolute_links,
62
+ # :internal_full_links, :internal_links, :keywords, :links,
63
+ # :relative_absolute_links, :relative_absolute_urls, :relative_full_links,
64
+ # :relative_full_urls, :relative_links, :relative_urls, :score, :search,
65
+ # :search!, :size, :stats, :text, :title, :to_h, :to_json, :url, :xpath
66
+ # ]
67
67
 
68
68
  results = doc.search "corruption"
69
69
  results.first # => "ial materials involving war, spying and corruption.
@@ -325,7 +325,7 @@ Currently there is no executable provided with Wgit, however...
325
325
 
326
326
  In future versions of Wgit, an executable will be packaged with the gem. The executable will provide a `pry` console with the `wgit` gem already loaded. Using the console, you'll easily be able to index and search the web without having to write your own scripts.
327
327
 
328
- This executable will be very similar in nature to `./bin/console` which is currently used only for development and isn't packaged as part of the `wgit` gem.
328
+ This executable will be similar in nature to `./bin/console` which is currently used for development and isn't packaged as part of the `wgit` gem.
329
329
 
330
330
  ## Change Log
331
331
 
@@ -345,7 +345,7 @@ The current road map is rudimentally listed in the [TODO.txt](https://github.com
345
345
 
346
346
  For a full list of available Rake tasks, run `bundle exec rake help`. The most commonly used tasks are listed below...
347
347
 
348
- After checking out the repo, run `./bin/setup` to install dependencies (requires `bundler`). Then, run `bundle exec rake test` to run the tests. You can also run `./bin/console` for an interactive (`pry`) REPL that will allow you to experiment with the code.
348
+ After checking out the repo, run `bundle exec rake setup` to install the dependencies (requires `bundler`). Then, run `bundle exec rake test` to run the tests. You can also run `bundle exec rake console` for an interactive (`pry`) REPL that will allow you to experiment with the code.
349
349
 
350
350
  To generate code documentation run `bundle exec yard doc`. To browse the generated documentation run `bundle exec yard server -r`.
351
351
 
data/lib/wgit/crawler.rb CHANGED
@@ -6,8 +6,8 @@ require 'net/http' # Requires 'uri'.
6
6
 
7
7
  module Wgit
8
8
 
9
- # The Crawler class provides a means of crawling web based URL's, turning
10
- # their HTML into Wgit::Document's.
9
+ # The Crawler class provides a means of crawling web based Wgit::Url's, turning
10
+ # their HTML into Wgit::Document instances.
11
11
  class Crawler
12
12
  include Assertable
13
13
 
@@ -29,9 +29,13 @@ module Wgit
29
29
  # The Net::HTTPResponse of the most recently crawled URL or nil.
30
30
  attr_reader :last_response
31
31
 
32
- # Initializes the Crawler by setting the @urls and @docs.
32
+ # Initializes the Crawler and sets the @urls and @docs.
33
33
  #
34
- # @param urls [*Wgit::Url] The URLs to crawl.
34
+ # @param urls [*Wgit::Url] The URL's to crawl in the future using either
35
+ # Crawler#crawl_url or Crawler#crawl_site. Note that the urls passed here
36
+ # will NOT update if they happen to redirect when crawled. If in doubt,
37
+ # pass the url(s) directly to the crawl_* method instead of to the new
38
+ # method.
35
39
  def initialize(*urls)
36
40
  self.[](*urls)
37
41
  @docs = []
@@ -39,7 +43,10 @@ module Wgit
39
43
 
40
44
  # Sets this Crawler's @urls.
41
45
  #
42
- # @param urls [Array<Wgit::Url>] The URLs to crawl.
46
+ # @param urls [*Wgit::Url] The URL's to crawl in the future using either
47
+ # crawl_url or crawl_site. Note that the urls passed here will NOT update
48
+ # if they happen to redirect when crawled. If in doubt, pass the url(s)
49
+ # directly to the crawl_* method instead of to the new method.
43
50
  def urls=(urls)
44
51
  @urls = []
45
52
  Wgit::Utils.each(urls) { |url| add_url(url) }
@@ -47,7 +54,10 @@ module Wgit
47
54
 
48
55
  # Sets this Crawler's @urls.
49
56
  #
50
- # @param urls [*Wgit::Url] The URLs to crawl.
57
+ # @param urls [*Wgit::Url] The URL's to crawl in the future using either
58
+ # crawl_url or crawl_site. Note that the urls passed here will NOT update
59
+ # if they happen to redirect when crawled. If in doubt, pass the url(s)
60
+ # directly to the crawl_* method instead of to the new method.
51
61
  def [](*urls)
52
62
  # If urls is nil then add_url (when called later) will set @urls = []
53
63
  # so we do nothing here.
@@ -68,12 +78,18 @@ module Wgit
68
78
 
69
79
  # Adds the url to this Crawler's @urls.
70
80
  #
71
- # @param url [Wgit::Url] A URL to crawl.
81
+ # @param url [Wgit::Url] A URL to crawl later by calling a crawl_* method.
82
+ # Note that the url added here will NOT update if it happens to
83
+ # redirect when crawled. If in doubt, pass the url directly to the
84
+ # crawl_* method instead of to the new method.
72
85
  def <<(url)
73
86
  add_url(url)
74
87
  end
75
88
 
76
- # Crawls individual urls, not entire sites.
89
+ # Crawls one or more individual urls using Wgit::Crawler#crawl_url
90
+ # underneath. See Wgit::Crawler#crawl_site for crawling entire sites. Note
91
+ # that any external redirects are followed. Use Wgit::Crawler#crawl_url if
92
+ # this isn't desirable.
77
93
  #
78
94
  # @param urls [Array<Wgit::Url>] The URLs to crawl.
79
95
  # @yield [Wgit::Document] If provided, the block is given each crawled
@@ -88,68 +104,100 @@ module Wgit
88
104
  doc ? doc : @docs.last
89
105
  end
90
106
 
91
- # Crawl the url and return the response document or nil.
107
+ # Crawl the url returning the response Wgit::Document or nil if an error
108
+ # occurs.
92
109
  #
93
- # @param url [Wgit::Document] The URL to crawl.
110
+ # @param url [Wgit::Url] The URL to crawl.
94
111
  # @param follow_external_redirects [Boolean] Whether or not to follow
95
- # external redirects. False will return nil for such a crawl.
112
+ # an external redirect. False will return nil for such a crawl. If false,
113
+ # you must also provide a `host:` parameter.
114
+ # @param host [Wgit::Url, String] Specify the host by which
115
+ # an absolute redirect is determined to be internal or not. Must be
116
+ # absolute and contain a protocol prefix. For example, a `host:` of
117
+ # 'http://www.example.com' will only allow redirects for Urls with a
118
+ # `to_host` value of 'www.example.com'.
96
119
  # @yield [Wgit::Document] The crawled HTML Document regardless if the
97
120
  # crawl was successful or not. Therefore, the Document#url can be used.
98
121
  # @return [Wgit::Document, nil] The crawled HTML Document or nil if the
99
122
  # crawl was unsuccessful.
100
- def crawl_url(url = @urls.first, follow_external_redirects: true)
123
+ def crawl_url(
124
+ url = @urls.first,
125
+ follow_external_redirects: true,
126
+ host: nil
127
+ )
101
128
  assert_type(url, Wgit::Url)
102
- markup = fetch(url, follow_external_redirects: follow_external_redirects)
129
+ if !follow_external_redirects and host.nil?
130
+ raise 'host cannot be nil if follow_external_redirects is false'
131
+ end
132
+
133
+ html = fetch(
134
+ url,
135
+ follow_external_redirects: follow_external_redirects,
136
+ host: host
137
+ )
103
138
  url.crawled = true
104
- doc = Wgit::Document.new(url, markup)
139
+
140
+ doc = Wgit::Document.new(url, html)
105
141
  yield(doc) if block_given?
142
+
106
143
  doc.empty? ? nil : doc
107
144
  end
108
145
 
109
146
  # Crawls an entire website's HTML pages by recursively going through
110
- # its internal links. Each crawled web Document is yielded to a block.
147
+ # its internal links. Each crawled Document is yielded to a block.
111
148
  #
112
- # @param base_url [Wgit::Url] The base URL of the website to be crawled.
149
+ # Only redirects to the same host are followed. For example, the Url
150
+ # 'http://www.example.co.uk/how' has a host of 'www.example.co.uk' meaning
151
+ # a link which redirects to 'https://ftp.example.co.uk' or
152
+ # 'https://www.example.com' will not be followed. The only exception to
153
+ # this is the initially crawled url which is allowed to redirect anywhere;
154
+ # it's host is then used for other link redirections on the site, as
155
+ # described above.
156
+ #
157
+ # @param url [Wgit::Url] The base URL of the website to be crawled.
158
+ # It is recommended that this URL be the index page of the site to give a
159
+ # greater chance of finding all pages within that site/host.
113
160
  # @yield [Wgit::Document] Given each crawled Document/page of the site.
114
161
  # A block is the only way to interact with each crawled Document.
115
162
  # @return [Array<Wgit::Url>, nil] Unique Array of external urls collected
116
- # from all of the site's pages or nil if the base_url could not be
163
+ # from all of the site's pages or nil if the url could not be
117
164
  # crawled successfully.
118
- def crawl_site(base_url = @urls.first, &block)
119
- assert_type(base_url, Wgit::Url)
165
+ def crawl_site(url = @urls.first, &block)
166
+ assert_type(url, Wgit::Url)
120
167
 
121
- doc = crawl_url(base_url, follow_external_redirects: false, &block)
168
+ doc = crawl_url(url, &block)
122
169
  return nil if doc.nil?
123
170
 
124
- path = base_url.path.nil? ? '/' : base_url.path
125
- crawled_urls = [path]
126
- external_urls = doc.external_links
127
- internal_urls = get_internal_links(doc)
171
+ host = url.to_base
172
+ alt_url = url.end_with?('/') ? url.chop : url + '/'
173
+ crawled = [url, alt_url]
174
+ externals = doc.external_links
175
+ internals = get_internal_links(doc)
128
176
 
129
- return doc.external_links.uniq if internal_urls.empty?
177
+ return doc.external_links.uniq if internals.empty?
130
178
 
131
179
  loop do
132
- internal_urls.uniq!
180
+ crawled.uniq!
181
+ internals.uniq!
133
182
 
134
- links = internal_urls - crawled_urls
183
+ links = internals - crawled
135
184
  break if links.empty?
136
185
 
137
186
  links.each do |link|
187
+ orig_link = link.dup
138
188
  doc = crawl_url(
139
- Wgit::Url.concat(base_url.to_base, link),
140
- follow_external_redirects: false,
141
- &block
189
+ link, follow_external_redirects: false, host: host, &block
142
190
  )
143
191
 
144
- crawled_urls << link
192
+ crawled.push(orig_link, link) # Push both in case of redirects.
145
193
  next if doc.nil?
146
194
 
147
- internal_urls.concat(get_internal_links(doc))
148
- external_urls.concat(doc.external_links)
195
+ internals.concat(get_internal_links(doc))
196
+ externals.concat(doc.external_links)
149
197
  end
150
198
  end
151
199
 
152
- external_urls.uniq
200
+ externals.uniq
153
201
  end
154
202
 
155
203
  private
@@ -168,8 +216,13 @@ module Wgit
168
216
  # The fetch method performs a HTTP GET to obtain the HTML document.
169
217
  # Invalid urls or any HTTP response that doesn't return a HTML body will be
170
218
  # ignored and nil will be returned. Otherwise, the HTML is returned.
171
- def fetch(url, follow_external_redirects: true)
172
- response = resolve(url, follow_external_redirects: follow_external_redirects)
219
+ # External redirects are followed by default but can be disabled.
220
+ def fetch(url, follow_external_redirects: true, host: nil)
221
+ response = resolve(
222
+ url,
223
+ follow_external_redirects: follow_external_redirects,
224
+ host: host
225
+ )
173
226
  @last_response = response
174
227
  response.body.empty? ? nil : response.body
175
228
  rescue Exception => ex
@@ -183,28 +236,35 @@ module Wgit
183
236
  # The resolve method performs a HTTP GET to obtain the HTML document.
184
237
  # A certain amount of redirects will be followed by default before raising
185
238
  # an exception. Redirects can be disabled by setting `redirect_limit: 0`.
239
+ # External redirects are followed by default but can be disabled.
186
240
  # The Net::HTTPResponse will be returned.
187
241
  def resolve(
188
242
  url,
189
243
  redirect_limit: Wgit::Crawler.default_redirect_limit,
190
- follow_external_redirects: true
244
+ follow_external_redirects: true,
245
+ host: nil
191
246
  )
192
- raise 'url must respond to :normalise' unless url.respond_to?(:normalise)
193
- redirect_count = -1
247
+ raise 'url must respond to :to_uri' unless url.respond_to?(:to_uri)
248
+ redirect_count = 0
194
249
 
195
250
  begin
196
- raise 'Too many redirects' if redirect_count >= redirect_limit
197
- redirect_count += 1
198
-
199
251
  response = Net::HTTP.get_response(url.to_uri)
200
252
  location = Wgit::Url.new(response.fetch('location', ''))
201
253
 
254
+ yield(url, response, location) if block_given?
255
+
202
256
  if not location.empty?
203
- if !follow_external_redirects and !location.is_relative?
204
- raise 'External redirect encountered but not allowed'
257
+ if !follow_external_redirects and
258
+ !location.is_relative?(host: host)
259
+ raise "External redirect not allowed - Redirected to: \
260
+ '#{location}', which is outside of host: '#{host}'"
205
261
  end
206
262
 
207
- url = location.is_relative? ? url.to_base.concat(location) : location
263
+ raise 'Too many redirects' if redirect_count >= redirect_limit
264
+ redirect_count += 1
265
+
266
+ location = url.to_base.concat(location) if location.is_relative?
267
+ url.replace(location)
208
268
  end
209
269
  end while response.is_a?(Net::HTTPRedirection)
210
270
 
@@ -217,10 +277,13 @@ module Wgit
217
277
  @urls << Wgit::Url.new(url)
218
278
  end
219
279
 
220
- # Pull out the doc's internal HTML page links for crawling.
280
+ # Returns doc's internal HTML page links in absolute form for crawling.
281
+ # We remove anchors because they are client side and don't change the
282
+ # resulting page's HTML; unlike query strings for example, which do.
221
283
  def get_internal_links(doc)
222
- doc.
223
- internal_links_without_anchors.
284
+ doc.internal_full_links.
285
+ map(&:without_anchor).
286
+ uniq.
224
287
  reject do |link|
225
288
  ext = link.to_extension
226
289
  ext ? !['htm', 'html'].include?(ext) : false
data/lib/wgit/document.rb CHANGED
@@ -126,6 +126,38 @@ module Wgit
126
126
  @url.date_crawled
127
127
  end
128
128
 
129
+ # Returns the base URL of this Wgit::Document. The base URL is either the
130
+ # <base> element's href value or @url (if @base is nil). If @base is
131
+ # present and relative, then @url.to_base + @base is returned. This method
132
+ # should be used instead of `doc.url.to_base` etc. if manually building
133
+ # absolute links.
134
+ #
135
+ # Provide the `link:` parameter to get the correct base URL for that type
136
+ # of link. For example, a link of `#top` would always return @url because
137
+ # it applies to that page, not a different one. Query strings work in the
138
+ # same way. Use this parameter if manually concatting links e.g.
139
+ # `absolute_link = doc.base_url(link: link).concat(link)` etc.
140
+ #
141
+ # @param link [Wgit::Url] The link to obtain the correct base URL for.
142
+ # @return [Wgit::Url] The base URL of this Document e.g.
143
+ # 'http://example.com/public'.
144
+ def base_url(link: nil)
145
+ get_base = -> { @base.is_relative? ? @url.to_base.concat(@base) : @base }
146
+
147
+ if link
148
+ assert_type(link, Wgit::Url)
149
+ raise "link must be relative: #{link}" unless link.is_relative?
150
+
151
+ if link.is_anchor? or link.is_query_string?
152
+ base_url = @base ? get_base.call : @url
153
+ return base_url.without_anchor.without_query_string
154
+ end
155
+ end
156
+
157
+ base_url = @base ? get_base.call : @url.base
158
+ base_url.without_anchor.without_query_string
159
+ end
160
+
129
161
  # Returns a Hash containing this Document's instance vars.
130
162
  # Used when storing the Document in a Database e.g. MongoDB etc.
131
163
  # By default the @html var is excluded from the returned Hash.
@@ -209,23 +241,19 @@ module Wgit
209
241
  end
210
242
 
211
243
  # Get all the internal links of this Document in relative form. Internal
212
- # meaning a link to another document on this domain. This Document's domain
244
+ # meaning a link to another document on the same host. This Document's host
213
245
  # is used to determine if an absolute URL is actually a relative link e.g.
214
- # For a Document representing http://server.com/about, an absolute link of
215
- # <a href='http://server.com/search'> will be recognized and returned as an
216
- # internal link because both Documents live on the same domain. Also see
217
- # Wgit::Document#internal_full_links.
246
+ # For a Document representing http://www.server.com/about, an absolute link
247
+ # of <a href='http://www.server.com/search'> will be recognized and
248
+ # returned as an internal link because both Documents live on the same
249
+ # host. Also see Wgit::Document#internal_full_links.
218
250
  #
219
251
  # @return [Array<Wgit::Url>] self's internal/relative URL's.
220
252
  def internal_links
221
253
  return [] if @links.empty?
222
254
 
223
255
  links = @links.
224
- reject do |link|
225
- not link.relative_link?(base: @url.to_base)
226
- rescue
227
- true
228
- end.
256
+ reject { |link| !link.is_relative?(host: @url.to_base) }.
229
257
  map(&:without_base).
230
258
  map do |link| # We map @url.to_host into / because it's a duplicate.
231
259
  link.to_host == @url.to_host ? Wgit::Url.new('/') : link
@@ -234,19 +262,6 @@ module Wgit
234
262
  Wgit::Utils.process_arr(links)
235
263
  end
236
264
 
237
- # Get all the internal links of this Document with their anchors removed
238
- # (if present). Also see Wgit::Document#internal_links.
239
- #
240
- # @return [Array<Wgit::Url>] self's internal/relative URL's with their
241
- # anchors removed.
242
- def internal_links_without_anchors
243
- in_links = internal_links
244
- return [] if in_links.empty?
245
- in_links.
246
- map(&:without_anchor).
247
- reject(&:empty?)
248
- end
249
-
250
265
  # Get all the internal links of this Document and append them to this
251
266
  # Document's base URL making them absolute. Also see
252
267
  # Wgit::Document#internal_links.
@@ -254,24 +269,20 @@ module Wgit
254
269
  # @return [Array<Wgit::Url>] self's internal/relative URL's in absolute
255
270
  # form.
256
271
  def internal_full_links
257
- in_links = internal_links
258
- return [] if in_links.empty?
259
- in_links.map { |link| @url.to_base.concat(link) }
272
+ links = internal_links
273
+ return [] if links.empty?
274
+ links.map { |link| base_url(link: link).concat(link) }
260
275
  end
261
276
 
262
277
  # Get all the external links of this Document. External meaning a link to
263
- # a different domain.
278
+ # a different host.
264
279
  #
265
280
  # @return [Array<Wgit::Url>] self's external/absolute URL's.
266
281
  def external_links
267
282
  return [] if @links.empty?
268
283
 
269
284
  links = @links.
270
- reject do |link|
271
- link.relative_link?(base: @url.to_base)
272
- rescue
273
- true
274
- end.
285
+ reject { |link| link.relative_link?(host: @url.to_base) }.
275
286
  map(&:without_trailing_slash)
276
287
 
277
288
  Wgit::Utils.process_arr(links)
@@ -506,6 +517,9 @@ module Wgit
506
517
  alias :relative_urls :internal_links
507
518
  alias :relative_full_links :internal_full_links
508
519
  alias :relative_full_urls :internal_full_links
520
+ alias :internal_absolute_links :internal_full_links
521
+ alias :relative_absolute_links :internal_full_links
522
+ alias :relative_absolute_urls :internal_full_links
509
523
  alias :external_urls :external_links
510
524
  end
511
525
  end
@@ -1,5 +1,15 @@
1
1
  ### Default Document Extensions ###
2
2
 
3
+ # Base.
4
+ Wgit::Document.define_extension(
5
+ :base,
6
+ '//base/@href',
7
+ singleton: true,
8
+ text_content_only: true,
9
+ ) do |base|
10
+ base = Wgit::Url.new(base) if base
11
+ end
12
+
3
13
  # Title.
4
14
  Wgit::Document.define_extension(
5
15
  :title,
@@ -37,15 +47,7 @@ Wgit::Document.define_extension(
37
47
  singleton: false,
38
48
  text_content_only: true,
39
49
  ) do |links|
40
- if links
41
- links.map! do |link|
42
- Wgit::Url.new(link)
43
- rescue
44
- nil
45
- end
46
- links.compact!
47
- end
48
- links
50
+ links.map! { |link| Wgit::Url.new(link) } if links
49
51
  end
50
52
 
51
53
  # Text.
data/lib/wgit/indexer.rb CHANGED
@@ -219,7 +219,7 @@ site: #{url}")
219
219
  # manipulation. Return nil or false from the block to prevent the
220
220
  # document from being saved into the database.
221
221
  def index_this_page(url, insert_externals = true)
222
- doc = @crawler.crawl_page(url) do |doc|
222
+ document = @crawler.crawl_page(url) do |doc|
223
223
  result = true
224
224
  if block_given?
225
225
  result = yield(doc)
@@ -236,7 +236,7 @@ site: #{url}")
236
236
  @db.url?(url) ? @db.update(url) : @db.insert(url)
237
237
 
238
238
  if insert_externals
239
- ext_urls = doc.external_links
239
+ ext_urls = document.external_links
240
240
  write_urls_to_db(ext_urls)
241
241
  Wgit.logger.info("Found and saved #{ext_urls.length} external url(s)")
242
242
  end
data/lib/wgit/logger.rb CHANGED
@@ -23,7 +23,11 @@ module Wgit
23
23
  # Returns the default Logger instance.
24
24
  # @return [Logger] The default Logger instance.
25
25
  def self.default_logger
26
- Logger.new(STDOUT, progname: 'wgit', level: :info)
26
+ logger = Logger.new(STDOUT, progname: 'wgit', level: :info)
27
+ logger.formatter = proc do |severity, datetime, progname, msg|
28
+ "[#{progname}] #{msg}\n"
29
+ end
30
+ logger
27
31
  end
28
32
 
29
33
  # Sets the default Logger instance to be used by Wgit.
data/lib/wgit/url.rb CHANGED
@@ -117,30 +117,54 @@ module Wgit
117
117
  Wgit::Url.new(host + separator + link)
118
118
  end
119
119
 
120
+ # Overrides String#replace setting the new_url @uri and String value.
121
+ #
122
+ # @param new_url [Wgit::Url, String] The new URL value.
123
+ # @return [String] The new URL value once set.
124
+ def replace(new_url)
125
+ @uri = Addressable::URI.parse(new_url)
126
+ super(new_url)
127
+ end
128
+
120
129
  # Returns true if self is a relative Url.
121
130
  #
122
131
  # All external links in a page are expected to have a protocol prefix e.g.
123
132
  # "http://", otherwise the link is treated as an internal link (regardless
124
- # of whether it's valid or not). The only exception is if base is provided
125
- # and self is a page within that site; then the link is relative.
133
+ # of whether it's valid or not). The only exception is if host or domain is
134
+ # provided and self is a page belonging to that host/domain; then the link
135
+ # is relative.
126
136
  #
127
- # @param base [Wgit::Url, String] The Url base e.g. http://www.google.com.
137
+ # @param host [Wgit::Url, String] The Url host e.g.
138
+ # http://www.google.com/how which gives a host of www.google.com.
139
+ # The host must be absolute and prefixed with a protocol.
140
+ # @param domain [Wgit::Url, String] The Url domain e.g.
141
+ # http://www.google.com/how which gives a domain of google.com. The
142
+ # domain must be absolute and prefixed with a protocol.
128
143
  # @return [Boolean] True if relative, false if absolute.
129
144
  # @raise [RuntimeError] If self is invalid e.g. empty.
130
- def is_relative?(base: nil)
145
+ def is_relative?(host: nil, domain: nil)
131
146
  raise "Invalid link: #{self}" if nil? or empty?
147
+ raise "Provide host or domain, not both" if host and domain
148
+
149
+ if host
150
+ host = Wgit::Url.new(host)
151
+ if host.to_base.nil?
152
+ raise "Invalid host, must be absolute and contain protocol: #{host}"
153
+ end
154
+ end
132
155
 
133
- if base
134
- base = Wgit::Url.new(base)
135
- if base.to_scheme.nil?
136
- raise "Invalid base, must contain protocol prefix: #{base}"
156
+ if domain
157
+ domain = Wgit::Url.new(domain)
158
+ if domain.to_base.nil?
159
+ raise "Invalid domain, must be absolute and contain protocol: #{domain}"
137
160
  end
138
161
  end
139
162
 
140
163
  if @uri.relative?
141
164
  true
142
165
  else
143
- base ? to_host == base.to_host : false
166
+ return host ? to_host == host.to_host : false if host
167
+ return domain ? to_domain == domain.to_domain : false if domain
144
168
  end
145
169
  end
146
170
 
@@ -207,6 +231,15 @@ module Wgit
207
231
  host ? Wgit::Url.new(host) : nil
208
232
  end
209
233
 
234
+ # Returns a new Wgit::Url containing just the domain of this URL e.g.
235
+ # Given http://www.google.co.uk/about.html, google.co.uk is returned.
236
+ #
237
+ # @return [Wgit::Url, nil] Containing just the domain or nil.
238
+ def to_domain
239
+ domain = @uri.domain
240
+ domain ? Wgit::Url.new(domain) : nil
241
+ end
242
+
210
243
  # Returns only the base of this URL e.g. the protocol and host combined.
211
244
  #
212
245
  # @return [Wgit::Url, nil] Base of self e.g. http://www.google.co.uk or nil.
@@ -226,9 +259,7 @@ module Wgit
226
259
  path = @uri.path
227
260
  return nil if path.nil? or path.empty?
228
261
  return Wgit::Url.new('/') if path == '/'
229
- Wgit::Url.new(path).
230
- without_leading_slash.
231
- without_trailing_slash
262
+ Wgit::Url.new(path).without_slashes
232
263
  end
233
264
 
234
265
  # Returns the endpoint of this URL e.g. the bit after the host with any
@@ -245,12 +276,12 @@ module Wgit
245
276
  end
246
277
 
247
278
  # Returns a new Wgit::Url containing just the query string of this URL
248
- # e.g. Given http://google.com?q=ruby, 'ruby' is returned.
279
+ # e.g. Given http://google.com?q=ruby, '?q=ruby' is returned.
249
280
  #
250
281
  # @return [Wgit::Url, nil] Containing just the query string or nil.
251
282
  def to_query_string
252
283
  query = @uri.query
253
- query ? Wgit::Url.new(query) : nil
284
+ query ? Wgit::Url.new("?#{query}") : nil
254
285
  end
255
286
 
256
287
  # Returns a new Wgit::Url containing just the anchor string of this URL
@@ -313,9 +344,21 @@ module Wgit
313
344
  without_base = base_url ? gsub(base_url, '') : self
314
345
 
315
346
  return self if ['', '/'].include?(without_base)
316
- Wgit::Url.new(without_base).
317
- without_leading_slash.
318
- without_trailing_slash
347
+ Wgit::Url.new(without_base).without_slashes
348
+ end
349
+
350
+ # Returns a new Wgit::Url with the query string portion removed e.g. Given
351
+ # http://google.com/search?q=hello, http://google.com/search is
352
+ # returned. Self is returned as is if no query string is present. A URL
353
+ # consisting of only a query string e.g. '?q=hello' will return an empty
354
+ # URL.
355
+ #
356
+ # @return [Wgit::Url] Self with the query string portion removed.
357
+ def without_query_string
358
+ query = to_query_string
359
+ without_query_string = query ? gsub(query, '') : self
360
+
361
+ Wgit::Url.new(without_query_string)
319
362
  end
320
363
 
321
364
  # Returns a new Wgit::Url with the anchor portion removed e.g. Given
@@ -333,6 +376,20 @@ module Wgit
333
376
  Wgit::Url.new(without_anchor)
334
377
  end
335
378
 
379
+ # Returns true if self is a URL query string e.g. ?q=hello etc.
380
+ #
381
+ # @return [Boolean] True if self is a query string, false otherwise.
382
+ def is_query_string?
383
+ start_with?('?')
384
+ end
385
+
386
+ # Returns true if self is a URL anchor/fragment e.g. #top etc.
387
+ #
388
+ # @return [Boolean] True if self is a anchor/fragment, false otherwise.
389
+ def is_anchor?
390
+ start_with?('#')
391
+ end
392
+
336
393
  # Returns a Hash containing this Url's instance vars excluding @uri.
337
394
  # Used when storing the URL in a Database e.g. MongoDB etc.
338
395
  #
@@ -349,6 +406,7 @@ module Wgit
349
406
  alias :to_protocol :to_scheme
350
407
  alias :protocol :to_scheme
351
408
  alias :host :to_host
409
+ alias :domain :to_domain
352
410
  alias :base :to_base
353
411
  alias :path :to_path
354
412
  alias :endpoint :to_endpoint
@@ -358,10 +416,14 @@ module Wgit
358
416
  alias :to_fragment :to_anchor
359
417
  alias :fragment :to_anchor
360
418
  alias :extension :to_extension
419
+ alias :without_query :without_query_string
361
420
  alias :without_fragment :without_anchor
421
+ alias :is_query? :is_query_string?
422
+ alias :is_fragment? :is_anchor?
362
423
  alias :relative_link? :is_relative?
363
424
  alias :internal_link? :is_relative?
364
425
  alias :is_internal? :is_relative?
426
+ alias :relative? :is_relative?
365
427
  alias :crawled? :crawled
366
428
  alias :normalize :normalise
367
429
  end
data/lib/wgit/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # @author Michael Telford
4
4
  module Wgit
5
5
  # The current gem version of Wgit.
6
- VERSION = "0.0.16".freeze
6
+ VERSION = "0.0.17".freeze
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wgit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.16
4
+ version: 0.0.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - Michael Telford
@@ -128,20 +128,6 @@ dependencies:
128
128
  - - "~>"
129
129
  - !ruby/object:Gem::Version
130
130
  version: '3.6'
131
- - !ruby/object:Gem::Dependency
132
- name: rack
133
- requirement: !ruby/object:Gem::Requirement
134
- requirements:
135
- - - "~>"
136
- - !ruby/object:Gem::Version
137
- version: '2.0'
138
- type: :development
139
- prerelease: false
140
- version_requirements: !ruby/object:Gem::Requirement
141
- requirements:
142
- - - "~>"
143
- - !ruby/object:Gem::Version
144
- version: '2.0'
145
131
  - !ruby/object:Gem::Dependency
146
132
  name: addressable
147
133
  requirement: !ruby/object:Gem::Requirement